In [1]:
import pandas as pd

# train and test datasets
df_train = pd.read_csv("final_match_pairs_train.txt", sep = '\t') # inferred match pairs
df_test = pd.read_csv("final_match_pairs_ground_truth_additional.txt", sep = '\t') # ground-truth match pairs

# retain test dataset for testing reidentification algorithm
data_test = df_test.copy(deep = True)

# filter rows with match = 1
df_train = df_train[df_train.match == 1]
df_test = df_test[df_test.match == 1]

# drop redundant columns
index_cols = ['file', 'adv', 'stop']
df_train.drop(index_cols + ['match'], axis = 1, inplace = True)
df_test.drop(index_cols + ['match'], axis = 1, inplace = True)

# test dataset of candidate adv where travel time are to be predicted
X_adv = data_test.drop(index_cols + ['match', 'travel_time'], axis = 1)

# split training features & target into train and test sets
random_state = 42
X_train = df_train.drop('travel_time', axis = 1)
y_train = df_train.travel_time
X_test = df_test.drop('travel_time', axis = 1)
y_test = df_test.travel_time

print(f"Train dataset size: {len(X_train)}")
print(f"Test dataset size: {len(X_test)}")

Train dataset size: 5391
Test dataset size: 619


In [2]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the SAINT model
class SAINTModel(nn.Module):
    def __init__(self, input_dim, d_model, n_heads, depth, dropout):
        super(SAINTModel, self).__init__()
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dropout=dropout,
            dim_feedforward=d_model * 4,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        self.fc = nn.Linear(d_model, 1)  # Output layer for regression

        # Embedding layer for tabular data
        self.embedding = nn.Linear(input_dim, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Ensure input compatibility
        x = self.embedding(x)  # Map input to `d_model` dimension
        x = self.dropout(x)
        x = self.encoder(x.unsqueeze(1))  # Add a sequence dimension for the encoder
        x = self.fc(x.mean(dim=1))  # Pool across sequence dimension
        return x

# Define a scikit-learn wrapper for SAINT
class SAINTRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, d_model=128, n_heads=4, depth=3, dropout=0.1, learning_rate=0.001, batch_size=32, epochs=50, random_state=None):
        self.d_model = d_model
        self.n_heads = n_heads
        self.depth = depth
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.random_state = random_state

    def fit(self, X, y):
        # Ensure y is a NumPy array
        if isinstance(y, pd.Series):
            y = y.values  # Convert to NumPy array

        # Convert data to PyTorch tensors
        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)

        # Create dataset and dataloader
        dataset = torch.utils.data.TensorDataset(X, y)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        # Initialize model, optimizer, and loss
        self.model = SAINTModel(
            input_dim=X.shape[1],
            d_model=self.d_model,
            n_heads=self.n_heads,
            depth=self.depth,
            dropout=self.dropout,
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

        # Training loop
        self.model.train()
        for epoch in range(self.epochs):
            for batch_X, batch_y in dataloader:
                self.optimizer.zero_grad()
                preds = self.model(batch_X).squeeze()
                loss = self.criterion(preds, batch_y)
                loss.backward()
                self.optimizer.step()


    def predict(self, X):
        # Convert data to PyTorch tensor
        X = torch.tensor(X, dtype=torch.float32)

        # Predict
        self.model.eval()
        with torch.no_grad():
            preds = self.model(X).squeeze().numpy()
        return preds

    def get_params(self, deep=True):
        return {
            'd_model': self.d_model,
            'n_heads': self.n_heads,
            'depth': self.depth,
            'dropout': self.dropout,
            'learning_rate': self.learning_rate,
            'batch_size': self.batch_size,
            'epochs': self.epochs,
            'random_state': self.random_state,
        }

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

# Define hyperparameter grid
saint_param_grid = {
    'd_model': [64, 128],
    'n_heads': [4, 8],
    'depth': [3, 6],
    'dropout': [0.1, 0.2],
    'learning_rate': [0.001, 0.01],
    'batch_size': [32, 64],
}

# Define the GridSearchCV
grid_search = GridSearchCV(
    estimator=SAINTRegressor(epochs=50, random_state=42),
    param_grid=saint_param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train.values, y_train)

# Best parameters and cross-validation score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (negative MSE):", grid_search.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits




Best Parameters: {'batch_size': 64, 'd_model': 64, 'depth': 3, 'dropout': 0.1, 'learning_rate': 0.001, 'n_heads': 8}
Best Cross-Validation Score (negative MSE): -0.9737012581376249
[CV] END batch_size=32, d_model=64, depth=3, dropout=0.1, learning_rate=0.01, n_heads=4; total time= 1.5min
[CV] END batch_size=32, d_model=64, depth=3, dropout=0.2, learning_rate=0.001, n_heads=8; total time= 1.4min
[CV] END batch_size=32, d_model=64, depth=6, dropout=0.1, learning_rate=0.001, n_heads=4; total time= 2.5min
[CV] END batch_size=32, d_model=64, depth=6, dropout=0.1, learning_rate=0.01, n_heads=8; total time= 2.6min
[CV] END batch_size=32, d_model=64, depth=6, dropout=0.2, learning_rate=0.01, n_heads=4; total time= 2.9min
[CV] END batch_size=32, d_model=128, depth=3, dropout=0.1, learning_rate=0.001, n_heads=8; total time= 3.4min
[CV] END batch_size=32, d_model=128, depth=3, dropout=0.2, learning_rate=0.001, n_heads=4; total time= 3.4min
[CV] END batch_size=32, d_model=128, depth=6, dropout=0.1

In [3]:
# Evaluate model performance on validation fold
rmse_valid = np.sqrt(abs(grid_search.best_score_))

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_final = best_model.predict(X_test.values)

rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_final))
print(f"Validation RMSE: {rmse_valid:.4f}, Test RMSE: {rmse_test:.4f}")

Validation RMSE: 0.9868, Test RMSE: 1.1213


In [4]:
# make predictions on candidate adv for reidentifying algorithm
y_adv_pred = best_model.predict(X_adv.values)

# add predicted travel time to dataset with both 1 and 0 matches
data_pred = data_test.copy(deep = True)
data_pred['y_pred'] = y_adv_pred

# save predicted travel time values
data_pred.to_csv("predicted_travel_time/SAINT.txt", sep = '\t', index = False)

In [5]:
import os

tt_thru_min, tt_thru_max = 2.5, 12 # min, max of through travel time to constrain search space

# function to process candidate match pairs
def reidentifyMatchPairs(adf, sdf, id_adv, data_pred, file):
    thru_match_initial = [] # store initial candidate match pairs of adv to stop-bar det
    
    for i in id_adv:
        adv_time = adf[adf.ID == i].TimeStamp.values[0]
        adv_lane = adf[adf.ID == i].Lane.values[0]

        # stop-bar det IDs on the same lane to look for a match
        id_stop_look = set(sdf[sdf.Lane == adv_lane].ID)

        for j in id_stop_look:
            stop_time = sdf[sdf.ID == j].TimeStamp.values[0]

            if stop_time > adv_time: # look forward in timestamp
                tt_adv_stop = (stop_time - adv_time) / np.timedelta64(1, 's') # paired travel time

                if tt_thru_min <= tt_adv_stop <= tt_thru_max:
                    # get predicted travel time for file and id_adv
                    Xi = data_pred.copy(deep = True)
                    Xi = Xi[(Xi.file == file[:-4]) & (Xi.adv == i)].reset_index(drop = True) # discard .txt
                    
                    tt_predict = Xi.loc[0, 'y_pred'] # predicted travel time
                    tt_diff = round(abs(tt_adv_stop - tt_predict), 4) # abs diff between paired & predicted

                    # store adv ID, stop ID, travel time diff
                    thru_match_initial.append([i, j, tt_diff])

    # dicts to store the lowest error for each adv, stop ID
    seen_adv_id, seen_stop_id = {}, {}

    # iterate through each candidate pair
    for pair in thru_match_initial:
        adv_id, stop_id, error = pair

        # check if adv ID not seen or if error is lower than seen error for that adv ID
        if (adv_id not in seen_adv_id) or (error < seen_adv_id[adv_id][1]):
            seen_adv_id[adv_id] = list([stop_id, error])

        # check if stop ID not seen or if error is lower than seen error for that stop ID
        if (stop_id not in seen_stop_id) or (error < seen_stop_id[stop_id][1]):
            seen_stop_id[stop_id] = list([adv_id, error])

    # match pairs for adv with lowest error
    df_adv = pd.DataFrame(seen_adv_id, index = ['adv', 'stop']).T.reset_index()
    df_adv.columns = ['adv', 'stop', 'error']

    # match pairs for stop with lowest error
    df_stop = pd.DataFrame(seen_stop_id, index = ['stop', 'adv']).T.reset_index()
    df_stop.columns = ['stop', 'adv', 'error']
    
    return {'df_adv': df_adv, 'df_stop': df_stop}

file_path = "data"
files = os.listdir(file_path)  # list of processed files to run through reidentifying algorithm

df_result = [] # store reidentified match pairs from each file

for file in files:
    print("Running reidentification algorithm for file: ", file)
    # read events-processed file with timestamp data
    df = pd.read_csv(os.path.join(file_path, file), sep = '\t')
    df.TimeStamp = pd.to_datetime(df.TimeStamp, format = '%Y-%m-%d %H:%M:%S.%f').sort_values()
    df.dropna(axis = 0, inplace = True) # drop rows with Nan

    # data frames for adv and stop-bar det
    adf = df[df.Det == 'adv']
    sdf = df[df.Det == 'stop']
    id_adv = list(sorted(adf.ID))

    # process candidate match pairs to get datasets of adv and stop pairs
    candidate_match_result = reidentifyMatchPairs(adf, sdf, id_adv, data_pred, file)
    df_adv = candidate_match_result['df_adv']
    df_stop = candidate_match_result['df_stop']

    # resulting common match pairs
    df_match_pair = df_adv.merge(df_stop, on = ['adv', 'stop', 'error'])
    df_match_pair['file'] = file[:-4]
    df_result.append(df_match_pair)

match_result = pd.concat(df_result)
match_result.to_csv("reidentification_result/SAINT.txt", sep = '\t')

# ground-truth match pairs for index cols
match_ground = data_test.copy(deep = True)
num_candidate_pairs = match_ground.shape[0]
print(f"\nNum of candidate pairs: {num_candidate_pairs}\n")

# filter ground-truth match pairs for match == 1 and select index cols
match_ground = match_ground[match_ground.match == 1][index_cols]

# get true positive (TP), false positive (FP), and false negative (FN) matches   
match_TP = pd.merge(match_result, match_ground, on = index_cols)
match_FP = match_result.merge(match_ground, on = index_cols, how = 'left', indicator = True).query('_merge == "left_only"').drop(columns = '_merge')
match_FN = match_ground.merge(match_result, on = index_cols, how = 'left', indicator = True).query('_merge == "left_only"').drop(columns = '_merge')

# num of TP, FP, FN
TP, FP, FN = match_TP.shape[0], match_FP.shape[0], match_FN.shape[0]
TN = num_candidate_pairs - TP - FP - FN

# compute metrics
accuracy = round((TP + TN) / (TP + FP + FN + TN), 4)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2*precision*recall / (precision + recall)

print(f"TP, FP, FN: {TP}, {FP}, {FN}")
print(f"Accuracy, Precision, Recall, F1: {accuracy:.4f}, {precision:.4f}, {recall:.4f}, {f1:.4f}")

Running reidentification algorithm for file:  20230327_0700_1400.txt
Running reidentification algorithm for file:  20221206_0945_1200.txt
Running reidentification algorithm for file:  20221214_0645_0715.txt
Running reidentification algorithm for file:  20230327_1415_1900.txt
Running reidentification algorithm for file:  20221206_0845_0915.txt
Running reidentification algorithm for file:  20221214_0945_1015.txt

Num of candidate pairs: 1040

TP, FP, FN: 507, 48, 112
Accuracy, Precision, Recall, F1: 0.8462, 0.9135, 0.8191, 0.8637
