In [77]:
import os
import numpy as np
import pandas as pd
import itertools
import json

import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

In [78]:
# train and test datasets
df_train = pd.read_csv("final_match_pairs_train.txt", sep = '\t') # inferred match pairs
df_test = pd.read_csv("final_match_pairs_ground_truth.txt", sep = '\t') # ground-truth match pairs

# retain test dataset for testing reidentification algorithm
data_test = df_test.copy(deep = True)

# filter rows with match = 1
df_train = df_train[df_train.match == 1]
df_test = df_test[df_test.match == 1]

# drop redundant columns
index_cols = ['file', 'adv', 'stop']
df_train.drop(index_cols + ['match'], axis = 1, inplace = True)
df_test.drop(index_cols + ['match'], axis = 1, inplace = True)

# training features and target variable
X = df_train.drop('travel_time', axis = 1)
y = df_train.travel_time

# split training features & target into train, validation sets
random_state = 42
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.25, random_state = random_state)

# testing features and target variable
X_test = df_test.drop('travel_time', axis = 1)
y_test = df_test.travel_time

print(f"Train dataset size: {len(X_train)}")
print(f"Validation dataset size: {len(X_valid)}")
print(f"Test dataset size: {len(X_test)}")

Train dataset size: 4043
Validation dataset size: 1348
Test dataset size: 337


In [79]:
# function to fit & predict travel time and evaluate performance
def modelFitPredict(model):
    # fit model with hypertuned parameters on train dataset
    model.fit(X_train, y_train)
    
    # make predictions on validation set and evaluate metrics
    y_pred_valid = model.predict(X_valid)
    mape_valid = mean_absolute_percentage_error(y_valid, y_pred_valid)
    rmse_valid = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
    
    # make predictions on test set and evaluate metrics
    y_pred_test = model.predict(X_test)
    mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

    # fit model with hypertuned parameters on train + validation datasets
    model.fit(X, y)
    
    # make predictions on test set and evaluate metrics
    y_pred_test_full = model.predict(X_test)
    mape_test_full = mean_absolute_percentage_error(y_test, y_pred_test_full)
    rmse_test_full = np.sqrt(mean_squared_error(y_test, y_pred_test_full))
    
    metrics_pred = {'mape_valid': mape_valid,
                    'rmse_valid': rmse_valid,
                    'mape_test': mape_test,
                    'rmse_test': rmse_test,
                    'mape_test_full': mape_test_full,
                    'rmse_test_full': rmse_test_full}
    
    # make predictions on candidate adv for reidentifying algorithm
    X_adv = data_test.drop(index_cols + ['match', 'travel_time'], axis = 1)
    y_adv_pred = model.predict(X_adv)
    
    # add predicted travel time to dataset with both 1 and 0 matches
    data_pred = data_test.copy(deep = True)
    data_pred['y_pred'] = y_adv_pred
    
    return {'metrics_pred': metrics_pred, 'data_pred': data_pred}

In [80]:
tt_thru_min, tt_thru_max = 2.5, 12 # min, max of through travel time to constrain search space

# function to process candidate match pairs
def reidentifyMatchPairs(adf, sdf, id_adv, data_pred, file):
    thru_match_initial = [] # store initial candidate match pairs of adv to stop-bar det
    
    for i in id_adv:
        adv_time = adf[adf.ID == i].TimeStamp.values[0]
        adv_lane = adf[adf.ID == i].Lane.values[0]

        # stop-bar det IDs on the same lane to look for a match
        id_stop_look = set(sdf[sdf.Lane == adv_lane].ID)

        for j in id_stop_look:
            stop_time = sdf[sdf.ID == j].TimeStamp.values[0]

            if stop_time > adv_time: # look forward in timestamp
                tt_adv_stop = (stop_time - adv_time) / np.timedelta64(1, 's') # paired travel time

                if tt_thru_min <= tt_adv_stop <= tt_thru_max:
                    # get predicted travel time for file and id_adv
                    Xi = data_pred.copy(deep = True)
                    Xi = Xi[(Xi.file == file[:-4]) & (Xi.adv == i)].reset_index(drop = True) # discard .txt
                    
                    tt_predict = Xi.loc[0, 'y_pred'] # predicted travel time
                    tt_diff = round(abs(tt_adv_stop - tt_predict), 4) # abs diff between paired & predicted

                    # store adv ID, stop ID, travel time diff
                    thru_match_initial.append([i, j, tt_diff])

    # dicts to store the lowest error for each adv, stop ID
    seen_adv_id, seen_stop_id = {}, {}

    # iterate through each candidate pair
    for pair in thru_match_initial:
        adv_id, stop_id, error = pair

        # check if adv ID not seen or if error is lower than seen error for that adv ID
        if (adv_id not in seen_adv_id) or (error < seen_adv_id[adv_id][1]):
            seen_adv_id[adv_id] = list([stop_id, error])

        # check if stop ID not seen or if error is lower than seen error for that stop ID
        if (stop_id not in seen_stop_id) or (error < seen_stop_id[stop_id][1]):
            seen_stop_id[stop_id] = list([adv_id, error])

    # match pairs for adv with lowest error
    df_adv = pd.DataFrame(seen_adv_id, index = ['adv', 'stop']).T.reset_index()
    df_adv.columns = ['adv', 'stop', 'error']

    # match pairs for stop with lowest error
    df_stop = pd.DataFrame(seen_stop_id, index = ['stop', 'adv']).T.reset_index()
    df_stop.columns = ['stop', 'adv', 'error']
    
    return {'df_adv': df_adv, 'df_stop': df_stop}

In [81]:
file_path = "processed"
files = os.listdir(file_path)  # list of processed files to run through reidentifying algorithm

# function to process each file for reidentifying match pairs
def processFiles(data_pred):
    df_result = [] # store reidentified match pairs from each file
    
    for file in files:
        # print("Running reidentification algorithm for file: ", file)
        # read events-processed file with timestamp data
        df = pd.read_csv(os.path.join(file_path, file), sep = '\t')
        df.TimeStamp = pd.to_datetime(df.TimeStamp, format = '%Y-%m-%d %H:%M:%S.%f').sort_values()
        df.dropna(axis = 0, inplace = True) # drop rows with Nan
        
        # data frames for adv and stop-bar det
        adf = df[df.Det == 'adv']
        sdf = df[df.Det == 'stop']
        id_adv = list(sorted(adf.ID))
        
        # process candidate match pairs to get datasets of adv and stop pairs
        candidate_match_result = reidentifyMatchPairs(adf, sdf, id_adv, data_pred, file)
        df_adv = candidate_match_result['df_adv']
        df_stop = candidate_match_result['df_stop']
        
        # resulting common match pairs
        df_match_pair = df_adv.merge(df_stop, on = ['adv', 'stop', 'error'])
        df_match_pair['file'] = file[:-4]
        df_result.append(df_match_pair)
        
    match_result = pd.concat(df_result)
    return match_result

In [82]:
# ground-truth match pairs for index cols
match_ground = data_test.copy(deep = True)[data_test.match == 1][index_cols]
num_ground_match_pairs = match_ground.shape[0]

# function for reidentification algorithm
def evaluateMatchMetrics(data_pred):
    # get match result from processing files for reidentifying algorithm
    match_result = processFiles(data_pred)
    
    # get true positive (TP), false positive (FP), and false negative (FN) matches   
    match_TP = pd.merge(match_result, match_ground, on = index_cols)
    match_FP = match_result.merge(match_ground, on = index_cols, how = 'left', indicator = True).query('_merge == "left_only"').drop(columns = '_merge')
    match_FN = match_ground.merge(match_result, on = index_cols, how = 'left', indicator = True).query('_merge == "left_only"').drop(columns = '_merge')
    
    # num of TP, FP, FN
    TP, FP, FN = match_TP.shape[0], match_FP.shape[0], match_FN.shape[0]
    num_match = list([TP, FP, FN])
    
    # compute metrics
    accuracy = TP / num_ground_match_pairs
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2*precision*recall / (precision + recall)
    
    return {'num_match': num_match, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [83]:
# grid of hyperparameters for each model
dt_param = {
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion': ['friedman_mse', 'absolute_error']
}

sv_param = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'degree': [2, 3, 4]
}

rf_param = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

xgb_param = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 7, 10],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

In [84]:
# function to produce combination of hyperparameters to test reidentification accuracy
def hyperparameterCombination(model_param):
    keys = model_param.keys()
    values = model_param.values()
    value_comb = list(itertools.product(*values)) # generate all possible combination of values
    
    comb_list = [] # store list of dictionaries
    for comb in value_comb:
        comb_dict = dict(zip(keys, comb))
        comb_list.append(comb_dict)
        
    return comb_list

In [85]:
def processModelMetrics(method, param_comb):
    # select model and its parameters
    if method == 'dt':
        model = DecisionTreeRegressor(**param_comb, random_state = random_state)
    elif method == 'sv':
        model = SVR(**param_comb)
    elif method == 'rf':
        model = RandomForestRegressor(**param_comb, random_state = random_state)
    elif method == 'xgb':
        model = xgb.XGBRegressor(**param_comb, random_state = random_state)
    
    # evaluate model's performance for travel time prediction
    prediction_result = modelFitPredict(model)
    metrics_pred = prediction_result['metrics_pred']
    data_pred = prediction_result['data_pred']
    
    # result from reidentification algorithm
    metrics_match = evaluateMatchMetrics(data_pred)
    
    # compute error in prediction for test dataset with both 1 and 0 matches
    data_pred['error'] = data_pred['travel_time'] - data_pred['y_pred'] # compute error in predicting travel time
    data_pred = data_pred[index_cols + ['error']] # retain only file, adv, stop, error columns
    
    return {'metrics_match': metrics_match, 'metrics_pred': metrics_pred, 'data_pred': data_pred}

In [86]:
def processHyperparameterMetrics(method, model_param):
    model_comb = hyperparameterCombination(model_param) # all combinations of hyperparameters
    print(f"Evaluating {len(model_comb)} combinations")
    
    max_match_f1 = 0 # minimum value of f1 score to beat from reidentification
    min_pred_mape = 1 # maximum value of MAPE to beat from travel time prediction

    for comb in model_comb[0:3]:
        comb_result = processModelMetrics(method, comb) # process model metrics for prediction and reidentification
        
        # all output of comb_result
        metrics_match = comb_result['metrics_match']
        metrics_pred = comb_result['metrics_pred']
        data_pred = comb_result['data_pred']
        
        # store pred & match metrics for each combination of hyperparameters
        all_comb_metrics_file = method + '_all_comb_metrics.json'
        with open(all_comb_metrics_file, 'a') as file:
            json.dump(comb, file)
            file.write('\n')
            json.dump(metrics_pred, file)
            file.write('\n')
            json.dump(metrics_match, file)
            file.write('\n')
        
        current_match_f1 = metrics_match['f1']
        
        # best hyperparameter combination for highest f1 score
        if current_match_f1 > max_match_f1:
            max_match_f1 = current_match_f1 # update max f1
            best_match_comb = comb # best hyperparameter combination for matching
            best_match_metrics = metrics_match # best match metrics
            best_match_pred_metrics = metrics_pred # prediction metrics for best match combination
            best_match_data_pred = data_pred # prediction error for best match combination

        current_pred_mape = metrics_pred['mape_test_full']
        
        # best hyperparameter combination for lowest mape in travel time prediction
        if current_pred_mape < min_pred_mape:
            min_pred_mape = current_pred_mape # update min mape
            best_pred_comb = comb # best hyperparameter combination for prediction
            best_pred_metrics = metrics_pred # best pred metrics
            best_pred_match_metrics = metrics_match # matching metrics for best prediction combination
            best_pred_data_pred = data_pred # prediction error for best prediction combination

    best_comb_metrics_file = method + '_best_comb_metrics.json' # store dictionaries
    with open(best_comb_metrics_file, 'a') as file:
        json.dump(best_match_comb, file)
        file.write('\n')
        json.dump(best_match_metrics, file)
        file.write('\n')
        json.dump(best_match_pred_metrics, file)
        file.write('\n')
        json.dump(best_pred_comb, file)
        file.write('\n')
        json.dump(best_pred_metrics, file)
        file.write('\n')
        json.dump(best_pred_match_metrics, file)
        
    best_match_data_pred.to_csv(method + '_best_match_data_pred.txt', sep = '\t')
    best_pred_data_pred.to_csv(method + '_best_pred_data_pred.txt', sep = '\t')

In [87]:
# run reidentification framework for all models
processHyperparameterMetrics('dt', dt_param)
# processHyperparameterMetrics('sv', sv_param)
# processHyperparameterMetrics('rf', rf_param)
# processHyperparameterMetrics('xgb', xgb_param)

Evaluating 270 combinations


KeyboardInterrupt: 