In [50]:
import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
# Load data
train_data = pd.read_csv('archive/train.csv')
test_data=pd.read_csv('archive/test_public.csv')
print("train_data length:",len(train_data), "test_data length:", len(test_data))

train_data length: 1710670 test_data length: 320


In [34]:
# Preprocess Data
train_data = train_data[train_data['MISSING_DATA'] != True] #Drop any rows with missing data
preprocessed_data = train_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1) #Drop irrelevant columns

preprocessed_data = pd.get_dummies(preprocessed_data, columns=['CALL_TYPE'])
preprocessed_data['CALL_TYPE'] = preprocessed_data['CALL_TYPE_A'].astype(int) + preprocessed_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_data = preprocessed_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_data['ORIGIN_STAND'].fillna(0, inplace=True)
preprocessed_data['TAXI_ID'] = preprocessed_data['TAXI_ID'] - 20000000 #Normalize taxi IDs

preprocessed_data['TIMESTAMP'] = pd.to_datetime(preprocessed_data['TIMESTAMP'], unit='s')
preprocessed_data['HOUR'] = preprocessed_data['TIMESTAMP'].dt.hour
preprocessed_data['DAY_OF_WEEK'] = preprocessed_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_data['WEEK_OF_YEAR'] = preprocessed_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_data = preprocessed_data.drop('TIMESTAMP', axis=1)

def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
preprocessed_data["LEN"] = preprocessed_data["POLYLINE"].apply(polyline_to_trip_duration)

train, val = train_test_split(preprocessed_data, test_size=0.2, random_state=42)
train = train.sample(500000)
X_train = train.drop(['POLYLINE', 'LEN'], axis=1)
y_train = train['LEN']  # Calculate travel time in seconds
X_val = val.drop(['POLYLINE', 'LEN'], axis=1)
y_val = val['LEN']  # Calculate travel time in seconds

print(X_train)

         ORIGIN_CALL  ORIGIN_STAND  TAXI_ID  CALL_TYPE  HOUR  DAY_OF_WEEK  \
1547701          0.0           0.0      112          3    10            5   
1515924          0.0          15.0      525          2    18            5   
344382           0.0          12.0      100          2    14            1   
660091           0.0           0.0      353          3     9            3   
986649           0.0          27.0      128          2    11            5   
...              ...           ...      ...        ...   ...          ...   
952058           0.0          37.0      591          2     2            4   
54791            0.0           1.0      107          2    19            4   
1159806          0.0           0.0      503          3     7            7   
1095899      57773.0           0.0      346          1     6            1   
1490210          0.0           0.0      126          3    12            7   

         WEEK_OF_YEAR  
1547701            22  
1515924            21  
344

In [33]:
'''
# Calculate the threshold for the top 1% travel time
top_percentile = np.percentile(y_train, 99)

# Filter the data based on the travel time threshold
X_train = X_train[y_train <= top_percentile]
y_train = y_train[y_train <= top_percentile]
'''

In [37]:
param_grid = {
    #'n_estimators': [100, 300, 500],       300
    #'max_depth': [4,6,8,10],               4 (5 seems slightly better)
    #'learning_rate': [0.05,0.15,0.3],      0.15
    #'gamma': [0, 0.5, 1, 2],               IRRELEVANT
    #'colsample_bytree': [0.5,0.75,1],      0.5 (0.75 seems slightly better)
    #'min_child_weight': [1, 5, 10]         IRRELEVANT
}

model = xgb.XGBRegressor(learning_rate=0.15, n_estimators=300, max_depth=4, colsample_bytree=0.5)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print(best_params)

y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ........................................subsample=0; total time=   3.2s
[CV] END ........................................subsample=0; total time=   3.3s
[CV] END ........................................subsample=0; total time=   3.2s
[CV] END ......................................subsample=0.3; total time=  10.8s
[CV] END ......................................subsample=0.3; total time=  10.9s
[CV] END ......................................subsample=0.3; total time=  11.7s
[CV] END ......................................subsample=0.6; total time=  13.4s
[CV] END ......................................subsample=0.6; total time=  13.5s
[CV] END ......................................subsample=0.6; total time=  12.3s
{'subsample': 0.6}
Mean Squared Error: 442414.4959784708
RMSE: 665.142462919389


In [46]:
model = xgb.XGBRegressor(n_estimators=300,max_depth=5,learning_rate=0.15,colsample_bytree=0.75)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

Mean Squared Error: 433833.78632658976
RMSE: 658.6606002537193


In [47]:
# Full data: mean 580 median 584 std 97
# 100000: mean 583 median 587 std 111
# 50000: mean 579 median 582 std 117
# 10000: mean 583 median 585 std 147

print(model.feature_importances_)
print(y_pred.mean(), np.median(y_pred), y_pred.std())
print(y_val.mean(), np.median(y_val), y_val.std())

[0.06173237 0.10853538 0.18023314 0.30610937 0.1566261  0.11687596
 0.06988759]
716.35706 691.9835 195.5302
717.5000876854548 600.0 694.5655365840385


In [48]:
# Preprocess testing data
trip_ids = test_data['TRIP_ID']
preprocessed_test_data = test_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)

preprocessed_test_data = pd.get_dummies(preprocessed_test_data, columns=['CALL_TYPE'])
preprocessed_test_data['CALL_TYPE'] = preprocessed_test_data['CALL_TYPE_A'].astype(int) + preprocessed_test_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_test_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_test_data = preprocessed_test_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_test_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_test_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_test_data['TAXI_ID'] = preprocessed_test_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_test_data['TIMESTAMP'] = pd.to_datetime(preprocessed_test_data['TIMESTAMP'], unit='s')
preprocessed_test_data['HOUR'] = preprocessed_test_data['TIMESTAMP'].dt.hour
preprocessed_test_data['DAY_OF_WEEK'] = preprocessed_test_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_test_data['WEEK_OF_YEAR'] = preprocessed_test_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_test_data = preprocessed_test_data.drop('TIMESTAMP', axis=1)


In [51]:
#Write results to submission.csv
results = model.predict(preprocessed_test_data)
fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

for i, result in enumerate(results):
    row = [str(trip_ids[i]), result]
    rows.append(row)

with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)