In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
# Load data
train_data = pd.read_csv('archive/train.csv')
test_data=pd.read_csv('archive/test_public.csv')
print("train_data length:",len(train_data), "test_data length:", len(test_data))

train_data length: 1710670 test_data length: 320


In [11]:
# Preprocess Data
train_data = train_data[train_data['MISSING_DATA'] != True] #Drop any rows with missing data
preprocessed_data = train_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1) #Drop irrelevant columns

preprocessed_data = pd.get_dummies(preprocessed_data, columns=['CALL_TYPE'])
preprocessed_data['CALL_TYPE'] = preprocessed_data['CALL_TYPE_A'].astype(int) + preprocessed_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_data = preprocessed_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_data['ORIGIN_STAND'].fillna(0, inplace=True)
preprocessed_data['TAXI_ID'] = preprocessed_data['TAXI_ID'] - 20000000 #Normalize taxi IDs

preprocessed_data['TIMESTAMP'] = pd.to_datetime(preprocessed_data['TIMESTAMP'], unit='s')
preprocessed_data['HOUR'] = preprocessed_data['TIMESTAMP'].dt.hour
preprocessed_data['DAY_OF_WEEK'] = preprocessed_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_data['WEEK_OF_YEAR'] = preprocessed_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_data = preprocessed_data.drop('TIMESTAMP', axis=1)

def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
preprocessed_data["LEN"] = preprocessed_data["POLYLINE"].apply(polyline_to_trip_duration)

train, val = train_test_split(preprocessed_data, test_size=0.2, random_state=42)
train = train.sample(100000)
X_train = train.drop(['POLYLINE', 'LEN'], axis=1)
y_train = train['LEN']  # Calculate travel time in seconds
X_val = val.drop(['POLYLINE', 'LEN'], axis=1)
y_val = val['LEN']  # Calculate travel time in seconds

print(X_train)

         ORIGIN_CALL  ORIGIN_STAND  TAXI_ID  CALL_TYPE  HOUR  DAY_OF_WEEK  \
1640598          0.0          15.0      671          2     9            2   
922701           0.0          36.0       99          2    18            3   
305200           0.0           0.0      504          3    16            7   
816540       22843.0           0.0      560          1     5            7   
1025357      13083.0           0.0       41          1     1            7   
...              ...           ...      ...        ...   ...          ...   
232647           0.0           0.0      352          3    19            3   
247256           0.0          38.0      465          2     1            1   
614241           0.0           9.0      151          2     6            6   
714745           0.0          24.0        8          2     9            7   
295412           0.0          15.0      571          2    15            5   

         WEEK_OF_YEAR  
1640598            25  
922701              3  
305

In [33]:
'''
# Calculate the threshold for the top 1% travel time
top_percentile = np.percentile(y_train, 99)

# Filter the data based on the travel time threshold
X_train = X_train[y_train <= top_percentile]
y_train = y_train[y_train <= top_percentile]
'''

In [14]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [4,6,8,10],
    'learning_rate': [0.05,0.15,0.3],
    #'colsample_bytree': [0.5,0.75,1],
    #'min_child_weight': [1, 5, 10]
}

model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=4)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print(best_params)

y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=100;, score=0.050 total time=   1.2s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=100;, score=0.035 total time=   1.1s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=100;, score=0.052 total time=   1.1s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=300;, score=0.064 total time=   3.3s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=300;, score=0.044 total time=   3.2s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=300;, score=0.068 total time=   3.2s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=500;, sc

KeyboardInterrupt: 

In [54]:
model = xgb.XGBRegressor(n_estimators=200,max_depth=6,learning_rate=0.15)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

In [48]:
# Full data: mean 580 median 584 std 97
# 100000: mean 583 median 587 std 111
# 50000: mean 579 median 582 std 117
# 10000: mean 583 median 585 std 147

print(model.feature_importances_)
print(y_pred.mean(), np.median(y_pred), y_pred.std())
print(y_val.mean(), np.median(y_val), y_val.std())

[0.06171712 0.1356234  0.16502033 0.28839728 0.14757086 0.11730443
 0.08436663]
716.1102 690.5679 209.12238
716.6898495910959 600.0 685.2213387618457


In [49]:
# Preprocess testing data
trip_ids = test_data['TRIP_ID']
preprocessed_test_data = test_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)

preprocessed_test_data = pd.get_dummies(preprocessed_test_data, columns=['CALL_TYPE'])
preprocessed_test_data['CALL_TYPE'] = preprocessed_test_data['CALL_TYPE_A'].astype(int) + preprocessed_test_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_test_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_test_data = preprocessed_test_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_test_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_test_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_test_data['TAXI_ID'] = preprocessed_test_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_test_data['TIMESTAMP'] = pd.to_datetime(preprocessed_test_data['TIMESTAMP'], unit='s')
preprocessed_test_data['HOUR'] = preprocessed_test_data['TIMESTAMP'].dt.hour
preprocessed_test_data['DAY_OF_WEEK'] = preprocessed_test_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_test_data['WEEK_OF_YEAR'] = preprocessed_test_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_test_data = preprocessed_test_data.drop('TIMESTAMP', axis=1)


In [50]:
import csv

results = model.predict(preprocessed_test_data)
fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

for i, result in enumerate(results):
    row = [str(trip_ids[i]), result]
    rows.append(row)

with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)