In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb


In [2]:
# Load the training data
train_data = pd.read_csv('archive/train.csv')
test_data=pd.read_csv('archive/test_public.csv')
print(len(train_data))
print(len(test_data))


1710670
320


In [160]:
# Preprocess the data
# Drop irrelevant columns (TRIP_ID, TAXI_ID, etc.)
# Encode categorical variables (CALL_TYPE, DAY_TYPE)
# Handle missing values (ORIGIN_CALL, ORIGIN_STAND)

#Doesn't actually work right now
#train_data.drop(train_data[train_data['MISSING_DATA'].to_string() == 'True'].index, inplace = True)

preprocessed_data = train_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)
# preprocessed_data = preprocessed_data.sample(10000)

preprocessed_data = pd.get_dummies(preprocessed_data, columns=['CALL_TYPE'])
preprocessed_data['CALL_TYPE'] = preprocessed_data['CALL_TYPE_A'].astype(int) + preprocessed_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_data = preprocessed_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)


preprocessed_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_data['TAXI_ID'] = preprocessed_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_data['TIMESTAMP'] = pd.to_datetime(preprocessed_data['TIMESTAMP'], unit='s')
preprocessed_data['HOUR'] = preprocessed_data['TIMESTAMP'].dt.hour
preprocessed_data['DAY_OF_WEEK'] = preprocessed_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_data['WEEK_OF_YEAR'] = preprocessed_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_data = preprocessed_data.drop('TIMESTAMP', axis=1)

def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
preprocessed_data["LEN"] = preprocessed_data["POLYLINE"].apply(polyline_to_trip_duration)

train, val = train_test_split(preprocessed_data, test_size=0.2, random_state=42)
train = train.sample(100000)
X_train = train.drop(['POLYLINE', 'LEN'], axis=1)
y_train = train['LEN']  # Calculate travel time in seconds
X_val = val.drop(['POLYLINE', 'LEN'], axis=1)
y_val = val['LEN']  # Calculate travel time in seconds

print(y_train)


574299     3900
555312      165
1025323     675
647030      810
781004      465
           ... 
1396874     645
310291      480
1645396    1170
239116      345
1086428     660
Name: LEN, Length: 10000, dtype: int64


In [161]:
# Calculate the threshold for the top 5% travel time
top_percentile = np.percentile(y_train, 90)

# Filter the data based on the travel time threshold
X_train = X_train[y_train <= top_percentile]
y_train = y_train[y_train <= top_percentile]

# Repeat the filtering for the validation set
X_val = X_val[y_val <= top_percentile]
y_val = y_val[y_val <= top_percentile]

In [162]:
print(top_percentile)
print(X_train)

1230.0
         ORIGIN_CALL  ORIGIN_STAND  TAXI_ID  CALL_TYPE  HOUR  DAY_OF_WEEK   
555312           0.0           0.0       80          3    10            1  \
1025323      23789.0           0.0      472          1    23            6   
647030       60204.0           0.0      343          1    19            6   
781004        8585.0           0.0      391          1     9            7   
1659326      17343.0           0.0      499          1     0            6   
...              ...           ...      ...        ...   ...          ...   
1396874          0.0           0.0      166          3     6            4   
310291       14084.0           0.0      398          1    21            1   
1645396          0.0          15.0      160          2     8            3   
239116           0.0           0.0      675          3    22            5   
1086428          0.0          23.0      276          2     1            6   

         WEEK_OF_YEAR  
555312             44  
1025323             

In [168]:
model = xgb.XGBRegressor(n_estimators=500,max_depth=16,learning_rate=0.15)
model.fit(X_train, y_train)

In [169]:
# With 300 estimators, max_depth=8
# Mean Squared Error: 256933.71772899234
# RMSE: 506.8862966474753

# Evaluate the model
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

# Calculate absolute differences
abs_diff = np.abs(y_val - y_pred)

# Find indices of inputs with the biggest losses
top_indices = np.argsort(abs_diff)[-5:][::-1]

# Print the inputs with the biggest losses
print("Inputs with the biggest losses:")
for i in top_indices:
    print(X_val.iloc[i])
    print("____\n")


Mean Squared Error: 87391.3379333748
RMSE: 295.62025968017616
Inputs with the biggest losses:
ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         616.0
CALL_TYPE         3.0
HOUR             11.0
DAY_OF_WEEK       3.0
WEEK_OF_YEAR     31.0
Name: 143148, dtype: float64
____

ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         280.0
CALL_TYPE         3.0
HOUR              1.0
DAY_OF_WEEK       1.0
WEEK_OF_YEAR     51.0
Name: 782663, dtype: float64
____

ORIGIN_CALL     2002.0
ORIGIN_STAND       0.0
TAXI_ID            5.0
CALL_TYPE          1.0
HOUR               3.0
DAY_OF_WEEK        6.0
WEEK_OF_YEAR      18.0
Name: 1406089, dtype: float64
____

ORIGIN_CALL       0.0
ORIGIN_STAND     61.0
TAXI_ID         167.0
CALL_TYPE         2.0
HOUR              8.0
DAY_OF_WEEK       2.0
WEEK_OF_YEAR     20.0
Name: 1465092, dtype: float64
____

ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         172.0
CALL_TYPE         3.0
HOUR             14.0
DAY_OF_WEEK       1.0
WEEK_OF_Y

In [170]:
# Full data: mean 580 median 584 std 97
# 100000: mean 583 median 587 std 111
# 50000: mean 579 median 582 std 117
# 10000: mean 583 median 585 std 147

print(model.feature_importances_)
print(y_pred.mean(), np.median(y_pred), y_pred.std())
print(y_val.mean(), np.median(y_val), y_val.std())

[0.11550716 0.102992   0.17929403 0.06201489 0.16430415 0.17191538
 0.20397234]
577.8193 578.6564 158.59146
586.2583338164532 570.0 278.84985466471693


In [171]:
# Preprocess the data
# Drop irrelevant columns (TRIP_ID, TAXI_ID, etc.)
# Encode categorical variables (CALL_TYPE, DAY_TYPE)
# Handle missing values (ORIGIN_CALL, ORIGIN_STAND)
trip_ids = test_data['TRIP_ID']
preprocessed_test_data = test_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)

preprocessed_test_data = pd.get_dummies(preprocessed_test_data, columns=['CALL_TYPE'])
preprocessed_test_data['CALL_TYPE'] = preprocessed_test_data['CALL_TYPE_A'].astype(int) + preprocessed_test_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_test_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_test_data = preprocessed_test_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_test_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_test_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_test_data['TAXI_ID'] = preprocessed_test_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_test_data['TIMESTAMP'] = pd.to_datetime(preprocessed_test_data['TIMESTAMP'], unit='s')
preprocessed_test_data['HOUR'] = preprocessed_test_data['TIMESTAMP'].dt.hour
preprocessed_test_data['DAY_OF_WEEK'] = preprocessed_test_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_test_data['WEEK_OF_YEAR'] = preprocessed_test_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_test_data = preprocessed_test_data.drop('TIMESTAMP', axis=1)


In [172]:
import csv

results = model.predict(preprocessed_test_data)

fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

for i, result in enumerate(results):
    row = [str(trip_ids[i]), result]
    rows.append(row)

with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)