In [160]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb


In [161]:
# Load the training data
train_data = pd.read_csv('archive/train.csv')
test_data=pd.read_csv('archive/test_public.csv')
print(len(train_data))
print(len(test_data))


1710670
320


In [163]:
# Preprocess the data
# Drop irrelevant columns (TRIP_ID, TAXI_ID, etc.)
# Encode categorical variables (CALL_TYPE, DAY_TYPE)
# Handle missing values (ORIGIN_CALL, ORIGIN_STAND)

#Doesn't actually work right now
#train_data.drop(train_data[train_data['MISSING_DATA'].to_string() == 'True'].index, inplace = True)

preprocessed_data = train_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)

preprocessed_data = pd.get_dummies(preprocessed_data, columns=['CALL_TYPE'])
preprocessed_data['CALL_TYPE'] = preprocessed_data['CALL_TYPE_A'].astype(int) + preprocessed_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_data = preprocessed_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)


preprocessed_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_data['TAXI_ID'] = preprocessed_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_data['TIMESTAMP'] = pd.to_datetime(preprocessed_data['TIMESTAMP'], unit='s')
preprocessed_data['HOUR'] = preprocessed_data['TIMESTAMP'].dt.hour
preprocessed_data['DAY_OF_WEEK'] = preprocessed_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_data['WEEK_OF_YEAR'] = preprocessed_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_data = preprocessed_data.drop('TIMESTAMP', axis=1)

X = preprocessed_data.drop(['POLYLINE'], axis=1)
y = (train_data['POLYLINE'].str.count(',')-1) * 15  # Calculate travel time in seconds

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [164]:
# Calculate the threshold for the top 5% travel time
top_percentile = np.percentile(y_train, 90)

# Filter the data based on the travel time threshold
X_train = X_train[y_train <= top_percentile]
y_train = y_train[y_train <= top_percentile]

# Repeat the filtering for the validation set
X_val = X_val[y_val <= top_percentile]
y_val = y_val[y_val <= top_percentile]

In [165]:
print(top_percentile)
print(X_train)

2400.0
         ORIGIN_CALL  ORIGIN_STAND  TAXI_ID  CALL_TYPE  HOUR  DAY_OF_WEEK  \
1566255       3998.0           0.0      296          1    16            1   
795817           0.0          20.0      392          2    20            3   
1144137          0.0           0.0       39          3     7            4   
180789           0.0          36.0      597          2    12            5   
479959           0.0          42.0      285          2    22            6   
...              ...           ...      ...        ...   ...          ...   
259178       24662.0           0.0      184          1    18            3   
1414414          0.0          37.0       83          2    10            7   
131932           0.0          25.0      600          2    19            7   
671155           0.0          25.0      451          2    16            5   
121958           0.0           0.0      342          2    14            5   

         WEEK_OF_YEAR  
1566255            23  
795817             5

In [139]:
model = xgb.XGBRegressor(n_estimators=300,max_depth=8,learning_rate=0.15)
model.fit(X_train, y_train)


In [159]:
# Evaluate the model
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

# Calculate absolute differences
abs_diff = np.abs(y_val - y_pred)

# Find indices of inputs with the biggest losses
top_indices = np.argsort(abs_diff)[-5:][::-1]

# Print the inputs with the biggest losses
print("Inputs with the biggest losses:")
for i in top_indices:
    print(X_val.iloc[i])
    print("____\n")


Mean Squared Error: 1922038.4755723134
RMSE: 1386.376022431257
Inputs with the biggest losses:
ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         562.0
CALL_TYPE         3.0
HOUR              7.0
DAY_OF_WEEK       6.0
WEEK_OF_YEAR     20.0
Name: 1492417, dtype: float64
____

ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         902.0
CALL_TYPE         3.0
HOUR             16.0
DAY_OF_WEEK       3.0
WEEK_OF_YEAR     31.0
Name: 147121, dtype: float64
____

ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         510.0
CALL_TYPE         3.0
HOUR             12.0
DAY_OF_WEEK       5.0
WEEK_OF_YEAR     24.0
Name: 1627045, dtype: float64
____

ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         665.0
CALL_TYPE         3.0
HOUR              8.0
DAY_OF_WEEK       3.0
WEEK_OF_YEAR     46.0
Name: 633224, dtype: float64
____

ORIGIN_CALL       0.0
ORIGIN_STAND      0.0
TAXI_ID         520.0
CALL_TYPE         3.0
HOUR             18.0
DAY_OF_WEEK       3.0
WEEK_OF_YEAR   

In [95]:
print(model.feature_importances_)
print(X_val)
print(y_pred)

[0.10981362 0.21117005 0.1868946  0.16885234 0.14956258 0.08111721
 0.09258962]
         ORIGIN_CALL  ORIGIN_STAND  TAXI_ID  CALL_TYPE  HOUR  DAY_OF_WEEK  \
1414161          0.0          10.0      213          2     7            7   
1397309          0.0           0.0       92          3     7            4   
560966           0.0           9.0      263          2    14            2   
1453634          0.0          13.0      503          2    21            6   
889280           0.0           0.0      678          3    14            2   
...              ...           ...      ...        ...   ...          ...   
1064661          0.0           0.0      367          3    15            1   
1195435          0.0           0.0      547          3    16            1   
1661714          0.0          56.0      692          2     7            6   
545123           0.0          14.0      648          2     2            6   
1215152      52782.0           0.0      621          1     9            6

In [141]:
# Preprocess the data
# Drop irrelevant columns (TRIP_ID, TAXI_ID, etc.)
# Encode categorical variables (CALL_TYPE, DAY_TYPE)
# Handle missing values (ORIGIN_CALL, ORIGIN_STAND)
trip_ids = test_data['TRIP_ID']
preprocessed_test_data = test_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)

preprocessed_test_data = pd.get_dummies(preprocessed_test_data, columns=['CALL_TYPE'])
preprocessed_test_data['CALL_TYPE'] = preprocessed_test_data['CALL_TYPE_A'].astype(int) + preprocessed_test_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_test_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_test_data = preprocessed_test_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_test_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_test_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_test_data['TAXI_ID'] = preprocessed_test_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_test_data['TIMESTAMP'] = pd.to_datetime(preprocessed_test_data['TIMESTAMP'], unit='s')
preprocessed_test_data['HOUR'] = preprocessed_test_data['TIMESTAMP'].dt.hour
preprocessed_test_data['DAY_OF_WEEK'] = preprocessed_test_data['TIMESTAMP'].dt.dayofweek + 1
preprocessed_test_data['WEEK_OF_YEAR'] = preprocessed_test_data['TIMESTAMP'].dt.isocalendar().week.astype(int)
preprocessed_test_data = preprocessed_test_data.drop('TIMESTAMP', axis=1)


In [142]:
import csv


results = model.predict(preprocessed_test_data)

fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

for i, result in enumerate(results):
    row = [str(trip_ids[i]), result]
    rows.append(row)

with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)