In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb


In [3]:
# Load the training data
train_data = pd.read_csv('archive/train.csv')
print(len(train_data))

1710670


In [10]:
# Preprocess the data
# Drop irrelevant columns (TRIP_ID, TAXI_ID, etc.)
# Encode categorical variables (CALL_TYPE, DAY_TYPE)
# Handle missing values (ORIGIN_CALL, ORIGIN_STAND)
preprocessed_data = train_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)

preprocessed_data = pd.get_dummies(preprocessed_data, columns=['CALL_TYPE'])
preprocessed_data['CALL_TYPE'] = preprocessed_data['CALL_TYPE_A'].astype(int) + preprocessed_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_data = preprocessed_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_data['TAXI_ID'] = preprocessed_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_data['TIMESTAMP'] = pd.to_datetime(preprocessed_data['TIMESTAMP'], unit='s')
preprocessed_data['HOUR'] = preprocessed_data['TIMESTAMP'].dt.hour
preprocessed_data['DAY_OF_WEEK'] = preprocessed_data['TIMESTAMP'].dt.dayofweek
preprocessed_data['MONTH'] = preprocessed_data['TIMESTAMP'].dt.month
#preprocessed_data['YEAR'] = preprocessed_data['TIMESTAMP'].dt.year
preprocessed_data = preprocessed_data.drop('TIMESTAMP', axis=1)

X = preprocessed_data.drop(['POLYLINE'], axis=1)
y = train_data['POLYLINE'].str.count(',') * 15  # Calculate travel time in seconds

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Calculate the threshold for the top 5% travel time
top_percentile = np.percentile(y_train, 98)

# Filter the data based on the travel time threshold
X_train = X_train[y_train <= top_percentile]
y_train = y_train[y_train <= top_percentile]

# Repeat the filtering for the validation set
#X_val = X_val[y_val <= top_percentile]
#y_val = y_val[y_val <= top_percentile]

In [11]:
#print(top_percentile)
print(X_train)

         ORIGIN_CALL  ORIGIN_STAND  TAXI_ID  CALL_TYPE  HOUR  DAY_OF_WEEK  \
1566255       3998.0           0.0      296          1    16            0   
795817           0.0          20.0      392          2    20            2   
1144137          0.0           0.0       39          3     7            3   
180789           0.0          36.0      597          2    12            4   
479959           0.0          42.0      285          2    22            5   
...              ...           ...      ...        ...   ...          ...   
259178       24662.0           0.0      184          1    18            2   
1414414          0.0          37.0       83          2    10            6   
131932           0.0          25.0      600          2    19            6   
671155           0.0          25.0      451          2    16            4   
121958           0.0           0.0      342          2    14            4   

         MONTH  
1566255      6  
795817      12  
1144137      3  
180789 

In [6]:
# Train the Random Forest Regression model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)


In [8]:
# Evaluate the model
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"RMSE: {np.sqrt(mse)}")

Mean Squared Error: 1673401.8391900184
RMSE: 1293.600339822937


In [8]:
print(X_val)
print(y_pred)

         ORIGIN_STAND  TAXI_ID  CALL_TYPE_A  CALL_TYPE_B  CALL_TYPE_C  \
1414161          10.0      213            0            1            0   
1397309           0.0       92            0            0            1   
560966            9.0      263            0            1            0   
1453634          13.0      503            0            1            0   
975156           10.0      174            0            1            0   
...               ...      ...          ...          ...          ...   
1064661           0.0      367            0            0            1   
1195435           0.0      547            0            0            1   
1661714          56.0      692            0            1            0   
545123           14.0      648            0            1            0   
1215152           0.0      621            1            0            0   

         DAY_TYPE_A  HOUR  DAY_OF_WEEK  MONTH  
1414161           1     7            6      5  
1397309           1     7  

In [67]:
# Load the test data
test_data = pd.read_csv('archive/test_public.csv')

# Preprocess the test data in the same way as the training data
preprocessed_test_data = test_data.drop(['TRIP_ID', 'MISSING_DATA'], axis=1)
preprocessed_test_data = pd.get_dummies(preprocessed_test_data, columns=['CALL_TYPE', 'DAY_TYPE'])
preprocessed_test_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_test_data['ORIGIN_STAND'].fillna(0, inplace=True)

# Predict on the test data
test_predictions = model.predict(preprocessed_test_data)

# Create submission file
submission_df = pd.DataFrame({'TRIP_ID': test_data['TRIP_ID'], 'TRAVEL_TIME': test_predictions})
submission_df.to_csv('submission.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'test_public.csv'