In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import math
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from scipy import stats

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [45]:
# Load the dataset
train_df = pd.read_csv('archive/train.csv')
test_df = pd.read_csv('archive/test_public.csv')
coord_lookup = pd.read_csv('archive/metaData_taxistandsID_name_GPSlocation.csv')

In [46]:
def get_lon(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Longitude'].iloc[0])

def get_lat(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Latitude'].iloc[0])

In [73]:
encoder = LabelEncoder()
scaler = MinMaxScaler()
train_df['CALL_TYPE'] = encoder.fit_transform(train_df['CALL_TYPE'])

# Normalize timestamp
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'], unit='s')
train_df['MINUTE'] = train_df['TIMESTAMP'].dt.minute
train_df['HOUR'] = train_df['TIMESTAMP'].dt.hour
train_df['DAY'] = train_df['TIMESTAMP'].dt.dayofweek
train_df['MONTH'] = train_df['TIMESTAMP'].dt.month

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = train_df['CALL_TYPE'] == 1
train_df_origin = train_df[has_origin]
train_df_nan = train_df[~has_origin]
train_df_origin = train_df_origin.dropna(subset=['ORIGIN_STAND'])



# Split the data into features and target
features_nan = pd.DataFrame({ 'MINUTE': train_df_nan['MINUTE'], 'HOUR': train_df_nan['HOUR'], 'DAY': train_df_nan['DAY'], 'MONTH': train_df_nan['MONTH'] })


target_nan = train_df_nan['POLYLINE'].str.count(',') * 15


target_origin = train_df_origin['POLYLINE'].str.count(',') * 15
starting_point_lon = train_df_origin.apply(lambda x : get_lon(int(x['ORIGIN_STAND'])), axis=1)
starting_point_lat = train_df_origin.apply(lambda x : get_lat(int(x['ORIGIN_STAND'])), axis=1)
features_origin = pd.concat([train_df_origin['HOUR'], train_df_origin['DAY'], train_df_origin['MONTH'], starting_point_lon, starting_point_lat], axis=1)

print(features_nan)

         TIMESTAMP         0          1
1              0.0 -8.641984  41.159980
15             0.0 -8.628468  41.157089
16             0.0 -8.584047  41.163222
23             0.0 -8.604246  41.160715
28             0.0 -8.613217  41.154965
...            ...       ...        ...
1710654        0.0 -8.585877  41.148628
1710661        0.0 -8.585877  41.148628
1710662        0.0 -8.605720  41.144253
1710668        0.0 -8.630466  41.155120
1710669        0.0 -8.615952  41.140517

[806579 rows x 3 columns]


In [48]:
X_train_nan, X_val_nan, y_train_nan, y_val_nan = train_test_split(features_nan, target_nan, test_size=0.2, random_state=42)
X_train_origin, X_val_origin, y_train_origin, y_val_origin = train_test_split(features_origin, target_origin, test_size=0.2, random_state=42)

# Check for NaN values and replace with zeros
X_train_origin[X_train_origin != X_train_origin] = 0
X_val_origin[X_val_origin != X_val_origin] = 0

In [49]:
def analytical_soln(X, Y):
    return np.linalg.inv(X.T @ X) @ X.T @ Y

In [50]:
def rmse(X, Y, w):
    L = 0
    for i in range(len(X)):
        xi = X[i]
        yi = Y[i]
        L += (yi - np.dot(w, xi))**2

    return math.sqrt(L/len(X))

In [51]:
# Adding bias columns (this seems to help a bit)
biases = np.ones(len(X_train_nan))
X_train_nan['x0'] = biases

biases = np.ones(len(X_val_nan))
X_val_nan['x0'] = biases

weights = analytical_soln(X_train_nan.values, y_train_nan.values)
print(weights, rmse(X_val_nan.values, y_val_nan.values, weights))

[2.04106448e-07 1.25684703e+03] 1723.206762070633


In [52]:
model_nan = GradientBoostingRegressor(n_estimators=200)
model_nan.fit(X_train_nan, y_train_nan)

In [53]:
X_train_origin.columns = X_train_origin.columns.astype(str)

biases = np.ones(len(X_train_origin))
X_train_origin['x0'] = biases

biases = np.ones(len(X_val_origin))
X_val_origin['x0'] = biases

model_origin = GradientBoostingRegressor(n_estimators=200)
model_origin.fit(X_train_origin, y_train_origin)

In [81]:
# Make predictions on the validation set
X_val_origin.columns = X_train_origin.columns.astype(str)
y_pred_nan = model_nan.predict(X_val_nan)
y_pred_origin = model_origin.predict(X_val_origin)

# Calculate the mean squared error on the validation set
mse_nan = mean_squared_error(y_val_nan, y_pred_nan)
print(f"RMSE (No origin): {math.sqrt(mse_nan)}")
mse_origin = mean_squared_error(y_val_origin, y_pred_origin)
print(f"RMSE (Origin): {math.sqrt(mse_origin)}")

          TIMESTAMP   x0
1208540  1395376728  1.0
309050   1378735304  1.0
1363726  1398323135  1.0
571553   1383247437  1.0
1151405  1394212151  1.0
...             ...  ...
799384   1387462996  1.0
451346   1381137045  1.0
1242931  1396029435  1.0
1098318  1393264219  1.0
1035992  1392116047  1.0

[178558 rows x 2 columns]
[1553.70234293 1526.68766512 1572.24095064 ... 1545.15514677 1546.08498487
 1530.85897794]
RMSE (No origin): 1719.3278540791728
RMSE (Origin): 944.1465424241857


In [76]:
test_df['CALL_TYPE'] = encoder.fit_transform(test_df['CALL_TYPE'])

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = test_df['CALL_TYPE'] == 1
test_df_origin = test_df[has_origin]
test_df_nan = test_df[~has_origin]
test_df_origin = test_df_origin.dropna(subset=['ORIGIN_STAND'])

# Split the data into features and target
features_nan = pd.DataFrame({ 'TIMESTAMP': test_df_nan['TIMESTAMP'] })

starting_point_lon = test_df_origin.apply(lambda x : get_lon(int(x['ORIGIN_STAND'])), axis=1)
starting_point_lat = test_df_origin.apply(lambda x : get_lat(int(x['ORIGIN_STAND'])), axis=1)
features_origin = pd.concat([test_df_origin['TIMESTAMP'], starting_point_lon, starting_point_lat], axis=1)

biases = np.ones(len(features_origin))
features_origin['x0'] = biases
biases = np.ones(len(features_nan))
features_nan['x0'] = biases

Because the timestamps for the test data are all really close together compared to the training data, there is much less of a dependence on the timestamp; in fact, for the dataset with no location, the model always spits out the same output.

In [82]:
import csv

fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

i_n = 0
i_o = 0

for i in range(320):
    trip_id = test_df.iloc[i, 0]
    row = test_df.iloc[i]
    # print(i_n, i_o)
    if row['CALL_TYPE'] == 1:
        # print(features_origin.iloc[i_o].to_numpy().reshape(1, -1))
        rows.append([trip_id, model_origin.predict(features_origin.iloc[i_o].to_numpy().reshape(1, -1))[0]])
        i_o += 1
    else:
        # print(features_nan.iloc[i_n].to_numpy().reshape(1, -1))
        rows.append([trip_id, model_nan.predict(features_nan.iloc[i_n].to_numpy().reshape(1, -1))[0]])
        i_n += 1
    
with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)

