In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import math
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from scipy import stats


from sklearn.svm import SVR
import xgboost as xgb


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [7]:
# Load the dataset
train_df = pd.read_csv('archive/train.csv')
test_df = pd.read_csv('archive/test_public.csv')
coord_lookup = pd.read_csv('archive/metaData_taxistandsID_name_GPSlocation.csv')

In [8]:
def get_lon(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Longitude'].iloc[0])

def get_lat(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Latitude'].iloc[0])

In [22]:
encoder = LabelEncoder()
scaler = MinMaxScaler()
train_df['CALL_TYPE'] = encoder.fit_transform(train_df['CALL_TYPE'])

# Normalize timestamp
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'], unit='s')
train_df['HOUR'] = train_df['TIMESTAMP'].dt.hour
train_df['DAY'] = train_df['TIMESTAMP'].dt.dayofweek
train_df['WEEK'] = train_df['TIMESTAMP'].dt.isocalendar().week
train_df['MONTH'] = train_df['TIMESTAMP'].dt.month

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = train_df['CALL_TYPE'] == 1
has_call = train_df['ORIGIN_CALL'].notnull()

train_df_origin = train_df[has_origin]
print('train_df_origin', len(train_df_origin))

train_df_call = train_df[has_call]
print('train_df_call', len(train_df_call))

train_df_nan = train_df[~has_origin][~has_call]
print('train_df_nan', len(train_df_nan))

train_df_origin = train_df_origin.dropna(subset=['ORIGIN_STAND'])

train_df_nan['TAXI_ID'] = train_df_nan['TAXI_ID'] - 20000000  # Normalize taxi IDs
train_df_call['TAXI_ID'] = train_df_call['TAXI_ID'] - 20000000  # Normalize taxi IDs



# Split the data into features and target
features_nan = pd.DataFrame({'HOUR': train_df_nan['HOUR'], 'DAY': train_df_nan['DAY'], 'MONTH': train_df_nan['MONTH'], 
                             'TAXI_ID': train_df_nan['TAXI_ID']})
target_nan = train_df_nan['POLYLINE'].str.count(',') * 15


features_call = pd.concat([train_df_call['HOUR'], train_df_call['DAY'], train_df_call['MONTH'], train_df_call['ORIGIN_CALL'], train_df_call['TAXI_ID']], axis=1)
target_call = train_df_call['POLYLINE'].str.count(',') * 15



target_origin = train_df_origin['POLYLINE'].str.count(',') * 15
#starting_point_lon = train_df_origin.apply(lambda x : get_lon(int(x['ORIGIN_STAND'])), axis=1)
#starting_point_lat = train_df_origin.apply(lambda x : get_lat(int(x['ORIGIN_STAND'])), axis=1)
train_df_origin['TAXI_ID'] = train_df_origin['TAXI_ID'] - 20000000  # Normalize taxi IDs
features_origin = pd.concat([train_df_origin['HOUR'], train_df_origin['DAY'], train_df_origin['MONTH'], train_df_origin['ORIGIN_STAND'], train_df_origin['TAXI_ID']], axis=1)



train_df_origin 817881
train_df_call 364770
train_df_nan 528019


  train_df_nan = train_df[~has_origin][~has_call]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_call['TAXI_ID'] = train_df_call['TAXI_ID'] - 20000000  # Normalize taxi IDs


In [23]:
X_train_nan, X_val_nan, y_train_nan, y_val_nan = train_test_split(features_nan, target_nan, test_size=0.2, random_state=42)
X_train_call, X_val_call, y_train_call, y_val_call = train_test_split(features_call, target_call, test_size=0.2, random_state=42)
X_train_origin, X_val_origin, y_train_origin, y_val_origin = train_test_split(features_origin, target_origin, test_size=0.2, random_state=42)

# Check for NaN values and replace with zeros
X_train_origin[X_train_origin != X_train_origin] = 0
X_val_origin[X_val_origin != X_val_origin] = 0

In [11]:
def analytical_soln(X, Y):
    return np.linalg.inv(X.T @ X) @ X.T @ Y

In [12]:
def rmse(X, Y, w):
    L = 0  
    for i in range(len(X)):
        xi = X[i]
        yi = Y[i]
        L += (yi - np.dot(w, xi))**2

    return math.sqrt(L/len(X))

In [18]:
print(X_train_nan)

         HOUR  DAY  MONTH  TAXI_ID
614077      5    5     11      310
936021     21    5      1      196
923305     22    2      1      351
444818     22    5     10      678
857004     19    1     12      570
...       ...  ...    ...      ...
374348      5    6      9      483
862590      8    2      1      565
1205591    11    3      3      626
447296      7    6     10      464
414008      2    0      9      337

[422415 rows x 4 columns]


In [24]:
model_nan = xgb.XGBRegressor()
#model_nan = RandomForestRegressor(n_estimators=200)
model_nan.fit(X_train_nan, y_train_nan)

In [154]:
model_call = GradientBoostingRegressor(n_estimators=200)
#model_nan = RandomForestRegressor(n_estimators=200)
model_call.fit(X_train_call, y_train_call)

In [139]:
X_train_origin.columns = X_train_origin.columns.astype(str)

model_origin = GradientBoostingRegressor(n_estimators=200)
model_origin.fit(X_train_origin, y_train_origin)

In [26]:
# NAN DATA ERROR
y_pred_nan = model_nan.predict(X_val_nan)

mse_nan = mean_squared_error(y_val_nan, y_pred_nan)
print(f"RMSE (No origin): {math.sqrt(mse_nan)}")

RMSE (No origin): 1765.028946605621


In [21]:
#ORIGIN STAND DATA ERROR

X_val_origin.columns = X_train_origin.columns.astype(str)
y_pred_origin = model_origin.predict(X_val_origin)

mse_origin = mean_squared_error(y_val_origin, y_pred_origin)
print(f"RMSE (Origin): {math.sqrt(mse_origin)}")

NameError: name 'model_origin' is not defined

In [156]:
# CALL DATA ERROR
y_pred_call = model_call.predict(X_val_call)

mse_call = mean_squared_error(y_val_call, y_pred_call)
print(f"RMSE (Call): {math.sqrt(mse_call)}")

RMSE (Call): 1001.6551911453263


In [76]:
test_df['CALL_TYPE'] = encoder.fit_transform(test_df['CALL_TYPE'])

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = test_df['CALL_TYPE'] == 1
test_df_origin = test_df[has_origin]
test_df_nan = test_df[~has_origin]
test_df_origin = test_df_origin.dropna(subset=['ORIGIN_STAND'])

# Split the data into features and target
features_nan = pd.DataFrame({ 'TIMESTAMP': test_df_nan['TIMESTAMP'] })

starting_point_lon = test_df_origin.apply(lambda x : get_lon(int(x['ORIGIN_STAND'])), axis=1)
starting_point_lat = test_df_origin.apply(lambda x : get_lat(int(x['ORIGIN_STAND'])), axis=1)
features_origin = pd.concat([test_df_origin['TIMESTAMP'], starting_point_lon, starting_point_lat], axis=1)

biases = np.ones(len(features_origin))
features_origin['x0'] = biases
biases = np.ones(len(features_nan))
features_nan['x0'] = biases

Because the timestamps for the test data are all really close together compared to the training data, there is much less of a dependence on the timestamp; in fact, for the dataset with no location, the model always spits out the same output.

In [82]:
import csv

fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

i_n = 0
i_o = 0

for i in range(320):
    trip_id = test_df.iloc[i, 0]
    row = test_df.iloc[i]
    # print(i_n, i_o)
    if row['CALL_TYPE'] == 1:
        # print(features_origin.iloc[i_o].to_numpy().reshape(1, -1))
        rows.append([trip_id, model_origin.predict(features_origin.iloc[i_o].to_numpy().reshape(1, -1))[0]])
        i_o += 1
    else:
        # print(features_nan.iloc[i_n].to_numpy().reshape(1, -1))
        rows.append([trip_id, model_nan.predict(features_nan.iloc[i_n].to_numpy().reshape(1, -1))[0]])
        i_n += 1
    
with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)

