In [1]:
import pandas as pd
import numpy as np

## Loading data
The cell below loads data needed for creating a model. This would be stored in dropbox. It is a cleaned version of the data, with data sampled appropriately and with weather and holidays included

In [35]:
! wget -O 'yellow_taxi_combined.csv' 'https://www.dropbox.com/s/nbeqql7non7rclf/yellow_taxi_combined.csv?dl=0'
! wget -O 'bike_data_combined.csv' 'https://www.dropbox.com/s/3gd83ynx6icrapw/bike_data_combined.csv?dl=0'
! wget -O 'areas.csv' 'https://www.dropbox.com/s/whm1qyz6bqas1kj/areas.csv?dl=0'

--2023-04-13 11:40:28--  https://www.dropbox.com/s/nbeqql7non7rclf/yellow_taxi_combined.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 2620:100:601f:18::a27d:912, 162.125.9.18
Connecting to www.dropbox.com (www.dropbox.com)|2620:100:601f:18::a27d:912|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/nbeqql7non7rclf/yellow_taxi_combined.csv [following]
--2023-04-13 11:40:29--  https://www.dropbox.com/s/raw/nbeqql7non7rclf/yellow_taxi_combined.csv
Reusing existing connection to [www.dropbox.com]:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucc7df0e63d751fa6f61265394b9.dl.dropboxusercontent.com/cd/0/inline/B6HSPgnUe97InKJZDWmPTk7SlNGhRr3QTfFYke3uVyCl0opjc31HeebyCG5xh7I-rLIil-gLkiXnXtFErKbNQqYIxgpd46pCS0xkiBfGaQJ67RINx6pAF9I8ATDj248t60HomFW3ZMotlLIExyqBgUG37y38g4la6Ph80-Qaa1u_Yw/file# [following]
--2023-04-13 11:40:30--  https://ucc7df0e63d751fa6f61265394b9.dl.dropboxusercontent.com/cd/0/inline/B6HSPgnUe97InKJZDWmPTk7

# Data preparation

In [21]:
model_type = "fhvhv"
df = pd.read_csv(f"{model_type}_combined.csv")

# create time based columns
df['mean_time'] = pd.to_datetime(df['mean_time'])

# get the month
df['month'] = df['mean_time'].dt.month.astype(str)

# bin the hours into 6-hour intervals
df['hours'] = pd.cut(df['mean_time'].dt.hour, bins=[0,8,11,16,19,22,24], right = False, labels=['22-8','8-11', '11-16', '16-19', '19-22','22-8'], ordered = False)

# just have some control for year
df['year'] = df['mean_time'].dt.year


# combine information from areas
areas = pd.read_csv("areas.csv")
id_to_borough_map = {}
id_to_zone_map = {}
for i in range(len(areas)):
    id_to_borough_map[areas.loc[i, "LocationID"]] = areas.loc[i, "Borough"]
    id_to_zone_map[areas.loc[i, "LocationID"]] = areas.loc[i, "Zone"]
df['PULocationBorough'] = df['PULocationID'].map(id_to_borough_map)
df['DOLocationZone'] = df['DOLocationID'].map(id_to_zone_map)

# convert snow and precipitation to binary
df['SNOW'] = (df['SNOW'] > 1)
df['PRCP'] = (df['PRCP'] > 1)

# fix holiday name
df['holiday_name'] = df['holiday_name'].fillna("None")

# filter df
df= df[df['trip_duration'] <100]

# if distance isn't present
if 'trip_distance' not in df.columns:
    distances = pd.read_csv('ids_to_distance_mapping.csv')
    df = pd.merge(df, distances)

if "TAVG" not in df.columns:
    df['TAVG'] = df['TMIN'] + 10


  df = pd.read_csv(f"{model_type}_combined.csv")


In [18]:

cols_to_use = [ 'trip_distance', 'TAVG', 'PRCP', 'SNOW',
       'holiday_name', 'month', 'hours', 'year' ]
X = df[cols_to_use]
X

Unnamed: 0,trip_distance,TAVG,PRCP,SNOW,holiday_name,month,hours,year
0,6.90,62.0,True,False,,10,19-22,2019
1,1.70,62.0,True,False,,10,22-8,2019
2,1.90,62.0,True,False,,10,22-8,2019
3,8.60,62.0,True,False,,10,8-11,2019
4,1.14,62.0,True,False,,10,8-11,2019
...,...,...,...,...,...,...,...,...
2345971,1.13,83.0,False,False,,7,16-19,2022
2345972,1.32,84.0,False,False,,7,11-16,2022
2345973,1.30,38.0,False,False,,2,22-8,2022
2345974,3.15,50.0,False,False,,4,16-19,2022


## Fit Model

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# sample some fo the dataset
df_final = df.sample(100000, random_state=42)
# cols_to_use = [ 'trip_distance', 'month', 'hours', 'year', 'PULocationBorough' ]
cols_to_use = [ 'trip_distance', 'TAVG', 'PRCP', 'SNOW',
       'holiday_name', 'month', 'hours', 'year', 'PULocationBorough' ]

# create train test split
X = df_final[cols_to_use]
X = pd.get_dummies(X)
y = df_final['trip_duration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit model
model = RandomForestRegressor(n_estimators = 100, max_depth=10, random_state= 42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# calculate errors
mean_squared_error(y_test,y_pred)
mean_absolute_error(y_test,y_pred)


5.498703149883624

In [None]:
# save model
import pickle
pickle.dump(model, open(f"{model_type}_model.pkl","wb"))