This notebook will be dedicated to training and testing several ML models

In [1]:
from src import data_utils, preprocessing

Only run the cell below by removing the # symbol if you haven't downloaded the zones data

In [4]:
# run this cell to download the required data files containing the trip and zones information

data_utils.download_zones_data('https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip')

Zip file successfully downloaded from 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip' and saved to 'f:\Pf_anyoneAI\Anyone-AI-NYC-Taxis\data\taxi_zones\zones.zip'


In [6]:
# first we get our cleaned data set
df_og = data_utils.clean_trip_data('yellow_tripdata_2022-05.parquet')

In [7]:
df_og.head(3).T

Unnamed: 0,0,1,2
PULocationID,246,238,163
DOLocationID,151,74,260
fare_amount,17.0,11.0,15.5
improvement_surcharge,0.3,0.3,0.3
congestion_surcharge,2.5,2.5,2.5
airport_fee,0.0,0.0,0.0
travel_time,1122.0,829.0,922.0
time_of_day,night,night,night
day,1,1,1
month,5,5,5


In [8]:
# now we get our X, and ys
X, y_travel_time, y_fare_amount = data_utils.get_feature_target(df_og)

In [9]:
# and we then get our training and testing sets
# we are going to get two sets of y labels, one for travel_time and another for fare_amount
X_train, X_test, y_train_travel_time, y_test_travel_time, y_train_fare_amount, y_test_fare_amount = data_utils.get_train_test_sets(X, y_travel_time, y_fare_amount)

In [10]:
X_train.shape

(2701206, 10)

In [11]:
# we can further split the training set into training and validation
X_train, X_val, y_train_travel_time, y_val_travel_time, y_train_fare_amount, y_val_fare_amount = data_utils.get_train_test_sets(X_train, y_train_travel_time, y_train_fare_amount)

In [12]:
X_train.shape

(2160964, 10)

In [13]:
# now using the training, validation, and testing sets we can preprocess our data
X_train, X_val, X_test = preprocessing.preprocess_data(X_train, X_val, X_test)

Input train data shape:  (2160964, 10)
Input val data shape:  (540242, 10)
Input test data shape:  (675302, 10) 



In [14]:
X_train.shape

(2160964, 12)

In [15]:
column_names_order = ['PULocationID','DOLocationID','improvement_surcharge','congestion_surcharge','airport_fee','day','month','is_weekend','distance_between_zones','morning','afternoon','night']

**Initial Results with no finetuning**

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error

In [13]:
# initial run without choosing any new hyperparameters
fare_amount_rf = RandomForestRegressor()

First we will start with RandomForest to predict fare amount

In [14]:
# our initial approach with no fine tuning
fare_amount_rf.fit(X_train,y_train_fare_amount)

In [15]:
preds_fare = fare_amount_rf.predict(X_test)
# measure our results
mse = mean_squared_error(y_test_fare_amount, preds_fare)
mae = mean_absolute_error(y_test_fare_amount, preds_fare)
rmse = root_mean_squared_error(y_test_fare_amount, preds_fare)

print(mse)
print(mae)
print(rmse)

24.841699484786893
2.253509892500316


In [20]:
# from the previous exercise we now get some information

print(f'Estimators: {len(fare_amount_rf.estimators_)}')
for feature_importance,column_name in zip(fare_amount_rf.feature_importances_,column_names_order):
    print(f'Feature {column_name} has an importance of {feature_importance}')

Estimators: 100
Feature PULocationID has an importance of 0.0237133501533072
Feature DOLocationID has an importance of 0.03686951600441979
Feature improvement_surcharge has an importance of 0.0009185169552815971
Feature congestion_surcharge has an importance of 0.004183153812484522
Feature airport_fee has an importance of 0.010232475916694374
Feature day has an importance of 0.02504503143276764
Feature month has an importance of 9.032521895527296e-06
Feature is_weekend has an importance of 0.0027921230437661696
Feature distance_between_zones has an importance of 0.8846067154526205
Feature morning has an importance of 0.0027889672879186844
Feature afternoon has an importance of 0.001888194071477481
Feature night has an importance of 0.006952923347366559


Now to predict the travel time

In [21]:
# initial run, no funetuning
travel_time_rf = RandomForestRegressor()

In [22]:
travel_time_rf.fit(X_train, y_train_travel_time)

In [25]:
preds_tt = travel_time_rf.predict(X_test)
# measure our results
mse = mean_squared_error(y_test_travel_time, preds_tt)
mae = mean_absolute_error(y_test_travel_time, preds_tt)
rmse = root_mean_squared_error(y_test_travel_time, preds_tt)

print(mse)
print(mae)
print(rmse)

144603.94700995611
234.5887471512303
380.26825664253926


In [26]:
# from the previous exercise we now get some information

print(f'Estimators: {len(travel_time_rf.estimators_)}')
for feature_importance,column_name in zip(travel_time_rf.feature_importances_,column_names_order):
    print(f'Feature {column_name} has an importance of {feature_importance}')

Estimators: 100
Feature PULocationID has an importance of 0.028099465177961924
Feature DOLocationID has an importance of 0.03637287268226878
Feature improvement_surcharge has an importance of 0.0020396874632975298
Feature congestion_surcharge has an importance of 0.005022326081492253
Feature airport_fee has an importance of 0.0047017326354928295
Feature day has an importance of 0.051869856951568274
Feature month has an importance of 0.0001260663368062028
Feature is_weekend has an importance of 0.012249099559072628
Feature distance_between_zones has an importance of 0.7981471071985252
Feature morning has an importance of 0.005058034871721715
Feature afternoon has an importance of 0.004635727118869083
Feature night has an importance of 0.05167802392292362


**Linear Regresion: Initial response without Fine Tuning.**

In [16]:
from sklearn.linear_model import LinearRegression

In [18]:
#FARE AMOUNT LINEAR REGRESSION
fare_amount_logreg = LinearRegression()
fare_amount_logreg.fit(X_train, y_train_fare_amount)

In [21]:
preds_fare_logreg = fare_amount_logreg.predict(X_test)
# measure the results
mse = mean_squared_error(y_test_fare_amount, preds_fare_logreg)
mae = mean_absolute_error(y_test_fare_amount, preds_fare_logreg)
rmse = root_mean_squared_error(y_test_fare_amount, preds_fare_logreg)

print(mse)
print(mae)
print(rmse)

35.18971475390373
2.904904190053445
5.9320919374116015


In [22]:
#TRAVEL TIME LINEAR REGRESSION
travel_time_logreg = LinearRegression()
travel_time_logreg.fit(X_train, y_train_travel_time)

In [23]:
preds_travel_logreg = travel_time_logreg.predict(X_test)
# measure the results
mse = mean_squared_error(y_test_travel_time, preds_travel_logreg)
mae = mean_absolute_error(y_test_travel_time, preds_travel_logreg)
rmse = root_mean_squared_error(y_test_travel_time, preds_travel_logreg)

print(mse)
print(mae)
print(rmse)

208218.05153921922
301.46361977293384
456.3091622345745


Now we start finetuning our RandomFores models

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
fare_amount_rf = RandomForestRegressor()

In [None]:
# we will choose from the following parameters to perform our cross validation search
rf_params = {}

In [None]:
clf_fare_amount = RandomizedSearchCV(fare_amount_rf,)