This notebook will be dedicated to training and testing several ML models

In [1]:
from src import data_utils, preprocessing

In [2]:
# first we get our cleaned data set
df_og = data_utils.clean_trip_data('yellow_tripdata_2022-05.parquet')

In [3]:
df_og.head(3).T

Unnamed: 0,0,1,2
PULocationID,246,238,163
DOLocationID,151,74,260
fare_amount,17.0,11.0,15.5
improvement_surcharge,0.3,0.3,0.3
congestion_surcharge,2.5,2.5,2.5
airport_fee,0.0,0.0,0.0
travel_time,1122.0,829.0,922.0
time_of_day,night,night,night
day,1,1,1
month,5,5,5


In [4]:
# now we get our X, and ys
X, y_travel_time, y_fare_amount = data_utils.get_feature_target(df_og)

In [5]:
# and we then get our training and testing sets
# we are going to get two sets of y labels, one for travel_time and another for fare_amount
X_train, X_test, y_train_travel_time, y_test_travel_time, y_train_fare_amount, y_test_fare_amount = data_utils.get_train_test_sets(X, y_travel_time, y_fare_amount)

In [6]:
X_train.shape

(2701206, 10)

In [7]:
# we can further split the training set into training and validation
X_train, X_val, y_train_travel_time, y_val_travel_time, y_train_fare_amount, y_val_fare_amount = data_utils.get_train_test_sets(X_train, y_train_travel_time, y_train_fare_amount)

In [8]:
X_train.shape

(2160964, 10)

In [9]:
# now using the training, validation, and testing sets we can preprocess our data
X_train, X_val, X_test = preprocessing.preprocess_data(X_train, X_val, X_test)

Input train data shape:  (2160964, 10)
Input val data shape:  (540242, 10)
Input test data shape:  (675302, 10) 



In [10]:
X_train.shape

(2160964, 12)

In [18]:
column_names_order = ['PULocationID','DOLocationID','improvement_surcharge','congestion_surcharge','airport_fee','day','month','is_weekend','distance_between_zones','morning','afternoon','night']

**Initial Results with no finetuning**

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error

In [13]:
# initial run without choosing any new hyperparameters
fare_amount_rf = RandomForestRegressor()

First we will start with RandomForest to predict fare amount

In [14]:
# our initial approach with no fine tuning
fare_amount_rf.fit(X_train,y_train_fare_amount)

In [15]:
preds_fare = fare_amount_rf.predict(X_test)
# measure our results
mse = mean_squared_error(y_test_fare_amount, preds_fare)
mae = mean_absolute_error(y_test_fare_amount, preds_fare)
rmse = root_mean_squared_error(y_test_fare_amount, preds_fare)

print(mse)
print(mae)

24.841699484786893
2.253509892500316


In [None]:
# from the previous exercise we now get some information

print(f'Estimators: {len(fare_amount_rf.estimators_)}')
for feature_importance,column_name in zip(fare_amount_rf.feature_importances_,column_names_order):
    print(f'Feature {column_name} has an importance of {feature_importance}')

Estimators: 100
Feature ['PULocationID', 'DOLocationID', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'day', 'month', 'is_weekend', 'distance_between_zones', 'morning', 'afternoon', 'night'] has an importance of [2.37133502e-02 3.68695160e-02 9.18516955e-04 4.18315381e-03
 1.02324759e-02 2.50450314e-02 9.03252190e-06 2.79212304e-03
 8.84606715e-01 2.78896729e-03 1.88819407e-03 6.95292335e-03]
Feature ['PULocationID', 'DOLocationID', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'day', 'month', 'is_weekend', 'distance_between_zones', 'morning', 'afternoon', 'night'] has an importance of [2.37133502e-02 3.68695160e-02 9.18516955e-04 4.18315381e-03
 1.02324759e-02 2.50450314e-02 9.03252190e-06 2.79212304e-03
 8.84606715e-01 2.78896729e-03 1.88819407e-03 6.95292335e-03]
Feature ['PULocationID', 'DOLocationID', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'day', 'month', 'is_weekend', 'distance_between_zones', 'morning', 'afternoon', '

Now to predict the travel time

In [38]:
# initial run, no funetuning
travel_time_rf = RandomForestRegressor()

In [39]:
travel_time_rf.fit(X_train, y_train_travel_time)

In [41]:
preds_tt = travel_time_rf.predict(X_test)
# measure our results
mse = mean_squared_error(y_test_travel_time, preds_tt)
mae = mean_absolute_error(y_test_travel_time, preds_tt)
rmse = root_mean_squared_error(y_test_travel_time, preds_tt)

print(mse)
print(mae)
print(rmse)

631929.452130803
550.9514718045573
794.9399047291581


Now we start finetuning our RandomFores models

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
fare_amount_rf = RandomForestRegressor()

In [None]:
# we will choose from the following parameters to perform our cross validation search
rf_params = {}

In [None]:
clf_fare_amount = RandomizedSearchCV(fare_amount_rf,)