This notebook will be dedicated to training and testing several ML models

In [1]:
from src import data_utils, preprocessing

In [5]:
# first we get our cleaned data set
df_og = data_utils.clean_trip_data('yellow_tripdata_2022-05.parquet')

DataSourceError: Unable to open f:\Pf_anyoneAI\Anyone-AI-NYC-Taxis\data\taxi_zones\taxi_zones.shx or f:\Pf_anyoneAI\Anyone-AI-NYC-Taxis\data\taxi_zones\taxi_zones.SHX. Set SHAPE_RESTORE_SHX config option to YES to restore or create it.

In [None]:
df_og.head(3).T

In [4]:
# now we get our X, and ys
X, y_travel_time, y_fare_amount = data_utils.get_feature_target(df_og)

In [5]:
# and we then get our training and testing sets
# we are going to get two sets of y labels, one for travel_time and another for fare_amount
X_train, X_test, y_train_travel_time, y_test_travel_time, y_train_fare_amount, y_test_fare_amount = data_utils.get_train_test_sets(X, y_travel_time, y_fare_amount)

In [6]:
X_train.shape

(2701206, 10)

In [7]:
# we can further split the training set into training and validation
X_train, X_val, y_train_travel_time, y_val_travel_time, y_train_fare_amount, y_val_fare_amount = data_utils.get_train_test_sets(X_train, y_train_travel_time, y_train_fare_amount)

In [8]:
X_train.shape

(2160964, 10)

In [9]:
# now using the training, validation, and testing sets we can preprocess our data
X_train, X_val, X_test = preprocessing.preprocess_data(X_train, X_val, X_test)

Input train data shape:  (2160964, 10)
Input val data shape:  (540242, 10)
Input test data shape:  (675302, 10) 



In [10]:
X_train.shape

(2160964, 12)

**Initial Results with no finetuning**

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error

In [13]:
# initial run without choosing any new hyperparameters
fare_amount_rf = RandomForestRegressor()

First we will start with RandomForest to predict fare amount

In [None]:
# our initial approach with no fine tuning
fare_amount_rf.fit(X_train,y_train_fare_amount)

In [None]:
preds_fare = fare_amount_rf.predict(X_test)
# measure our results
mse = mean_squared_error(y_test_fare_amount, preds_fare)
mae = mean_absolute_error(y_test_fare_amount, preds_fare)
rmse = root_mean_squared_error(y_test_fare_amount, preds_fare)

print(mse)
print(mae)
print(rmse)

162.97555817113022
8.612743355634692


In [None]:
# from the previous exercise we now get some information

print(f'Estimators: {len(fare_amount_rf.estimators_)}')
print(f'Feature importances: {fare_amount_rf.feature_importances_}')
print(f'Original columns: {X.columns}')

Estimators: 100
Feature importances: [0.         0.89045782 0.10954218]


Now to predict the travel time

In [38]:
# initial run, no funetuning
travel_time_rf = RandomForestRegressor()

In [39]:
travel_time_rf.fit(X_train, y_train_travel_time)

In [41]:
preds_tt = travel_time_rf.predict(X_test)
# measure our results
mse = mean_squared_error(y_test_travel_time, preds_tt)
mae = mean_absolute_error(y_test_travel_time, preds_tt)
rmse = root_mean_squared_error(y_test_travel_time, preds_tt)

print(mse)
print(mae)
print(rmse)

631929.452130803
550.9514718045573
794.9399047291581


**Logistic Regresion: Initial response without Fine Tuning.**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#FARE AMOUNT LOGISTIC REGRESSION
fare_amount_logreg = LogisticRegression()
fare_amount_logreg.fit(X_train, y_train_fare_amount)

In [None]:
preds_fare_logreg = fare_amount_logreg.predict(X_test)
# measure the results
mse = mean_squared_error(y_test_fare_amount, preds_fare_logreg)
mae = mean_absolute_error(y_test_fare_amount, preds_fare_logreg)
rmse = root_mean_squared_error(y_test_fare_amount, preds_fare_logreg)

print(mse)
print(mae)
print(rmse)

In [None]:
#TRAVEL TIME LOGISTIC REGRESSION
travel_time_logreg = LogisticRegression()
travel_time_logreg.fit(X_train, y_train_travel_time)

In [None]:
preds_travel_logreg = travel_time_logreg.predict(X_test)
# measure the results
mse = mean_squared_error(y_test_travel_time, preds_travel_logreg)
mae = mean_absolute_error(y_test_travel_time, preds_travel_logreg)
rmse = root_mean_squared_error(y_test_travel_time, preds_travel_logreg)

print(mse)
print(mae)
print(rmse)

Now we start finetuning our RandomFores models

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
fare_amount_rf = RandomForestRegressor()

In [None]:
# we will choose from the following parameters to perform our cross validation search
rf_params = {}

In [None]:
clf_fare_amount = RandomizedSearchCV(fare_amount_rf,)