In [1]:
from src import data_utils, preprocessing
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# first we get our cleaned data set
df_og = data_utils.clean_trip_data('yellow_tripdata_2022-05.parquet')

In [3]:
# now we get our X, and ys
X, y_travel_time, y_fare_amount = data_utils.get_feature_target(df_og)

In [4]:
# and we then get our training and testing sets
# we are going to get two sets of y labels, one for travel_time and another for fare_amount
X_train, X_test, y_train_travel_time, y_test_travel_time, y_train_fare_amount, y_test_fare_amount = data_utils.get_train_test_sets(X, y_travel_time, y_fare_amount)

In [5]:
# we can further split the training set into training and validation
X_train, X_val, y_train_travel_time, y_val_travel_time, y_train_fare_amount, y_val_fare_amount = data_utils.get_train_test_sets(X_train, y_train_travel_time, y_train_fare_amount)

In [6]:
# now using the training, validation, and testing sets we can preprocess our data
X_train, X_val, X_test = preprocessing.preprocess_data(X_train, X_val, X_test)

Input train data shape:  (2160964, 10)
Input val data shape:  (540242, 10)
Input test data shape:  (675302, 10) 



In [7]:
column_names_order = ['PULocationID','DOLocationID','improvement_surcharge','congestion_surcharge','airport_fee','day','month','is_weekend','distance_between_zones','morning','afternoon','night']

**MLP model training without Fine Tuning**

In [9]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

In [10]:
mlp_model = MLPRegressor()

In [None]:
#from sklearn.preprocessing import StandardScaler
#Escalamos datos para ingresar al modelo
#scaler = StandardScaler()

#X_train_scaled = scaler.fit_transform(X_train)
#X_val_scaled = scaler.transform(X_val)


In [18]:
mlp_model.fit(X_train, y_train_fare_amount)



In [19]:
mlp_preds_fare = mlp_model.predict(X_val)
r2_fare = r2_score(y_val_fare_amount, mlp_preds_fare)
mae_fare = mean_absolute_error(y_val_fare_amount, mlp_preds_fare)
rmse_fare = rmse = root_mean_squared_error(y_val_fare_amount, mlp_preds_fare)

print(f'R2 score: {r2_fare}')
print(f'Mean Absolute Error: {mae_fare}')
print(f'Root Mean Squared Error: {rmse_fare}')

R2 score: 0.5442223045654946
Mean Absolute Error: 2.6273268137466372
Root Mean Squared Error: 10.700281756875249


**Fine Tuning**

In [20]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'hidden_layer_sizes': [(100,), (100, 50), (150, 100), (200,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'max_iter': [300, 500, 1000],
    'alpha': np.logspace(-5, 5, 10),
    'learning_rate': ['constant', 'adaptive'],
}

In [21]:
random_search = RandomizedSearchCV(
    MLPRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    n_jobs=-1,
    random_state=42
)

In [22]:
random_search.fit(X_train, y_train_fare_amount)

KeyboardInterrupt: 

In [None]:
y_pred_fare = random_search.best_estimator_.predict(X_val)
y_pred_travel = random_search.best_estimator_.predict(X_val)

In [None]:
r2_fare = r2_score(y_val_fare_amount, y_pred_fare)
mae_fare = mean_absolute_error(y_val_fare_amount, y_pred_fare)
rmse_fare = root_mean_squared_error(y_val_fare_amount, y_pred_fare, squared=False)

r2_travel = r2_score(y_val_travel_time, y_pred_travel)
mae_travel = mean_absolute_error(y_val_travel_time, y_pred_travel)
rmse_travel = root_mean_squared_error(y_val_travel_time, y_pred_travel, squared=False)

In [None]:
print(f"Fare Amount MLP Model - R2: {r2_fare}, MAE: {mae_fare}, RMSE: {rmse_fare}")
print(f"Travel Time MLP Model - R2: {r2_travel}, MAE: {mae_travel}, RMSE: {rmse_travel}")