In [19]:
import pandas as pd
from datetime import datetime

import optuna
import joblib
from sklearn.metrics import mean_absolute_error

from src.paths import *
from src.data_split import *
from src.model import *

optuna.logging.set_verbosity(optuna.logging.WARNING)
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_parquet(path = TRANSFORMED_PATH + "/features.parquet")
df['target_rides_next_hour'] = pd.read_parquet(path = TRANSFORMED_PATH + "/target.parquet")
df

Unnamed: 0,rides_previous_672_hours,rides_previous_671_hours,rides_previous_670_hours,rides_previous_669_hours,rides_previous_668_hours,rides_previous_667_hours,rides_previous_666_hours,rides_previous_665_hours,rides_previous_664_hours,rides_previous_663_hours,...,rides_previous_7_hours,rides_previous_6_hours,rides_previous_5_hours,rides_previous_4_hours,rides_previous_3_hours,rides_previous_2_hours,rides_previous_1_hours,pickup_hour,pickup_location_id,target_rides_next_hour
0,1,1,0,2,0,0,1,2,1,5,...,0,0,0,0,0,0,0,2022-01-29 04:00:00,1,0
1,0,4,1,2,1,2,0,1,1,3,...,1,0,0,0,0,0,0,2022-01-30 04:00:00,1,0
2,0,0,0,2,0,0,0,0,2,1,...,0,0,0,0,0,0,0,2022-01-31 04:00:00,1,1
3,0,0,0,0,1,1,0,0,0,2,...,1,0,0,0,0,0,0,2022-02-01 04:00:00,1,0
4,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,2022-02-02 04:00:00,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72307,3,4,1,1,1,0,1,2,2,3,...,29,15,4,12,7,11,4,2022-10-27 00:00:00,265,12
72308,9,4,3,3,2,4,2,6,1,3,...,29,17,12,4,5,8,9,2022-10-28 00:00:00,265,3
72309,7,6,3,4,3,5,7,6,5,10,...,29,13,9,10,5,10,7,2022-10-29 00:00:00,265,6
72310,6,5,8,6,6,0,1,2,8,6,...,8,10,7,3,3,6,2,2022-10-30 00:00:00,265,10


### Feature Engineering, Hyperparameter Tuning and Model training

- Feature Engineering
    1. Add new feature by taking an average of rides taken in the last 4 weeks
    2. Extract new features from pickup timestamp
    3. Gecode the location(id) into meaningful coordinates (latitudes,longitudes)
- Convert user-defined functions into transformers that can be used further down the line in pipelines

In [3]:
# Train - Test split

cutoff_dt = datetime(year=2022, month=6, day=1, hour=0, minute=0, second=0)
X_train,y_train,X_test,y_test = train_test_split(df,cutoff_dt)

print("Training Subset : ",X_train.shape,y_train.shape)
print("Validation data : ",X_test.shape,y_test.shape)

Training Subset :  (32284, 674) (32284,)
Validation data :  (40028, 674) (40028,)


#### Hyperparameter tuning using Optuna

In [4]:
# Create an objective function that Optuna tries to minimize

def objective(trial):
    
    r_params = {
        "metric": "mae",
        "verbosity": -1,
        "num_leaves":trial.suggest_int('num_leaves',2,256), 
        "bagging_fraction":trial.suggest_float('bagging_fraction',0.2,1.0),
        "feature_fraction":trial.suggest_float('feature_fraction',0.2,1.0),
        "min_data_in_leaf":trial.suggest_int('min_data_in_leaf',3,100)
    }
    
    pipe = get_pipeline(r_params)
    pipe.fit(X_train,y_train)
    
    y_pred_lgb_ht = pipe.predict(X_test)
    error_metric_lgb_ht = mean_absolute_error(y_test,y_pred_lgb_ht)
    print("{:.4f}".format(error_metric_lgb_ht))
    
    return error_metric_lgb_ht

In [5]:
# Run the optimization (i.e., minimization here)

study = optuna.create_study(study_name='lgb_nyc_study',direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]

2.5014
2.5050
2.4858
2.5328
2.5516
2.5890
2.7489
2.5038
2.5061
2.5667
2.5176
2.5073
2.5038
2.5708
2.5155
2.5259
2.4979
2.5125
2.4999
2.5146
2.5000
2.5045
2.4956
2.5020
2.4884
2.5011
2.4898
2.4958
2.5025
2.5053
2.4861
2.4918
2.5375
2.4984
2.4944
2.4920
2.5155
2.4909
2.5306
2.5088
2.5180
2.5042
2.4931
2.4910
2.4988
2.4844
2.5400
2.5439
2.4986
2.6042
2.5264
2.4845
2.4760
2.5076
2.5114
2.4969
2.5008
2.6221
2.4957
2.4900
2.4954
2.4873
2.4836
2.4868
2.4864
2.4957
2.4911
2.4911
2.4879
2.5116
2.4856
2.4954
2.4990
2.4887
2.4993
2.5057
2.5087
2.4791
2.4917
2.5182
2.4900
2.4996
2.4840
2.4894
2.4929
2.4866
2.5058
2.5030
2.4847
2.4976
2.4862
2.4952
2.4782
2.4750
2.4782
2.4820
2.4807
2.4948
2.5024
2.4910


In [6]:
print('Best parameters', study.best_params)
print('Best score', study.best_value)
print('Best model', study.best_trial)

Best parameters {'num_leaves': 116, 'bagging_fraction': 0.40117538729933633, 'feature_fraction': 0.7572269855330722, 'min_data_in_leaf': 78}
Best score 2.474986091581983
Best model FrozenTrial(number=93, state=TrialState.COMPLETE, values=[2.474986091581983], datetime_start=datetime.datetime(2024, 2, 11, 0, 55, 21, 552038), datetime_complete=datetime.datetime(2024, 2, 11, 0, 55, 33, 98132), params={'num_leaves': 116, 'bagging_fraction': 0.40117538729933633, 'feature_fraction': 0.7572269855330722, 'min_data_in_leaf': 78}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'bagging_fraction': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'min_data_in_leaf': IntDistribution(high=100, log=False, low=3, step=1)}, trial_id=93, value=None)


#### Re-fit the model using the best parameters found

In [7]:
pipe = get_pipeline(study.best_params)
pipe.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.206501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154550
[LightGBM] [Info] Number of data points in the train set: 32284, number of used features: 676
[LightGBM] [Info] Start training from score 11.714534


In [24]:
### Save the model for later use

joblib.dump(pipe,MODEL_PATH+'nyc_pipe.joblib')

['/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/model/nyc_pipe.joblib']

In [25]:
pipe_again = joblib.load(MODEL_PATH+'nyc_pipe.joblib')

In [26]:
y_pred_lgb_ht = pipe_again.predict(X_test)



#### Evaluate model

In [27]:
error_metric_lgb_ht = mean_absolute_error(y_test,y_pred_lgb_ht)
print("{:.4f}".format(error_metric_lgb_ht))

2.4750
