## LightGBM Hyperparameter Tuning

In [52]:
import warnings
warnings.filterwarnings('ignore')


In [53]:
# load our data
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0


In [54]:
# split in to training and testing
from src.data_split import train_test_split
from datetime import datetime

X_train, y_train, X_test, y_test = train_test_split(
    df,
    # have to use datetime here
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(56710, 674)
y_test.shape=(56710,)


#### Define Optuna Objective Function

This guides the search for the optimal hyperparameters. We have to define a function that takes a **trial** object as input and returns a scaler that represents the performance of the model for the given hyperparameters. 

In [55]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import optuna
from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = KFold(n_splits=3)
    scores = []
        
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)
        scores.append(mae)
   
    return np.array(scores).mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

[I 2024-09-18 13:46:16,704] A new study created in memory with name: no-name-57aa887d-93f2-44c0-abd7-5559613a2bab
[W 2024-09-18 13:46:16,824] Trial 0 failed with parameters: {'num_leaves': 78, 'feature_fraction': 0.32314358136699095, 'bagging_fraction': 0.505293946042574, 'min_child_samples': 60} because of the following error: AttributeError("'NoneType' object has no attribute 'fit'").
Traceback (most recent call last):
  File "c:\Users\ryans\taxi_demand_predictor\.venv\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ryans\AppData\Local\Temp\ipykernel_20592\656805343.py", line 25, in objective
    pipeline.fit(X_train_, y_train_)
AttributeError: 'NoneType' object has no attribute 'fit'
[W 2024-09-18 13:46:16,834] Trial 0 failed with value None.


AttributeError: 'NoneType' object has no attribute 'fit'

#### Optuna Study Object

In [50]:
# create study
study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 10)

[I 2024-09-18 13:22:33,957] A new study created in memory with name: no-name-fcd16d6f-3d3b-42cb-b97b-f09cd5e5f5e5
[W 2024-09-18 13:22:34,014] Trial 0 failed with parameters: {'num_leaves': 189, 'feature_fraction': 0.9460874848658272, 'bagging_fraction': 0.2981283379907696, 'min_child_samples': 59} because of the following error: AttributeError("'NoneType' object has no attribute 'fit'").
Traceback (most recent call last):
  File "c:\Users\ryans\taxi_demand_predictor\.venv\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ryans\AppData\Local\Temp\ipykernel_20592\2932411385.py", line 35, in objective
    pipeline.fit(X_train_, y_train_)
AttributeError: 'NoneType' object has no attribute 'fit'
[W 2024-09-18 13:22:34,022] Trial 0 failed with value None.


AttributeError: 'NoneType' object has no attribute 'fit'