## LightGBM Hyperparameter Tuning

In [32]:
import warnings
warnings.filterwarnings('ignore')


In [33]:
# load our data
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0


In [34]:
# split in to training and testing
from src.data_split import train_test_split
from datetime import datetime

X_train, y_train, X_test, y_test = train_test_split(
    df,
    # have to use datetime here
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(56710, 674)
y_test.shape=(56710,)


#### Define Optuna Objective Function

This guides the search for the optimal hyperparameters. We have to define a function that takes a **trial** object as input and returns a scaler that represents the performance of the model for the given hyperparameters. 

In [35]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = KFold(n_splits=3)
    scores = []
        
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [25]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-09-19 14:06:52,136] A new study created in memory with name: no-name-d5f8758a-e3fe-4bec-a8a6-252742ad437c


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 253, 'feature_fraction': 0.797028687841669, 'bagging_fraction': 0.7304091239677686, 'min_child_samples': 68}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created: Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function average_rides_last_4_weeks at 0x00000196BBB77520>)),
                ('temporalfeaturesengineer', TemporalFeaturesEngineer()),
                ('lgbmregressor',
                 LGBMRegressor(bagging_fraction=0.7304091239677686,
                               feature_fraction=0.797028687841669, metric='mae',
                               min_child_samples=68, num_leaves=253,
                               verbose=-1))])
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 253, 'feature_fraction': 0.797028687841669, 'bagging_fraction': 0.7304091239677686, 'min_child_samples': 

[I 2024-09-19 14:07:25,396] Trial 0 finished with value: 2.6037618542364007 and parameters: {'num_leaves': 253, 'feature_fraction': 0.797028687841669, 'bagging_fraction': 0.7304091239677686, 'min_child_samples': 68}. Best is trial 0 with value: 2.6037618542364007.


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 30, 'feature_fraction': 0.54308755855437, 'bagging_fraction': 0.8061182813798904, 'min_child_samples': 5}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created: Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function average_rides_last_4_weeks at 0x00000196BBB77520>)),
                ('temporalfeaturesengineer', TemporalFeaturesEngineer()),
                ('lgbmregressor',
                 LGBMRegressor(bagging_fraction=0.8061182813798904,
                               feature_fraction=0.54308755855437, metric='mae',
                               min_child_samples=5, num_leaves=30,
                               verbose=-1))])
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 30, 'feature_fraction': 0.54308755855437, 'bagging_fraction': 0.8061182813798904, 'min_child_samples': 5}
Added

[I 2024-09-19 14:07:35,763] Trial 1 finished with value: 2.612553999484412 and parameters: {'num_leaves': 30, 'feature_fraction': 0.54308755855437, 'bagging_fraction': 0.8061182813798904, 'min_child_samples': 5}. Best is trial 0 with value: 2.6037618542364007.


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 86, 'feature_fraction': 0.7666437404583031, 'bagging_fraction': 0.653097012009402, 'min_child_samples': 52}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created: Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function average_rides_last_4_weeks at 0x00000196BBB77520>)),
                ('temporalfeaturesengineer', TemporalFeaturesEngineer()),
                ('lgbmregressor',
                 LGBMRegressor(bagging_fraction=0.653097012009402,
                               feature_fraction=0.7666437404583031,
                               metric='mae', min_child_samples=52,
                               num_leaves=86, verbose=-1))])
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 86, 'feature_fraction': 0.7666437404583031, 'bagging_fraction': 0.653097012009402, 'min_child_samples': 52}

The Optuna study object uses the objective function to test the hyperparameters. Since we are trying to optimize mean absolute error, we use **direction = 'minimize'** but this can be changed to direction = 'maximize' if the scoring metric we were trying to optimize was accuracy score or similar metric. 

In [26]:
# best params
best_params = study.best_params
print(f'{best_params=}')

best_params={'num_leaves': 114, 'feature_fraction': 0.7481441912280344, 'bagging_fraction': 0.7139488991894807, 'min_child_samples': 51}


In [27]:
# retrain
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

Creating pipeline with hyperparameters: {'num_leaves': 114, 'feature_fraction': 0.7481441912280344, 'bagging_fraction': 0.7139488991894807, 'min_child_samples': 51}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created: Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function average_rides_last_4_weeks at 0x00000196BBB77520>)),
                ('temporalfeaturesengineer', TemporalFeaturesEngineer()),
                ('lgbmregressor',
                 LGBMRegressor(bagging_fraction=0.7139488991894807,
                               feature_fraction=0.7481441912280344,
                               min_child_samples=51, num_leaves=114))])


In [28]:
predictions = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f'{mae=:.4f}')

mae=2.5296


##### Let's Try TimeSeriesSplit

This would be set up differently because order matters. We can see below that the indices increase by one and are not randomly split. This saves us from potentially using future dates in our training data. For example, if we use Kfold split when trying to forecast data we may have the dates mixed together such as February 10th in the training data and February 1st in the testing data. In the real world, we do not have information on future dates so we should not have information on future dates now. 

In [29]:
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits = 4)

for train_index, val_index in tss.split(X_train):
    X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
    y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
    print(train_index, val_index)

[   0    1    2 ... 6516 6517 6518] [ 6519  6520  6521 ... 13035 13036 13037]
[    0     1     2 ... 13035 13036 13037] [13038 13039 13040 ... 19554 19555 19556]
[    0     1     2 ... 19554 19555 19556] [19557 19558 19559 ... 26073 26074 26075]
[    0     1     2 ... 26073 26074 26075] [26076 26077 26078 ... 32592 32593 32594]


The X_train and y_train will be split 3 different times. Each time, a model is created and predictions are made. Then, the mean_absolute_error will be added to our scores list. For example, TimeSeriesSplit of X_train and y_train will have 3 mean_absolute_error scores. Then, we are returning the mean of those three scores. 

Following this process we make a study. The study will run 10 different iterations. This study will use different values for our hyperparameters with each iteration. Each iteration will have the TimeSeriesSplit iteration inside of it as well. 

In [30]:
# using time series split
def objective_ts(trial: optuna.trial.Trial) -> float:
    '''Same objective with TimeSeriesSplit.'''
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
    # split using kfolds - time stamps not used   
    tss = TimeSeriesSplit(n_splits = 4)
    scores = []

    # set up X_train, X_val, y_train, y_val
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # set up pipeline
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # predictions and evaluation
        predictions = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, predictions)

        # add scores to our empty list
        scores.append(mae)
    
    # return the mean of the scores
    return np.array(scores).mean()


In [31]:
# create a study 
study2 = optuna.create_study(direction= 'minimize')
study2.optimize(objective_ts, n_trials= 5)

[I 2024-09-19 14:06:43,887] A new study created in memory with name: no-name-37dd24bd-f7db-4c31-a0e0-749196565bd8


Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 93, 'feature_fraction': 0.535961158163473, 'bagging_fraction': 0.6962605221555951, 'min_child_samples': 17}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created: Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function average_rides_last_4_weeks at 0x00000196BBB77520>)),
                ('temporalfeaturesengineer', TemporalFeaturesEngineer()),
                ('lgbmregressor',
                 LGBMRegressor(bagging_fraction=0.6962605221555951,
                               feature_fraction=0.535961158163473, metric='mae',
                               min_child_samples=17, num_leaves=93,
                               verbose=-1))])
Creating pipeline with hyperparameters: {'metric': 'mae', 'verbose': -1, 'num_leaves': 93, 'feature_fraction': 0.535961158163473, 'bagging_fraction': 0.6962605221555951, 'min_child_samples': 17}

[W 2024-09-19 14:06:50,053] Trial 0 failed with parameters: {'num_leaves': 93, 'feature_fraction': 0.535961158163473, 'bagging_fraction': 0.6962605221555951, 'min_child_samples': 17} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\ryans\taxi_demand_predictor\.venv\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ryans\AppData\Local\Temp\ipykernel_33372\1271494902.py", line 23, in objective_ts
    pipeline.fit(X_train_, y_train_)
  File "c:\Users\ryans\taxi_demand_predictor\.venv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\ryans\taxi_demand_predictor\.venv\lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\ryans\taxi_demand_predictor\.venv\lib\site-packages\lightgbm\sklearn.py", line 1189, in fi

KeyboardInterrupt: 

In [21]:
# best params
best_params2 = study2.best_params
print(f'{best_params2=}')

best_params2={'num_leaves': 225, 'feature_fraction': 0.5027892422656295, 'bagging_fraction': 0.25202947347290605, 'min_child_samples': 25}


In [22]:
# retrain
pipeline2 = get_pipeline(**best_params2)
pipeline2.fit(X_train, y_train)

Creating pipeline with hyperparameters: {'num_leaves': 225, 'feature_fraction': 0.5027892422656295, 'bagging_fraction': 0.25202947347290605, 'min_child_samples': 25}
Added feature transformer for average rides
Added temporal features engineer
Pipeline created: Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function average_rides_last_4_weeks at 0x00000196BBB77520>)),
                ('temporalfeaturesengineer', TemporalFeaturesEngineer()),
                ('lgbmregressor',
                 LGBMRegressor(bagging_fraction=0.25202947347290605,
                               feature_fraction=0.5027892422656295,
                               min_child_samples=25, num_leaves=225))])


This model has a higher mean absolute error than the kfold split method but this is okay because with real data it is harder to predict the future and kfold does not have a realistic way of training and testing data because the dates do not matter in the splits. For this reason, the kfold splits have access to future dates and thus a lower mean absolute error is returned.

In [23]:
preds = pipeline2.predict(X_test)
mae = mean_absolute_error(y_test, preds)
print(f'{mae=:.4f}')

mae=2.6301
