# Tabular Playground Series October 2021 using Optuna's Light GBM Tuner

During last month's competition I discovered that Optuna has a [build in solution for tuning LightGBM hyperparameters](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.lightgbm.LightGBMTuner.html) in a sequential manner. This is supposed to be even faster than using Optuna and LightGBM in what [Kaggle grandmaster Kohei Osaki calls a "naive way"](https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258). Instead of using the product of all hyperparameters which results in a large search space, LightGBM Tuner follows a step-wise approach. It tunes lambda_l1, lambda_l2, num_leaves, feature_fraction, bagging_fraction, bagging_freq and min_child_samples, sequentially. 

So let's see how well it works!

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
print("scikit-learn version: {}". format(sklearn.__version__))

import optuna
import optuna.integration.lightgbm as lgb
print("Optuna version:  {}".format(optuna.__version__))

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read input files
df_train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")

In [None]:
# see EDA notebooks for why these columns are chosen
feature_cols = [col for col in df_train.columns if col.startswith("f")] # all features
bin_feat = ["f22"] + ["f43"] + list(df_train.columns[243:-1])           # binary features
num_feat = [x for x in feature_cols if x not in bin_feat]               # numeric features

In [None]:
ts = time.time()

print("Before downcast:")
display(df_train.info())

# reduce memory size of data frames
df_train[num_feat] = df_train[num_feat].astype('float32')
df_train[bin_feat] = df_train[bin_feat].astype('uint8')
df_train['target'] = df_train['target'].astype('int8')

df_test[num_feat] = df_test[num_feat].astype('float32')
df_test[bin_feat] = df_test[bin_feat].astype('uint8')

print("\nAfter downcast:")
display(df_train.info())

execution_time = time.time() - ts
print("\nExecution time: " + str(round(execution_time,3)) + "s")

In [None]:
df_train.head()

In [None]:
# dividing X, y into train and test data
X = df_train.drop(columns=["id","target"])
y = df_train.target
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 29, stratify=y)
X_test = df_test.drop(columns=["id"])
display(X_train.shape)

In [None]:
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

The cells below use Optuna's LightGBM Tuner.

In [None]:
params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt"
    }

In [None]:
#model = lgb.train(
#        params, 
#        dtrain, 
#        valid_sets=[dtrain, dval], 
#        verbose_eval=100, 
#        early_stopping_rounds=100
#    )
#best_params = model.params
#print("Best params:", best_params)

In [None]:
#print("  Params: ")
#for key, value in best_params.items():
#    print("    {}: {}".format(key, value))

In [None]:
#pred_val = model.predict(X_val, num_iteration=model.best_iteration)
#roc_auc_score(y_val, pred_val)

In [None]:
#predictions = model.predict(X_test, num_iteration=model.best_iteration)

I'm using LightGBM (without tuner) here to run again with higher number of iterations and a lower learning rate. 

In [None]:
import lightgbm as lgb
print("LightGBM version:  {}".format(lgb.__version__))

In [None]:
lgb_params = {'objective': 'binary', 
          'metric': 'auc', 
          'num_iterations' : 12000,
          'learning_rate' : 0.01,
          'verbosity': -1, 
          'boosting_type': 'gbdt',  
          'lambda_l1': 8.533875942246594, 
          'lambda_l2': 2.0533270677941314e-06, 
          'num_leaves': 13, 
          'feature_fraction': 0.4, 
          'bagging_fraction': 1.0, 
          'bagging_freq': 0, 
          'min_child_samples': 50}

In [None]:
#dict_eval = {}

In [None]:
#ts = time.time()

#model = lgb.train(        
#        lgb_params, 
#        dtrain, 
#        valid_sets=[dtrain, dval],
#        valid_names=['train','val'],
        #evals_result = dict_eval,  # use this to store the auc scores, can be handy in cv loop
#        verbose_eval=200, 
#        early_stopping_rounds=300
#    )

#execution_time = time.time() - ts
#print("\nTraining time: " + str(round(execution_time,3)) + "s")

In [None]:
# early stopping = 300
# learning rate 0.1, num_iter 3000 -> val's auc 0.856436 [925]
# learning rate 0.05, num_iter 3000 -> [2247]	train's auc: 0.870716	val's auc: 0.856854
# learning rate 0.01, num_iter 4000 -> [4000]	train's auc: 0.861236	val's auc: 0.855615
# learning rate 0.01, num_iter 8000 -> [8000]	train's auc: 0.867171	val's auc: 0.856897
# learning rate 0.01, num_iter 15000 -> [10712]	train's auc: 0.870415	val's auc: 0.856981
# learning rate 0.01, num_iter 12000, val set size = 10% -> [9721]	train's auc: 0.867667	val's auc: 0.855951

In [None]:
#lgb.plot_importance(model, figsize=(16,40))

In [None]:
#predictions = model.predict(X_test)

In [None]:
# generate submission file 
#submission_lgbm = pd.DataFrame(data={"id" : sample_submission.id,
#                                     "target" : predictions})

#submission_lgbm.to_csv('submission_lgbm.csv', index=False)
#submission_lgbm.head()

In [None]:
# new in version 6, I want to see how much can be gained by increasing the trees to 25000
lgb_params = {'objective': 'binary', 
          'metric': 'auc', 
          'num_iterations' : 25000,
          'learning_rate' : 0.01,
          'verbosity': -1, 
          'boosting_type': 'gbdt',  
          'lambda_l1': 8.533875942246594, 
          'lambda_l2': 2.0533270677941314e-06, 
          'num_leaves': 13, 
          'feature_fraction': 0.4, 
          'bagging_fraction': 1.0, 
          'bagging_freq': 0, 
          'min_child_samples': 50}

In [None]:
# retrain on whole data and make a new submission file
dtrain = lgb.Dataset(X, label=y)
model = lgb.train(
        lgb_params, 
        dtrain
    )

In [None]:
predictions = model.predict(X_test)

In [None]:
# generate submission file 
submission_lgbm = pd.DataFrame(data={"id" : sample_submission.id,
                                     "target" : predictions})

submission_lgbm.to_csv('submission_lgbm_retrain.csv', index=False)
submission_lgbm.head()