## References

* Credits to @abdulravoofshaik for his work in implementing an efficient LGBM model
* Notebook link: https://www.kaggle.com/code/abdulravoofshaik/top-3-solution-lgbm-mean

This code is aimed to help people find the best hyperparameters for LGBM using Bayesian Optimization tool.
TRAINING_SIZE variable enables to make tuning faster by taking only a sample of the dataset.
This is my first time implementing BO so any feedback is welcome !

In [None]:
# pip install bayesian-optimization

In [None]:
import random
import time 
import warnings

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import missingno as msno
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm


warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
PATH_DATA = "../input/tabular-playground-series-jun-2022/data.csv"
data = pd.read_csv(PATH_DATA, index_col='row_id')

In [None]:
data.shape

In [None]:
data

In [None]:
msno.matrix(data)

In [None]:
nans_percentage = data.isna().sum().sum() / data.size * 100
print(f"Dataset contains {nans_percentage} % of Nans")

In [None]:
data.dtypes.value_counts()

In [None]:
# shows only int64 columns

mask_int64 = data.dtypes == 'int64'
data.loc[:, data.columns[mask_int64]].columns

In [None]:
list_col_2 = [col for col in data.columns if col.split('_')[1] == '2']
data_col_2 = data[list_col_2]

print(f"Columns starting with F_2 have \
{data_col_2.isna().sum().sum()} Nans")

In [None]:
# correlation of all columns containing nans

pearsoncorr = data.corr()

plt.figure(figsize=(30, 30))
sns.heatmap(np.abs(pearsoncorr),
            xticklabels=pearsoncorr.columns,
            yticklabels=pearsoncorr.columns,
            cmap='RdBu_r',
            annot=True,
            linewidth=0.5)

In [None]:
# shows correlations of columns starting with F_4 only

pearsoncorr_4 = pearsoncorr.loc[[row for row in pearsoncorr.index
                                 if row.split('_')[1] == '4'],
                                [col for col in pearsoncorr.columns
                                 if col.split('_')[1] == '4']]

plt.figure(figsize=(15, 7))
sns.heatmap(np.abs(pearsoncorr_4),
            xticklabels=pearsoncorr_4.columns,
            yticklabels=pearsoncorr_4.columns,
            cmap='RdBu_r',
            annot=True,
            linewidth=0.5)

In [None]:
list_col_nans = [col for col in data.columns if data[col].isna().sum() != 0]

print(f"Columns with nans have an average of \
{round(data[list_col_nans].isna().mean().mean()*100, 2)} \
% nans with a mininum of \
{round(data[list_col_nans].isna().mean().min()*100, 2)} \
% of nans and a maximum of \
{round(data[list_col_nans].isna().mean().max()*100, 2)} \
% of nans")

In [None]:
# defining the training scope for Bayesian Optimization of group 4 columns

list_col_4 = [col for col in data.columns if col.split('_')[1] == '4']
data_col_4 = data[list_col_4]
TARGET_COL = 'F_4_7'
TRAINING_SIZE = 100000

target_nan_idx = data_col_4[data_col_4[TARGET_COL].isnull()].index

train_set = data_col_4.drop(target_nan_idx, axis=0)
test_set = data_col_4[data_col_4.index.isin(target_nan_idx)]

X = train_set.drop([TARGET_COL], axis=1)
y = train_set[TARGET_COL]

subset_idx = random.sample(list(X.index), TRAINING_SIZE)

X = X.loc[subset_idx]
y = y.loc[subset_idx]

X.shape

In [None]:
dtrain = xgb.DMatrix(X, label=y)


def xgb_evaluate(max_depth, gamma, eta, colsample_bytree, min_child_weight):
    
    """ Function to maximize.
        Inputs are hyperparameters.
        Returns negative RMSE score."""
    
    params = {'eval_metric': 'rmse',
              'max_depth': int(max_depth),
              'subsample': 0.8,
              'eta': eta,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree,
              'min_child_weight': int(min_child_weight)
             }

    cv_result = xgb.cv(params, dtrain, num_boost_round=300, nfold=3)

    # Bayesian only knows how to maximize so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [None]:
%%time

xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (1, 10),
                                             'gamma': (0, 5),
                                             'eta': (0.01, 0.1),
                                             'colsample_bytree': (0.3, 0.9),
                                             'min_child_weight': (3,6)
                                            })


xgb_bo.maximize(init_points=5, n_iter=15, acq='ei')

In [None]:
# save the best hyperparameters

xgb_best_params = xgb_bo.max
best_max_depth = int(xgb_best_params['params']['max_depth'])
best_min_child_weight = int(xgb_best_params['params']['min_child_weight'])

xgb_best_params['params']['max_depth'] = best_max_depth
xgb_best_params['params']['min_child_weight'] = best_min_child_weight

xgb_best_params['model_name'] = 'xgb'
xgb_best_params

In [None]:
def lgb_eval(learning_rate, num_leaves, max_bin):
    """ Function to maximize.
        Inputs are hyperparameters.
        Returns negative RMSE score."""
    
    dtrain = lgb.Dataset(data=X, label=y)
    
    params = {'objective': 'regression',
              'learning_rate': learning_rate,
              'num_iterations': 500,
              'num_leaves': int(num_leaves),
              'max_bin': int(max_bin),
              'metric': 'rmse',
              'force_col_wise': 'true',
              'verbose': -1
             }
    
    cv_result = lgb.cv(params, train_set=dtrain, nfold=3, metrics='rmse', stratified=False)
    # Bayesian only knows how to maximize so return the negative RMSE

    return -1.0 * min(cv_result['rmse-mean'])

In [None]:
%%time

lgbBO = BayesianOptimization(lgb_eval,
                             {'learning_rate': (0.01, 0.02),
                              'num_leaves': (50, 150),
                              'max_bin': (255,1000)
                             })


lgbBO.maximize(init_points=5, n_iter=15, acq='ei')

In [None]:
# save the best hyperparameters

lgb_best_params = lgbBO.max

best_num_leaves = int(lgb_best_params['params']['num_leaves'])
best_max_bin = int(lgb_best_params['params']['max_bin'])

lgb_best_params['params']['num_leaves'] = best_num_leaves
lgb_best_params['params']['max_bin'] = best_max_bin

lgb_best_params['model_name'] = 'LGBM'
lgb_best_params

In [None]:
# identifying the best model with the best HP based on the results

all_results = [xgb_best_params, lgb_best_params]

best_target = all_results[0]['target']
best_model_name = all_results[0]['model_name']
best_params = all_results[0]['params']


for model_max in all_results:
    if model_max['target'] > best_target:
        best_target = model_max['target']
        best_model_name = model_max['model_name']
        best_params = model_max['params']

print("Model performing best is: ", best_model_name)

In [None]:
# building the final model depending on the results

if best_model_name == 'xgb':
    best_model = xgb.XGBRegressor(
          n_estimators=5000,
          colsample_bytree=best_params['colsample_bytree'],
          eta=best_params['eta'],
          gamma=best_params['gamma'],
          max_depth=best_params['max_depth'],
          min_child_weight=best_params['min_child_weight']
                              )
elif best_model_name == 'LGBM':
    best_model = LGBMRegressor(
          n_estimators=20000,
          learning_rate=best_params['learning_rate'],
          num_iterations=30000,
          num_leaves=best_params['num_leaves'],
          max_bin=best_params['max_bin'],
          metric='rmse'
                    )

In [None]:
%%time

# training and predicting with the final model

predictions = {}

data_col_4 = data[list_col_4]

for target_col in list_col_4:
  
    target_nan_idx = data_col_4[data_col_4[target_col].isnull()].index
    train_set = data_col_4.drop(target_nan_idx, axis=0)
    test_set = data_col_4[data_col_4.index.isin(target_nan_idx)]


    X_train = train_set.drop([target_col], axis=1)
    y_train = train_set[target_col]
    X_test = test_set.loc[target_nan_idx].drop(target_col, axis=1)

    print('\nProcessing Column Name : ', target_col)
    best_model.fit(X_train, y_train)
    predictions[target_col] = best_model.predict(X_test)
    print(target_col, ' processed')

In [None]:
for col in list(predictions.keys()):
    data[col][data[col].isna()] = predictions[col]

In [None]:
print(f"""Columns starting with F_4 now have \
{data[list_col_4].isna().sum().sum()} nans left""")

In [None]:
imp = SimpleImputer(
         missing_values=np.nan,
         strategy='mean') 
data[:] = imp.fit_transform(data)

In [None]:
print(f"The whole dataset now has \
{data.isna().sum().sum()} nans left")

In [None]:
PATH_SAMPLE = '../input/tabular-playground-series-jun-2022/sample_submission.csv'

submission = pd.read_csv(PATH_SAMPLE, index_col='row-col')
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

submission.to_csv('submission.csv')