#  Simple example of lightgbm model with optuna

In [None]:
import optuna.integration.lightgbm as lgb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMRegressor

import optuna
import functools
import gc
import time

import warnings
warnings.filterwarnings('ignore')

In [None]:
#--------------------------------------read the data--------------------------------------

train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

X_train = train.drop(['id', 'target'], axis=1)
y_train = train.target
X_test = test.drop(['id'], axis=1)

cat_cols = [feature for feature in train.columns if 'cat' in feature]

#--------------------------------------encode categorical features--------------------------------------

def label_encoder(df):
    for feature in cat_cols:
        le = LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

X_train = label_encoder(X_train)
X_test = label_encoder(X_test)

In [None]:
#--------------------------------------making cross validation folds--------------------------------------

split = KFold(n_splits=5, shuffle=True, random_state=2)

In [None]:
#--------------------------------------optuna parameters function--------------------------------------

def lb_opt(trial):
    
    max_depth =  trial.suggest_int('max_depth',1,30),
    subsample =  trial.suggest_uniform('subsample',0.001,1),
    colsample_bytree = trial.suggest_uniform('colsample_bytree',0.001,1),
    learning_rate = trial.suggest_loguniform('learning_rate',0.001,0.1),
    num_leaves = trial.suggest_int('num_leaves',2,70),
    reg_lambda = trial.suggest_int('reg_lambda',1,100),
    reg_alpha = trial.suggest_int('reg_alpha',1,100),
    min_child_samples = trial.suggest_int('min_child_samples',1,100),
    max_bin = trial.suggest_int('max_bin',1,1000),
    cat_smooth = trial.suggest_int('cat_smooth',1,100),
    cat_l2 = trial.suggest_uniform('cat_l2',0.001,1.0),
    random_state = trial.suggest_int('random_state',1,50),
   

    
    lightgbm_tuna = LGBMRegressor(
        verbosity = 0,
        num_leaves=num_leaves, 
        max_depth=max_depth, 
        learning_rate=learning_rate, 
        n_estimators=20000, 
        min_child_samples=min_child_samples, 
        subsample=subsample, 
        colsample_bytree=colsample_bytree, 
        reg_alpha=reg_alpha, 
        reg_lambda=reg_lambda, 
        random_state=random_state, 
        metric='rmse'
    )
    
    lightgbm_tuna.fit(X_train,y_train)
    lb_predict_test = lightgbm_tuna.predict(X_test)
    
study = optuna.create_study(direction="minimize")
study.optimize(lb_opt, n_trials=10)
print(study.best_trial)

In [None]:
#--------------------------------------params we got--------------------------------------

lgbm_params = {'max_depth': 16, 
                'subsample': 0.7, 
                'colsample_bytree': 0.2, 
                'learning_rate': 0.007,
                'reg_lambda': 10.98, 
                'reg_alpha': 17.3, 
                'min_child_samples': 31, 
                'num_leaves': 100, 
                'max_bin': 600, 
                'cat_smooth': 81, 
                'cat_l2': 0.03, 
                'metric': 'rmse', 
                'n_jobs': -1, 
                'n_estimators': 20000}

In [None]:
preds_list_base = []
preds_list_final_iteration = []
preds_list_all = []

#--------------------------------------split the train data on train and validation sets--------------------------------------

for train_idx, val_idx in split.split(X_train):
    
            X_tr = X_train.iloc[train_idx]
            X_val = X_train.iloc[val_idx]
            y_tr = y_train.iloc[train_idx]
            y_val = y_train.iloc[val_idx]
            
#--------------------------------------fit the model with optune params--------------------------------------
            
            Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=250, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                          verbose=0)
            
            preds_list_base.append(Model.predict(X_test))
            preds_list_all.append(Model.predict(X_test))
            print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
            first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            params = lgbm_params.copy()
            
#--------------------------------------reducing regularizing params if--------------------------------------            
            for i in range(1, 8):
                if i >2:    
                    params['reg_lambda'] *= 0.9
                    params['reg_alpha'] *= 0.9
                    params['num_leaves'] += 40
                    
                params['learning_rate'] = 0.003
                Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=200, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          verbose=0,
                          init_model=Model)
                
                preds_list_all.append(Model.predict(X_test))
                print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
                
            last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            print('',end='\n\n')
            print(f'Improvement of : {first_rmse - last_rmse}')
            print('-' * 100)
            preds_list_final_iteration.append(Model.predict(X_test))

In [None]:
y_preds_final_iteration = np.array(preds_list_final_iteration).mean(axis=0)
y_preds_final_iteration

In [None]:
submission = pd.DataFrame({'id':test.id,
              'target':y_preds_final_iteration})
submission.to_csv('submission.csv', index=False)