# Prameter Tuning by using Optuna

> ## PreProcess Class

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

class PreProcessDataFrame(object):
    def __init__(self):
        self.train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv', index_col='id')
        self.test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv', index_col='id')
        self.y = self.train['target']
        self.train = self.train.drop('target', axis=1)
        self.train_onehot, self.test_onehot = self.onehot_encoding()
        self.train_light_onehot, self.test_light_onehot = self.light_onehot_encoding()
        self.train_cat, self.test_cat = self.categorical_encoding()
  
    def onehot_encoding(self):
        concat = pd.concat([self.train, self.test], axis=0)
        concat_onehot = pd.get_dummies(concat)
        train_onehot = concat_onehot.iloc[:300000, :]
        test_onehot = concat_onehot.iloc[300000:, :]
        return train_onehot, test_onehot

    def light_onehot_encoding(self):
        unique_cats = [chr(i) for i in range(65, 65+15)]
        onehot_enc = OneHotEncoder().fit(np.array(unique_cats).reshape(-1, 1))
        concat = pd.concat([self.train, self.test], axis=0)
        add_onehot = np.zeros((500000, 15))

        for column in concat.columns[:10]:
            add_onehot += onehot_enc.transform(concat[column].values.reshape(-1, 1)).toarray()
        add_onehot = add_onehot / add_onehot.max()
        add_onehot = pd.DataFrame(add_onehot, columns=unique_cats, index=concat.index)
        light_onehot = pd.concat([add_onehot, concat.iloc[:, 10:]], axis=1)
        train_light_onehot = light_onehot.iloc[:300000, :]
        test_light_onehot = light_onehot.iloc[300000:, :]
        return train_light_onehot, test_light_onehot
  
    def categorical_encoding(self):
        concat = pd.concat([self.train, self.test], axis=0)
        for column in concat.columns[:10]:
            concat[column] = LabelEncoder().fit_transform(concat[column])
        train_cat = concat.iloc[:300000, :]
        test_cat = concat.iloc[300000:, :]
        return train_cat, test_cat

In [None]:
df = PreProcessDataFrame()
df.train.head()

In [None]:
df.train_onehot.head()

In [None]:
df.train_light_onehot.head()

In [None]:
df.train_cat.head()

## Cross-Validate-Generator

In [None]:
import time
import numpy as np

from sklearn.model_selection import KFold

import lightgbm as lgb

In [None]:
def KFold_train_val(X, y, random_state=42, shuffle=True, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        yield X_train, X_val, y_train, y_val

## Simple Cross Validation of LightGBM

In [None]:
class LightGBM(object):
    def __init__(self, params=dict(), metric='rmse', random_state=42, num_boost_round=500, early_stopping_rounds=10, verbose_eval=False):
        self.params = params
        self.params['metric'] = metric
        self.params['random_state'] = random_state
        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds
        self.verbose_eval = verbose_eval
        self.random_state = random_state

    def fit(self, X_train, y_train, X_val=None, y_val=None):
        train_data = lgb.Dataset(X_train, y_train, feature_name=list(X_train.columns), categorical_feature=list(X_train.columns)[:10])
    
        if X_val is not None and y_val is not None:
            val_data = lgb.Dataset(X_val, y_val, reference=train_data)
        else:
            val_data = None
    
        self.model = lgb.train(
            params=self.params, train_set=train_data, valid_sets=val_data, 
            num_boost_round=self.num_boost_round, early_stopping_rounds=self.early_stopping_rounds, 
            verbose_eval=self.verbose_eval,
            )
        if val_data:
            self.score = self.model.best_score['valid_0'][self.params['metric']]

    def predict(self, X_test):
        y_pred = self.model.predict(X_test, num_iteration=self.model.best_iteration)
        return y_pred    

In [None]:
models = []

for i, (X_train, X_val, y_train, y_val) in enumerate(KFold_train_val(df.train_cat, df.y), 1):
    print(f'validation : {i}')
    start = time.time()

    lr = LightGBM()
    lr.fit(X_train, y_train, X_val, y_val)
    models.append(lr)

    print('elapsed_time : {}'.format(time.time() - start))
    print('score : {}\n'.format(lr.score))
  
score = [model.score for model in models]
print('mean score : {}'.format(np.array(score).mean()))

In [None]:
predict_values = np.zeros((200000, len(models)))

for i, model in enumerate(models):
  predict_values[:, i] = model.predict(df.test_cat)

predict_values, predict_values.shape

In [None]:
submit_df = pd.DataFrame({'id': df.test.index, 'target': predict_values.mean(axis=1)})
submit_df.to_csv('submit.csv', index=False)

## Parameter Tuning by Optuna

In [None]:
import optuna

def lgb_objective(X_train, X_val, y_train, y_val, trial):

  params = {
      # 'boosting': trial.suggest_categorical('boosting', ['gbdt', 'goss']),
      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
      'num_leaves': trial.suggest_int('num_leaves', 10, 200, 2),
      'max_depth': trial.suggest_int('max_detpth', 3, 20),
      'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50, 2),
      'bagging_fraction': trial.suggest_float('bagging_fraction', 0.01, 0.99),
      'bagging_freq': trial.suggest_int('bagging_freq', 0, 50, 5),
      'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
      'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
      }
  
  lr = LightGBM(params=params)
  lr.fit(X_train, y_train, X_val, y_val)
  return lr.score

In [None]:
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

class OptunaEarlyStopCallback(object):
    def __init__(self, patient: int):
        self.patient = patient
        self._no_improve_count = 0
        self._best_value = float('inf')

    def __call__(self, study: optuna.study.Study, trial: optuna.trial.Trial) -> None:
        if study.best_value < self._best_value:
            self._best_value = study.best_value
            self._no_improve_count = 0
        else:
            self._no_improve_count += 1

        if self._no_improve_count > self.patient:
            # logging.info('[INFO] : Trial {} early_stopping'.format(trial.number))
            logger.info('Trial {} : early_stopping'.format(trial.number))
            study.stop()

In [None]:
from functools import partial

models = []

for i, (X_train, X_val, y_train, y_val) in enumerate(KFold_train_val(df.train_cat, df.y), 1):
  print(f'validation : {i}')
  start = time.time()


  f = partial(lgb_objective, X_train, X_val, y_train, y_val)
  study_stop_cb = OptunaEarlyStopCallback(patient=15)
  study = optuna.create_study()
  study.optimize(f, n_trials=100, callbacks=[study_stop_cb])

  print('best_params: {}'.format(study.best_params))
  lr = LightGBM(params=study.best_params)
  lr.fit(X_train, y_train, X_val, y_val)
  models.append(lr)

  print('elapsed_time : {}'.format(time.time() - start))
  print('score : {}\n'.format(lr.score))
  
score = [model.score for model in models]
print('mean score : {}'.format(np.array(score).mean()))

In [None]:
predict_values = np.zeros((200000, len(models)))

for i, model in enumerate(models):
  predict_values[:, i] = model.predict(df.test_cat)

predict_values, predict_values.shape

In [None]:
submit_df = pd.DataFrame({'id': df.test.index, 'target': predict_values.mean(axis=1)})
submit_df.to_csv('submit2.csv', index=False)