# Problem definition


-  Details:

According to the description: "The dataset is used for this competition is synthetic but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the amount of an insurance claim. Although the features are anonymized, they have properties relating to real-world features."

-  Solution:

A LightGBM model will be adjusted using Bayesian optimization with lib [optuna](https://optuna.readthedocs.io/en/stable/) (optimize hyperparameters and pre processing). The goal will be to maximize area under the ROC curve

<p align="right"><span style="color:firebrick">Dont forget to upvote if the notebook was useful! <i class="fas fa-hand-peace"></i></span> </p>

# Import dependencies

In [None]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier

from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.integration import LightGBMPruningCallback

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks


pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 20)

# Prepare data

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
X_train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=0)
X_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=0)

In [None]:
for col in X_train.columns[X_train.dtypes == "object"].tolist():
    X_train[col] = X_train[col].astype('category')
    
for col in X_test.columns[X_test.dtypes == "object"].tolist():
    X_test[col] = X_test[col].astype('category')

In [None]:
X = X_train.drop('target', axis=1)
y = X_train['target']

K = 10 # cross validation

fixedparams = {'random_state': 42,
               'n_estimators': 10000, 
               'learning_rate': 0.03, 
               'metric': 'auc', 
               'verbose':-1   
}

# Custom Functions

In [None]:
def model_instance(hyperparams, fixedparams):

    clf = LGBMClassifier(**hyperparams['clf'], **fixedparams) 
    
    if hyperparams['resample'] == 'random':
        resample = RandomUnderSampler(sampling_strategy='majority')
    else:
        resample = None
        
    if hyperparams['power'] == True:
        cont = [col for col in X_train.columns if 'cont' in col]
        numeric_transformer = PowerTransformer(method='yeo-johnson',
                                               standardize=True)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, cont)])
    else:
        preprocessor = None
    
    pipe = Pipeline([('preprocessor', preprocessor),
                     ('resample', resample),
                     ('clf', clf) ])
    return pipe

In [None]:
def fit_with_stop(pipe, X, y, X_val, y_val, trial, hyperparams, early_stopping_rounds = 50):
    
    if(trial != 0):
        pruning_callback = [LightGBMPruningCallback(trial, 'auc')]
    else: 
        pruning_callback = None
    
    if hyperparams['power'] == True:
        pipe_interim = pipe.named_steps.preprocessor.fit(X)
        X_val = pipe_interim.transform(X_val)
    
    pipe.fit(X, y,
              clf__eval_set=(X_val, y_val),
              clf__early_stopping_rounds=early_stopping_rounds,
              clf__verbose=0,
              clf__eval_metric="auc",
              clf__callbacks=pruning_callback)
    return pipe

In [None]:
def evaluate(model, X, y):

    yp = model.predict_proba(X)[:, 1]
    auc_score = roc_auc_score(y, yp)
    return auc_score

In [None]:
def kfold_prediction(X, y, X_test, k, hyperparams, fixedparams, early_stopping_rounds = 50):

    yp = np.zeros(len(X_test))
    
    kf = StratifiedKFold(n_splits=k,random_state=42,shuffle=True)
    model = model_instance(hyperparams, fixedparams)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"\n FOLD {i} ...")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train,
                                  X_val, y_val, 0, hyperparams, 
                                  early_stopping_rounds)
        yp += model_fit.predict_proba(X_test)[:, 1] / k
    
    return yp

In [None]:
def objective(trial):
    
    global X, y, K, fixedparams

    hyperparams = {
        'resample': trial.suggest_categorical("resample", [None]),
        'power': trial.suggest_categorical("power", [False]),
        'clf':{
            'boosting_type': trial.suggest_categorical("boosting_type", ['gbdt']),
            'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'max_depth': trial.suggest_int('max_depth', 1, 64),
               
            'max_delta_step': trial.suggest_int('max_delta_step', 1, 15),
            ##'max_bin': trial.suggest_int('max_bin', 32, 255),
            ##'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 256),
            ##'min_data_in_bin': trial.suggest_int('min_data_in_bin', 1, 256),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10),
            #'min_split_gain' : trial.suggest_discrete_uniform('min_split_gain', 0, 5, 0.01),
            
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
            #'subsample': trial.suggest_float('subsample ', 0.1, 1.0),
            
            'cat_smooth': trial.suggest_float('cat_smooth', 10, 100.0),
            'cat_l2': trial.suggest_int('cat_l2', 1, 20)
        }

    }
    
    kf = StratifiedKFold(n_splits=K,random_state=42,shuffle=True)
    scores = []
    model = model_instance(hyperparams, fixedparams)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train, X_val, y_val,
                                  trial, hyperparams)
        val_score = evaluate(model_fit, X_val, y_val)
        scores.append(val_score)
    
    return np.nanmean(scores)


# Model Tuning

In [None]:
study = optuna.create_study(direction='maximize',
                            pruner=optuna.pruners.HyperbandPruner())

In [None]:
%%time
import warnings
warnings.filterwarnings('ignore')

study.optimize(objective, timeout=60*5, n_jobs=-1,
               n_trials=None, gc_after_trial=False)

# Evaluate optimization

In [None]:
study.trials_dataframe()

In [None]:
study.best_value

In [None]:
plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
plot_param_importances(study)

In [None]:
study.best_params

Update the best parameters after a long training

In [None]:
best_params = {'resample': None,
 'power': False,
 'boosting_type': 'gbdt',
 'num_leaves': 250,
 'min_child_samples': 75,
 'max_depth': 63,
 'max_delta_step': 5,
 'reg_alpha': 3.4218738754608045,
 'reg_lambda': 3.0962347920614643,
 'colsample_bytree': 0.3189138428868663,
 'cat_smooth': 45.74149068289875,
 'cat_l2': 19}

# Prepare to submit

In [None]:
final_params = dict()
final_params['clf']=dict(best_params)

final_params['resample']=final_params['clf']['resample']
del final_params['clf']['resample']

final_params['power']=final_params['clf']['power']
del final_params['clf']['power']

fixedparams['learning_rate'] = 0.005

In [None]:
%%time

submission.loc[:, 'target'] = kfold_prediction(X, y, X_test, 10, 
                                               final_params, fixedparams,
                                               500)
submission.to_csv('submission.csv', index = False)

# References:

- <https://optuna.readthedocs.io/>
- <https://www.kaggle.com/rmiperrier/tps-mar-lgbm-optuna>
- <https://towardsdatascience.com/how-to-make-your-model-awesome-with-optuna-b56d490368af>
- <https://optuna.readthedocs.io/en/v1.0.0/tutorial/pruning.html>
- <https://www.kaggle.com/kst6690/dsb2019-tuning-lightgbm-parameter-using-optuna>