In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import optuna
import matplotlib.pyplot as plt
%matplotlib inline

import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

**In tis notebook I want to check CatBoost, XGB and LGBM which is one the best with Optuna optimization with 30 trials. As a novice, I've heard a lot about how CatBoost is fast, and that XGB is the weapon of champions. Well, it's time to check it out in person.**

# EDA

In [None]:
train.head(3)

In [None]:
train.info()

There are no NA values. It's good.

In [None]:
test.head(3)

In [None]:
test.info()

There are no NA values too

Let's look the correlation between features

In [None]:
trheat = train.drop('id', axis = 1)
matrix = np.triu(trheat.corr())
plt.figure(figsize=(15, 10))
sns.heatmap(trheat.corr(), annot = True, cmap = 'YlGn', fmt=".2f", mask = matrix, vmin = -1, vmax = 1, linewidths = 0.1, linecolor = 'white')

Let's look on count distribution in target

In [None]:
sns.set_style('whitegrid')
sns.set_palette('Greens_r', 2)
plt.figure(figsize=(15, 10))
sns.countplot(x = 'target', data = train)

# Preprocessing

In [None]:
X = train.drop(['target', 'id'], axis = 1)
y = train['target']

ID = test['id'] # for submission
test = test.drop('id', axis = 1)

num_cols = X.select_dtypes(include = 'number').columns.to_list() # numerical features
cat_cols = X.select_dtypes(exclude = 'number').columns.to_list() # categorical features

Encoding categorical fetaures

In [None]:
cols = cat_cols + num_cols
X_objs = len(X)
df = pd.concat(objs = [X[cols], test[cols]], axis = 0)
df = pd.concat(objs = [X[cols], test[cols]], axis = 0)
df = pd.get_dummies(df, columns = cat_cols)
X = df[:X_objs]
test = df[X_objs:]

Let's see what we got

In [None]:
X.head(3)

In [None]:
test.head(3)

# CatBoost and optuna's optimization

In [None]:
def objective(trial, data = X, target = y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'random_seed': 42,
        'task_type': 'GPU',
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'bootstrap_type': 'Poisson'
    }
    
    model = CatBoostClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 222, verbose = False)
    y_pred = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, y_pred)

    return roc_auc

In [None]:
%%time
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

**Visualize optuna's optimization CatBoost**

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

**Fit CatBoost model with best parametrs**

In [None]:
paramsCB = study.best_trial.params
paramsCB['task_type'] = 'GPU'
paramsCB['loss_function'] = 'Logloss'
paramsCB['eval_metric'] = 'AUC'
paramsCB['random_seed'] = 42
paramsCB['bootstrap_type'] = 'Poisson'

In [None]:
%%time
from sklearn.model_selection import KFold

folds = KFold(n_splits = 10, shuffle = True, random_state = 42)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostClassifier(**paramsCB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 222)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
submission = pd.DataFrame({'id': ID, 'target': predictions})
submission.to_csv('submissionCB.csv', index = False)

# XGB and optuna's optimization

In [None]:
def objective(trial, data = X, target = y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'auc'

    }
    
    model = XGBClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 222, verbose = False)
    y_pred = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, y_pred)

    return roc_auc

In [None]:
%%time
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

**Visualize optuna's optimization XGB**

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

**Fit XGB model with best parametrs**

In [None]:
paramsXGB = study.best_trial.params
paramsXGB['tree_method'] = 'gpu_hist'
paramsXGB['booster'] = 'gbtree'
paramsXGB['eval_metric'] = 'auc'
paramsXGB['random_state'] = 42
paramsXGB['use_label_encoder'] = False

In [None]:
%%time
from sklearn.model_selection import KFold

folds = KFold(n_splits = 10, shuffle = True, random_state = 42)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = XGBClassifier(**paramsXGB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'auc', verbose = False, early_stopping_rounds = 222)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
submission = pd.DataFrame({'id': ID, 'target': predictions})
submission.to_csv('submissionXGB.csv', index = False)

# LGBM and optuna's optimizaton

In [None]:
def objective(trial, data = X, target = y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 333),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05, 0.005, 0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'random_state': 42,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        'device': 'gpu'
    }
    
    model = LGBMClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 222, verbose = False)
    y_pred = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, y_pred)

    return roc_auc

In [None]:
%%time
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

**Visualize optuna's optimization LGBM**

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

**Fit LGBM model with best parametrs**

In [None]:
paramsLGBM = study.best_trial.params
paramsLGBM['boosting_type'] = 'gbdt'
paramsLGBM['metric'] = 'AUC'
paramsLGBM['random_state'] = 42

In [None]:
%%time
from sklearn.model_selection import KFold

folds = KFold(n_splits = 10, shuffle = True, random_state = 42)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**paramsLGBM)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'auc', verbose = False, early_stopping_rounds = 222)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
submission = pd.DataFrame({'id': ID, 'target': predictions})
submission.to_csv('submissionLGBM.csv', index = False)

# Conclusion

In [None]:
results = pd.DataFrame([['CatBoost', 0.89144],
                  ['XGB', 0.89414],
                  ['LGBM', 0.89490]], 
columns = ['Algorithm', 'BestScore'])
results

**Well, based on the results in the table above, we can say that CatBoost has the lowest score. XGB showed an average result. LGBM has the best result (so that's why everyone uses it in TPS). Of course, only 30 trials were done with Optuna, and CatBoost may have better result, but this is just my little experiment, where all algorithms are on an equal conditions. Conclusion - if we want to win in TPS competitions on Kaggle and fight for every 0.001 or even 0.0001, then our choice slow, but true - LGBM.**