### Library import

In [None]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer     # verified that there're no null values accordingly we don't need SimpleImputer

from sklearn.metrics import roc_auc_score, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

import optuna
from optuna.samplers import TPESampler
from tqdm import tqdm


plt.rcParams['axes.unicode_minus'] = False
plt.style.use('seaborn') 
sns.set(font_scale=1)  
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

print("Let's start!")

### Data Load

In [None]:
trn = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col = 'id')
tst = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col = 'id')

display(trn.shape, trn.head(3), tst.shape, tst.head(3))

### Check for the existence of null data

In [None]:
# null data check

null_trn = trn.isnull().sum().sort_values(ascending = False)
null_tst = tst.isnull().sum().sort_values(ascending = False)

print("null data list of trn set\n")
for idx in null_trn.index:
    if null_trn[idx] > 0:
        print(idx, null_trn[idx]) 
        
print("================================\n")
print("null data list of tst set")
for idx in null_tst.index:
    if null_tst[idx] > 0:
        print(idx, null_tst[idx])     

### Target variable

In [None]:
# Target values distribution

ax = sns.countplot(trn['target'])

ax.bar_label(ax.containers[0])
ax.set_ylim(0, 350000)

plt.show()

In [None]:
features = trn.columns[:-1]
target = trn.columns[-1]

### Compare the distribution of train data and that of test data

In [None]:
# Thanks to https://www.kaggle.com/maximkazantsev/tps-11-21-eda-xgboost-optuna#Data-preprocessing


df = pd.concat([trn[features], tst[features]], axis=0)

columns = df.columns.values

cols = 5
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,65), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(trn[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(tst[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            if i == 0:
                axs[r, c].legend(fontsize=10)
                                  
        i+=1

plt.show();

### Standardization of the numerical variables using StandardScaler

In [None]:
# scaling

ss = StandardScaler()

for col in features:
    trn[col] = ss.fit_transform(trn[[col]])
    tst[col] = ss.fit_transform(tst[[col]])
    
display(trn.head(3), tst.head(3))

In [None]:
# train_test_split

X_trn = trn[features]
y_trn = trn[target]
X_tst = tst[features]

display(X_trn.shape, y_trn.shape, X_tst.shape)

In [None]:
# k-fold cross validation, trial with the default parameter.


# RANDOM_SEED = 42
# n_splits = 5
# skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

# tst_preds = []
# mean_auc = 0

# model = LGBMClassifier(random_state = RANDOM_SEED)

# for fold, (trn_idx, val_idx) in tqdm(enumerate(skf.split(X_trn, y_trn))):
#     X_train, X_val = X_trn.loc[trn_idx], X_trn.loc[val_idx]
#     y_train, y_val = y_trn.loc[trn_idx], y_trn.loc[val_idx]
    
#     model.fit(X_train, y_train,
#              verbose = False,
#              eval_set = [(X_train, y_train), (X_val, y_val)],
#              eval_metric = 'auc',
#              early_stopping_rounds = 100)
    
#     y_pred = model.predict_proba(X_val)
#     score = roc_auc_score(y_val, y_pred[:,1])
#     mean_auc += score
    
#     print(f"Fold {fold}'s score: {score}")
        
#     tst_preds.append(model.predict_proba(X_tst)[:, 1])

# print("==========================================")
# print(f"Mean auc of all folds: {mean_auc / n_splits}")

### Hyperparameter optimization using Optuna

In [None]:
# # HPO using opuna

# def lgb_objective(trial):
#     params = {
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
#         'n_estimators': trial.suggest_int("n_estimators", 64, 8192),
#         'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.25, log=True),
#         'num_leaves': trial.suggest_int("num_leaves", 20, 3000),
#         'max_depth': trial.suggest_int("max_depth", 3, 12),
#         'feature_fraction': trial.suggest_float("feature_fraction", 0.1, 1.0),
#         'min_gain_to_split' : trial.suggest_int('min_gain_to_split', 0, 15),
#         'min_data_in_leaf' : trial.suggest_int("min_data_in_leaf", 100, 1000),
#         'lambda_l1': trial.suggest_loguniform("lambda_l1", 1e-8, 100.0),
#         'lambda_l2': trial.suggest_loguniform("lambda_l2", 1e-8, 100.0),
#         'bagging_fraction' : trial.suggest_float("bagging_fraction", 0, 0.8),
#         'bagging_freq' : trial.suggest_int("bagging_freq", 1, 100),
#         'seed': 42,
#         'deterministic': True,
#         'metric' : 'auc',
#         'verbose':-1
#     }
    
#     X_train, X_val, y_train, y_val = train_test_split(X_trn, y_trn, test_size = 0.3, random_state = 42)
    
#     model = LGBMClassifier(**params)
#     model.fit(X_train, y_train,
#              eval_set = [(X_train, y_train), (X_val, y_val)],
#              early_stopping_rounds = 100,
#              eval_metric = 'auc',
#              verbose = False
#              )
#     pred_val = model.predict(X_val)
    
#     return roc_auc_score(y_val, pred_val)

In [None]:
# sampler = TPESampler(seed = 42)
# study = optuna.create_study(study_name = 'lgbm_hpo',
#                            direction = 'maximize',
#                            sampler = sampler)
# study.optimize(lgb_objective, n_trials = 10)

# print("Best AUC:", study.best_value)
# print("Best params:", study.best_params)

In [None]:
# params = study.best_params

In [None]:
params = {'boosting_type': 'gbdt',
          'objective': 'binary',
          'n_estimators': 7276, 
          'learning_rate': 0.013562603384785458,
          'num_leaves': 376,
          'max_depth': 10,
          'feature_fraction': 0.7847065437552077,
          'min_gain_to_split': 8,
          'min_data_in_leaf': 794,
          'lambda_l1': 0.0008668739724852811,
          'lambda_l2': 0.0016878284140548435,
          'bagging_fraction': 0.3420328146868397,
          'bagging_freq': 3,
          'seed': 42,
          'deterministic': True,
          'metric' : 'auc',
          'verbose':-1
         }

### Out Of Folds ensemble

In [None]:
RANDOM_SEED = 42
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

tst_preds = []
mean_auc = 0

model = LGBMClassifier(**params)

for fold, (trn_idx, val_idx) in tqdm(enumerate(skf.split(X_trn, y_trn))):
    X_train, X_val = X_trn.loc[trn_idx], X_trn.loc[val_idx]
    y_train, y_val = y_trn.loc[trn_idx], y_trn.loc[val_idx]
    
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_val, y_val)],
             eval_metric = 'auc',
             early_stopping_rounds = 100)
    
    y_pred = model.predict_proba(X_val)
    score = roc_auc_score(y_val, y_pred[:,1])
    mean_auc += score
    
    print(f"Fold {fold}'s score: {score}")
        
    tst_preds.append(model.predict_proba(X_tst)[:, 1])

print("==========================================")
print(f"Mean auc of all folds: {mean_auc / n_splits}")

### Prediction and submission

In [None]:
final_preds = np.mean(tst_preds, axis = 0)
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')
submission['target'] = final_preds
submission

In [None]:
submission.to_csv('submission.csv', index = False)

### 