# Libraries
---

In [None]:
import pandas as pd
import numpy as np
import random
import time
import sys
import os

import cudf

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

import xgboost as xgb
import optuna
from optuna.samplers import TPESampler

import shap

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

import warnings
warnings.simplefilter('ignore')

# Parameters
---

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 80000
LEARNING_RATE = 1e-2
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED = 42

In [None]:
# Optuna parameters
N_TRIALS = 300
TRAIN_TIME = 3 * 60 * 60

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

# Datasets
---

In [None]:
%%time

INPUT = "../input/tabular-playground-series-nov-2021/"

train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")
submission = pd.read_csv(INPUT + "sample_submission.csv")

features = [col for col in test.columns if 'f' in col]
TARGET = 'target'

# Standardization
---

In [None]:
scaler = StandardScaler()

train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
display(train.info())
display(train.head())

In [None]:
display(test.info())
display(test.head())

## Hyperparameter tuning
---

In [None]:
def objective(trial, X=train[features], y=train[TARGET]):

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=SEED, stratify=train[TARGET])

    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'subsample': trial.suggest_float('subsample', 0.2, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.2, 0.9),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3), 
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e2),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e2), 
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e2),
    }

    xgb_params = params
    xgb_params['booster'] = 'gbtree'
    xgb_params['objective'] = 'binary:logistic'
    xgb_params['n_estimators'] = N_ESTIMATORS
    xgb_params['seed'] = SEED
    xgb_params['learning_rate'] = LEARNING_RATE
    xgb_params['use_label_encoder'] = False
    xgb_params['importance_type'] = 'gain'
    xgb_params['tree_method'] = 'gpu_hist'
    xgb_params['predictor'] = 'gpu_predictor'

    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train,
              y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='auc',
              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
              verbose=False
             )

    preds = model.predict_proba(X_valid)[:, -1]
    score = roc_auc_score(y_valid, preds)

    return score

In [None]:
study = optuna.create_study(sampler=TPESampler(), study_name='TPS09', direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, timeout=TRAIN_TIME, show_progress_bar=True)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_param_importances(study)

# XGBoost
---

In [None]:
xgb_params = trial.params
    
xgb_params['booster'] = 'gbtree'
xgb_params['objective'] = 'binary:logistic'
xgb_params['n_estimators'] = N_ESTIMATORS
xgb_params['seed'] = SEED
xgb_params['learning_rate'] = LEARNING_RATE
xgb_params['use_label_encoder'] = False
xgb_params['importance_type'] = 'gain'
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'

display(xgb_params)

## Cross validation

In [None]:
xgb_oof = np.zeros(train.shape[0])
xgb_pred = np.zeros(test.shape[0])
shap_values = np.zeros((train.shape[0], train[features].shape[1]))

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=train[TARGET])):
    print(f"===== fold {fold} =====")
    X_train, y_train = train[features].iloc[trn_idx], train[TARGET].iloc[trn_idx]
    X_valid, y_valid = train[features].iloc[val_idx], train[TARGET].iloc[val_idx]
    X_test = test[features]

    start = time.time()
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train,
              y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='auc',
              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
              verbose=VERBOSE
             )

    shap_values[val_idx] = shap.TreeExplainer(model).shap_values(X_valid)
    xgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    xgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, xgb_oof[val_idx])
    print(f"fold {fold} - xgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof xgb auc = {roc_auc_score(train[TARGET], xgb_oof)}")

np.save("xgb_oof.npy", xgb_oof)
np.save("xgb_pred.npy", xgb_pred)

In [None]:
if 0:
    print("Feature distribution: ")
    ncols = 5
    nrows = int(len(features) / ncols + (len(features) % ncols > 0))

    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 150), facecolor='#EAEAF2')

    for r in range(nrows):
        for c in range(ncols):
            col = features[r*ncols+c]
            sns.kdeplot(x=train[col], ax=axes[r, c], color='#58D68D', label='Train data')
            sns.kdeplot(x=test[col], ax=axes[r, c], color='#DE3163', label='Test data')
            axes[r, c].set_ylabel('')
            axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
            axes[r, c].tick_params(labelsize=5, width=0.5)
            axes[r, c].xaxis.offsetText.set_fontsize(4)
            axes[r, c].yaxis.offsetText.set_fontsize(4)
    plt.show()

## SHAP values

In [None]:
shap.summary_plot(shap_values, train[features], show=False)

# Submission
---

In [None]:
submission[TARGET] = xgb_pred
submission.to_csv("submission.csv", index=False)
submission