<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:270%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Tabular Playground Series - Oct 2021
</div>

<a><img src="https://i.ibb.co/PWvpT9F/header.png" alt="header" border="0" width=800 height=400></a>

In [None]:
import pandas as pd, numpy as np, os, matplotlib.pyplot as plt, seaborn as sns
import datatable as dt
import warnings
import random
warnings.filterwarnings('ignore')
pd.set_option('max_columns',None)
import gc
#import cudf #only works when gpu on
from sklearn.metrics import roc_auc_score,auc, roc_curve
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import plotly.figure_factory as ff
import plotly.express as px

from time import time
import pprint
import joblib
from functools import partial
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
PLOT = False

#notebook setup
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    
TARGET = 'target'
FOLD = 5
SEED = 69
N_ESTIMATORS=10000
DEVICE = 'CPU'
EVAL_METRIC = "AUC"

STUDY_TIME = 60*60*8
seed_everything(SEED)

<div style="color:White; display:fill; border-radius:5px;background-color:#dd4124;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Note: Upvote is Free!!!
</div>

In [None]:
%time

#import data
train = dt.fread(r"../input/d/ankitkalauni/tps-october-2021-dataset/train.csv").to_pandas()
test = dt.fread(r"../input/d/ankitkalauni/tps-october-2021-dataset/test.csv").to_pandas()

# train.columns = ['f1', 'f2', 'f3', 'f4', 'f5', 'target', 'f6',
#        'f7', 'f8', 'f9', 'f10', 'f11',
#        'f12', 'f13', 'f14', 'f15', 'f16',
#        'f17', 'f18', 'f19', 'f20','f21']

# test.columns = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6',
#        'f7', 'f8', 'f9', 'f10', 'f11',
#        'f12', 'f13', 'f14', 'f15', 'f16',
#        'f17', 'f18', 'f19', 'f20','f21']


sample_submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

train[TARGET] = train[TARGET].astype('int64') 

In [None]:
print('Train Shape: ',train.shape)
train.tail(10).reset_index(drop=True)

In [None]:
print('Test Shape: ',test.shape)
test.head(10)

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Preprocessing
</div>

___

In [None]:
#setup for preprocessing
X = train.drop(TARGET, axis=1)
y = train[TARGET]
X_test = test

#delete the old datframes from the memory
del train, test
gc.collect()

In [None]:
# helper functions
def get_auc(y_true, y_hat):
    fpr, tpr, _ = roc_curve(y_true, y_hat)
    score = auc(fpr, tpr)
    return score

In [None]:
#best parameters searched using optuna 
hist_params = {'l2_regularization': 1.3244040135051264e-10,
               'early_stopping': 'True',
               'learning_rate': 0.0366777965884429, 
               'max_iter': 10000, 
               'max_depth': 3, 
               'max_bins': 129, 
               'min_samples_leaf': 13449, 
               'max_leaf_nodes': 68}

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Train-set KDE Plot
</div>

In [None]:
if PLOT == True:
    X_data = [X.f1,  X.f2,  X.f3,  X.f4,  X.f5,  X.f6,  X.f7,  X.f8,  X.f9,  X.f10,  X.f11,  X.f12,  X.f13,  X.f14,  X.f15,  X.f16,  X.f17,  X.f18,  X.f19,  X.f20, X.f21]
    group_labels = X.columns.to_list()
    fig = ff.create_distplot(X_data, group_labels, bin_size=0.3, show_hist=False, show_rug=False)
    fig.show()

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Test-set KDE Plot
</div>

In [None]:
if PLOT == True:
    _data = [X_test.f1,  X_test.f2,  X_test.f3,  X_test.f4,  X_test.f5,  X_test.f6,  X_test.f7,  X_test.f8,  X_test.f9,  X_test.f10,  X_test.f11,  X_test.f12,  X_test.f13,  X_test.f14,  X_test.f15,  X_test.f16,  X_test.f17,  X_test.f18,  X_test.f19,  X_test.f20, X_test.f21]
    fig = ff.create_distplot(_data, group_labels, bin_size=0.3, show_hist=False, show_rug=False)
    fig.show()

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Train-set Heatmap Plot
</div>

In [None]:
if PLOT == True:
    #correlation between all models pred
    data = np.corrcoef(X_data)
    fig=px.imshow(data,x=group_labels, y=group_labels)

    fig.show()

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Test-set Heatmap Plot
</div>

In [None]:
if PLOT == True:
    #correlation between all models pred
    data = np.corrcoef(_data)
    fig=px.imshow(data,x=group_labels, y=group_labels)

    fig.show()

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
HistGBM
</div>

# Histogram-based Gradient Boosting Classification Tree.

This estimator is much faster than
:class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned
into integer-valued bins, which considerably reduces the number of
splitting points to consider, and allows the algorithm to leverage
integer-based data structures. For small sample sizes,
:class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
might be preferred since binning may lead to split points that are too
approximate in this setting.

This implementation is inspired by
`LightGBM <https://github.com/Microsoft/LightGBM>`_.

## note:

  This estimator is still **experimental** for now: the predictions
  and the API might change without any deprecation cycle. To use it,
  you need to explicitly import ``enable_hist_gradient_boosting``::

    >>> # explicitly require this experimental feature
    >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
    >>> # now you can import normally from ensemble
    >>> from sklearn.ensemble import HistGradientBoostingClassifier


[Tutorial Gradient Boosting - StatQuest](https://www.youtube.com/embed/3CC4N4z3GJc)

In [None]:
#check of all the columns in train is in test set
assert X.columns.to_list() == X_test.columns.to_list()

meta_pred_tmp = []
scores_tmp = []

# create cv
kf = StratifiedKFold(n_splits=50, shuffle=True, random_state=1)

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    # create train, validation sets
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = HistGradientBoostingClassifier(**hist_params)
    model.fit(X_train, y_train)
    # validation prediction
    pred_valid = model.predict_proba(X_valid)[:,1]
    
    score = get_auc(y_valid, pred_valid)
    scores_tmp.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('--'*20)
    
    # test prediction based on oof_set
    y_hat = model.predict_proba(X_test)[:,1]
    meta_pred_tmp.append(y_hat)
# print overall validation scores
print(f"Overall Validation Score | Meta: {np.mean(scores_tmp)}")
print('::'*20)

In [None]:
#average meta predictions over each fold
meta_predictions = np.mean(np.column_stack(meta_pred_tmp), axis=1)

# create submission file
stacked_submission = sample_submission.copy()
stacked_submission[TARGET] = meta_predictions
stacked_submission.to_csv('./HistGBM.csv', index=False)

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
HistGBM Prediction KDE Plot
</div>

In [None]:
if PLOT == True:
    plot = pd.concat([X_test,stacked_submission[TARGET]],axis=1)

    pred = [plot.f1,  plot.f2,  plot.f3,  plot.f4,  plot.f5,  plot.f6,  plot.f7,  plot.f8,  plot.f9,  plot.f10,  plot.f11,  plot.f12,  plot.f13,  plot.f14,  plot.f15,  plot.f16,  plot.f17,  plot.f18,  plot.f19,  plot.f20, plot.f21, plot.target]
    group_labels = plot.columns.to_list()
    fig = ff.create_distplot(pred, group_labels, bin_size=0.3, show_hist=False, show_rug=False)
    fig.show()