In this ntebook i'm going to apply the ROC woth cross validation, following the guide in the Scikit-learn.org procedure: 
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py

> ROC curves typically feature true positive rate on the Y axis, and false positive rate on the X axis. This means that the top left corner of the plot is the “ideal” point - a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that a larger area under the curve (AUC) is usually better.
> 
> The “steepness” of ROC curves is also important, since it is ideal to maximize the true positive rate while minimizing the false positive rate.
> 
> This example shows the ROC response of different datasets, created from K-fold cross-validation. Taking all of these curves, it is possible to calculate the mean area under curve, and see the variance of the curve when the training set is split into different subsets. This roughly shows how the classifier output is affected by changes in the training data, and how different the splits generated by K-fold cross-validation are from one another.

# Libraries and Data import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import datatable as dt  # pip install datatable - for faster data download

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import random

from matplotlib.lines import Line2D

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBClassifier


from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer

from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve

import optuna

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
# Read the data
train = dt.fread("../input/tabular-playground-series-sep-2021/train.csv").to_pandas().set_index("id")
test = dt.fread("../input/tabular-playground-series-sep-2021/test.csv").to_pandas().set_index("id")

# NA values in train and test

In [None]:
print("(train, test) na --> ",(train.isna().sum().sum(), test.isna().sum().sum()))

In [None]:
is_na_train_df = train.drop(columns="claim").isna().sum(axis = 1)
print(is_na_train_df.shape)

is_na_test_df = test.isna().sum(axis = 1)
print(is_na_test_df.shape)

## Data preparation: Feature enG + Siple Imputer + NA to median

In [None]:
train["isNA"] =is_na_train_df
print(train.shape)

test["isNA"] = is_na_test_df
print(test.shape)


In [None]:
### Sampling data to reduce time during NB writing process

# Uncomment to some try and error session
# train = train.head(10000)
# test = test.head(10000)

In [None]:
x_Mm_scaler = MinMaxScaler()
X = pd.DataFrame(x_Mm_scaler.fit_transform(train.drop("claim", axis=1)),
                 columns=train.drop("claim", axis=1).columns)
y = train.claim
X_test = pd.DataFrame(x_Mm_scaler.transform(test), columns=test.columns)

In [None]:
imputer_zeros = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer_zeros.fit_transform(train.drop("claim", axis=1)),
                 columns=train.drop("claim", axis=1).columns)
X_test = pd.DataFrame(imputer_zeros.transform(test), columns=test.columns)
X = pd.DataFrame(x_Mm_scaler.fit_transform(X),
                 columns=train.drop("claim", axis=1).columns)
X_test = pd.DataFrame(x_Mm_scaler.transform(X_test), columns=test.columns)
print("(train, test) na --> ",(X.isna().sum().sum(), X_test.isna().sum().sum()))

In [None]:
# Parameters from a 2 hours Optuna Optimization here: https://www.kaggle.com/sgiuri/sep21tp-optuna-feature-eng-xgbc
xgb_params = {'n_estimators': 10000, 
              'learning_rate': 0.08625196792060146, 
              'subsample': 0.5959773829663169, 
              'colsample_bytree': 0.7603045913120982, 
              'max_depth': 7, 'booster': 'gbtree', 
              'tree_method': 'gpu_hist', # comment this line if you don't have a GPU
              'reg_lambda': 74.60593770387143, 
              'reg_alpha': 33.38858560681472, 
              'random_state': 42, 
              'n_jobs': 4}


# Creating Folds and representing the Roc Curve

In [None]:
splits = 8
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_rmse = 0
total_mean_roc_auc_score = 0

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(16,16))


for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X,y)):
    
    X_train, X_valid = X.loc[train_indicies], X.loc[valid_indicies]
    y_train, y_valid = y.loc[train_indicies], y.loc[valid_indicies]

    model = XGBClassifier(**xgb_params)
    print(f"Fitting fold {fold+1} of {splits} ")
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=100,
              verbose=False)
    
    viz = plot_roc_curve(model, X_valid, y_valid,
                         name='ROC fold {}'.format(fold),
                         alpha=0.3, lw=1, ax=ax)
    
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    
  
    preds += (model.predict_proba(X_test))[:,1] / splits
    model_fi += model.feature_importances_
    
    oof_preds[valid_indicies] = model.predict_proba(X_valid)[:,1]
    oof_preds[oof_preds < 0] = 0
#     fold_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(np.array(y_valid).reshape(-1,1)), y_scaler.inverse_transform(np.array(oof_preds[valid_idx]).reshape(-1,1))))
    fold_roc_auc_score = roc_auc_score(y_valid, oof_preds[valid_indicies])
    # fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_indicies]))
    print(f"Fold {fold} ROC AUC Score: {fold_roc_auc_score}")
#         print(f"Trees: {model.tree_count_}")
    # total_mean_rmse += fold_rmse / splits
    total_mean_roc_auc_score += fold_roc_auc_score / splits
print(f"\nOverall ROC AUC Score: {total_mean_roc_auc_score}\n\n")
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.4f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
ax.legend(loc="lower right")
plt.show()

In [None]:
# xgb public Score untuned and fast parameters: 0.76817
predictions = pd.DataFrame()
predictions["id"] = test.index
predictions["claim"] = preds

predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()