In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import numpy as np
import pandas as pd

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

%matplotlib inline
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import shap

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col=0)

sample_submission = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

feature_cols = test.columns.tolist()

# KFold

In [None]:
folds = 5
train["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["target"])):
    train.loc[valid_indicies, "kfold"] = fold

In [None]:
%%time
final_test_predictions = []
scores = []

for fold in range(folds):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test[feature_cols].copy()
    
    y_train = x_train['target']
    y_valid = x_valid['target']
    
    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]
    xgb_params = {
        'eval_metric': 'auc', 
        'objective': 'binary:logistic', 
        'tree_method': 'gpu_hist', 
        'gpu_id': 0, 
        'predictor': 'gpu_predictor', 
        'n_estimators': 10000, 
        'learning_rate': 0.01063045229441343, 
        'gamma': 0.24652519525750877, 
        'max_depth': 4, 
        'seed': 42,       
        'min_child_weight': 366, 
        'subsample': 0.6423040816299684, 
        'colsample_bytree': 0.7751264493218339, 
        'colsample_bylevel': 0.8675692743597421, 
        'use_label_encoder': False,
        'lambda': 0, 
        'alpha': 10
    }
    
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)
    
    preds_train = xgb_model.predict_proba(x_train)[:,1]
    preds_valid = xgb_model.predict_proba(x_valid)[:,1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold,", train:", f"{auc_train:.5f}", ", valid:", f"{auc:.5f}")
    scores.append(auc)
    
    preds_test = xgb_model.predict_proba(x_test)[:,1]
    final_test_predictions.append(preds_test)
    
    
print("AVG AUC:",np.mean(scores))

# SHAP Values

In [None]:
shap_values = shap.TreeExplainer(xgb_model).shap_values(x_valid)
shap.summary_plot(shap_values, x_valid)

In [None]:
shap.dependence_plot("f34", shap_values, x_valid)

In [None]:
idx = 5
data_for_prediction = x_valid.iloc[idx]
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)

print(xgb_model.predict_proba(data_for_prediction_array))

shap.initjs()
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(data_for_prediction_array)
shap.force_plot(explainer.expected_value, shap_values, data_for_prediction)

In [None]:
shap.decision_plot(explainer.expected_value, shap_values, data_for_prediction)

# Plot Prediction

In [None]:
labels = [f'fold {i}' for i in range(folds)]

fig = ff.create_distplot(final_test_predictions, labels, bin_size=.3, show_hist=False, show_rug=False)
fig.show()

In [None]:
sample_submission['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)