In [1]:
import numpy as np 
import pandas as pd 
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score, confusion_matrix
import xgboost as xgb

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [6]:
train_cols = [c for c in train_df.columns if c not in ["ID_code", "target"]]
y_train = train_df["target"]

In [8]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)

In [9]:
# By Bayesian Hyperparameter Optimization
params = {'tree_method': 'hist',
 'objective': 'binary:logistic',
 'eval_metric': 'auc',
 'learning_rate': 0.0936165921314771,
 'max_depth': 2,
 'colsample_bytree': 0.3561271102144279,
 'subsample': 0.8246604621518232,
 'min_child_weight': 53,
 'gamma': 9.943467991283027,
 'silent': 1}

In [10]:
%%time

oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])

feature_importance_df = pd.DataFrame()
folds.n_splits = 5;

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_df, y_train)):
    
    trn_x, trn_y = train_df[train_cols].iloc[trn_idx], y_train.iloc[trn_idx]
    val_x, val_y = train_df[train_cols].iloc[val_idx], y_train.iloc[val_idx]
    
    dtrain = xgb.DMatrix(trn_x, trn_y, feature_names=trn_x.columns) # load train
    dval = xgb.DMatrix(val_x, val_y, feature_names=val_x.columns)   # load validation
    
    clf = xgb.train(params=params, dtrain=dtrain, num_boost_round=4000, evals=[(dtrain, "Train"), (dval, "Val")],
        verbose_eval= 100, early_stopping_rounds=50) # classifier
       
    oof_preds[val_idx] = clf.predict(xgb.DMatrix(val_x))  # prediction of validation
    sub_preds += clf.predict(xgb.DMatrix(test_df[train_cols])) / folds.n_splits
    # In a for loop every iteration we predict test data and append it to the list.
    # So we have predicted test data for n times and got mean value for test predictions.

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = pd.DataFrame.from_dict(data=clf.get_fscore(), orient="index", columns=["FScore"])["FScore"].index
    fold_importance_df["fscore"] = pd.DataFrame.from_dict(data=clf.get_fscore(), orient="index", columns=["FScore"])["FScore"].values
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    print('\nFold %1d AUC %.6f & std %.6f' %(n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx]), np.std([oof_preds[val_idx]])))
    print('Fold %1d Precision %.6f' %(n_fold + 1, precision_score(val_y, np.round(oof_preds[val_idx])) ))
    print('Fold %1d Recall %.6f' %(n_fold + 1, recall_score(val_y, np.round(oof_preds[val_idx]) )))
    print('Fold %1d F1 score %.6f' % (n_fold + 1,f1_score(val_y, np.round(oof_preds[val_idx]))))
    print('Fold %1d Kappa score %.6f\n' % (n_fold + 1,cohen_kappa_score(val_y, np.round(oof_preds[val_idx]))))
    gc.collect()

print('\nCV AUC score %.6f & std %.6f' % (roc_auc_score(y_train, oof_preds), np.std((oof_preds))))
print('CV Precision score %.6f' % (precision_score(y_train, np.round(oof_preds))))
print('CV Recall score %.6f' % (recall_score(y_train, np.round(oof_preds))))
print('CV F1 score %.6f' % (f1_score(y_train, np.round(oof_preds))))
print('CV Kappa score %.6f' % (cohen_kappa_score(y_train, np.round(oof_preds))))

[0]	Train-auc:0.581105	Val-auc:0.573174
Multiple eval metrics have been passed: 'Val-auc' will be used for early stopping.

Will train until Val-auc hasn't improved in 400 rounds.
[500]	Train-auc:0.897606	Val-auc:0.879851
[1000]	Train-auc:0.916257	Val-auc:0.892997
[1500]	Train-auc:0.923435	Val-auc:0.896094
[2000]	Train-auc:0.926792	Val-auc:0.897124
[2500]	Train-auc:0.928692	Val-auc:0.897144
[3000]	Train-auc:0.929838	Val-auc:0.897415
Stopping. Best iteration:
[3050]	Train-auc:0.929939	Val-auc:0.897454


Fold 1 AUC 0.897386 & std 0.169979
Fold 1 Precision 0.755238
Fold 1 Recall 0.367662
Fold 1 F1 score 0.494562
Fold 1 Kappa score 0.458956

[0]	Train-auc:0.578702	Val-auc:0.570255
Multiple eval metrics have been passed: 'Val-auc' will be used for early stopping.

Will train until Val-auc hasn't improved in 400 rounds.
[500]	Train-auc:0.898251	Val-auc:0.876441
[1000]	Train-auc:0.916242	Val-auc:0.890981
[1500]	Train-auc:0.923431	Val-auc:0.895461
[2000]	Train-auc:0.926737	Val-auc:0.896889
[25

In [18]:
oof_roc = roc_auc_score(y_train, oof_preds)
oof_roc 

0.897288546154305

In [21]:
print("xgb %sFold %.6f"%(folds.n_splits, oof_roc))

xgb 5Fold 0.897289
