In [None]:
import pandas as pd
import numpy as np
import os

import pickle
import time

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

import xgboost as xgb

seed = 66

In [None]:
# Case of class imbalance - sampling and shuffling data
validation_split = 0.8
oversampling_factor = 3
np.random.seed(seed)

# Assuming binary classification with integer 0 and 1 for lables
train_count = int(X['label'].sum() * validation_split)
valid_count = int(X['label'].sum() * (1-validation_split))

idx_1 = X.loc[X['label'] == 1].index.tolist()
np.random.shuffle(idx_1)
validation_index_1 = idx_1[:valid_count]
train_index_1 = idx_1[valid_count:]

idx_0 = X.loc[X['label'] == 0].index.tolist()
np.random.shuffle(idx_0)
validation_index_0 = idx_0[:valid_count] 
train_index_0 = idx_0[valid_count:]

X_Valid = pd.concat([X.loc[validation_index_1], X.loc[validation_index_0]])

X_tr_1 = X.loc[train_index_1].sample(train_count * oversampling_factor, replace=True)
X_tr_0 = X.loc[train_index_0].sample(train_count * oversampling_factor, replace=True)
X_Train = pd.concat([X_tr_1, X_tr_0])

In [None]:
cols = [x for x in X_Train.columns if x != 'label']
train_x, train_y = X_Train[cols], X_Train['label']
train_mat = xgb.DMatrix(train_x, train_y)
valid_x, valid_y = X_Valid[cols], X_Valid['label']
valid_mat = xgb.DMatrix(valid_x, valid_y)

preds = np.zeros(X_Valid.shape[0])
num_epoch = 2000
xgb_params = {'eta': 0.1, # learning rate
              'max_depth': 8,  
              'subsample': 0.9, 
              'sampling_method' : 'uniform', # or 'uniform'
              'colsample_bytree': 0.9,
              'colsample_bylevel': 0.9,
              'min_child_weight' : 20,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
              'seed': 66,
              'lambda': 60, 
              'alpha': 140, 
             }

watchlist = [(train_mat, 'train'), (valid_mat, 'valid')]
xgb_model = xgb.train(xgb_params, train_mat, num_epoch, watchlist, verbose_eval = 20, early_stopping_rounds=200)

preds = xgb_model.predict(valid_mat, ntree_limit=xgb_model.best_iteration)
pred_class = preds.copy()
pred_class[pred_class>=0.5] = 1
pred_class[pred_class<0.5] = 0

feature_df = pd.DataFrame.from_dict(xgb_model.get_score(importance_type='gain'), orient='index')
feature_df.columns = ['importance']

In [None]:
print('\nAUC score %.6f' %roc_auc_score(valid_y, preds))

In [None]:
conf_mat = confusion_matrix(y_true=valid_y, y_pred=pred_class)
print(conf_mat/cont_mat.sum())