In [1]:
import numpy as np
import pandas as pd

#!pip install xgboost
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

#!pip install shap
import matplotlib.pyplot as plt

In [2]:
fTest = pd.read_csv('Resources/test.csv')

fTest=fTest.sample(50000)

In [3]:
dfTrain=pd.read_csv('Resources/train.csv')
dfTrain=dfTrain.sample(50000)

In [4]:
var_columns=[c for c in dfTrain.columns if c not in ['ID_code', 'target']]
X = dfTrain.loc[:, var_columns]
y=dfTrain.loc[:, 'target']
X_train, X_valid, y_train, y_valid = train_test_split (X,y, test_size=.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((40000, 200), (10000, 200), (40000,), (10000,))

In [5]:
xgb.XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [6]:
model_xgboost = xgb.XGBClassifier(learning_rate=.01,
                                     max_depth=5,
                                     n_estimators=3000,
                                     subsample=.5,
                                     colsample_bytree=.5,
                                     eval_metric='auc',
                                     verbosity=1,
                                    random_state=1)

eval_set =[(X_valid, y_valid)]

model_xgboost.fit(X_train,
                 y_train,
                 early_stopping_rounds=10,
                 eval_set=eval_set,
                 verbose=True)



[0]	validation_0-auc:0.59643
[1]	validation_0-auc:0.62884
[2]	validation_0-auc:0.65616
[3]	validation_0-auc:0.67579
[4]	validation_0-auc:0.67798
[5]	validation_0-auc:0.69998
[6]	validation_0-auc:0.70079
[7]	validation_0-auc:0.69981
[8]	validation_0-auc:0.70430
[9]	validation_0-auc:0.70607
[10]	validation_0-auc:0.70820
[11]	validation_0-auc:0.70917
[12]	validation_0-auc:0.71121
[13]	validation_0-auc:0.71606
[14]	validation_0-auc:0.72355
[15]	validation_0-auc:0.72152
[16]	validation_0-auc:0.72034
[17]	validation_0-auc:0.72050
[18]	validation_0-auc:0.72529
[19]	validation_0-auc:0.72828
[20]	validation_0-auc:0.72739
[21]	validation_0-auc:0.73054
[22]	validation_0-auc:0.73064
[23]	validation_0-auc:0.73390
[24]	validation_0-auc:0.73580
[25]	validation_0-auc:0.73842
[26]	validation_0-auc:0.73817
[27]	validation_0-auc:0.74008
[28]	validation_0-auc:0.74162
[29]	validation_0-auc:0.74099
[30]	validation_0-auc:0.74224
[31]	validation_0-auc:0.74079
[32]	validation_0-auc:0.74047
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5,
              enable_categorical=False, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=3000, n_jobs=4, num_parallel_tree=1,
              predictor='auto', random_state=1, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.5, tree_method='exact',
              validate_parameters=1, verbosity=1)

# Evaluate Model Performance

In [7]:
y_train_pred = model_xgboost.predict_proba(X_train)[:,1]
y_valid_pred = model_xgboost.predict_proba(X_valid)[:,1]

print("AUC Train: {:.4f}\nAUC Valid {:.4f}".format(roc_auc_score(y_train, y_train_pred), roc_auc_score(y_valid,y_valid_pred)))

AUC Train: 0.8448
AUC Valid 0.7770


# Hyperparameter Tuning

In [8]:
learning_rate_list = [0.02, 0.05, .1]
num_parallel_tree_list =[1,3,5]
n_estimators_list = [500, 750, 1000]
params_dict = {'learning_rate' : learning_rate_list,
               'num_parallel_tree': num_parallel_tree_list,
               'n_estimators': n_estimators_list}
num_combinations=1
for v in params_dict.values(): num_combinations  *= len (v)
    
print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'num_parallel_tree': [1, 3, 5],
 'n_estimators': [500, 750, 1000]}

In [9]:
def my_roc_auc_score(model, X, y): return roc_auc_score(y, model.predict_proba(X)[:,1])
model_xgboost_hp= GridSearchCV(estimator=xgb.XGBClassifier(subsample=.5,
                                                          colsample_bytree=.25,
                                                          eval_metric='auc',
                                                          use_label_encoder=False),
                              param_grid=params_dict,
                              cv=2,
                              scoring=my_roc_auc_score,
                              return_train_score=True,
                              verbose=4)
model_xgboost_hp.fit(X,y)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=1; total time=  50.4s
[CV 2/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=1; total time=  49.4s
[CV 1/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=3; total time= 2.3min
[CV 2/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=3; total time= 2.3min
[CV 1/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=5; total time= 3.9min
[CV 2/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=5; total time= 3.8min
[CV 1/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=1; total time= 1.2min
[CV 2/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=1; total time= 1.2min
[CV 1/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=3; total time= 3.5min
[CV 2/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=3; total time= 3.5min
[CV 1/2] END learning_rate=0.

GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.25,
                                     enable_categorical=False,
                                     eval_metric='auc', gamma=None, gpu_id=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=0.5, tree_method=None,
                           

In [12]:
df_cv_results2 = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_reults2 = df_cv_results2[['rank_test_score', 'mean_test_score', 'param_learning_rate', 'param_num_parallel_tree', 'param_n_estimators' ]]
df_cv_results2.sort_values(by='rank_test_score', inplace=True)
df_cv_results2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,param_num_parallel_tree,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
8,457.407718,1.221733,1.3623,0.007978,0.02,1000,5,"{'learning_rate': 0.02, 'n_estimators': 1000, ...",0.883459,0.881575,0.882517,0.000942,1,0.999871,0.99994,0.999906,3.424651e-05
17,453.024094,1.410879,1.442583,0.011466,0.05,1000,5,"{'learning_rate': 0.05, 'n_estimators': 1000, ...",0.881281,0.880314,0.880797,0.000483,2,1.0,1.0,1.0,0.0
7,279.143487,1.18483,0.828726,0.006983,0.02,1000,3,"{'learning_rate': 0.02, 'n_estimators': 1000, ...",0.881889,0.879683,0.880786,0.001103,3,0.999834,0.999943,0.999889,5.434734e-05
14,345.500003,1.406228,1.066105,0.002981,0.05,750,5,"{'learning_rate': 0.05, 'n_estimators': 750, '...",0.881066,0.88007,0.880568,0.000498,4,1.0,1.0,1.0,0.0
26,441.481256,0.84873,1.359365,0.008976,0.1,1000,5,"{'learning_rate': 0.1, 'n_estimators': 1000, '...",0.881371,0.87888,0.880126,0.001245,5,1.0,1.0,1.0,0.0
5,349.88902,6.370701,1.025711,0.009489,0.02,750,5,"{'learning_rate': 0.02, 'n_estimators': 750, '...",0.881032,0.878631,0.879831,0.0012,6,0.999343,0.999558,0.99945,0.0001077581
11,230.892536,0.007978,0.707051,0.011968,0.05,500,5,"{'learning_rate': 0.05, 'n_estimators': 500, '...",0.880457,0.878956,0.879707,0.00075,7,0.999974,0.999988,0.999981,6.564798e-06
23,336.117235,0.469618,1.054152,0.006012,0.1,750,5,"{'learning_rate': 0.1, 'n_estimators': 750, 'n...",0.880423,0.877835,0.879129,0.001294,8,1.0,1.0,1.0,0.0
16,276.950909,1.071746,0.838256,0.019448,0.05,1000,3,"{'learning_rate': 0.05, 'n_estimators': 1000, ...",0.879495,0.878476,0.878985,0.00051,9,1.0,1.0,1.0,0.0
13,208.805068,0.27179,0.63678,0.01245,0.05,750,3,"{'learning_rate': 0.05, 'n_estimators': 750, '...",0.879643,0.878194,0.878918,0.000724,10,1.0,1.0,1.0,8.835528e-09


In [14]:

df_cv_results2.to_csv('output/model2.csv')
