In [1]:
import numpy as np
import pandas as pd

#!pip install xgboost
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

#!pip install shap
import matplotlib.pyplot as plt

In [2]:
fTest = pd.read_csv('Resources/test.csv')

fTest=fTest.sample(50000)

In [3]:
dfTrain=pd.read_csv('Resources/train.csv')
dfTrain=dfTrain.sample(50000)

In [4]:
var_columns=[c for c in dfTrain.columns if c not in ['ID_code', 'target']]
X = dfTrain.loc[:, var_columns]
y=dfTrain.loc[:, 'target']
X_train, X_valid, y_train, y_valid = train_test_split (X,y, test_size=.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((40000, 200), (10000, 200), (40000,), (10000,))

In [5]:
xgb.XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [6]:
model_xgboost = xgb.XGBClassifier(learning_rate=.01,
                                     max_depth=5,
                                     n_estimators=3000,
                                     subsample=.5,
                                     colsample_bytree=.5,
                                     eval_metric='auc',
                                     verbosity=1,
                                    random_state=1)

eval_set =[(X_valid, y_valid)]

model_xgboost.fit(X_train,
                 y_train,
                 early_stopping_rounds=10,
                 eval_set=eval_set,
                 verbose=True)



[0]	validation_0-auc:0.59713
[1]	validation_0-auc:0.63256
[2]	validation_0-auc:0.64794
[3]	validation_0-auc:0.66539
[4]	validation_0-auc:0.68281
[5]	validation_0-auc:0.69532
[6]	validation_0-auc:0.69578
[7]	validation_0-auc:0.70263
[8]	validation_0-auc:0.70588
[9]	validation_0-auc:0.70861
[10]	validation_0-auc:0.71042
[11]	validation_0-auc:0.71267
[12]	validation_0-auc:0.71900
[13]	validation_0-auc:0.72066
[14]	validation_0-auc:0.72251
[15]	validation_0-auc:0.72560
[16]	validation_0-auc:0.72573
[17]	validation_0-auc:0.73029
[18]	validation_0-auc:0.73186
[19]	validation_0-auc:0.73198
[20]	validation_0-auc:0.73171
[21]	validation_0-auc:0.73635
[22]	validation_0-auc:0.73622
[23]	validation_0-auc:0.73645
[24]	validation_0-auc:0.73655
[25]	validation_0-auc:0.73793
[26]	validation_0-auc:0.74088
[27]	validation_0-auc:0.74309
[28]	validation_0-auc:0.74526
[29]	validation_0-auc:0.74624
[30]	validation_0-auc:0.74611
[31]	validation_0-auc:0.74571
[32]	validation_0-auc:0.74630
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5,
              enable_categorical=False, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=3000, n_jobs=4, num_parallel_tree=1,
              predictor='auto', random_state=1, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.5, tree_method='exact',
              validate_parameters=1, verbosity=1)

# Evaluate Model Performance

In [9]:
y_train_pred = model_xgboost.predict_proba(X_train)[:,1]
y_valid_pred = model_xgboost.predict_proba(X_valid)[:,1]

print("AUC Train: {:.4f}\nAUC Valid {:.4f}".format(roc_auc_score(y_train, y_train_pred), roc_auc_score(y_valid,y_valid_pred)))

AUC Train: 0.8482
AUC Valid 0.7685


# Hyperparameter Tuning

In [10]:
learning_rate_list = [0.02, 0.05, .1]
num_parallel_tree_list =[1,3,5]
n_estimators_list = [500, 750, 1000]
params_dict = {'learning_rate' : learning_rate_list,
               'num_parallel_tree': num_parallel_tree_list,
               'n_estimators': n_estimators_list}
num_combinations=1
for v in params_dict.values(): num_combinations  *= len (v)
    
print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'num_parallel_tree': [1, 3, 5],
 'n_estimators': [500, 750, 1000]}

In [11]:
def my_roc_auc_score(model, X, y): return roc_auc_score(y, model.predict_proba(X)[:,1])
model_xgboost_hp= GridSearchCV(estimator=xgb.XGBClassifier(subsample=.5,
                                                          colsample_bytree=.25,
                                                          eval_metric='auc',
                                                          use_label_encoder=False),
                              param_grid=params_dict,
                              cv=2,
                              scoring=my_roc_auc_score,
                              return_train_score=True,
                              verbose=4)
model_xgboost_hp.fit(X,y)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=1; total time= 1.1min
[CV 2/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=1; total time= 1.0min
[CV 1/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=3; total time= 2.9min
[CV 2/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=3; total time= 2.5min
[CV 1/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=5; total time= 3.9min
[CV 2/2] END learning_rate=0.02, n_estimators=500, num_parallel_tree=5; total time= 3.9min
[CV 1/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=1; total time= 1.3min
[CV 2/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=1; total time= 1.3min
[CV 1/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=3; total time= 3.6min
[CV 2/2] END learning_rate=0.02, n_estimators=750, num_parallel_tree=3; total time= 3.5min
[CV 1/2] END learning_rate=0.

GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.25,
                                     enable_categorical=False,
                                     eval_metric='auc', gamma=None, gpu_id=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=0.5, tree_method=None,
                           

In [16]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
Model2Results = df_cv_results[['rank_test_score', 'mean_test_score', 'param_learning_rate', 'param_num_parallel_tree', 'param_n_estimators' ]]
Model2Results.sort_values(by='rank_test_score', inplace=True)
Model2Results

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Model2Results.sort_values(by='rank_test_score', inplace=True)


Unnamed: 0,rank_test_score,mean_test_score,param_learning_rate,param_num_parallel_tree,param_n_estimators
8,1,0.882748,0.02,5,1000
17,2,0.882553,0.05,5,1000
14,3,0.882526,0.05,5,750
26,4,0.88194,0.1,5,1000
7,5,0.88183,0.02,3,1000
11,6,0.881615,0.05,5,500
23,7,0.881182,0.1,5,750
16,8,0.881108,0.05,3,1000
13,9,0.880545,0.05,3,750
25,10,0.880287,0.1,3,1000


In [17]:

Model2Results.to_csv('output/model2.csv')
