In [1]:
import numpy as np
import pandas as pd

#!pip install xgboost
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

#!pip install shap
import matplotlib.pyplot as plt

In [None]:
fTest = pd.read_csv('Resources/test.csv')
fTest.head()

In [None]:
dfTrain=pd.read_csv('Resources/train.csv')
dfTrain.head()

In [None]:
var_columns=[c for c in dfTrain.columns if c not in ['ID_code', 'target']]
X = dfTrain.loc[:, var_columns]
y=dfTrain.loc[:, 'target']
X_train, X_valid, y_train, y_valid = train_test_split (X,y, test_size=.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
xgb.XGBClassifier().get_params()

In [None]:
model_xgboost = xgb.XGBClassifier(learning_rate=.01,
                                     max_depth=5,
                                     n_estimators=500,
                                     subsample=.5,
                                     colsample_bytree=.5,
                                     eval_metric='auc',
                                     verbosity=1,
                                    random_state=1)

eval_set =[(X_valid, y_valid)]

model_xgboost.fit(X_train,
                 y_train,
                 early_stopping_rounds=10,
                 eval_set=eval_set,
                 verbose=True)

# Evaluate Model Performance

In [None]:
y_train_pred = model_xgboost.predict_proba(X_train)[:,1]
y_valid_pred = model_xgboost.predict_proba(X_valid)[:,1]

print("AUC Train: {:.4f}\nAUC Valid {:.4f}".format(roc_auc_score(y_train, y_train_pred), roc_auc_score(y_valid,y_valid_pred)))

# Hyperparameter Tuning

In [None]:
learning_rate_list = [0.02, 0.05, .1]
num_parallel_tree_list =[1,3,5]
n_estimators_list = [500, 750, 1000]
params_dict = {'learning_rate' : learning_rate_list,
               'num_parallel_tree': num_parallel_tree_list,
               'n_estimators': n_estimators_list}
num_combinations=1
for v in params_dict.values(): num_combinations  *= len (v)
    
print(num_combinations)
params_dict

In [None]:
def my_roc_auc_score(model, X, y): return roc_auc_score(y, model.predict_proba(X)[:,1])
model_xgboost_hp= GridSearchCV(estimator=xgb.XGBClassifier(subsample=.5,
                                                          colsample_bytree=.25,
                                                          eval_metric='auc',
                                                          use_label_encoder=False),
                              param_grid=params_dict,
                              cv=2,
                              scoring=my_roc_auc_score,
                              return_train_score=True,
                              verbose=4)
model_xgboost_hp.fit(X,y)

In [None]:
df_cv_results2 = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_reults2 = df_cv_results[['rank_test_score', 'mean_test_score', 'param_learning_rate', 'param_num_parallel_tree_list', 'param_n_estimators' ]]
df_cv_results2.sort_values(by='rank_test_score', inplace=True)
df_cv_results2

In [None]:

#df_cv_results.to_csv('output/model2.csv')
