In [74]:
import warnings
warnings.filterwarnings("ignore")

In [75]:
import pandas as pd 
import numpy as np 

from xgboost import XGBClassifier,DMatrix,train
from xgboost.callback import EarlyStopping
from xgboost import cv

from sklearn.metrics import f1_score,confusion_matrix,roc_auc_score,recall_score,accuracy_score
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.utils import resample

In [76]:
X_train=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\X_train.csv',index_col=0))
y_train=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\y_train.csv',index_col=0)).reshape(-1)

X_test=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\X_test.csv',index_col=0))
y_test=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\y_test.csv',index_col=0)).reshape(-1)

In [77]:
train_np=np.concatenate((X_train,y_train.reshape(-1,1)),axis=1)

In [78]:
param={'n_estimators':list(np.linspace(0,2000,20)),'max_depth':[3,5,8,12,15],'eta':list(np.linspace(0.01,0.2,10)),'objective':'binary:logistic','tree_method':'hist'}

In [79]:
def make_balance(fit_df):
    fit_df_0=fit_df[fit_df[:,-1]==0]
    fit_df_1=fit_df[fit_df[:,-1]==1]
    max_count=np.floor(len(fit_df_0)/len(fit_df_1))
    residuals=int(len(fit_df_0)-max_count*len(fit_df_1))
    balance_count=1
    fit_df2=np.array(pd.DataFrame(fit_df_1).copy())
    while balance_count<max_count:
        fit_df2=np.concatenate((fit_df2,fit_df_1))
        balance_count=balance_count+1
    fit_df2=np.concatenate((fit_df2,resample(fit_df_1,replace=False,n_samples=residuals)))
    fit_df2=np.concatenate((fit_df2,fit_df_0))
    fit_df2=resample(fit_df2,replace=False,n_samples=len(fit_df2))
    return fit_df2

In [80]:
def ensemble_xgb(train_np,params,test_size=0.3):
    n_estimators=params['n_estimators']
    object=params['objective']
    tree_method=params['tree_method']
    depth_list=params['max_depth']
    eta_list=params['eta']
    model_dic={}
    model_number=1
    
    fit_df,val_df=train_test_split(train_np,random_state=None,test_size=test_size,stratify=train_np[:,-1])
    fit_df2=make_balance(fit_df) 

    X_fit=fit_df2[:,:-1]
    y_fit=fit_df2[:,-1]

    X_val=val_df[:,:-1]
    y_val=val_df[:,-1]

    for i in depth_list:
        for j in eta_list:
            for k in n_estimators:
                model=XGBClassifier(n_estimators=np.round(k),objective=object,tree_method=tree_method,
                                    learning_rate=j,max_depth=i,n_jobs=-1).fit(X_fit,y_fit)
                pred=model.predict(X_val)
                f_1_accuracy_score=f1_score(y_val,pred)
                model_dic['model_number_{}'.format(model_number)]=\
                    {'n_estimators':k,'max_depth':i,'learning_rate':j,'accuracy_score':f_1_accuracy_score}
                print(model_dic['model_number_{}'.format(model_number)])
                model_number=model_number+1


    model_df=pd.DataFrame(model_dic)
    
    return model_df


In [84]:
param1={'n_estimators':list(pd.Series(np.round(np.linspace(100,5000,20))).astype(int)),'max_depth':[3,5,8,12,15],'eta':[0.3,0.2,0.1,0.05],'objective':'binary:logistic','tree_method':'hist'}
selected_model1=ensemble_xgb(train_np,param1)

{'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.7702432045779686}
{'n_estimators': 358, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.7980266976204295}
{'n_estimators': 616, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.7967193907439953}
{'n_estimators': 874, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.799882283696292}
{'n_estimators': 1132, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.800834824090638}
{'n_estimators': 1389, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.8}
{'n_estimators': 1647, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.8018018018018018}
{'n_estimators': 1905, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.800478612025127}
{'n_estimators': 2163, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.7985546522131888}
{'n_estimators': 2421, 'max_depth': 3, 'learning_rate': 0.3, 'accuracy_score': 0.7981845688350984}
{'n_estimators': 2679, 'max_dept

In [72]:
list(pd.Series(np.round(np.linspace(100,5000,10))).astype(int))

[100, 644, 1189, 1733, 2278, 2822, 3367, 3911, 4456, 5000]

In [55]:
train_np2=make_balance(train_np)
pd.Series(train_np2[:,-1]).value_counts()

1.0    20378
0.0    20378
Name: count, dtype: int64

In [56]:
X_train=train_np2[:,:-1]
y_train=train_np2[:,-1]
model=XGBClassifier(n_estimators=5000,max_depth=15,learning_rate=0.01,objective='binary:logistic',tree_method='hist').fit(X_train,y_train)

In [57]:
pred=model.predict(X_test)
confusion_matrix(y_test,pred)

array([[5002,   93],
       [ 340, 1082]], dtype=int64)

In [58]:
accuracy_score(y_test,pred)

0.9335583857603191

In [59]:
f1_score(y_test,pred)

0.8332691567192915

In [60]:
recall_score(y_test,pred)

0.7609001406469761