### Model Tuning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
import logging
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc


In [None]:
class WoE_Binning(BaseEstimator, TransformerMixin):
    def __init__(self):  # no *args or *kargs
        self.woe_grade = None
        self.woe_home = None
        self.woe_verStatus = None
        self.woe_purpose = None
        self.woe_term = None
        self.woe_int_rate = None
        self.woe_annInc = None
        self.woe_dti = None
        self.woe_revolUtil = None
        self.woe_outPrncp = None
        self.woe_totalPymnt = None
        self.woe_totalRev = None
        self.woe_mthsECL = None
        self.woe_mthsID = None
        self.woe_mthsLCP = None

    def fit(self, X, y):
        logging.basicConfig(filename='./app.log', filemode='w',
                            format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
        newX = X.copy()
        # grade woe calculation
        self.woe_grade = self.woe(X, 'grade', y)

        # home_ownership woe calculation
        self.woe_home = self.woe(X, 'home_ownership', y)

        # verification_status woe calculation
        self.woe_verStatus = self.woe(X, 'verification_status', y)

        # purpose woe calculation
        self.woe_purpose = self.woe(X, 'purpose', y)

        # term woe calculation
        self.woe_term = self.woe(X, 'term', y)

        # int_rate woe calculation
        # fine-classing using the 'cut' method, given the large number of unique values
        newX['int_rate_factor'] = pd.cut(newX['int_rate'], 50, right=False)
        # Process 'int_rate_factor' column through woe_ordered_continuous and plot_by_woe functions
        self.woe_int_rate = self.woe(newX, 'int_rate_factor', y)
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_int_rate.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_int_rate.index[0].right, closed='left')
        self.woe_int_rate = self.woe_int_rate.reset_index(
        )[['int_rate_factor', 'woe']].to_dict(orient='list')
        self.woe_int_rate['int_rate_factor'][0] = iv_start
        self.woe_int_rate['int_rate_factor'][-1] = iv_end
        self.woe_int_rate = pd.DataFrame(self.woe_int_rate)

        # annual_inc woe calculation
        # Initial examination shows that there are too few individuals with large income (up to $75m!!) and too many with small income.
        # Since 95.6% of observations have income < 151,858, we will have one category for more than 150K, and we are going to apply our approach to determine
        # the categories of everyone with 150k or less.
        new_X_temp = newX[newX['annual_inc'] <= 150500].copy()
        # fine-classing again
        new_X_temp['annual_inc_factor'] = pd.cut(new_X_temp['annual_inc'], 12)
        # make sure to select only the relevant indexes in the target column
        self.woe_annInc = self.woe(new_X_temp, 'annual_inc_factor', y[new_X_temp.index])
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_annInc.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_annInc.index[0].right, closed='left')
        self.woe_annInc = self.woe_annInc.reset_index()[['annual_inc_factor', 'woe']].to_dict(orient='list')
        self.woe_annInc['annual_inc_factor'][0] = iv_start
        self.woe_annInc['annual_inc_factor'][-1] = iv_end
        self.woe_annInc = pd.DataFrame(self.woe_annInc)

        # dti woe calculation
        # fine-classing
        newX['dti_factor'] = pd.cut(newX['dti'], 10)
        # Process 'dti_factor' column through woe_ordered_continuous and plot_by_woe functions
        self.woe_dti = self.woe(newX, 'dti_factor', y)
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_dti.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_dti.index[0].right, closed='left')
        self.woe_dti = self.woe_dti.reset_index()[['dti_factor', 'woe']].to_dict(orient='list')
        self.woe_dti['dti_factor'][0] = iv_start
        self.woe_dti['dti_factor'][-1] = iv_end
        self.woe_dti = pd.DataFrame(self.woe_dti)

        # revol_util woe calculation
        # Initial examination shows that there are some obs with utilization of >1 times which should be very rare, so we will filter them out first
        new_X_temp = newX[newX['revol_util'] <= 1].copy()
        # fine-classing
        new_X_temp['revol_util_factor'] = pd.cut(new_X_temp['revol_util'], 10)
        # preprocess 'revol_util'
        self.woe_revolUtil = self.woe(new_X_temp, 'revol_util_factor', y[new_X_temp.index])
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_revolUtil.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_revolUtil.index[0].right, closed='left')
        self.woe_revolUtil = self.woe_revolUtil.reset_index()[['revol_util_factor', 'woe']].to_dict(orient='list')
        self.woe_revolUtil['revol_util_factor'][0] = iv_start
        self.woe_revolUtil['revol_util_factor'][-1] = iv_end
        self.woe_revolUtil = pd.DataFrame(self.woe_revolUtil)

        # out_prncp woe calculation
        # Here we do fine-classing: using the 'cut' method, we split the variable into 26 categories by its values.
        newX['out_prncp_factor'] = pd.cut(newX['out_prncp'], 26)
        # We calculate weight of evidence.
        self.woe_outPrncp = self.woe(newX, 'out_prncp_factor', y)
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_outPrncp.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_outPrncp.index[0].right, closed='left')
        self.woe_outPrncp = self.woe_outPrncp.reset_index()[['out_prncp_factor', 'woe']].to_dict(orient='list')
        self.woe_outPrncp['out_prncp_factor'][0] = iv_start
        self.woe_outPrncp['out_prncp_factor'][-1] = iv_end
        self.woe_outPrncp = pd.DataFrame(self.woe_outPrncp)

        # total_pymnt woe calculation
        # Initial examination shows that there are vey few obs with >25,000 which should be very rare, so we will filter them out first
        new_X_temp = X[X['total_pymnt'] <= 30000].copy()
        # fine-classing
        new_X_temp['total_pymnt_factor'] = pd.cut(new_X_temp['total_pymnt'], 20)
        # preprocess 'total_pymnt'
        self.woe_totalPymnt = self.woe(new_X_temp, 'total_pymnt_factor', y[new_X_temp.index])
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_totalPymnt.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_totalPymnt.index[0].right, closed='left')
        self.woe_totalPymnt = self.woe_totalPymnt.reset_index()[['total_pymnt_factor', 'woe']].to_dict(orient='list')
        self.woe_totalPymnt['total_pymnt_factor'][0] = iv_start
        self.woe_totalPymnt['total_pymnt_factor'][-1] = iv_end
        self.woe_totalPymnt = pd.DataFrame(self.woe_totalPymnt)

        # total_rev_hi_lim woe calculation
        # initial examination reveals very few obs > 79,780, we will filter them out first
        new_X_temp = newX[newX['total_rev_hi_lim'] <= 79750].copy()
        # fine-classing
        new_X_temp['total_rev_hi_lim_factor'] = pd.cut(new_X_temp['total_rev_hi_lim'], 25)
        # preprocess
        self.woe_totalRev = self.woe(new_X_temp, 'total_rev_hi_lim_factor', y[new_X_temp.index])
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_totalRev.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_totalRev.index[0].right, closed='left')
        self.woe_totalRev = self.woe_totalRev.reset_index()[['total_rev_hi_lim_factor', 'woe']].to_dict(orient='list')
        self.woe_totalRev['total_rev_hi_lim_factor'][0] = iv_start
        self.woe_totalRev['total_rev_hi_lim_factor'][-1] = iv_end
        self.woe_totalRev = pd.DataFrame(self.woe_totalRev)

        # mths_since_earliest_cr_line woe calculation
        # fine-classing
        newX['mths_since_earliest_cr_line_factor'] = pd.cut(newX['mths_since_earliest_cr_line'], 50)
        # preprocess
        self.woe_mthsECL = self.woe(newX, 'mths_since_earliest_cr_line_factor', y)
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_mthsECL.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_mthsECL.index[0].right, closed='left')
        self.woe_mthsECL = self.woe_mthsECL.reset_index()[['mths_since_earliest_cr_line_factor', 'woe']].to_dict(orient='list')
        self.woe_mthsECL['mths_since_earliest_cr_line_factor'][0] = iv_start
        self.woe_mthsECL['mths_since_earliest_cr_line_factor'][-1] = iv_end
        self.woe_mthsECL = pd.DataFrame(self.woe_mthsECL)

        # mths_since_issue_d woe calculation
        # fine-classing
        newX['mths_since_issue_d_factor'] = pd.cut(newX['mths_since_issue_d'], 24)
        # preprocess
        self.woe_mthsID = self.woe(newX, "mths_since_issue_d_factor", y)
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_mthsID.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_mthsID.index[0].right, closed='left')
        self.woe_mthsID = self.woe_mthsID.reset_index()[['mths_since_issue_d_factor', 'woe']].to_dict(orient='list')
        self.woe_mthsID['mths_since_issue_d_factor'][0] = iv_start
        self.woe_mthsID['mths_since_issue_d_factor'][-1] = iv_end
        self.woe_mthsID = pd.DataFrame(self.woe_mthsID)

        # mths_since_last_credit_pull_d
        # filter out all values > 75 given the low obs
        #X_train_prepr_temp = X_train_prepr[X_train_prepr['mths_since_last_credit_pull_d'] <= 75].copy()
        # Here we do fine-classing: using the 'cut' method, we split the variable into 25 categories
        newX['mths_since_last_credit_pull_d_factor'] = pd.cut(newX['mths_since_last_credit_pull_d'], 10)
        # Calculate WoE
        self.woe_mthsLCP = self.woe(newX, "mths_since_last_credit_pull_d_factor", y[newX.index])
        # Manipulation to include extreme bounds (0, infinity)
        iv_end = pd.Interval(left=self.woe_mthsLCP.index[-1].left, right=np.inf, closed='left')
        iv_start = pd.Interval(left=0, right=self.woe_mthsLCP.index[0].right, closed='left')
        self.woe_mthsLCP = self.woe_mthsLCP.reset_index()[['mths_since_last_credit_pull_d_factor', 'woe']].to_dict(orient='list')
        self.woe_mthsLCP['mths_since_last_credit_pull_d_factor'][0] = iv_start
        self.woe_mthsLCP['mths_since_last_credit_pull_d_factor'][-1] = iv_end
        self.woe_mthsLCP = pd.DataFrame(self.woe_mthsLCP)

        return self

    def transform(self, X):
        newX = X.copy()

        # grade Processing
        newX.replace({'grade': self.woe_grade[['woe']].to_dict()['woe']}, inplace=True)

        # home_ownership processing
        newX.replace({'home_ownership': self.woe_home[['woe']].to_dict()['woe']}, inplace=True)

        # verification_status processing
        newX.replace({'verification_status': self.woe_verStatus[['woe']].to_dict()['woe']}, inplace=True)

        # purpose processing
        newX.replace({'purpose': self.woe_purpose[['woe']].to_dict()['woe']}, inplace=True)

        # term processing
        newX.replace({'term': self.woe_term[['woe']].to_dict()['woe']}, inplace=True)

        # int_rate processing
        logging.info('int_rate processing...started')
        newX['int_rate'] = self.replace_woe(newX['int_rate'], self.woe_int_rate, 'int_rate_factor')
        logging.info('int_rate processing...end')

        # annual_inc processing
        logging.info('annual_inc processing...started')
        newX['annual_inc'] = self.replace_woe(newX['annual_inc'], self.woe_annInc, 'annual_inc_factor')
        logging.info('annual_inc processing...end')

        # dti processing
        logging.info('dti processing...started')
        newX['dti'] = self.replace_woe(newX['dti'], self.woe_dti, 'dti_factor')
        logging.info('dti processing...end')

        # revol_util processing
        logging.info('revol_util processing...started')
        newX['revol_util'] = self.replace_woe(newX['revol_util'], self.woe_revolUtil, 'revol_util_factor')
        logging.info('revol_util processing...end')

        # out_prncp processing
        logging.info('out_prncp processing...started')
        newX['out_prncp'] = self.replace_woe(newX['out_prncp'], self.woe_outPrncp, 'out_prncp_factor')
        logging.info('out_prncp processing...end')

        # total_pymnt processing
        logging.info('total_pymnt processing...started')
        newX['total_pymnt'] = self.replace_woe(newX['total_pymnt'], self.woe_totalPymnt, 'total_pymnt_factor')
        logging.info('total_pymnt processing...end')

        # total_rev_hi_lim
        logging.info('total_rev_hi_lim...started')
        newX['total_rev_hi_lim'] = self.replace_woe(newX['total_rev_hi_lim'], self.woe_totalRev, 'total_rev_hi_lim_factor')
        logging.info('total_rev_hi_lim...end')

        # mths_since_earliest_cr_line
        logging.info('mths_since_earliest_cr_line...started')
        newX['mths_since_earliest_cr_line'] = self.replace_woe(newX['mths_since_earliest_cr_line'], self.woe_mthsECL, 'mths_since_earliest_cr_line_factor')
        logging.info('mths_since_earliest_cr_line...end')

        # mths_since_issue_d
        logging.info('mths_since_issue_d...started')
        newX['mths_since_issue_d'] = self.replace_woe(newX['mths_since_issue_d'], self.woe_mthsID, 'mths_since_issue_d_factor')
        logging.info('mths_since_issue_d...end')

        # mths_since_last_credit_pull_d
        logging.info('mths_since_last_credit_pull_d...started')
        newX['mths_since_last_credit_pull_d_factor'] = self.replace_woe(newX['mths_since_last_credit_pull_d'], self.woe_mthsLCP, 'mths_since_last_credit_pull_d_factor')
        logging.info('mths_since_last_credit_pull_d...end')

        return newX

    def woe(self, df, cat_variabe_name, y_df):
        """
        woe(weight of evidence) function
        """

        df = pd.concat([df[cat_variabe_name], y_df], axis=1)
        df_woe_iv = pd.crosstab(df[cat_variabe_name], df['good_bad'], normalize='columns').assign(
            woe=lambda dfx: np.log(dfx[1]/dfx[0])).assign(iv=lambda dfx: np.sum(dfx['woe']*(dfx[1]-dfx[0])))
        return df_woe_iv

    def replace_woe(self, col, woe_sheet, name):
        woe_t = []
        arr_iv = pd.arrays.IntervalArray(woe_sheet[name], closed='left')
        for i in col.values:
            idx = np.argwhere(arr_iv.contains(i)).ravel()[0]
            woe_t.append(woe_sheet.iloc[idx]['woe'])
        return woe_t

In [None]:
data = pd.read_csv('../input/loan-data/loan_data_2007_2014.csv')

In [None]:
# create a new column based on the loan_status column that will be our target variable
data['good_bad'] = np.where(data.loc[:, 'loan_status'].isin(['Charged Off', 'Default', 'Late (31-120 days)',
                                                             'Does not meet the credit policy. Status:Charged Off']), 0, 1)
# Drop the original 'loan_status' column
data.drop(columns=['loan_status'], inplace=True)

In [None]:
data.fillna(data.mean(axis=0), inplace=True, axis=0)

In [None]:
X = data.drop('good_bad', axis=1)[['grade', 'home_ownership', 'verification_status', 'purpose', 'term', 'int_rate', 'annual_inc', 'dti', 'revol_util',
                                   'out_prncp', 'total_pymnt', 'total_rev_hi_lim', 'mths_since_earliest_cr_line', 'mths_since_issue_d', 'mths_since_last_credit_pull_d']]
y = data['good_bad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
num_pipe = joblib.load('./binning.joblib')

In [None]:
X_train = num_pipe.transform(X_train)

In [None]:
X_test = num_pipe.transform(X_test)

In [None]:
cv = cv = RepeatedStratifiedKFold(n_splits=5, random_state=42, n_jobs=-1)


In [None]:
xgb_param_grid = {
    'n_estimators': range(50, 400, 50),
    'objective': ['binary:logistic'],
    'booster': ['dart', 'gbtree'],
    'eta': [0.05, 0.1],
    'verbosity': [2],
    'tree_method': ['hist']

}

In [None]:
model = XGBClassifier(eval_metric='error', use_label_encoder=False)

In [None]:
grid_search = GridSearchCV(model, xgb_param_grid, scoring="accuracy", n_jobs=-1, cv=cv)

In [None]:
grid_result = grid_search.fit(X_train, y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# plot
plt.errorbar(xgb_param_grid['n_estimators'], means, yerr=stds)
plt.title("XGBoost n_estimators vs Log Loss")
plt.xlabel('n_estimators')
plt.ylabel('Log Loss')
plt.savefig('n_estimators.png')
plt.show()