# Data preparation

## Library import

In [1]:
import random
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from optbinning import BinningProcess, OptimalBinning
import statsmodels.api as sm

### Data import and overview

In [2]:
oneypd = pd.read_csv("data.csv",sep=",")

In [3]:
oneypd.head()

Unnamed: 0,id,vintage_year,monthly_installment,loan_balance,bureau_score,num_bankrupt_iva,time_since_bankrupt,num_ccj,time_since_ccj,ccj_amount,...,months_since_2mia,avg_mia_6m,max_arrears_bal_6m,max_mia_6m,avg_bal_6m,avg_bureau_score_6m,cc_util,annual_income,emp_length,months_since_recent_cc_delinq
0,6670001,2005,746.7,131304.44,541.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,-42.0,0.0,132080.0,542.0,0.4578,76749,3,11
1,9131199,2006,887.4,115486.51,441.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,116972.0,494.0,0.6299,78451,10,7
2,4963167,2004,1008.5,128381.73,282.0,0.0,0.0,1.0,36.0,459.0,...,0.0,0.0,1198.0,2.0,128500.0,290.0,0.6331,31038,3,6
3,3918582,2005,458.23,35482.96,461.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,-114.0,0.0,36610.0,460.0,0.499,56663,8,6
4,5949777,2006,431.2,77086.31,466.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,77518.0,468.0,0.9568,77014,10,3


In [4]:
oneypd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25906 entries, 0 to 25905
Data columns (total 44 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             25906 non-null  int64  
 1   vintage_year                   25906 non-null  int64  
 2   monthly_installment            25906 non-null  float64
 3   loan_balance                   25906 non-null  float64
 4   bureau_score                   25473 non-null  float64
 5   num_bankrupt_iva               25473 non-null  float64
 6   time_since_bankrupt            25473 non-null  float64
 7   num_ccj                        25473 non-null  float64
 8   time_since_ccj                 25473 non-null  float64
 9   ccj_amount                     25473 non-null  float64
 10  num_bankrupt                   25473 non-null  float64
 11  num_iva                        25473 non-null  float64
 12  min_months_since_bankrupt      25473 non-null 

### Round arrears count fields

In [5]:
oneypd["max_arrears_12m"] = np.round(oneypd['max_arrears_12m'],4)
oneypd['arrears_months'] = np.round(oneypd['arrears_months'],4)

## Default flag

In [6]:
# default = 0 and non-default =1
oneypd['default_event'] = np.logical_not(np.logical_or((np.logical_or(
    oneypd['arrears_event'] == 1, oneypd['term_expiry_event'] == 1)), oneypd['bankrupt_event'] == 1)).astype(int)

In [7]:
oneypd["default_event"].head(10)

0    1
1    1
2    0
3    1
4    1
5    0
6    1
7    1
8    0
9    1
Name: default_event, dtype: int32

## Database split in test and train sample

In [8]:
# Setting seed for split
seed = 123

In [9]:
# separating dependent andd independent variables
X = oneypd.drop('default_event', axis=1)
y = oneypd['default_event']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=seed,stratify = y)

# Univariate analysis

## IV assessment

In [11]:
# list of numerical columns
num_list = ["max_arrears_12m","arrears_months","max_arrears_bal_6m","cc_util","bureau_score",
            "annual_income","avg_bureau_score_6m","emp_length",'remaining_mat',"num_ccj","time_since_ccj",
            "months_since_recent_cc_delinq","loan_term","ltv","avg_bal_6m","loan_balance",
            "monthly_installment"]

In [12]:
# Getting IV value for numerical variables 
iv_list = []

for val in num_list:

    x = oneypd[val].values
    y = oneypd["default_event"].values
    optb = OptimalBinning(name=val, dtype="numerical", solver="cp")
    optb.fit(x, y)
    binning_table = optb.binning_table
    iv_list.append(binning_table.build().loc["Totals", "IV"])

In [13]:
iv_dict = {"Variable": num_list, "IV": iv_list}
iv_df = pd.DataFrame.from_dict(iv_dict)
iv_df = iv_df.sort_values(by="IV", ascending=False)
iv_df

Unnamed: 0,Variable,IV
3,cc_util,2.290492
0,max_arrears_12m,1.066055
1,arrears_months,0.997844
2,max_arrears_bal_6m,0.843924
4,bureau_score,0.575394
6,avg_bureau_score_6m,0.572484
5,annual_income,0.561323
11,months_since_recent_cc_delinq,0.530156
7,emp_length,0.227386
8,remaining_mat,0.199165


## Binning WOE

In [14]:
# Discarding the variables low IV
# Discarding variables which has similar charcteristics with other variable
num_list_final = ["max_arrears_12m", "cc_util", "bureau_score",
                  "annual_income", "emp_length", "num_ccj", "months_since_recent_cc_delinq"]

In [15]:
binning_process = BinningProcess(num_list_final)

In [16]:
x_final= X_train[num_list_final]
x_final_test= X_test[num_list_final]

In [18]:
# replacing values of variables with woe
x_woe=binning_process.fit_transform(x_final,y_train)
x_woe_test=binning_process.fit_transform(x_final_test,y_test)

In [19]:
# setting index of independent variables
x_woe.set_index(y_train.index,inplace=True)
x_woe_test.set_index(y_test.index,inplace=True)

In [20]:
x_woe.corr()

Unnamed: 0,max_arrears_12m,cc_util,bureau_score,annual_income,emp_length,num_ccj,months_since_recent_cc_delinq
max_arrears_12m,1.0,0.096982,0.36132,0.059841,0.028309,0.245784,0.034351
cc_util,0.096982,1.0,0.0528,0.056827,0.028782,0.032979,0.424944
bureau_score,0.36132,0.0528,1.0,0.020279,0.01291,0.351756,0.017206
annual_income,0.059841,0.056827,0.020279,1.0,0.576568,0.034918,0.031467
emp_length,0.028309,0.028782,0.01291,0.576568,1.0,0.027246,0.011585
num_ccj,0.245784,0.032979,0.351756,0.034918,0.027246,1.0,0.012722
months_since_recent_cc_delinq,0.034351,0.424944,0.017206,0.031467,0.011585,0.012722,1.0


# Stepwise regression

In [21]:
def forward_regression(X, y,
                       threshold_in,
                       verbose=False):
    """
    args:
    ---------
    X : dataframe of independent variable
    y: dataframe of independent variable
    threshold_in : threshold for p value 
    """
    initial_list = []
    included = list(initial_list)
    while True:
        changed = False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)

        for new_column in excluded:
            glm_binom = sm.GLM(y, sm.add_constant(pd.DataFrame(
                X[included+[new_column]])), family=sm.families.Binomial())
            model = glm_binom.fit()
            new_pval[new_column] = model.pvalues[new_column]

        best_pval = new_pval.min()

        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()

            included.append(best_feature)
            changed = True
            if verbose:
                print(
                    'Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        if not changed:

            break

    return included

In [22]:
# Variable list based on stepwise regression
var_list = forward_regression(x_woe, y_train, threshold_in=0.05)

  app.launch_new_instance()


In [23]:
# Model based on var_list
glm_binom = sm.GLM(y_train, sm.add_constant(pd.DataFrame(
                x_woe[var_list])), family=sm.families.Binomial()).fit()

In [24]:
# Summary of the model
glm_binom.summary()

0,1,2,3
Dep. Variable:,default_event,No. Observations:,18134.0
Model:,GLM,Df Residuals:,18128.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2165.1
Date:,"Sun, 12 Jul 2020",Deviance:,4330.2
Time:,01:41:06,Pearson chi2:,16100.0
No. Iterations:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.9015,0.052,55.313,0.000,2.799,3.004
max_arrears_12m,-0.8089,0.039,-20.484,0.000,-0.886,-0.731
cc_util,-0.9840,0.037,-26.671,0.000,-1.056,-0.912
annual_income,-1.0598,0.050,-21.005,0.000,-1.159,-0.961
bureau_score,-0.5751,0.058,-9.934,0.000,-0.689,-0.462
months_since_recent_cc_delinq,-0.1906,0.082,-2.327,0.020,-0.351,-0.030


# GLM Calibration

## Score Normalisation

In [25]:
#New scale with anchor set at 660 points and log-odds doubling each 40 points and 72:1 odds ratio.
def scaled_score(logit,odds,offset=500,pdo=20):
    b=pdo/np.log(2)
    a = offset - b* np.log(odds)
    
    return round(a+b*(np.log((1-logit)/logit)))

In [26]:
x_woe["logit"] = 1-glm_binom.predict(sm.add_constant(pd.DataFrame(x_woe[var_list])))
x_woe_test["logit"] = 1-glm_binom.predict(sm.add_constant(pd.DataFrame(x_woe_test[var_list])))

In [27]:
x_woe["score"]=scaled_score(x_woe["logit"],odds=72,offset=660,pdo=40)
x_woe_test["score"]=scaled_score(x_woe_test["logit"],odds=72,offset=660,pdo=40)