In [1]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

# For tree-based models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# For logistic regression and VIF
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

# For model metrics and misc
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, f1_score, log_loss, roc_curve, auc, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler

# For information value
from xverse.transformer import WOE


In [2]:
# Helper functions

def ks(target=None, prob=None, return_ks_table=False):
    ''' 
        Function to compute KS statistic
    '''
    data = pd.DataFrame({'target':target,'prob':prob})
    data['target0'] = 1 - data['target']
    data['bucket'] = pd.qcut(data['prob'], 10)
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()['prob']
    kstable['max_prob'] = grouped.max()['prob']
    kstable['events']   = grouped.sum()['target']
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data['target'].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data['target'].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    kstable.index = range(1,11)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 9)
    
    #Display KS
    from colorama import Fore
    print("KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
    if return_ks_table:
        return(kstable)
    
def plot_roc_curve(false_positive_rate, true_positive_rate, label="ROC Curve"):
    """
        Function to plot an ROC curve.
    """
    fig = plt.figure(figsize=(5,5))
    ax = fig.add_subplot(1,1,1)
    plt.plot(false_positive_rate, true_positive_rate, linewidth=1)
    plt.plot([0,1],[0,1], "r--") 
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate") 
    
    title_obj = ax.set_title(label)
    plt.setp(title_obj, color='w')                     # Individual title colours.
    ax.xaxis.label.set_color('black')  
    ax.yaxis.label.set_color('black')          
    ax.tick_params(axis='x', colors='black')       #set the color of xticks
    ax.tick_params(axis='y', colors='black')       #set the color of yticks    
    ax.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.2f'))
    ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2f'))
    plt.grid(b=True, which='major', color='gray', linestyle='--')
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()


In [3]:
# Read data

train = pd.read_csv('cs-training.csv')
kaggle_test = pd.read_csv('cs-test.csv')

print(train.shape)
print(kaggle_test.shape)

original_feature_cols = train.columns[2:]

(150000, 12)
(101503, 12)


### Exploratory Data Analysis and Data Treatment

In [4]:
# Univariate analysis (DIDQ report)

report_dict = {
    'column': [],
    'missing': [],
    'zero': [],
    'mean': [],
    'median': [],
    'stdev': [],
    'min': [],
    'max': [],
    'pctile_5th': [],
    'pctile_95th': []}

for col in train.columns[1:]:
    report_dict['column'].append(col)
    report_dict['missing'].append(train[col].isna().sum())
    report_dict['zero'].append((train[col]==0).sum())
    report_dict['mean'].append(train[col].mean())
    report_dict['median'].append(train[col].median())
    report_dict['stdev'].append(train[col].std())
    report_dict['min'].append(train[col].min())
    report_dict['max'].append(train[col].max())
    report_dict['pctile_5th'].append(train[col].quantile(0.05))
    report_dict['pctile_95th'].append(train[col].quantile(0.95))
    
pd.DataFrame(report_dict).to_csv('data_exploration_report.csv')
pd.DataFrame(report_dict)

Unnamed: 0,column,missing,zero,mean,median,stdev,min,max,pctile_5th,pctile_95th
0,SeriousDlqin2yrs,0,139974,0.06684,0.0,0.249746,0.0,1.0,0.0,1.0
1,RevolvingUtilizationOfUnsecuredLines,0,10878,6.048438,0.154181,249.755371,0.0,50708.0,0.0,1.0
2,age,0,1,52.295207,52.0,14.771866,0.0,109.0,29.0,78.0
3,NumberOfTime30-59DaysPastDueNotWorse,0,126018,0.421033,0.0,4.192781,0.0,98.0,0.0,2.0
4,DebtRatio,0,4113,353.005076,0.366508,2037.818523,0.0,329664.0,0.004329,2449.0
5,MonthlyIncome,29731,1634,6670.221237,5400.0,14384.674215,0.0,3008750.0,1300.0,14587.6
6,NumberOfOpenCreditLinesAndLoans,0,1888,8.45276,8.0,5.145951,0.0,58.0,2.0,18.0
7,NumberOfTimes90DaysLate,0,141662,0.265973,0.0,4.169304,0.0,98.0,0.0,1.0
8,NumberRealEstateLoansOrLines,0,56188,1.01824,1.0,1.129771,0.0,54.0,0.0,3.0
9,NumberOfTime60-89DaysPastDueNotWorse,0,142396,0.240387,0.0,4.155179,0.0,98.0,0.0,1.0


In [5]:
# Derived variables

train['IncomePerPerson'] = train.MonthlyIncome/(train.NumberOfDependents+1)

In [6]:
# Bi-variate analysis

def bivariate_charts(df, variables, n_rows, n_cols):
    df['SeriousDlqin2yrs'] = df.SeriousDlqin2yrs.astype('str')
    fig=plt.figure(figsize=(40, 20))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df.boxplot(column=var_name, by='SeriousDlqin2yrs', ax=ax)
        title_obj = ax.set_title(var_name)
        plt.setp(title_obj, color='black')                     # Individual title colours.

        ax.xaxis.label.set_color('white')  
        ax.yaxis.label.set_color('black')          
        ax.tick_params(axis='x', colors='black')       #set the color of xticks
        ax.tick_params(axis='y', colors='black')       #set the color of yticks
        
    #plt.rcParams.update({'font.size': 20})
    plt.show()


bivariate_charts(train, train.columns[2:], 3, 4)

  plt.show()


In [7]:
# Information value

train['SeriousDlqin2yrs'] = train.SeriousDlqin2yrs.astype(int)
clf = WOE()
clf.fit(train[train.columns[2:]], train['SeriousDlqin2yrs'])
clf.iv_df

Unnamed: 0,Variable_Name,Information_Value
9,RevolvingUtilizationOfUnsecuredLines,0.935429
7,NumberOfTimes90DaysLate,0.484202
5,NumberOfTime30-59DaysPastDueNotWorse,0.471831
6,NumberOfTime60-89DaysPastDueNotWorse,0.264824
10,age,0.219843
1,IncomePerPerson,0.086524
2,MonthlyIncome,0.066103
3,NumberOfDependents,0.025999
4,NumberOfOpenCreditLinesAndLoans,0.0233
0,DebtRatio,0.01236


In [8]:
# Check for target variable distribution

print(train.SeriousDlqin2yrs.value_counts())
print('Blanks: '+str(train.SeriousDlqin2yrs.isnull().sum()))

0    139974
1     10026
Name: SeriousDlqin2yrs, dtype: int64
Blanks: 0


In [9]:
# Data treatment 1: Remove record with zero age (one record)

train = train[train.age>0]

In [10]:
# Data treatment 2: Imputing missing values in monthly income using

temp = train[(train.age>0) & (train.age<=80) & (train.MonthlyIncome.notnull())][['age','MonthlyIncome']]
temp2 = train[(train.age>0) & (train.age<=80)][['age','MonthlyIncome']].groupby('age').mean().reset_index()

fit1 = np.polyfit(temp.age, temp.MonthlyIncome, 2, rcond=None, full=False, w=None, cov=False)
print(fit1)

fig = plt.figure()
plt
ax = plt.axes()
ax.scatter(x=temp2.age, y=temp2.MonthlyIncome)
ax.plot(np.arange(20,81), (np.arange(20,81)**2)*fit1[0] + np.arange(20,81)*fit1[1] + fit1[2], color='red')

ax.set_xlabel('Age')
ax.set_ylabel('Average Monthly Income')

plt.show()

train['MonthlyIncome'] = np.where(train.MonthlyIncome.isnull(),
                                 (train.age**2)*fit1[0] + train.age*fit1[1] + fit1[2],
                                 train.MonthlyIncome)

kaggle_test['MonthlyIncome'] = np.where(kaggle_test.MonthlyIncome.isnull(),
                                 (kaggle_test.age**2)*fit1[0] + kaggle_test.age*fit1[1] + fit1[2],
                                 kaggle_test.MonthlyIncome)

print(train.MonthlyIncome.isnull().sum())

[-5.58202612e+00  6.17521223e+02 -9.22116325e+03]
0


  plt.show()


In [11]:
# Data treatment 3: Imputing missing values in NumberOfDependents

train['NumberOfDependents'] = train.NumberOfDependents.fillna(0)
kaggle_test['NumberOfDependents'] = kaggle_test.NumberOfDependents.fillna(0)

In [12]:
# Checking for variable correlations and multicollinearity

# Check for variable correlations

corr = train[train.columns[1:]].corr()
fig = plt.figure(figsize=(10,10))    
ax = sns.heatmap(corr, annot=True, fmt='.2f', cmap = "Blues",xticklabels=True, yticklabels=True, cbar=True)
ax.xaxis.label.set_color('black')  
ax.yaxis.label.set_color('black')          
ax.tick_params(axis='x', colors='black')    
ax.tick_params(axis='y', colors='black')    
plt.show()

# Checking for VIF 

temp = train[original_feature_cols].copy(deep=True)
vif_data = pd.DataFrame()
vif_data["feature"] = temp.columns
vif_data["VIF"] = [variance_inflation_factor(temp.values, i) for i in range(len(temp.columns))]
print('VIF with original variables:')
print(vif_data)
print('\n')

# Checking if this gets solved by combining the 'NumberOfTime...' variables removes multi-collinearity
temp = train[original_feature_cols].copy(deep=True)
temp['NumberOfTimesLate'] = temp['NumberOfTime30-59DaysPastDueNotWorse'] + temp['NumberOfTime60-89DaysPastDueNotWorse'] + temp['NumberOfTimes90DaysLate']
temp = temp.drop(['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate'], axis=1)
vif_data = pd.DataFrame()
vif_data["feature"] = temp.columns
vif_data["VIF"] = [variance_inflation_factor(temp.values, i) for i in range(len(temp.columns))]
print('VIF after combining variables:')
print(vif_data)

  plt.show()


VIF with original variables:
                                feature        VIF
0  RevolvingUtilizationOfUnsecuredLines   1.000778
1                                   age   3.646245
2  NumberOfTime30-59DaysPastDueNotWorse  41.173681
3                             DebtRatio   1.048820
4                         MonthlyIncome   1.286248
5       NumberOfOpenCreditLinesAndLoans   4.570557
6               NumberOfTimes90DaysLate  73.196361
7          NumberRealEstateLoansOrLines   2.305697
8  NumberOfTime60-89DaysPastDueNotWorse  91.181585
9                    NumberOfDependents   1.402707


VIF after combining variables:
                                feature       VIF
0  RevolvingUtilizationOfUnsecuredLines  1.000778
1                                   age  3.631896
2                             DebtRatio  1.048807
3                         MonthlyIncome  1.286240
4       NumberOfOpenCreditLinesAndLoans  4.505016
5          NumberRealEstateLoansOrLines  2.304628
6                    Number

In [13]:
# Derived variables

train['IncomePerPerson'] = train.MonthlyIncome/(train.NumberOfDependents+1)
kaggle_test['IncomePerPerson'] = kaggle_test.MonthlyIncome/(kaggle_test.NumberOfDependents+1)

In [14]:
# Sample split - Train: 60%, Validation: 20%, Test: 20%

X_train, X_test, y_train, y_test = train_test_split(train[train.columns[2:]],
                                                    train['SeriousDlqin2yrs'],
                                                    random_state=2023, test_size = 0.2)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                    random_state=2023, test_size = 0.25)


### Logistic Regression V1 model

In [15]:
## Logistic regression (using statsmodels)

logit_model=sm.Logit(y_train,add_constant(X_train, prepend=False))
result=logit_model.fit(disp=0)
print(pd.DataFrame({'Co-efficient':result.params,'p-value':result.pvalues}))
print('\n')

preds = result.predict(add_constant(X_train, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_train, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))

preds = result.predict(add_constant(X_val, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_val, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))

                                      Co-efficient        p-value
RevolvingUtilizationOfUnsecuredLines     -0.000012   8.582054e-01
age                                      -0.027803  5.501173e-146
NumberOfTime30-59DaysPastDueNotWorse      0.499849  4.692096e-260
DebtRatio                                -0.000026   5.368272e-02
MonthlyIncome                            -0.000060   1.860127e-14
NumberOfOpenCreditLinesAndLoans          -0.010753   1.076705e-03
NumberOfTimes90DaysLate                   0.484113  2.995688e-135
NumberRealEstateLoansOrLines              0.078953   2.909338e-09
NumberOfTime60-89DaysPastDueNotWorse     -0.951988   0.000000e+00
NumberOfDependents                        0.143943   9.706391e-15
IncomePerPerson                           0.000034   6.722531e-04
const                                    -1.379737  4.277711e-137


Train AUC Score :  0.6972364
Validation AUC Score :  0.6972798


In [16]:
## Logistic regression + removing outliers

drop_cols_3 = []#['IncomePerPerson']

X_train_3 = X_train.drop(drop_cols_3, axis=1).copy(deep=True)
X_val_3 = X_val.drop(drop_cols_3, axis=1).copy(deep=True)
X_test_3 = X_test.drop(drop_cols_3, axis=1).copy(deep=True)
kaggle_test_3 = kaggle_test.drop(drop_cols_3, axis=1).copy(deep=True)

exclude_cols_for_outlier_treatment = ['NumberOfTime30-59DaysPastDueNotWorse',
                                     'NumberOfTime60-89DaysPastDueNotWorse',
                                     'NumberOfTimes90DaysLate',
                                     'NumberOfTimesLate',
                                     'NumberOfDependents']

print('Upper limits:')
for col in X_train_3.columns:
    if col not in exclude_cols_for_outlier_treatment:
        q1 = X_train_3[col].quantile(0.25)
        q3 = X_train_3[col].quantile(0.75)
        upper_lim = q3 + 1.5 * (q3 - q1)
        if upper_lim > 0:
            print(col+' '+str(upper_lim))
            X_train_3[col] = np.where(X_train_3[col] > upper_lim, upper_lim, X_train_3[col])
            X_test_3[col] = np.where(X_test_3[col] > upper_lim, upper_lim, X_test_3[col])
            X_val_3[col] = np.where(X_val_3[col] > upper_lim, upper_lim, X_val_3[col])
            kaggle_test_3[col] = np.where(kaggle_test_3[col] > upper_lim, upper_lim, kaggle_test_3[col])
        else:
            print(col+' '+'NO TREATMENT: LIMIT = 0')
    else:
        print(col+' '+'NO TREATMENT: EXCLUDED COLUMN')
print('\n')

#['NumberOfOpenCreditLinesAndLoans', 'RevolvingUtilizationOfUnsecuredLines', 'IncomePerPerson']

import statsmodels.api as sm
logit_model=sm.Logit(y_train,add_constant(X_train_3, prepend=False))
result=logit_model.fit(disp=0, )#method='lbfgs')
print(pd.DataFrame({'Co-efficient':result.params,'p-value':result.pvalues}))
print('\n')

preds = result.predict(add_constant(X_train_3, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_train, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))

preds = result.predict(add_constant(X_val_3, prepend=False))
fpr, tpr, thresh = metrics.roc_curve(y_val, preds)
roc_auc = metrics.auc(fpr, tpr)
#plot_roc_curve(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))

#gmeans = np.sqrt(tpr * (1-fpr))
#ix = np.argmax(gmeans)
#print('Best Threshold=%f, G-Mean=%.3f' % (thresh[ix], gmeans[ix]))
#print(confusion_matrix(y_test, np.where(preds >= thresh[ix], 1, 0)))

Upper limits:
RevolvingUtilizationOfUnsecuredLines 1.35518645525
age 96.0
NumberOfTime30-59DaysPastDueNotWorse NO TREATMENT: EXCLUDED COLUMN
DebtRatio 1.8901548084999997
MonthlyIncome 13956.829493889112
NumberOfOpenCreditLinesAndLoans 20.0
NumberOfTimes90DaysLate NO TREATMENT: EXCLUDED COLUMN
NumberRealEstateLoansOrLines 5.0
NumberOfTime60-89DaysPastDueNotWorse NO TREATMENT: EXCLUDED COLUMN
NumberOfDependents NO TREATMENT: EXCLUDED COLUMN
IncomePerPerson 13425.0


                                      Co-efficient        p-value
RevolvingUtilizationOfUnsecuredLines      2.516859   0.000000e+00
age                                      -0.014907   4.395801e-38
NumberOfTime30-59DaysPastDueNotWorse      0.311453  5.241580e-100
DebtRatio                                 0.076218   1.058134e-03
MonthlyIncome                            -0.000075   1.949807e-13
NumberOfOpenCreditLinesAndLoans           0.036362   4.519826e-24
NumberOfTimes90DaysLate                   0.304673   1.541656e-62
Num

In [17]:
## Logistic regression + removing outliers + treating multi-collinearity

drop_cols_2 = ['MonthlyIncome','age']

X_train_2 = X_train_3.copy(deep=True)
X_val_2 = X_val_3.copy(deep=True)
X_test_2 = X_test_3.copy(deep=True)
kaggle_test_2 = kaggle_test_3.copy(deep=True)

# Check VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X_train_2.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_2.values, i) for i in range(len(X_train_2.columns))]
print('VIF before:')
print(vif_data)
print('\n')

X_train_2['NumberOfTimesLate'] = X_train_2['NumberOfTime30-59DaysPastDueNotWorse'] + X_train_2['NumberOfTime60-89DaysPastDueNotWorse'] + X_train_2['NumberOfTimes90DaysLate']
X_train_2 = X_train_2.drop(['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate'], axis=1)

X_val_2['NumberOfTimesLate'] = X_val_2['NumberOfTime30-59DaysPastDueNotWorse'] + X_val_2['NumberOfTime60-89DaysPastDueNotWorse'] + X_val_2['NumberOfTimes90DaysLate']
X_val_2 = X_val_2.drop(['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate'], axis=1)

X_test_2['NumberOfTimesLate'] = X_test_2['NumberOfTime30-59DaysPastDueNotWorse'] + X_test_2['NumberOfTime60-89DaysPastDueNotWorse'] + X_test_2['NumberOfTimes90DaysLate']
X_test_2 = X_test_2.drop(['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate'], axis=1)

kaggle_test_2['NumberOfTimesLate'] = kaggle_test_2['NumberOfTime30-59DaysPastDueNotWorse'] + kaggle_test_2['NumberOfTime60-89DaysPastDueNotWorse'] + kaggle_test_2['NumberOfTimes90DaysLate']
kaggle_test_2 = kaggle_test_2.drop(['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate'], axis=1)

X_train_2 = X_train_2.drop(drop_cols_2, axis=1)
X_val_2 = X_val_2.drop(drop_cols_2, axis=1)
X_test_2 = X_test_2.drop(drop_cols_2, axis=1)
kaggle_test_2 = kaggle_test_2.drop(drop_cols_2, axis=1)

# Check VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X_train_2.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_2.values, i) for i in range(len(X_train_2.columns))]
print('VIF after:')
print(vif_data)
print('\n')

import statsmodels.api as sm
logit_model=sm.Logit(y_train,add_constant(X_train_2, prepend=False))
result=logit_model.fit(disp=0)
print(pd.DataFrame({'Co-efficient':result.params,'p-value':result.pvalues}))
print('\n')

preds = result.predict(add_constant(X_train_2, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_train, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))

preds = result.predict(add_constant(X_val_2, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_val, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))

VIF before:
                                 feature        VIF
0   RevolvingUtilizationOfUnsecuredLines   1.714830
1                                    age   6.660850
2   NumberOfTime30-59DaysPastDueNotWorse  43.437364
3                              DebtRatio   2.055377
4                          MonthlyIncome  25.663061
5        NumberOfOpenCreditLinesAndLoans   5.247781
6                NumberOfTimes90DaysLate  75.875611
7           NumberRealEstateLoansOrLines   2.805854
8   NumberOfTime60-89DaysPastDueNotWorse  96.955916
9                     NumberOfDependents   4.696991
10                       IncomePerPerson  21.305351


VIF after:
                                feature       VIF
0  RevolvingUtilizationOfUnsecuredLines  1.615764
1                             DebtRatio  1.819304
2       NumberOfOpenCreditLinesAndLoans  4.192455
3          NumberRealEstateLoansOrLines  2.681367
4                    NumberOfDependents  1.671308
5                       IncomePerPerson  3.104675
6

In [18]:
## Logistic regression + removing outliers + treating multi-collinearity + backward variable elimination

drop_cols_4 = ['NumberOfDependents']#['IncomePerPerson']#['RevolvingUtilizationOfUnsecuredLines']#['NumberOfOpenCreditLinesAndLoans', ]

X_train_4 = X_train_2.drop(drop_cols_4, axis=1).copy(deep=True)
X_val_4 = X_val_2.drop(drop_cols_4, axis=1).copy(deep=True)
X_test_4 = X_test_2.drop(drop_cols_4, axis=1).copy(deep=True)
kaggle_test_4 = kaggle_test_2.drop(drop_cols_4, axis=1).copy(deep=True)

import statsmodels.api as sm
logit_model=sm.Logit(y_train,add_constant(X_train_4, prepend=False))
result=logit_model.fit(disp=0)
print(pd.DataFrame({'Co-efficient':result.params,'p-value':result.pvalues}).sort_values('p-value'))
print('\n')

preds = result.predict(add_constant(X_train_4, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_train, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))
print('Train Gini Score : ', round(roc_auc*2-1,7))
ks(y_train, preds)
print('\n')

preds = result.predict(add_constant(X_val_4, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_val, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))
print('Validation Gini Score : ', round(roc_auc*2-1,7))
ks(y_val, preds)
print('\n')


                                      Co-efficient       p-value
RevolvingUtilizationOfUnsecuredLines      2.808794  0.000000e+00
const                                    -4.181514  0.000000e+00
NumberOfTimesLate                         0.007984  9.820190e-43
IncomePerPerson                          -0.000065  2.317757e-34
NumberOfOpenCreditLinesAndLoans           0.035833  4.793978e-26
DebtRatio                                 0.084981  1.267117e-04
NumberRealEstateLoansOrLines              0.048352  2.492849e-03


Train AUC Score :  0.7920739
Train Gini Score :  0.5841478
KS is 46.7% at decile 3


Validation AUC Score :  0.7891834
Validation Gini Score :  0.5783668
KS is 45.7% at decile 3




In [19]:
# Logistic regression final model on local test sample

preds = result.predict(add_constant(X_test_4, prepend=False))
fpr, tpr, thresh = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
plot_roc_curve(fpr, tpr)
print('Test AUC Score : ', round(roc_auc,7))
print('Test Gini Score : ', round(roc_auc*2-1,7))

#Confusion matrix
gmeans = np.sqrt(tpr * (1-fpr))
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresh[ix], gmeans[ix]))
print(confusion_matrix(y_test, np.where(preds >= thresh[ix], 1, 0)))

#KS statistic
kstable = ks(y_test, preds, True)
kstable

Test AUC Score :  0.788467
Test Gini Score :  0.5769339
Best Threshold=0.067760, G-Mean=0.730
[[21018  6936]
 [  594  1452]]
KS is 45.5% at decile 3


  plt.show()


Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.188697,0.72729,755,2245,36.90%,8.03%,36.90%,8.03%,28.9
2,0.112116,0.188673,449,2551,21.95%,9.13%,58.85%,17.16%,41.7
3,0.060783,0.112103,278,2722,13.59%,9.74%,72.43%,26.89%,45.5
4,0.038162,0.060782,168,2832,8.21%,10.13%,80.65%,37.03%,43.6
5,0.02795,0.038145,120,2880,5.87%,10.30%,86.51%,47.33%,39.2
6,0.02281,0.027945,73,2927,3.57%,10.47%,90.08%,57.80%,32.3
7,0.019454,0.022807,70,2930,3.42%,10.48%,93.50%,68.28%,25.2
8,0.017001,0.019453,50,2950,2.44%,10.55%,95.94%,78.83%,17.1
9,0.01465,0.017,44,2956,2.15%,10.57%,98.09%,89.41%,8.7
10,0.006832,0.014649,39,2961,1.91%,10.59%,100.00%,100.00%,0.0


In [20]:
# Logistic regression Kaggle submission

preds = result.predict(add_constant(kaggle_test_4[kaggle_test_4.columns[2:]], prepend=False))
out = pd.DataFrame({'Id':list(kaggle_test[kaggle_test.columns[0]]),'Probability':list(preds)})
out.to_csv('kaggle_submission_logistic_regression.csv', index=False)

### Decision-tree based models

In [21]:
# Decision tree

clf = DecisionTreeClassifier(class_weight='balanced', random_state=2022, max_depth=5)
clf.fit(X_train, y_train)

clf_scores_proba = clf.predict_proba(X_train)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_train, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))

clf_scores_proba = clf.predict_proba(X_val)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_val, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))


Train AUC Score :  0.8490429
Validation AUC Score :  0.8430704


In [22]:
# Decision tree final model on local test sample

clf_scores_proba = clf.predict_proba(X_test)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_test, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Test AUC Score : ', round(roc_auc,7))

pd.DataFrame({'Feature':X_train.columns, 'Feature importance': [round(i,3) for i in clf.feature_importances_]}).sort_values('Feature importance', ascending=False)

# Kaggle submission

clf_scores_proba = clf.predict_proba(kaggle_test[kaggle_test.columns[2:]])
clf_preds = clf_scores_proba[:,1]
out = pd.DataFrame({'Id':list(kaggle_test[kaggle_test.columns[0]]),'Probability':clf_preds})
out.to_csv('kaggle_submission_decision_tree.csv', index=False)


Test AUC Score :  0.8440602


In [23]:
# Random Forest

clf = RandomForestClassifier(random_state=2022, n_estimators=200, class_weight='balanced', max_depth=6, n_jobs=-1)
clf.fit(X_train, y_train)

clf_scores_proba = clf.predict_proba(X_train)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_train, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))

clf_scores_proba = clf.predict_proba(X_val)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_val, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))



Train AUC Score :  0.8676474
Validation AUC Score :  0.857928


In [24]:
# Random forest final model on local test sample

clf_scores_proba = clf.predict_proba(X_test)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_test, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Test AUC Score : ', round(roc_auc,7))

print(confusion_matrix(y_test, np.where(clf_preds >= 0.5, 1, 0)))

pd.DataFrame({'Feature':X_train.columns, 'Feature importance': [round(i,3) for i in clf.feature_importances_]}).sort_values('Feature importance', ascending=False)

# Kaggle submission

clf_scores_proba = clf.predict_proba(kaggle_test[kaggle_test.columns[2:]])
clf_preds = clf_scores_proba[:,1]
out = pd.DataFrame({'Id':list(kaggle_test[kaggle_test.columns[0]]),'Probability':clf_preds})
out.to_csv('kaggle_submission_random_forest.csv', index=False)


Test AUC Score :  0.8588611
[[21903  6051]
 [  465  1581]]


In [25]:
# GBM

clf = GradientBoostingClassifier(random_state=2022, n_estimators=100, learning_rate=0.05, max_depth=6)
clf.fit(X_train, y_train)

clf_scores_proba = clf.predict_proba(X_train)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_train, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))

clf_scores_proba = clf.predict_proba(X_val)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_val, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))


Train AUC Score :  0.8842281
Validation AUC Score :  0.8618999


In [26]:
# GBM final model on local test sample

clf_scores_proba = clf.predict_proba(X_test)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_test, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Test AUC Score : ', round(roc_auc,7))

pd.DataFrame({'Feature':X_train.columns, 'Feature importance': [round(i,3) for i in clf.feature_importances_]}).sort_values('Feature importance', ascending=False)

# Kaggle submission

clf_scores_proba = clf.predict_proba(kaggle_test[kaggle_test.columns[2:]])
clf_preds = clf_scores_proba[:,1]
out = pd.DataFrame({'Id':list(kaggle_test[kaggle_test.columns[0]]),'Probability':clf_preds})
out.to_csv('kaggle_submission_gbm.csv', index=False)


Test AUC Score :  0.863382


In [27]:
# XGBoost

clf = XGBClassifier(random_state=2022, n_estimators=150, learning_rate=0.05, max_depth=6, n_jobs=-1)
clf.fit(X_train, y_train)
clf_scores_proba = clf.predict_proba(X_train)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_train, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))
print('Train Gini Score : ', round(roc_auc*2-1,7))
kstable = ks(y_train, clf_preds, True)
print('\n')

clf_scores_proba = clf.predict_proba(X_val)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_val, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))
print('Validation Gini Score : ', round(roc_auc*2-1,7))
kstable = ks(y_val, clf_preds, True)
print('\n')




Train AUC Score :  0.8920086
Train Gini Score :  0.7840172
KS is 61.7% at decile 2


Validation AUC Score :  0.8614525
Validation Gini Score :  0.722905
KS is 56.3% at decile 3




In [28]:
# XGBoost final model on local test sample

clf_scores_proba = clf.predict_proba(X_test)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_test, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
plot_roc_curve(fpr, tpr)
print('Test AUC Score : ', round(roc_auc,7))
print('Test Gini Score : ', round(roc_auc*2-1,7))
kstable = ks(y_test, clf_preds, True)
print('\n')

gmeans = np.sqrt(tpr * (1-fpr))
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresh[ix], gmeans[ix]))
print(confusion_matrix(y_test, np.where(clf_preds >= thresh[ix], 1, 0)))

pd.DataFrame({'Feature':X_train.columns, 'Feature importance': [round(i,3) for i in clf.feature_importances_]}).sort_values('Feature importance', ascending=False)

# Kaggle submission

clf_scores_proba = clf.predict_proba(kaggle_test[kaggle_test.columns[2:]])
clf_preds = clf_scores_proba[:,1]
out = pd.DataFrame({'Id':list(kaggle_test[kaggle_test.columns[0]]),'Probability':clf_preds})
out.to_csv('kaggle_submission_xgboost.csv', index=False)


  plt.show()


Test AUC Score :  0.8623707
Test Gini Score :  0.7247414
KS is 56.49999999999999% at decile 3


Best Threshold=0.068511, G-Mean=0.785
[[22425  5529]
 [  474  1572]]


### Logistic Regression V2 model

In [29]:
# Logistic regression V2 - Outlier treatment

train_logistic_v2 = train.copy(deep=True)
kaggle_test_logistic_v2 = kaggle_test.copy(deep=True)

# Clip outliers

exclude_cols_for_outlier_treatment = ['NumberOfTime30-59DaysPastDueNotWorse',
                                     'NumberOfTime60-89DaysPastDueNotWorse',
                                     'NumberOfTimes90DaysLate',
                                     'NumberOfDependents']

print('Upper limits:')
for col in train_logistic_v2.columns[2:]:
    if col not in exclude_cols_for_outlier_treatment:
        q1 = train_logistic_v2[col].quantile(0.25)
        q3 = train_logistic_v2[col].quantile(0.75)
        upper_lim = q3 + 1.5 * (q3 - q1)
        if upper_lim > 0:
            print(col+' '+str(upper_lim))
            train_logistic_v2[col] = np.where(train_logistic_v2[col] > upper_lim, upper_lim, train_logistic_v2[col])
            kaggle_test_logistic_v2[col] = np.where(kaggle_test_logistic_v2[col] > upper_lim, upper_lim, kaggle_test_logistic_v2[col])
        else:
            print(col+' '+'NO TREATMENT: LIMIT = 0')
    else:
        print(col+' '+'NO TREATMENT: EXCLUDED COLUMN')
print('\n')



Upper limits:
RevolvingUtilizationOfUnsecuredLines 1.35280900425
age 96.0
NumberOfTime30-59DaysPastDueNotWorse NO TREATMENT: EXCLUDED COLUMN
DebtRatio 1.9080320674999998
MonthlyIncome 13968.829493889112
NumberOfOpenCreditLinesAndLoans 20.0
NumberOfTimes90DaysLate NO TREATMENT: EXCLUDED COLUMN
NumberRealEstateLoansOrLines 5.0
NumberOfTime60-89DaysPastDueNotWorse NO TREATMENT: EXCLUDED COLUMN
NumberOfDependents NO TREATMENT: EXCLUDED COLUMN
IncomePerPerson 13465.0




In [30]:
# Derived variables

# Number of dependents capped at 4
train_logistic_v2['NumberOfDependents'] = np.where(train_logistic_v2.NumberOfDependents>4, 4, train_logistic_v2.NumberOfDependents)
kaggle_test_logistic_v2['NumberOfDependents'] = np.where(kaggle_test_logistic_v2.NumberOfDependents>4, 4, kaggle_test_logistic_v2.NumberOfDependents)

# Income per person = income/(1 + dependents)
train_logistic_v2['IncomePerPerson'] = train_logistic_v2.MonthlyIncome/(train_logistic_v2.NumberOfDependents+1)
kaggle_test_logistic_v2['IncomePerPerson'] = kaggle_test_logistic_v2.MonthlyIncome/(kaggle_test_logistic_v2.NumberOfDependents+1)

# Flag for more than 2 open lines or loans
train_logistic_v2['MoreThan2OpenLinesFlag'] = np.where(train_logistic_v2.NumberOfOpenCreditLinesAndLoans>2, 1, 0)
kaggle_test_logistic_v2['MoreThan2OpenLinesFlag'] = np.where(kaggle_test_logistic_v2.NumberOfOpenCreditLinesAndLoans>2, 1, 0)

# Flag for more than 4 real estate loans
train_logistic_v2['MoreThan4RealEstateLoansFlag'] = np.where(train_logistic_v2.NumberRealEstateLoansOrLines>4, 1, 0)
kaggle_test_logistic_v2['MoreThan4RealEstateLoansFlag'] = np.where(kaggle_test_logistic_v2.NumberRealEstateLoansOrLines>4, 1, 0)

# Monthly expenses = monthly income * debt ratio
train_logistic_v2['MonthlyExpenses'] = train_logistic_v2.DebtRatio * train_logistic_v2.MonthlyIncome
kaggle_test_logistic_v2['MonthlyExpenses'] = kaggle_test_logistic_v2.DebtRatio * kaggle_test_logistic_v2.MonthlyIncome

# Net income per person = (monthly income - monthly expenses)/(1 + dependents)
train_logistic_v2['NetIncomePerPerson'] = (train_logistic_v2.MonthlyIncome - train_logistic_v2.MonthlyExpenses) / (train_logistic_v2.NumberOfDependents + 1)
kaggle_test_logistic_v2['NetIncomePerPerson'] = (kaggle_test_logistic_v2.MonthlyIncome - kaggle_test_logistic_v2.MonthlyExpenses) / (kaggle_test_logistic_v2.NumberOfDependents + 1)

# Weighted days past due = 1 * num of times bwn 30 and 60 + 2 * num of times bwn 60 and 90 + 4 * num of times past 90
train_logistic_v2['WeightedNumTimesLate'] = train_logistic_v2['NumberOfTime30-59DaysPastDueNotWorse'] + 2*train_logistic_v2['NumberOfTime60-89DaysPastDueNotWorse'] + 4*train_logistic_v2['NumberOfTimes90DaysLate']
kaggle_test_logistic_v2['WeightedNumTimesLate'] = kaggle_test_logistic_v2['NumberOfTime30-59DaysPastDueNotWorse'] + 2*kaggle_test_logistic_v2['NumberOfTime60-89DaysPastDueNotWorse'] + 4*kaggle_test_logistic_v2['NumberOfTimes90DaysLate']

# Flag for customer who has gone due beyond 60 day at least once
train_logistic_v2['AtLeastOnce60DaysLateFlag'] = np.where(train_logistic_v2['NumberOfTime60-89DaysPastDueNotWorse'] + train_logistic_v2['NumberOfTimes90DaysLate'] > 0, 1, 0)
kaggle_test_logistic_v2['AtLeastOnce60DaysLateFlag'] = np.where(kaggle_test_logistic_v2['NumberOfTime60-89DaysPastDueNotWorse'] + kaggle_test_logistic_v2['NumberOfTimes90DaysLate'] > 0, 1, 0)

# New credit utilization variable to capture the non-linear behaviour exhibited by the original variable
train_logistic_v2['UtilizationSigmoid'] = 1/(1+np.exp(-1*train_logistic_v2.RevolvingUtilizationOfUnsecuredLines)) 
kaggle_test_logistic_v2['UtilizationSigmoid'] = 1/(1+np.exp(-1*kaggle_test_logistic_v2.RevolvingUtilizationOfUnsecuredLines)) 

# Smoothing age 
train_logistic_v2['ageBinned'] = np.where(train_logistic_v2.age<=34, 30,
                                          np.where(train_logistic_v2.age<=44, 40,
                                                  np.where(train_logistic_v2.age<=54, 50,
                                                          np.where(train_logistic_v2.age<=64, 60, 70))))
kaggle_test_logistic_v2['ageBinned'] = np.where(kaggle_test_logistic_v2.age<=34, 30,
                                          np.where(kaggle_test_logistic_v2.age<=44, 40,
                                                  np.where(kaggle_test_logistic_v2.age<=54, 50,
                                                          np.where(kaggle_test_logistic_v2.age<=64, 60, 70))))


In [31]:
# Information value

train_logistic_v2['SeriousDlqin2yrs'] = train_logistic_v2.SeriousDlqin2yrs.astype(int)
clf = WOE()
clf.fit(train_logistic_v2[train_logistic_v2.columns[2:]], train_logistic_v2['SeriousDlqin2yrs'])
clf.iv_df

Unnamed: 0,Variable_Name,Information_Value
16,WeightedNumTimesLate,1.222337
0,AtLeastOnce60DaysLateFlag,1.069606
15,UtilizationSigmoid,0.935422
14,RevolvingUtilizationOfUnsecuredLines,0.935422
12,NumberOfTimes90DaysLate,0.484201
10,NumberOfTime30-59DaysPastDueNotWorse,0.471829
11,NumberOfTime60-89DaysPastDueNotWorse,0.264823
18,ageBinned,0.248028
17,age,0.21985
2,IncomePerPerson,0.084335


In [32]:
# Multicollinearity

drop_cols_v2 = ['age', 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse', 
                'NumberOfTimes90DaysLate', 'MonthlyIncome', 'MoreThan2OpenLinesFlag', 
                'MonthlyExpenses', 'DebtRatio','RevolvingUtilizationOfUnsecuredLines','NumberOfDependents']

temp = train_logistic_v2[train_logistic_v2.columns[2:]].copy(deep=True)
vif_data = pd.DataFrame()
vif_data["feature"] = temp.columns
vif_data["VIF"] = [variance_inflation_factor(temp.values, i) for i in range(len(temp.columns))]
print('VIF with original variables:')
print(vif_data)
print('\n')

# Checking if this gets solved by combining the 'NumberOfTime...' variables removes multi-collinearity
temp = train_logistic_v2[train_logistic_v2.columns[2:]].copy(deep=True)
temp = temp.drop(drop_cols_v2, axis=1)
vif_data = pd.DataFrame()
vif_data["feature"] = temp.columns
vif_data["VIF"] = [variance_inflation_factor(temp.values, i) for i in range(len(temp.columns))]
print('VIF after treatment:')
print(vif_data)

  vif = 1. / (1. - r_squared_i)


VIF with original variables:
                                 feature         VIF
0   RevolvingUtilizationOfUnsecuredLines    5.898479
1                                    age  173.782593
2   NumberOfTime30-59DaysPastDueNotWorse         inf
3                              DebtRatio   11.664471
4                          MonthlyIncome   28.777663
5        NumberOfOpenCreditLinesAndLoans    6.311832
6                NumberOfTimes90DaysLate         inf
7           NumberRealEstateLoansOrLines    3.287406
8   NumberOfTime60-89DaysPastDueNotWorse         inf
9                     NumberOfDependents    5.445803
10                       IncomePerPerson   29.651038
11                MoreThan2OpenLinesFlag   15.663496
12          MoreThan4RealEstateLoansFlag    1.209096
13                       MonthlyExpenses   29.911015
14                    NetIncomePerPerson   14.096935
15                  WeightedNumTimesLate         inf
16             AtLeastOnce60DaysLateFlag    1.287523
17               

In [33]:
# Sample split - Train: 60%, Validation: 20%, Test: 20%

train_logistic_v2_temp = train_logistic_v2.drop(drop_cols_v2, axis = 1).copy(deep=True)
kaggle_test_logistic_v2_temp = kaggle_test_logistic_v2.drop(drop_cols_v2, axis = 1).copy(deep=True)

X_train_v2, X_test_v2, y_train_v2, y_test_v2 = train_test_split(train_logistic_v2_temp[train_logistic_v2_temp.columns[2:]],
                                                    train_logistic_v2_temp['SeriousDlqin2yrs'],
                                                    random_state=2023, test_size = 0.2)

X_train_v2, X_val_v2, y_train_v2, y_val_v2 = train_test_split(X_train_v2, y_train_v2,
                                                    random_state=2023, test_size = 0.25)


In [34]:
# Variable elimination

drop_cols_4 = []#['UtilizationSquared']#['IncomePerPerson']#['RevolvingUtilizationOfUnsecuredLines']#['NumberOfOpenCreditLinesAndLoans', ]

X_train_v2_2 = X_train_v2.drop(drop_cols_4, axis=1).copy(deep=True)
X_val_v2_2 = X_val_v2.drop(drop_cols_4, axis=1).copy(deep=True)
X_test_v2_2 = X_test_v2.drop(drop_cols_4, axis=1).copy(deep=True)
kaggle_test_logistic_v2_2 = kaggle_test_logistic_v2_temp.drop(drop_cols_4, axis=1).copy(deep=True)

import statsmodels.api as sm
logit_model=sm.Logit(y_train_v2,add_constant(X_train_v2_2, prepend=False))
result=logit_model.fit(disp=0)
print(pd.DataFrame({'Co-efficient':result.params,'p-value':result.pvalues}).sort_values('p-value'))
print('\n')

preds = result.predict(add_constant(X_train_v2_2, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_train_v2, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))
print('Train Gini Score : ', round(roc_auc*2-1,7))
ks(y_train, preds)
print('\n')

preds = result.predict(add_constant(X_val_v2_2, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_val_v2, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))
print('Validation Gini Score : ', round(roc_auc*2-1,7))
ks(y_val, preds)
print('\n')


                                 Co-efficient       p-value
AtLeastOnce60DaysLateFlag            1.999375  0.000000e+00
UtilizationSigmoid                   9.327389  0.000000e+00
const                               -8.377522  0.000000e+00
NumberOfOpenCreditLinesAndLoans      0.050632  1.939833e-44
ageBinned                           -0.015449  1.122885e-32
MoreThan4RealEstateLoansFlag         0.983907  8.259838e-16
IncomePerPerson                     -0.000038  7.334117e-11
NetIncomePerPerson                  -0.000021  1.388914e-06
WeightedNumTimesLate                 0.000979  4.458538e-05
NumberRealEstateLoansOrLines         0.053397  3.572270e-03


Train AUC Score :  0.8453463
Train Gini Score :  0.6906925
KS is 53.7% at decile 3


Validation AUC Score :  0.839726
Validation Gini Score :  0.6794521
KS is 52.400000000000006% at decile 3




In [35]:
# Final model on internal test sample

preds = result.predict(add_constant(X_test_v2_2, prepend=False))
fpr, tpr, threshold = metrics.roc_curve(y_test_v2, preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))
print('Validation Gini Score : ', round(roc_auc*2-1,7))
ks(y_test_v2, preds)
print('\n')

Validation AUC Score :  0.8397855
Validation Gini Score :  0.6795709
KS is 52.900000000000006% at decile 3




In [36]:
# Logistic regression V2 Kaggle submission

preds = result.predict(add_constant(kaggle_test_logistic_v2_2[kaggle_test_logistic_v2_2.columns[2:]], prepend=False))
out = pd.DataFrame({'Id':list(kaggle_test_logistic_v2_2[kaggle_test_logistic_v2_2.columns[0]]),'Probability':list(preds)})
out.to_csv('kaggle_submission_logistic_regression_v2.csv', index=False)

### Appendix: Trying out XGBoost with the additional features

In [37]:



X_train_v3, X_test_v3, y_train_v3, y_test_v3 = train_test_split(train_logistic_v2[train_logistic_v2.columns[2:]],
                                                    train_logistic_v2['SeriousDlqin2yrs'],
                                                    random_state=2023, test_size = 0.2)

X_train_v3, X_val_v3, y_train_v3, y_val_v3 = train_test_split(X_train_v3, y_train_v3,
                                                    random_state=2023, test_size = 0.25)


clf = XGBClassifier(random_state=2022, n_estimators=150, learning_rate=0.05, max_depth=6, n_jobs=-1)
clf.fit(X_train_v3, y_train_v3)
clf_scores_proba = clf.predict_proba(X_train_v3)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_train_v3, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Train AUC Score : ', round(roc_auc,7))
print('Train Gini Score : ', round(roc_auc*2-1,7))
kstable = ks(y_train_v3, clf_preds, True)
print('\n')

clf_scores_proba = clf.predict_proba(X_val_v3)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_val_v3, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
print('Validation AUC Score : ', round(roc_auc,7))
print('Validation Gini Score : ', round(roc_auc*2-1,7))
kstable = ks(y_val_v3, clf_preds, True)
print('\n')

clf_scores_proba = clf.predict_proba(X_test_v3)
clf_preds = clf_scores_proba[:,1]
fpr, tpr, thresh = roc_curve(y_test_v3, clf_preds)
roc_auc = metrics.auc(fpr, tpr)
plot_roc_curve(fpr, tpr)
print('Test AUC Score : ', round(roc_auc,7))
print('Test Gini Score : ', round(roc_auc*2-1,7))
kstable = ks(y_test_v3, clf_preds, True)
print('\n')





Train AUC Score :  0.8970707
Train Gini Score :  0.7941413
KS is 63.0% at decile 2


Validation AUC Score :  0.8606991
Validation Gini Score :  0.7213982
KS is 56.00000000000001% at decile 2


Test AUC Score :  0.8635059
Test Gini Score :  0.7270117
KS is 56.699999999999996% at decile 3




  plt.show()
