In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, r2_score

import warnings
import os

In [2]:
# Load the dataset
i = pd.read_excel("./dataset/Internal.xlsx")
e = pd.read_excel("./dataset/External.xlsx")

In [3]:
df1 = i.copy()
df2 = e.copy()

In [4]:
print(df1.shape)
print(df2.shape)

(51336, 26)
(51336, 62)


In [5]:
# Remove nulls
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [6]:
# Remove Columns with more than 10000 missing values
columns_to_be_removed = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed .append(i)



df2 = df2.drop(columns_to_be_removed, axis =1)

In [7]:
# Remove Rows with missing values
for i in df2.columns:
    df2 = df2.loc[ df2[i] != -99999 ]

In [8]:
# NA values in df2
df2.isna().sum()

PROSPECTID                    0
time_since_recent_payment     0
num_times_delinquent          0
max_recent_level_of_deliq     0
num_deliq_6mts                0
num_deliq_12mts               0
num_deliq_6_12mts             0
num_times_30p_dpd             0
num_times_60p_dpd             0
num_std                       0
num_std_6mts                  0
num_std_12mts                 0
num_sub                       0
num_sub_6mts                  0
num_sub_12mts                 0
num_dbt                       0
num_dbt_6mts                  0
num_dbt_12mts                 0
num_lss                       0
num_lss_6mts                  0
num_lss_12mts                 0
recent_level_of_deliq         0
tot_enq                       0
CC_enq                        0
CC_enq_L6m                    0
CC_enq_L12m                   0
PL_enq                        0
PL_enq_L6m                    0
PL_enq_L12m                   0
time_since_recent_enq         0
enq_L12m                      0
enq_L6m 

In [9]:
# Checking common column names
for i in list(df1.columns):
    if i in list(df2.columns):
        print (i)

PROSPECTID


In [10]:
# Merge the two dataframes, inner join so that no nulls are present
df = pd.merge( df1, df2, how ='inner', left_on = ['PROSPECTID'], right_on = ['PROSPECTID'] )


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PROSPECTID                  42064 non-null  int64  
 1   Total_TL                    42064 non-null  int64  
 2   Tot_Closed_TL               42064 non-null  int64  
 3   Tot_Active_TL               42064 non-null  int64  
 4   Total_TL_opened_L6M         42064 non-null  int64  
 5   Tot_TL_closed_L6M           42064 non-null  int64  
 6   pct_tl_open_L6M             42064 non-null  float64
 7   pct_tl_closed_L6M           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  Total_TL_opened_L12M        42064 non-null  int64  
 11  Tot_TL_closed_L12M          42064 non-null  int64  
 12  pct_tl_open_L12M            42064 non-null  float64
 13  pct_tl_closed_L12M          420

In [12]:
df.isna().sum().sum()

0

In [13]:
# check how many columns are categorical
categorical_log = []
numerical_log = []

for i in df.columns:
    if i in ['PROSPECTID', 'Approved_Flag']:
        continue
    if df[i].dtype == 'object' :
        categorical_log.append(i)
        print(df[i].value_counts(), end = '\n\n')
    else:
        numerical_log.append(i)
        

MARITALSTATUS
Married    30886
Single     11178
Name: count, dtype: int64

EDUCATION
GRADUATE          14140
12TH              11703
SSC                7241
UNDER GRADUATE     4572
OTHERS             2291
POST-GRADUATE      1898
PROFESSIONAL        219
Name: count, dtype: int64

GENDER
M    37345
F     4719
Name: count, dtype: int64

last_prod_enq2
ConsumerLoan    16480
others          13653
PL               7553
CC               2195
AL               1353
HL                830
Name: count, dtype: int64

first_prod_enq2
others          20640
ConsumerLoan    11075
PL               4431
AL               2641
CC               1988
HL               1289
Name: count, dtype: int64



In [14]:
# Drop the target variable from the categorical_log for chi-square test.
# categorical_log.remove('Approved_Flag')

In [15]:
# Chi-square test
for i in categorical_log:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(f'{i} : {pval}')
    
# Since all the p-values are less than 0.05, So all the categorical variables are significant.

MARITALSTATUS : 3.578180861038862e-233


EDUCATION : 2.6942265249737532e-30
GENDER : 1.907936100186563e-05
last_prod_enq2 : 0.0
first_prod_enq2 : 7.84997610555419e-287


In [16]:
numerical_log

['Total_TL',
 'Tot_Closed_TL',
 'Tot_Active_TL',
 'Total_TL_opened_L6M',
 'Tot_TL_closed_L6M',
 'pct_tl_open_L6M',
 'pct_tl_closed_L6M',
 'pct_active_tl',
 'pct_closed_tl',
 'Total_TL_opened_L12M',
 'Tot_TL_closed_L12M',
 'pct_tl_open_L12M',
 'pct_tl_closed_L12M',
 'Tot_Missed_Pmnt',
 'Auto_TL',
 'CC_TL',
 'Consumer_TL',
 'Gold_TL',
 'Home_TL',
 'PL_TL',
 'Secured_TL',
 'Unsecured_TL',
 'Other_TL',
 'Age_Oldest_TL',
 'Age_Newest_TL',
 'time_since_recent_payment',
 'num_times_delinquent',
 'max_recent_level_of_deliq',
 'num_deliq_6mts',
 'num_deliq_12mts',
 'num_deliq_6_12mts',
 'num_times_30p_dpd',
 'num_times_60p_dpd',
 'num_std',
 'num_std_6mts',
 'num_std_12mts',
 'num_sub',
 'num_sub_6mts',
 'num_sub_12mts',
 'num_dbt',
 'num_dbt_6mts',
 'num_dbt_12mts',
 'num_lss',
 'num_lss_6mts',
 'num_lss_12mts',
 'recent_level_of_deliq',
 'tot_enq',
 'CC_enq',
 'CC_enq_L6m',
 'CC_enq_L12m',
 'PL_enq',
 'PL_enq_L6m',
 'PL_enq_L12m',
 'time_since_recent_enq',
 'enq_L12m',
 'enq_L6m',
 'enq_L3m',

In [17]:
# VIF sequentially check
vif_data = df[numerical_log]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

In [18]:
for i in range (0,total_columns):
    
    vif_value = variance_inflation_factor(vif_data, column_index)
    print (column_index,'---',vif_value)
    
    # If VIF is less than 6, then keep the column
    if vif_value <= 6:
        columns_to_be_kept.append( numerical_log[i] )
        column_index = column_index + 1
    
    # If VIF is greater than 6, then drop the column
    else:
        vif_data = vif_data.drop([ numerical_log[i] ] , axis=1)

  vif = 1. / (1. - r_squared_i)


0 --- inf


  vif = 1. / (1. - r_squared_i)


0 --- inf
0 --- 11.320180023967996
0 --- 8.363698035000336
0 --- 6.520647877790928
0 --- 5.149501618212625
1 --- 2.611111040579735


  vif = 1. / (1. - r_squared_i)


2 --- inf
2 --- 1788.7926256209232
2 --- 8.601028256477228
2 --- 3.8328007921530785
3 --- 6.099653381646739
3 --- 5.581352009642762
4 --- 1.985584353098778


  vif = 1. / (1. - r_squared_i)


5 --- inf
5 --- 4.809538302819343
6 --- 23.270628983464636
6 --- 30.595522588100053
6 --- 4.3843464059655854
7 --- 3.064658415523423
8 --- 2.898639771299253
9 --- 4.377876915347324
10 --- 2.207853583695844
11 --- 4.916914200506864
12 --- 5.214702030064725
13 --- 3.3861625024231476
14 --- 7.840583309478997
14 --- 5.255034641721438


  vif = 1. / (1. - r_squared_i)


15 --- inf
15 --- 7.380634506427232
15 --- 1.4210050015175733
16 --- 8.083255010190316
16 --- 1.6241227524040112
17 --- 7.257811920140003
17 --- 15.59624383268298
17 --- 1.825857047132431
18 --- 1.5080839450032664
19 --- 2.172088834824577
20 --- 2.62339755352723
21 --- 2.2959970812106176
22 --- 7.360578319196439
22 --- 2.1602387773102554
23 --- 2.8686288267891458
24 --- 6.458218003637277
24 --- 2.8474118865638265
25 --- 4.753198156284083
26 --- 16.22735475594825
26 --- 6.424377256363877
26 --- 8.887080381808687
26 --- 2.3804746142952653
27 --- 8.609513476514548
27 --- 13.06755093547673
27 --- 3.500040056654654
28 --- 1.9087955874813773
29 --- 17.006562234161628
29 --- 10.730485153719197
29 --- 2.3538497522950275
30 --- 22.104855915136433
30 --- 2.7971639638512906
31 --- 3.4241712032176985
32 --- 10.175021454450935
32 --- 6.408710354561301
32 --- 1.001151196262561
33 --- 3.069197305397274
34 --- 2.8091261600643715
35 --- 20.249538381980678
35 --- 15.864576541593774
35 --- 1.833164974053

In [19]:
# Before : 72 numerical features
# After  : 39 numerical features after VIF check

In [20]:
# check Anova for columns_to_be_kept 
from scipy.stats import f_oneway

columns_to_be_kept_numerical = []
for i in columns_to_be_kept:
    a = list(df[i])  
    b = list(df['Approved_Flag'])  
    
    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']

    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)

In [21]:
len(columns_to_be_kept_numerical)

37

In [22]:
# Feature Selection is done for cat and num features

# Listing all the final features
features = columns_to_be_kept_numerical + categorical_log
df = df[features + ['Approved_Flag']]
df.shape

(42064, 43)

In [23]:
print(categorical_log)
print(df['EDUCATION'].unique())

['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']


In [24]:
# Ordinal feature - EDUCATION

# SSC           : 1
# 12TH          : 2
# GRADUATE      : 3
# UNDER GRADUATE: 3
# POST GRADUATE : 4
# OTHERS        : 1
# PROFESSIONAL  : 3

df.loc[df['EDUCATION'] == 'SSC', ['EDUCATION']]             = 1
df.loc[df['EDUCATION'] == '12TH', ['EDUCATION']]            = 2
df.loc[df['EDUCATION'] == 'GRADUATE', ['EDUCATION']]        = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE', ['EDUCATION']]  = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE', ['EDUCATION']]   = 4
df.loc[df['EDUCATION'] == 'OTHERS', ['EDUCATION']]          = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL', ['EDUCATION']]    = 3

print(df['EDUCATION'].value_counts())
df['EDUCATION'] = df['EDUCATION'].astype('int')

EDUCATION
3    18931
2    11703
1     9532
4     1898
Name: count, dtype: int64


In [25]:
df_encoded = pd.get_dummies(df, columns = ['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2'])
df_encoded.shape

(42064, 55)

In [26]:
k = df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

In [27]:
df['Approved_Flag'].value_counts()

Approved_Flag
P2    25452
P3     6440
P4     5264
P1     4908
Name: count, dtype: int64

In [28]:
# Data Preprocessing

# 1. Random Forest Classifier

X = df_encoded.drop(['Approved_Flag'], axis = 1)
Y = df_encoded['Approved_Flag']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

rf = RandomForestClassifier(n_estimators = 200, random_state = 42)
rf.fit(X_train, Y_train)

Y_pred = rf.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy : {accuracy}')
precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, Y_pred)
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f'class {v} : precision : {precision[i]}, recall : {recall[i]}, f1_score : {f1_score[i]}')
    

Accuracy : 0.7636990372043266
class p1 : precision : 0.8370457209847597, recall : 0.7041420118343196, f1_score : 0.7648634172469202
class p2 : precision : 0.7957519116397621, recall : 0.9282457879088206, f1_score : 0.856907593778591
class p3 : precision : 0.4423380726698262, recall : 0.21132075471698114, f1_score : 0.28600612870275793
class p4 : precision : 0.7178502879078695, recall : 0.7269193391642371, f1_score : 0.7223563495895703


In [29]:
# !pip install xgboost

In [30]:
# 2. xgboost

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

X = df_encoded.drop(['Approved_Flag'], axis = 1)
Y = df_encoded['Approved_Flag']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)

xg = XGBClassifier(objective = 'multi:softmax', num_class = 4, random_state = 42)
xg.fit(X_train, Y_train)

Y_pred = xg.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy : {accuracy}')

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, Y_pred)
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f'class {v} : precision : {precision[i]}, recall : {recall[i]}, f1_score : {f1_score[i]}')

Accuracy : 0.7783192677998336
class p1 : precision : 0.823906083244397, recall : 0.7613412228796844, f1_score : 0.7913890312660175
class p2 : precision : 0.8255418233924413, recall : 0.913577799801784, f1_score : 0.8673315769665035
class p3 : precision : 0.4756380510440835, recall : 0.30943396226415093, f1_score : 0.37494284407864653
class p4 : precision : 0.7342386032977691, recall : 0.7356656948493683, f1_score : 0.7349514563106796


In [31]:
# 3. Decision Tree

from sklearn.tree import DecisionTreeClassifier

X = df_encoded.drop(['Approved_Flag'], axis = 1)
Y = df_encoded['Approved_Flag']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

dt = DecisionTreeClassifier(random_state = 42)
dt.fit(X_train, Y_train)

Y_pred = dt.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy : {accuracy}')
precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, Y_pred)
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f'class {v} : precision : {precision[i]}, recall : {recall[i]}, f1_score : {f1_score[i]}')

Accuracy : 0.7021276595744681
class p1 : precision : 0.7207661290322581, recall : 0.7051282051282052, f1_score : 0.7128614157527418
class p2 : precision : 0.8134373125374925, recall : 0.8063429137760159, f1_score : 0.8098745769460481
class p3 : precision : 0.3391430646332607, recall : 0.35245283018867923, f1_score : 0.3456698741672835
class p4 : precision : 0.6299137104506232, recall : 0.6384839650145773, f1_score : 0.6341698841698842


In [32]:
# XGBoost is the best model

# Define the hyperparameter grid

param_grid = {
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [100, 200, 300, 400, 500],
    'alpha': [10, 20, 30, 40, 50]
}

index = 0

answers_grid = {
    'combination'       : [],
    'train_accuracy'    : [],
    'test_accuracy'     : [],
    'colsample_bytree'  : [],
    'learning_rate'     : [],
    'max_depth'         : [],
    'n_estimators'      : [],
    'alpha'             : []
}

In [33]:
# Loop through each combination of hyperparameters

for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for n_estimators in param_grid['n_estimators']:
                for alpha in param_grid['alpha']:
                    
                    index = index + 1
                    
                    # Define and train the model
                    model = XGBClassifier(
                        colsample_bytree = colsample_bytree,
                        learning_rate = learning_rate,
                        max_depth = max_depth,
                        n_estimators = n_estimators,
                        alpha = alpha,
                        objective = 'multi:softmax',
                        num_class = 4,
                        random_state = 42
                    )
                    
                    y = df_encoded['Approved_Flag']
                    y_encoded = label_encoder.fit_transform(y)
                    
                    X_train, X_test, Y_train, Y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)
                    
                    model.fit(X_train, Y_train)
                    
                    # Predict on the training and test data
                    Y_train_pred = model.predict(X_train)
                    Y_test_pred = model.predict(X_test)
                    
                    # Calculate the accuracy
                    train_accuracy = accuracy_score(Y_train, Y_train_pred)
                    test_accuracy = accuracy_score(Y_test, Y_test_pred)
                    
                    # Save the results
                    answers_grid['combination'].append(index)
                    answers_grid['train_accuracy'].append(train_accuracy)
                    answers_grid['test_accuracy'].append(test_accuracy)
                    answers_grid['colsample_bytree'].append(colsample_bytree)
                    answers_grid['learning_rate'].append(learning_rate)
                    answers_grid['max_depth'].append(max_depth)
                    answers_grid['n_estimators'].append(n_estimators)
                    answers_grid['alpha'].append(alpha)
                    
                    # Print the results
                    print(f'Combination : {index}')
                    print(f'colsample_bytree : {colsample_bytree}, learning_rate : {learning_rate}, max_depth : {max_depth}, n_estimators : {n_estimators}, alpha : {alpha}')
                    print(f'Train Accuracy : {train_accuracy:.2f}')
                    print(f'Test Accuracy : {test_accuracy:.2f}')


                    

Combination : 1
colsample_bytree : 0.1, learning_rate : 0.1, max_depth : 3, n_estimators : 100, alpha : 10
Train Accuracy : 0.72
Test Accuracy : 0.71
Combination : 2
colsample_bytree : 0.1, learning_rate : 0.1, max_depth : 3, n_estimators : 100, alpha : 20
Train Accuracy : 0.72
Test Accuracy : 0.71
Combination : 3
colsample_bytree : 0.1, learning_rate : 0.1, max_depth : 3, n_estimators : 100, alpha : 30
Train Accuracy : 0.71
Test Accuracy : 0.71
Combination : 4
colsample_bytree : 0.1, learning_rate : 0.1, max_depth : 3, n_estimators : 100, alpha : 40
Train Accuracy : 0.71
Test Accuracy : 0.70
Combination : 5
colsample_bytree : 0.1, learning_rate : 0.1, max_depth : 3, n_estimators : 100, alpha : 50
Train Accuracy : 0.71
Test Accuracy : 0.70
Combination : 6
colsample_bytree : 0.1, learning_rate : 0.1, max_depth : 3, n_estimators : 200, alpha : 10
Train Accuracy : 0.76
Test Accuracy : 0.75
Combination : 7
colsample_bytree : 0.1, learning_rate : 0.1, max_depth : 3, n_estimators : 200, alph

In [37]:
parameter = pd.DataFrame(answers_grid)
parameter.to_csv('./model/parameter.csv', index = False)

In [39]:
model = XGBClassifier(
    colsample_bytree = 0.7,
    learning_rate = 0.1,
    max_depth = 3,
    n_estimators = 300,
    alpha = 40,
)

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy : {accuracy}')

Accuracy : 0.7782004041364555


In [44]:
import pickle

filename = './model/loan_v1.sav'
with open(filename, 'wb') as f:
    pickle.dump(model, f)

In [46]:
filename = './model/loan_v1.sav'

with open(filename, 'rb') as f:
    load_model = pickle.load(f)

Y_pred = load_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)

In [47]:
print(f'Accuracy : {accuracy}')

Accuracy : 0.7782004041364555
