### Import necessary libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import r2_score, classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
import os
warnings.filterwarnings('ignore')

### Load the dataset

In [None]:
a1 = pd.read_excel('/content/case_study1.xlsx')
a2 = pd.read_excel('/content/case_study2.xlsx')

In [None]:
df1 = a1.copy()
df2 = a2.copy()

### Remove nulls

In [None]:
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [None]:
columns_to_be_removed = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed.append(i)

df2 = df2.drop(columns=columns_to_be_removed)

In [None]:
for i in df2.columns:
    df2.loc[df2[i] == -99999]

for i in list(df1.columns):
  if i in list(df2.columns):
    print(i)

PROSPECTID


### Merge the two dataframes, inner join so that no nulls are present

In [None]:
df = pd.merge(df1, df2, how='inner', left_on=['PROSPECTID'], right_on=['PROSPECTID'])

In [None]:
df.isna().sum().sum()

0

# Merging two dataframes

In [None]:
for i in df.columns:
  if df[i].dtype == 'object':
    print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [None]:
for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2', 'Approved_Flag']:
  chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
  print(i, ':', pval)

MARITALSTATUS : 2.7347247145640217e-257
EDUCATION : 6.772122013692765e-38
GENDER : 0.000259241464445085
last_prod_enq2 : 0.0
first_prod_enq2 : 0.0
Approved_Flag : 0.0


In [None]:
numeric_columns = []
for i in df.columns:
  if df[i].dtype != 'object' and i not in ['PROSPECTID', 'Approved_Flag']:
    numeric_columns.append(i)

In [None]:
vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

In [None]:
for i in range(0, total_columns):
  vif_value = variance_inflation_factor(vif_data, column_index)
  print(column_index, ':', vif_value)

  if vif_value < 6:
    columns_to_be_kept.append(numeric_columns[i])
    column_index = column_index + 1
  else:
    vif_data = vif_data.drop([numeric_columns[i]], axis=1)

0 : inf
0 : inf
0 : 10.96731827150461
0 : 8.28043194336109
0 : 6.428482253706439
0 : 5.510394576267282
1 : 2.4900436238094175
2 : inf
2 : 1813.189446652738
2 : 8.26063898818775
2 : 3.670325423946726
3 : 5.694216446776847
4 : 4.970645827346287
5 : 2.002032552173907
6 : inf
6 : 4.788323772675481
7 : 21.142530080727536
7 : 32.035916588478756
7 : 4.45209231888243
8 : 2.9986771147212234
9 : 2.799174255749638
10 : 4.0849555302802525
11 : 2.180147515711328
12 : 5.113889440000764
13 : 3.247544480100828
14 : 1.2747264556566809
15 : 8.005890070879012
15 : 5.397672188311226
16 : inf
16 : 7.274564152451663
16 : 1.4153535674199744
17 : 8.503819690047568
17 : 1.6294794750057182
18 : 7.078919419257034
18 : 15.410978252728741
18 : 1.836324397616079
19 : 1.5650766286119036
20 : 2.546098272138208
21 : 3.1032891116488006
22 : 2.194287797978665
23 : 7.383989877498994
23 : 2.0751482560411603
24 : 2.726449304987256
25 : 6.293135612441871
25 : 2.710267787144797
26 : 4.923780392902989
27 : 347440509.5926095
2

In [None]:
from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
  a = list(df[i])
  b = list(df['Approved_Flag'])

  group_P1 = [value for value, group in zip(a, b) if group == 'P1']
  group_P2 = [value for value, group in zip(a, b) if group == 'P2']
  group_P3 = [value for value, group in zip(a, b) if group == 'P3']
  group_P4 = [value for value, group in zip(a, b) if group == 'P4']

  f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

  if p_value < 0.05:
    columns_to_be_kept_numerical.append(i)

In [None]:
f_statistic

483.3714500929281

In [None]:
p_value

9.07042366922547e-310

In [None]:
features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
df = df[features + ['Approved_Flag']]

In [None]:
['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

df['MARITALSTATUS'].unique()
df['EDUCATION'].unique()
df['GENDER'].unique()
df['last_prod_enq2'].unique()
df['first_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [None]:
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3


In [None]:
df['EDUCATION'].value_counts()
df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51296 entries, 0 to 51295
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            51296 non-null  float64
 1   pct_tl_closed_L6M          51296 non-null  float64
 2   Tot_TL_closed_L12M         51296 non-null  int64  
 3   pct_tl_open_L12M           51296 non-null  float64
 4   pct_tl_closed_L12M         51296 non-null  float64
 5   Tot_Missed_Pmnt            51296 non-null  int64  
 6   CC_TL                      51296 non-null  int64  
 7   Home_TL                    51296 non-null  int64  
 8   PL_TL                      51296 non-null  int64  
 9   Secured_TL                 51296 non-null  int64  
 10  Unsecured_TL               51296 non-null  int64  
 11  Other_TL                   51296 non-null  int64  
 12  Age_Oldest_TL              51296 non-null  int64  
 13  Age_Newest_TL              51296 non-null  int

In [None]:
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2'])

df_encoded.info()
k = df_encoded.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51296 entries, 0 to 51295
Data columns (total 52 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               51296 non-null  float64
 1   pct_tl_closed_L6M             51296 non-null  float64
 2   Tot_TL_closed_L12M            51296 non-null  int64  
 3   pct_tl_open_L12M              51296 non-null  float64
 4   pct_tl_closed_L12M            51296 non-null  float64
 5   Tot_Missed_Pmnt               51296 non-null  int64  
 6   CC_TL                         51296 non-null  int64  
 7   Home_TL                       51296 non-null  int64  
 8   PL_TL                         51296 non-null  int64  
 9   Secured_TL                    51296 non-null  int64  
 10  Unsecured_TL                  51296 non-null  int64  
 11  Other_TL                      51296 non-null  int64  
 12  Age_Oldest_TL                 51296 non-null  int64  
 13  A

# Machine Learing model fitting


### Data processing
#### 1. Random Forest

In [None]:
y = df_encoded['Approved_Flag']
X = df_encoded.drop(['Approved_Flag'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(x_train, y_train)
y_pred = rf_classifier.predict(x_test)

accuracy_score = accuracy_score(y_test, y_pred)
print()
print("Accuracy Score:", accuracy_score)
print()
precision_score, recall_score, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1','p2','p3','p4']):
    print(f"class {v}:")
    print(f"precision: {precision_score[i]}")
    print(f"recall: {recall_score[i]}")
    print(f"f1_score: {f1_score[i]}")
    print()


Accuracy Score: 0.7783625730994153

class p1:
precision: 0.8125
recall: 0.7152838427947599
f1_score: 0.7607988852763586

class p2:
precision: 0.8155495978552278
recall: 0.9331288343558283
f1_score: 0.8703862660944205

class p3:
precision: 0.4323922734026746
recall: 0.19782460910944935
f1_score: 0.271455223880597

class p4:
precision: 0.707774798927614
recall: 0.7046263345195729
f1_score: 0.7061970575122604



#### 2. xgboost

In [None]:
!pip install xgboost -U -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1+cu121 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.1+cu121 requires nvidia-cuda-cupti-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.1+cu121 requires nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.1+cu121 requires nvidia-cuda-runtime-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

xgb_classifier = xgb.XGBClassifier(objectives='multi:softmax', num_class=4)

y = df_encoded['Approved_Flag']
X = df_encoded.drop(['Approved_Flag'], axis=1)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy= accuracy_score(y_test, y_pred)
print()
print(f"Accuracy Score: {accuracy: .2f}")
print()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1','p2','p3','p4']):
    print(f"class {v}:")
    print(f"precision: {precision[i]}")
    print(f"recall: {recall[i]}")
    print(f"f1_score: {f1_score[i]}")
    print()


Accuracy Score:  0.79

class p1:
precision: 0.7850045167118338
recall: 0.7589519650655022
f1_score: 0.7717584369449378

class p2:
precision: 0.8385925719072884
recall: 0.9211656441717792
f1_score: 0.8779418213711445

class p3:
precision: 0.45555555555555555
recall: 0.2787219578518015
f1_score: 0.3458456347532686

class p4:
precision: 0.7369385884509624
recall: 0.7153024911032029
f1_score: 0.7259593679458239



#### 3. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

y = df_encoded['Approved_Flag']
X = df_encoded.drop(['Approved_Flag'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(max_depth = 20, min_samples_split = 10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

accuracy= accuracy_score(y_test, y_pred)
print()
print(f"Accuracy Score: {accuracy: .2f}")
print()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1','p2','p3','p4']):
    print(f"class {v}:")
    print(f"precision: {precision[i]}")
    print(f"recall: {recall[i]}")
    print(f"f1_score: {f1_score[i]}")
    print()


Accuracy Score:  0.73

class p1:
precision: 0.6951827242524917
recall: 0.731004366812227
f1_score: 0.7126436781609194

class p2:
precision: 0.8263982102908277
recall: 0.8498466257668712
f1_score: 0.8379584120982987

class p3:
precision: 0.34103156274056967
recall: 0.30115567641060503
f1_score: 0.31985559566787003

class p4:
precision: 0.6473384030418251
recall: 0.6058718861209964
f1_score: 0.6259191176470589



 xgboost is giving me best results

 We will further finetune it

 Apply standard scaler

In [None]:
from sklearn.preprocessing import StandardScaler

columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq','NETMONTHLYINCOME','Time_With_Curr_Empr']

for i in columns_to_be_scaled:
    columns_data = df_encoded[i].values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled_column = scaler.fit_transform(columns_data)
    df_encoded[i] = scaled_column

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

xgb_classifier = xgb.XGBClassifier(objectives='multi:softmax', num_class=4)

y = df_encoded['Approved_Flag']
X = df_encoded.drop(['Approved_Flag'], axis=1)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy= accuracy_score(y_test, y_pred)
print()
print(f"Accuracy Score: {accuracy: .2f}")
print()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1','p2','p3','p4']):
    print(f"class {v}:")
    print(f"precision: {precision[i]}")
    print(f"recall: {recall[i]}")
    print(f"f1_score: {f1_score[i]}")
    print()


Accuracy Score:  0.79

class p1:
precision: 0.7850045167118338
recall: 0.7589519650655022
f1_score: 0.7717584369449378

class p2:
precision: 0.8385925719072884
recall: 0.9211656441717792
f1_score: 0.8779418213711445

class p3:
precision: 0.45555555555555555
recall: 0.2787219578518015
f1_score: 0.3458456347532686

class p4:
precision: 0.7369385884509624
recall: 0.7153024911032029
f1_score: 0.7259593679458239



#### Hyperparameter tuning in xgboost

In [None]:
from sklearn.model_selection import GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(objectives='multi:softmax', num_class = 4)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(x_train, y_train)

print("Best Hyperparameters", grid_search.best_params_)

best_model = grid_search.best_estimator_
accuracy = best_model.score(x_test, y_test)
print("Test Accuracy:", accuracy)

Best Hyperparameters {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Test Accuracy: 0.7908382066276803
