In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 

import warnings
warnings.filterwarnings("ignore")

from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder 

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier




In [212]:
# load the datasets

df1 = pd.read_excel('case_study1.xlsx')

df2 = pd.read_excel('case_study2.xlsx')

In [213]:
print(df1.shape)
print(df2.shape)

(51336, 26)
(51336, 62)


In [214]:
# remove nulls

df1 = df1[df1['Age_Newest_TL'] != -99999]

df1.shape

(51296, 26)

In [215]:
# columns with more than 10k null values 

cols_removed = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        cols_removed.append(i)

cols_removed

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

In [216]:
# remove cols with null values

df2 = df2.drop(cols_removed, axis=1)

df2.shape

(51336, 54)

In [217]:
# remove null rows

for i in df2.columns:
    df2 = df2.loc[df2[i] != -99999]

df2.shape
    

(42066, 54)

In [218]:
# checking commom columns

for i in list(df1.columns):
    if i in list(df2.columns):
        print(i)

PROSPECTID


In [219]:
# join both datasets

df = df1.merge(df2, how='inner', on='PROSPECTID')

df.shape

(42064, 79)

In [220]:
pd.set_option('display.max_columns', None)

df.describe()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,Total_TL_opened_L12M,Tot_TL_closed_L12M,pct_tl_open_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,Auto_TL,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL,time_since_recent_payment,num_times_delinquent,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,num_times_30p_dpd,num_times_60p_dpd,num_std,num_std_6mts,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_6mts,num_dbt_12mts,num_lss,num_lss_6mts,num_lss_12mts,recent_level_of_deliq,tot_enq,CC_enq,CC_enq_L6m,CC_enq_L12m,PL_enq,PL_enq_L6m,PL_enq_L12m,time_since_recent_enq,enq_L12m,enq_L6m,enq_L3m,AGE,NETMONTHLYINCOME,Time_With_Curr_Empr,pct_of_active_TLs_ever,pct_opened_TLs_L6m_of_L12m,pct_currentBal_all_TL,CC_Flag,PL_Flag,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,Credit_Score
count,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0
mean,25649.827477,5.26298,2.967383,2.295597,0.812643,0.48992,0.179032,0.097783,0.577452,0.422548,1.672142,0.825504,0.401271,0.160365,0.525746,0.667626,0.145921,1.3624,1.566304,0.076241,0.328,2.921334,2.341646,1.116489,46.498074,13.970046,218.601607,1.742939,14.314758,0.21163,0.548593,0.336963,0.773298,0.438879,9.118343,1.464887,3.279978,0.063831,0.002211,0.009224,0.02451,0.001284,0.004279,0.016713,0.001189,0.003376,11.803918,5.457303,0.485641,0.162277,0.268924,1.174971,0.516927,0.779194,264.854507,3.063189,2.002686,1.230458,33.752472,26929.9,110.345783,0.577452,0.309198,0.883693,0.102962,0.193063,0.219169,0.074833,0.195497,0.064186,0.252235,0.05658,679.326336
std,14844.173396,7.463383,6.141098,2.404086,1.383559,1.05892,0.278043,0.210957,0.36611,0.36611,2.249543,1.537208,0.381266,0.258831,1.106442,0.952677,0.549314,2.394966,5.500184,0.358582,0.916368,6.379764,3.405397,2.486801,42.10923,18.835191,422.282417,4.390599,54.056303,0.75794,1.625512,1.097356,2.860464,2.1484,21.514144,3.375811,7.566312,0.799989,0.081704,0.220786,0.62189,0.072637,0.184461,0.573762,0.08331,0.204293,46.422091,6.308943,1.710479,0.681683,1.019459,2.380981,1.37324,1.802092,466.585002,4.299207,3.165782,2.069461,8.774652,20843.0,75.629967,0.36611,0.400555,40.622275,0.303913,0.394707,0.3951,0.250658,0.367414,0.225989,0.4343,0.231042,21.133619
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,469.0
25%,12776.75,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,4.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,27.0,18000.0,61.0,0.286,0.0,0.152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,668.0
50%,25706.5,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.545,0.455,1.0,0.0,0.333,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,34.0,7.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,79.0,2.0,1.0,1.0,32.0,24000.0,92.0,0.545,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,679.0
75%,38518.25,6.0,3.0,3.0,1.0,1.0,0.333,0.1,1.0,0.714,2.0,1.0,0.714,0.25,1.0,1.0,0.0,2.0,1.0,0.0,0.0,3.0,3.0,1.0,65.0,16.0,146.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,8.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,7.0,0.0,0.0,0.0,1.0,0.0,1.0,302.0,4.0,3.0,2.0,39.0,31000.0,131.0,1.0,0.625,0.86,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,690.0
max,51336.0,235.0,216.0,47.0,27.0,19.0,1.0,1.0,1.0,1.0,34.0,33.0,1.0,1.0,34.0,27.0,27.0,41.0,235.0,10.0,29.0,235.0,55.0,80.0,385.0,359.0,6065.0,74.0,900.0,12.0,28.0,20.0,60.0,52.0,422.0,58.0,122.0,41.0,5.0,12.0,35.0,6.0,12.0,72.0,12.0,30.0,900.0,176.0,42.0,17.0,24.0,46.0,44.0,44.0,4768.0,87.0,66.0,42.0,67.0,2500000.0,1020.0,1.0,1.0,6327.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,809.0


In [221]:
# check categorical columns

for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [222]:
print(df['MARITALSTATUS'].unique())
print(df['EDUCATION'].unique())
print(df['GENDER'].unique())
print(df['last_prod_enq2'].unique())
print(df['first_prod_enq2'].unique())
print(df['Approved_Flag'].unique())

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']
['P2' 'P1' 'P3' 'P4']


In [96]:
# chi-square test 

# for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
#     chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
#     print(i, "---", pval)

MARITALSTATUS --- 3.5781808610388605e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.9079361001865664e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.849976105554191e-287


In [223]:
def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

cols = ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

for col in cols:
    ct = pd.crosstab(df[col], df['Approved_Flag'])
    chi2, pval, _, _ = chi2_contingency(ct)
    v = cramers_v(ct)
    print(f"{col:20s} | p-value: {pval:.4e} | Cramér’s V: {v:.4f}")

MARITALSTATUS        | p-value: 3.5782e-233 | Cramér’s V: 0.1600
EDUCATION            | p-value: 2.6942e-30 | Cramér’s V: 0.0386
GENDER               | p-value: 1.9079e-05 | Cramér’s V: 0.0242
last_prod_enq2       | p-value: 0.0000e+00 | Cramér’s V: 0.1392
first_prod_enq2      | p-value: 7.8500e-287 | Cramér’s V: 0.1049


In [224]:
# all values less than equal to 0.05, we can accept all 
# but cramers v is less than 0.05 for eduaction and gender, so remove those 
# associated with target variable 

# check with both columns - xgboost - 77.26
# check with education - 77.10
# removed both - 77.57

In [225]:
# check numerical columns

num_cols = []

for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID', 'Approved_Flag']:
        num_cols.append(i)

num_cols

['Total_TL',
 'Tot_Closed_TL',
 'Tot_Active_TL',
 'Total_TL_opened_L6M',
 'Tot_TL_closed_L6M',
 'pct_tl_open_L6M',
 'pct_tl_closed_L6M',
 'pct_active_tl',
 'pct_closed_tl',
 'Total_TL_opened_L12M',
 'Tot_TL_closed_L12M',
 'pct_tl_open_L12M',
 'pct_tl_closed_L12M',
 'Tot_Missed_Pmnt',
 'Auto_TL',
 'CC_TL',
 'Consumer_TL',
 'Gold_TL',
 'Home_TL',
 'PL_TL',
 'Secured_TL',
 'Unsecured_TL',
 'Other_TL',
 'Age_Oldest_TL',
 'Age_Newest_TL',
 'time_since_recent_payment',
 'num_times_delinquent',
 'max_recent_level_of_deliq',
 'num_deliq_6mts',
 'num_deliq_12mts',
 'num_deliq_6_12mts',
 'num_times_30p_dpd',
 'num_times_60p_dpd',
 'num_std',
 'num_std_6mts',
 'num_std_12mts',
 'num_sub',
 'num_sub_6mts',
 'num_sub_12mts',
 'num_dbt',
 'num_dbt_6mts',
 'num_dbt_12mts',
 'num_lss',
 'num_lss_6mts',
 'num_lss_12mts',
 'recent_level_of_deliq',
 'tot_enq',
 'CC_enq',
 'CC_enq_L6m',
 'CC_enq_L12m',
 'PL_enq',
 'PL_enq_L6m',
 'PL_enq_L12m',
 'time_since_recent_enq',
 'enq_L12m',
 'enq_L6m',
 'enq_L3m',

In [226]:
# VIF sequentially check

vif_data = df[num_cols]
total_cols = vif_data.shape[1]
cols_to_keep = []
col_index = 0

for i in range(0, total_cols):
    vif_value = variance_inflation_factor(vif_data, col_index)
    print(col_index, "---", vif_value)

    if vif_value <= 6:
        cols_to_keep.append(num_cols[i])
        col_index = col_index + 1

    else: 
        vif_data = vif_data.drop(num_cols[i], axis=1)


0 --- inf
0 --- inf
0 --- 11.320180023967982
0 --- 8.36369803500036
0 --- 6.5206478777909425
0 --- 5.14950161821261
1 --- 2.611111040579735
2 --- inf
2 --- 1788.7926256209232
2 --- 8.601028256477212
2 --- 3.832800792153082
3 --- 6.0996533816466405
3 --- 5.581352009642814
4 --- 1.9855843530987702
5 --- inf
5 --- 4.809538302819332
6 --- 23.270628983464636
6 --- 30.595522588099946
6 --- 4.384346405965575
7 --- 3.0646584155234122
8 --- 2.898639771299225
9 --- 4.377876915347337
10 --- 2.2078535836958486
11 --- 4.916914200506877
12 --- 5.214702030064743
13 --- 3.3861625024231516
14 --- 7.84058330947899
14 --- 5.255034641721459
15 --- inf
15 --- 7.380634506427207
15 --- 1.421005001517572
16 --- 8.083255010190301
16 --- 1.6241227524040012
17 --- 7.257811920140015
17 --- 15.596243832683006
17 --- 1.825857047132431
18 --- 1.5080839450032724
19 --- 2.1720888348245815
20 --- 2.6233975535272367
21 --- 2.2959970812106216
22 --- 7.360578319196457
22 --- 2.1602387773102514
23 --- 2.8686288267891493
24

In [227]:
len(cols_to_keep)

39

In [125]:
# check anova

# cols_to_keep_num = []

# for i in cols_to_keep:
#     a = list(df[i])
#     b = list(df['Approved_Flag'])

#     group_P1 = [value for value, group in zip(a, b) if group == 'P1']
#     group_P2 = [value for value, group in zip(a, b) if group == 'P2']
#     group_P3 = [value for value, group in zip(a, b) if group == 'P3']
#     group_P4 = [value for value, group in zip(a, b) if group == 'P4']

#     f_statistics, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

#     if p_value <= 0.05:
#         cols_to_keep_num.append(i)


# len(cols_to_keep_num)

37

In [228]:
import numpy as np
import pandas as pd
from scipy.stats import kruskal

def kruskal_eps2(df, feature, target):
    """
    Kruskal-Wallis test + epsilon-squared effect size for multi-class targets.
    epsilon^2 ~ proportion of variability explained (nonparametric analogue).
    """
    d = df[[feature, target]].dropna()

    # Build samples per class
    groups = [g[feature].values for _, g in d.groupby(target, sort=False)]
    k = len(groups)
    n = len(d)

    # Need at least 2 non-empty groups
    if k < 2 or n == 0:
        return np.nan, np.nan, np.nan, n, k

    H, p = kruskal(*groups)

    # Epsilon-squared for Kruskal-Wallis:
    # eps^2 = (H - k + 1) / (n - k)
    # Clamp at 0 to avoid tiny negatives due to floating error
    denom = (n - k)
    eps2 = (H - k + 1) / denom if denom > 0 else np.nan
    if not np.isnan(eps2):
        eps2 = max(0.0, eps2)

    return H, p, eps2, n, k


def run_kruskal_multiclass_report(df, target_col, numeric_cols):
    rows = []
    for col in numeric_cols:
        H, p, eps2, n, k = kruskal_eps2(df, col, target_col)
        rows.append({
            "feature": col,
            "H_stat": H,
            "p_value": p,
            "epsilon_sq": eps2,
            "n_nonnull": n,
            "n_classes": k
        })

    out = pd.DataFrame(rows).sort_values(["p_value", "epsilon_sq"], ascending=[True, False])
    return out.reset_index(drop=True)


# ---- Usage ----
report = run_kruskal_multiclass_report(df, "Approved_Flag", cols_to_keep)

report


Unnamed: 0,feature,H_stat,p_value,epsilon_sq,n_nonnull,n_classes
0,enq_L3m,12633.712184,0.0,0.300302,42064,4
1,Age_Oldest_TL,9390.306137,0.0,0.223188,42064,4
2,pct_PL_enq_L6m_of_ever,7281.843002,0.0,0.173059,42064,4
3,time_since_recent_enq,7121.527996,0.0,0.169247,42064,4
4,num_std_12mts,5665.151268,0.0,0.134621,42064,4
5,PL_enq_L12m,5300.178109,0.0,0.125943,42064,4
6,Secured_TL,3443.148288,0.0,0.081791,42064,4
7,recent_level_of_deliq,2081.218221,0.0,0.049411,42064,4
8,pct_CC_enq_L6m_of_ever,1956.247275,0.0,0.04644,42064,4
9,max_recent_level_of_deliq,1943.979023,0.0,0.046148,42064,4


In [229]:
feat = report.loc[report['epsilon_sq'] > 0.01, 'feature'].tolist()
feat

['enq_L3m',
 'Age_Oldest_TL',
 'pct_PL_enq_L6m_of_ever',
 'time_since_recent_enq',
 'num_std_12mts',
 'PL_enq_L12m',
 'Secured_TL',
 'recent_level_of_deliq',
 'pct_CC_enq_L6m_of_ever',
 'max_recent_level_of_deliq',
 'Other_TL',
 'CC_enq_L12m',
 'Home_TL',
 'GL_Flag',
 'pct_tl_open_L6M',
 'Time_With_Curr_Empr',
 'HL_Flag',
 'Age_Newest_TL',
 'Tot_Missed_Pmnt',
 'PL_TL',
 'Unsecured_TL',
 'PL_Flag',
 'num_times_60p_dpd',
 'num_deliq_6_12mts']

In [230]:
# listing all the final features

features = feat + ['MARITALSTATUS', 'last_prod_enq2', 'first_prod_enq2']
df = df[features + ['Approved_Flag']]
df.shape

(42064, 28)

In [196]:
# ordinal encoding for education

# df.loc[df['EDUCATION']=='SSC', ['EDUCATION']] = 1
# df.loc[df['EDUCATION']=='12TH', ['EDUCATION']] = 2
# df.loc[df['EDUCATION']=='UNDER GRADUATE', ['EDUCATION']] = 3
# df.loc[df['EDUCATION']=='GRADUATE', ['EDUCATION']] = 3
# df.loc[df['EDUCATION']=='POST-GRADUATE', ['EDUCATION']] = 4
# df.loc[df['EDUCATION']=='PROFESSIONAL', ['EDUCATION']] = 3
# df.loc[df['EDUCATION']=='OTHERS', ['EDUCATION']] = 1

# df['EDUCATION'] = df['EDUCATION'].astype(int)
# df['EDUCATION'].value_counts()

In [231]:
# one-hot encoding for other categorical columns

df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS', 'last_prod_enq2', 'first_prod_enq2'], drop_first=True)
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 36 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   enq_L3m                       42064 non-null  int64  
 1   Age_Oldest_TL                 42064 non-null  int64  
 2   pct_PL_enq_L6m_of_ever        42064 non-null  float64
 3   time_since_recent_enq         42064 non-null  int64  
 4   num_std_12mts                 42064 non-null  int64  
 5   PL_enq_L12m                   42064 non-null  int64  
 6   Secured_TL                    42064 non-null  int64  
 7   recent_level_of_deliq         42064 non-null  int64  
 8   pct_CC_enq_L6m_of_ever        42064 non-null  float64
 9   max_recent_level_of_deliq     42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  CC_enq_L12m                   42064 non-null  int64  
 12  Home_TL                       42064 non-null  int64  
 13  G

In [239]:
df_encoded.columns

Index(['enq_L3m', 'Age_Oldest_TL', 'pct_PL_enq_L6m_of_ever',
       'time_since_recent_enq', 'num_std_12mts', 'PL_enq_L12m', 'Secured_TL',
       'recent_level_of_deliq', 'pct_CC_enq_L6m_of_ever',
       'max_recent_level_of_deliq', 'Other_TL', 'CC_enq_L12m', 'Home_TL',
       'GL_Flag', 'pct_tl_open_L6M', 'Time_With_Curr_Empr', 'HL_Flag',
       'Age_Newest_TL', 'Tot_Missed_Pmnt', 'PL_TL', 'Unsecured_TL', 'PL_Flag',
       'num_times_60p_dpd', 'num_deliq_6_12mts', 'Approved_Flag',
       'MARITALSTATUS_Single', 'last_prod_enq2_CC',
       'last_prod_enq2_ConsumerLoan', 'last_prod_enq2_HL', 'last_prod_enq2_PL',
       'last_prod_enq2_others', 'first_prod_enq2_CC',
       'first_prod_enq2_ConsumerLoan', 'first_prod_enq2_HL',
       'first_prod_enq2_PL', 'first_prod_enq2_others'],
      dtype='object')

In [232]:
# random forest

x = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']

x_trian, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)

rf_classifier.fit(x_trian, y_train)

y_pred = rf_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print()
print(f'Accuracy = {accuracy}')
print()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f'Class {v}')
    print(f'Precision = {precision[i]}')
    print(f'Recall = {recall[i]}')
    print(f'F1_score = {f1_score[i]}')
    print()



Accuracy = 0.764174491857839

Class p1
Precision = 0.8373702422145328
Recall = 0.7159763313609467
F1_score = 0.7719298245614035

Class p2
Precision = 0.802311540451958
Recall = 0.9219028741328048
F1_score = 0.85795978601734

Class p3
Precision = 0.4347202295552367
Recall = 0.22867924528301886
F1_score = 0.2997032640949555

Class p4
Precision = 0.7119771863117871
Recall = 0.7278911564625851
F1_score = 0.7198462277751081



In [233]:
# decision tree

x = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']

x_trian, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)

dt_model.fit(x_trian, y_train)

y_pred = dt_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print()
print(f'Accuracy = {accuracy}')
print()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f'Class {v}')
    print(f'Precision = {precision[i]}')
    print(f'Recall = {recall[i]}')
    print(f'F1_score = {f1_score[i]}')
    print()


Accuracy = 0.7190062997741591

Class p1
Precision = 0.7224926971762414
Recall = 0.7317554240631163
F1_score = 0.7270945614894659

Class p2
Precision = 0.8159735717061796
Recall = 0.8323092170465808
F1_score = 0.8240604454911196

Class p3
Precision = 0.36356209150326796
Recall = 0.33584905660377357
F1_score = 0.34915653197332286

Class p4
Precision = 0.6525590551181102
Recall = 0.6443148688046647
F1_score = 0.6484107579462103



In [234]:
# xgboost

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

x = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_trian, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

xgb_classifier.fit(x_trian, y_train)

y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print()
print(f'Accuracy = {accuracy}')
print()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f'Class {v}')
    print(f'Precision = {precision[i]}')
    print(f'Recall = {recall[i]}')
    print(f'F1_score = {f1_score[i]}')
    print()


Accuracy = 0.7722572209675502

Class p1
Precision = 0.8286637931034483
Recall = 0.7583826429980276
F1_score = 0.791967044284243

Class p2
Precision = 0.8232441171187354
Recall = 0.9084241823587711
F1_score = 0.8637391632114587

Class p3
Precision = 0.44680851063829785
Recall = 0.3011320754716981
F1_score = 0.35978358881875566

Class p4
Precision = 0.7278048780487805
Recall = 0.7249757045675413
F1_score = 0.7263875365141188



In [235]:
import optuna
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# -------------------------
# Prepare data ONCE
# -------------------------
y = df_encoded["Approved_Flag"]
X = df_encoded.drop(columns=["Approved_Flag"])

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

# -------------------------
# Optuna objective
# -------------------------
def objective(trial):

    params = {
        "objective": "multi:softmax",
        "num_class": 4,
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.9, step=0.2),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.001, 0.01, 0.1, 1.0]),
        "max_depth": trial.suggest_categorical("max_depth", [3, 5, 6]),
        "reg_alpha": trial.suggest_categorical("alpha", [1, 10, 100]),
        "n_estimators": trial.suggest_categorical("n_estimators", [10, 50, 100]),
        "random_state": 42,
        "n_jobs": -1,
        "eval_metric": "mlogloss"
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    test_preds  = model.predict(X_test)

    train_acc = accuracy_score(y_train, train_preds)
    test_acc  = accuracy_score(y_test, test_preds)

    # store train accuracy for the best trial
    trial.set_user_attr("train_accuracy", train_acc)

    return test_acc

# -------------------------
# Run optimization
# -------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200, show_progress_bar=False)

# -------------------------
# Print ONLY best result
# -------------------------
best_trial = study.best_trial

print("Best Test Accuracy :", best_trial.value)
print("Best Train Accuracy:", best_trial.user_attrs["train_accuracy"])
print("Best Parameters    :", best_trial.params)


[32m[I 2026-01-29 01:29:35,604][0m A new study created in memory with name: no-name-da4c7279-c0ad-4dc7-8bad-5cc2a0e06778[0m
[32m[I 2026-01-29 01:29:37,521][0m Trial 0 finished with value: 0.7751099488886247 and parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 5, 'alpha': 1, 'n_estimators': 100}. Best is trial 0 with value: 0.7751099488886247.[0m
[32m[I 2026-01-29 01:29:38,223][0m Trial 1 finished with value: 0.7519315345298943 and parameters: {'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.1, 'max_depth': 5, 'alpha': 1, 'n_estimators': 50}. Best is trial 0 with value: 0.7751099488886247.[0m
[32m[I 2026-01-29 01:29:38,372][0m Trial 2 finished with value: 0.605016046594556 and parameters: {'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.001, 'max_depth': 3, 'alpha': 100, 'n_estimators': 10}. Best is trial 0 with value: 0.7751099488886247.[0m
[32m[I 2026-01-29 01:29:39,184][0m Trial 3 finished with value: 0.7701176750267443 an

Best Test Accuracy : 0.7786758587899679
Best Train Accuracy: 0.8148049092151793
Best Parameters    : {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 6, 'alpha': 1, 'n_estimators': 100}


In [137]:
# # Apply standard scaler 

# from sklearn.preprocessing import StandardScaler

# columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
# 'max_recent_level_of_deliq','recent_level_of_deliq',
# 'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

# for i in columns_to_be_scaled:
#     column_data = df_encoded[i].values.reshape(-1, 1)
#     scaler = StandardScaler()
#     scaled_column = scaler.fit_transform(column_data)
#     df_encoded[i] = scaled_column

# xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# x = df_encoded.drop(['Approved_Flag'], axis=1)
# y = df_encoded['Approved_Flag']

# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# x_trian, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

# xgb_classifier.fit(x_trian, y_train)

# y_pred = xgb_classifier.predict(x_test)

# accuracy = accuracy_score(y_test, y_pred)
# print()
# print(f'Accuracy = {accuracy}')
# print()
# precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

# for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
#     print(f'Class {v}')
#     print(f'Precision = {precision[i]}')
#     print(f'Recall = {recall[i]}')
#     print(f'F1_score = {f1_score[i]}')
#     print()

In [138]:
# y = df_encoded["Approved_Flag"]
# X = df_encoded.drop(columns=["Approved_Flag"])

# le = LabelEncoder()
# y_enc = le.fit_transform(y)

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y_enc,
#     test_size=0.2,
#     random_state=42,
#     stratify=y_enc
# )

# # -------------------------
# # Optuna objective
# # -------------------------
# def objective(trial):

#     params = {
#         "objective": "multi:softmax",
#         "num_class": 4,
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.9, step=0.2),
#         "learning_rate": trial.suggest_categorical("learning_rate", [0.001, 0.01, 0.1, 1.0]),
#         "max_depth": trial.suggest_categorical("max_depth", [3, 5, 8, 10]),
#         "reg_alpha": trial.suggest_categorical("alpha", [1, 10, 100]),
#         "n_estimators": trial.suggest_categorical("n_estimators", [10, 50, 100]),
#         "random_state": 42,
#         "n_jobs": -1,
#         "eval_metric": "mlogloss"
#     }

#     model = xgb.XGBClassifier(**params)
#     model.fit(X_train, y_train)

#     train_preds = model.predict(X_train)
#     test_preds  = model.predict(X_test)

#     train_acc = accuracy_score(y_train, train_preds)
#     test_acc  = accuracy_score(y_test, test_preds)

#     # store train accuracy for the best trial
#     trial.set_user_attr("train_accuracy", train_acc)

#     return test_acc

# # -------------------------
# # Run optimization
# # -------------------------
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=200, show_progress_bar=False)

# # -------------------------
# # Print ONLY best result
# # -------------------------
# best_trial = study.best_trial

# print("Best Test Accuracy :", best_trial.value)
# print("Best Train Accuracy:", best_trial.user_attrs["train_accuracy"])
# print("Best Parameters    :", best_trial.params)


In [237]:
# predict for unseen data

un = pd.read_excel('/Users/shraddhagupta/Downloads/Unseen_Dataset.xlsx')
un.shape


(100, 42)

In [238]:
# ordinal encoding for education

# un.loc[un['EDUCATION']=='SSC', ['EDUCATION']] = 1
# un.loc[un['EDUCATION']=='12TH', ['EDUCATION']] = 2
# un.loc[un['EDUCATION']=='UNDER GRADUATE', ['EDUCATION']] = 3
# un.loc[un['EDUCATION']=='GRADUATE', ['EDUCATION']] = 3
# un.loc[un['EDUCATION']=='POST-GRADUATE', ['EDUCATION']] = 4
# un.loc[un['EDUCATION']=='PROFESSIONAL', ['EDUCATION']] = 3
# un.loc[un['EDUCATION']=='OTHERS', ['EDUCATION']] = 1

# un['EDUCATION'] = un['EDUCATION'].astype(int)


# one-hot encoding for other categorical columns


df_unseen = pd.get_dummies(un, columns=['MARITALSTATUS', 'last_prod_enq2', 'first_prod_enq2'], drop_first=True)
df_unseen.drop(columns=['EDUCATION', 'GENDER'], inplace=True)
df_unseen.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 48 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               100 non-null    float64
 1   pct_tl_closed_L6M             100 non-null    float64
 2   Tot_TL_closed_L12M            100 non-null    int64  
 3   pct_tl_closed_L12M            100 non-null    float64
 4   Tot_Missed_Pmnt               100 non-null    int64  
 5   CC_TL                         100 non-null    int64  
 6   Home_TL                       100 non-null    int64  
 7   PL_TL                         100 non-null    int64  
 8   Secured_TL                    100 non-null    int64  
 9   Unsecured_TL                  100 non-null    int64  
 10  Other_TL                      100 non-null    int64  
 11  Age_Oldest_TL                 100 non-null    int64  
 12  Age_Newest_TL                 100 non-null    int64  
 13  time_s

In [241]:
df_unseen = df_unseen[['enq_L3m', 'Age_Oldest_TL', 'pct_PL_enq_L6m_of_ever',
       'time_since_recent_enq', 'num_std_12mts', 'PL_enq_L12m', 'Secured_TL',
       'recent_level_of_deliq', 'pct_CC_enq_L6m_of_ever',
       'max_recent_level_of_deliq', 'Other_TL', 'CC_enq_L12m', 'Home_TL',
       'GL_Flag', 'pct_tl_open_L6M', 'Time_With_Curr_Empr', 'HL_Flag',
       'Age_Newest_TL', 'Tot_Missed_Pmnt', 'PL_TL', 'Unsecured_TL', 'PL_Flag',
       'num_times_60p_dpd', 'num_deliq_6_12mts', 
       'MARITALSTATUS_Single', 'last_prod_enq2_CC',
       'last_prod_enq2_ConsumerLoan', 'last_prod_enq2_HL', 'last_prod_enq2_PL',
       'last_prod_enq2_others', 'first_prod_enq2_CC',
       'first_prod_enq2_ConsumerLoan', 'first_prod_enq2_HL',
       'first_prod_enq2_PL', 'first_prod_enq2_others']]

In [242]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, 
                         colsample_bytree=0.5, learning_rate=0.1,
                         max_depth=6, alpha=1, n_estimators=100)

x = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_trian, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

model.fit(x_trian, y_train)

y_pred_unseen = model.predict(df_unseen)

df_unseen['Target'] = y_pred_unseen


In [243]:
y_pred = label_encoder.inverse_transform(y_pred_unseen)
df_unseen['Target_v'] = y_pred

df_unseen['Target_v'].value_counts()

Target_v
P2    75
P4    12
P1     7
P3     6
Name: count, dtype: int64

In [32]:
# # Define the hyperparameter grid
# param_grid = {
#   'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
#   'learning_rate'   : [0.001, 0.01, 0.1, 1],
#   'max_depth'       : [3, 5, 8, 10],
#   'alpha'           : [1, 10, 100],
#   'n_estimators'    : [10,50,100]
# }

# index = 0

# answers_grid = {
#     'combination'       :[],
#     'train_Accuracy'    :[],
#     'test_Accuracy'     :[],
#     'colsample_bytree'  :[],
#     'learning_rate'     :[],
#     'max_depth'         :[],
#     'alpha'             :[],
#     'n_estimators'      :[]

#     }


# # Loop through each combination of hyperparameters
# for colsample_bytree in param_grid['colsample_bytree']:
#   for learning_rate in param_grid['learning_rate']:
#     for max_depth in param_grid['max_depth']:
#       for alpha in param_grid['alpha']:
#           for n_estimators in param_grid['n_estimators']:
             
#               index = index + 1
             
#               # Define and train the XGBoost model
#               model = xgb.XGBClassifier(objective='multi:softmax',  
#                                        num_class=4,
#                                        colsample_bytree = colsample_bytree,
#                                        learning_rate = learning_rate,
#                                        max_depth = max_depth,
#                                        alpha = alpha,
#                                        n_estimators = n_estimators)
               
       
                     
#               y = df_encoded['Approved_Flag']
#               x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

#               label_encoder = LabelEncoder()
#               y_encoded = label_encoder.fit_transform(y)


#               x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)


#               model.fit(x_train, y_train)
  

       
#               # Predict on training and testing sets
#               y_pred_train = model.predict(x_train)
#               y_pred_test = model.predict(x_test)
       
       
#               # Calculate train and test results
              
#               train_accuracy =  accuracy_score (y_train, y_pred_train)
#               test_accuracy  =  accuracy_score (y_test , y_pred_test)
              
              
       
#               # Include into the lists
#               answers_grid ['combination']   .append(index)
#               answers_grid ['train_Accuracy']    .append(train_accuracy)
#               answers_grid ['test_Accuracy']     .append(test_accuracy)
#               answers_grid ['colsample_bytree']   .append(colsample_bytree)
#               answers_grid ['learning_rate']      .append(learning_rate)
#               answers_grid ['max_depth']          .append(max_depth)
#               answers_grid ['alpha']              .append(alpha)
#               answers_grid ['n_estimators']       .append(n_estimators)
       
       
#               # Print results for this combination
#               print(f"Combination {index}")
#               print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
#               print(f"Train Accuracy: {train_accuracy:.2f}")
#               print(f"Test Accuracy : {test_accuracy :.2f}")
#               print("-" * 30)



In [33]:
# # Convert answers_grid into a DataFrame
# answers_df = pd.DataFrame(answers_grid)

# # Find the row with the best (highest) test accuracy
# best_row = answers_df.loc[answers_df['test_Accuracy'].idxmax()]

# print("✅ Best Hyperparameter Combination:")
# print(f"Combination #: {int(best_row['combination'])}")
# print(f"Train Accuracy: {best_row['train_Accuracy']:.4f}")
# print(f"Test Accuracy : {best_row['test_Accuracy']:.4f}")
# print(f"colsample_bytree: {best_row['colsample_bytree']}")
# print(f"learning_rate  : {best_row['learning_rate']}")
# print(f"max_depth      : {int(best_row['max_depth'])}")
# print(f"alpha          : {best_row['alpha']}")
# print(f"n_estimators   : {int(best_row['n_estimators'])}")

