<a href="https://colab.research.google.com/github/sanhith25/Credit-risk-modelling-/blob/main/Credit_risk_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from scipy.stats import chi2_contingency, f_oneway
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    precision_recall_fscore_support
)

In [3]:
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [4]:
df1 = pd.read_excel("case_study1.xlsx")
df2 = pd.read_excel("case_study2.xlsx")

In [5]:
df1 = df1.copy()
df2 = df2.copy()

In [6]:
# In df1, remove rows where Age_Oldest_TL is -99999
if "Age_Oldest_TL" in df1.columns:
    df1 = df1.loc[df1["Age_Oldest_TL"] != -99999]

# In df2, drop columns with too many -99999, then rows with remaining -99999
columns_to_drop = []
for col in df2.columns:
    if (df2[col] == -99999).sum() > 10000:
        columns_to_drop.append(col)

df2 = df2.drop(columns_to_drop, axis=1)

for col in df2.columns:
    df2 = df2.loc[df2[col] != -99999]

In [7]:
# 3. MERGE DATASETS ON PROSPECTID
df = pd.merge(df1, df2, on="PROSPECTID", how="inner")
print("Shape after merge:", df.shape)

Shape after merge: (42064, 79)



 CATEGORICAL FEATURE SELECTION (CHI-SQUARE TEST)

In [8]:
categorical_cols = [c for c in df.columns if df[c].dtype == "object"]
print("Categorical columns:", categorical_cols)

cat_candidates = ["MARITALSTATUS", "EDUCATION", "GENDER", "last_prod_enq2", "first_prod_enq2"]
cat_candidates = [c for c in cat_candidates if c in df.columns]

print("\nChi-square p-values vs Approved_Flag:")
useful_cats = []
for col in cat_candidates:
    contingency = pd.crosstab(df[col], df["Approved_Flag"])
    chi2, pval, _, _ = chi2_contingency(contingency)
    print(f"{col}: p-value = {pval:.4f}")
    if pval <= 0.05:
        useful_cats.append(col)

print("Selected categorical features:", useful_cats)

Categorical columns: ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2', 'Approved_Flag']

Chi-square p-values vs Approved_Flag:
MARITALSTATUS: p-value = 0.0000
EDUCATION: p-value = 0.0000
GENDER: p-value = 0.0000
last_prod_enq2: p-value = 0.0000
first_prod_enq2: p-value = 0.0000
Selected categorical features: ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']


In [9]:
numeric_cols = [
    c for c in df.columns
    if (df[c].dtype != "object") and (c not in ["PROSPECTID"]) and (c != "Approved_Flag")
]

vif_df = df[numeric_cols].copy()

In [10]:
# Iteratively drop columns with VIF > 6
while True:
    vif_values = pd.Series(
        [variance_inflation_factor(vif_df.values, i) for i in range(vif_df.shape[1])],
        index=vif_df.columns,
        name="VIF"
    )
    max_vif = vif_values.max()
    if max_vif > 6:
        drop_col = vif_values.idxmax()
        print(f"Dropping {drop_col} due to high VIF = {max_vif:.2f}")
        vif_df = vif_df.drop(columns=[drop_col])
    else:
        break

selected_numeric_vif = list(vif_df.columns)
print("\nNumeric features after VIF filtering:", len(selected_numeric_vif))


Dropping Total_TL due to high VIF = inf
Dropping Tot_Closed_TL due to high VIF = inf
Dropping pct_active_tl due to high VIF = inf
Dropping Auto_TL due to high VIF = inf
Dropping num_deliq_6mts due to high VIF = inf
Dropping pct_of_active_TLs_ever due to high VIF = 2688.95
Dropping Secured_TL due to high VIF = 91.07
Dropping enq_L12m due to high VIF = 36.97
Dropping Credit_Score due to high VIF = 33.38
Dropping num_std_12mts due to high VIF = 26.13
Dropping pct_PL_enq_L6m_of_L12m due to high VIF = 24.11
Dropping Total_TL_opened_L12M due to high VIF = 22.14
Dropping Unsecured_TL due to high VIF = 19.94
Dropping pct_CC_enq_L6m_of_L12m due to high VIF = 19.16
Dropping enq_L6m due to high VIF = 16.72
Dropping num_times_30p_dpd due to high VIF = 13.68
Dropping AGE due to high VIF = 12.92
Dropping PL_enq_L12m due to high VIF = 12.43
Dropping Tot_Active_TL due to high VIF = 12.32
Dropping num_dbt_12mts due to high VIF = 9.48
Dropping Tot_TL_closed_L12M due to high VIF = 9.23
Dropping CC_enq_L1

In [11]:
#ANOVA across Approved_Flag groups
columns_to_keep_numerical = []
target = "Approved_Flag"

for col in selected_numeric_vif:
    # building groups per class
    groups = []
    for cls in df[target].unique():
        groups.append(df.loc[df[target] == cls, col].values)

    # min 2 values each
    if all(len(g) > 1 for g in groups):
        f_stat, p_val = f_oneway(*groups)
        if p_val <= 0.05:
            columns_to_keep_numerical.append(col)

print("\nNumeric features after ANOVA:", len(columns_to_keep_numerical))
print(columns_to_keep_numerical)



Numeric features after ANOVA: 43
['Total_TL_opened_L6M', 'Tot_TL_closed_L6M', 'pct_tl_closed_L6M', 'pct_tl_open_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Consumer_TL', 'Gold_TL', 'Home_TL', 'PL_TL', 'Other_TL', 'Age_Oldest_TL', 'Age_Newest_TL', 'time_since_recent_payment', 'num_times_delinquent', 'max_recent_level_of_deliq', 'num_deliq_6_12mts', 'num_times_60p_dpd', 'num_std', 'num_std_6mts', 'num_sub', 'num_sub_6mts', 'num_sub_12mts', 'num_dbt', 'num_dbt_6mts', 'num_lss', 'recent_level_of_deliq', 'CC_enq', 'CC_enq_L6m', 'PL_enq', 'PL_enq_L6m', 'time_since_recent_enq', 'enq_L3m', 'NETMONTHLYINCOME', 'Time_With_Curr_Empr', 'pct_opened_TLs_L6m_of_L12m', 'CC_Flag', 'PL_Flag', 'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag']


In [12]:
# 6. FINAL FEATURE SET
features = columns_to_keep_numerical + useful_cats
features = list(dict.fromkeys(features))  # removing duplicates

df = df[features + [target]].copy()
print("\nFinal feature set:", len(features))


Final feature set: 48


In [13]:
# 7. ENCODING CATEGORICAL FEATURES
# Clean EDUCATION into ordinal values if present
if "EDUCATION" in df.columns:
    df.loc[df["EDUCATION"] == "SSC", "EDUCATION"] = 1
    df.loc[df["EDUCATION"] == "12TH", "EDUCATION"] = 2
    df.loc[df["EDUCATION"] == "GRADUATE", "EDUCATION"] = 3
    df.loc[df["EDUCATION"] == "UNDER GRADUATE", "EDUCATION"] = 3
    df.loc[df["EDUCATION"] == "POST-GRADUATE", "EDUCATION"] = 4
    df.loc[df["EDUCATION"] == "OTHERS", "EDUCATION"] = 1
    df.loc[df["EDUCATION"] == "PROFESSIONAL", "EDUCATION"] = 3
    df["EDUCATION"] = df["EDUCATION"].astype(int)

In [14]:
# Onehot encoding remaining categoricals except EDUCATION if you treat it as numeric
one_hot_cols = [c for c in useful_cats if c != "EDUCATION"]

df_encoded = pd.get_dummies(df, columns=one_hot_cols)

print("\nEncoded data shape:", df_encoded.shape)
print(df_encoded.head())


Encoded data shape: (42064, 61)
   Total_TL_opened_L6M  Tot_TL_closed_L6M  pct_tl_closed_L6M  \
0                    0                  0                0.0   
1                    0                  0                0.0   
2                    1                  0                0.0   
3                    0                  0                0.0   
4                    0                  0                0.0   

   pct_tl_open_L12M  pct_tl_closed_L12M  Tot_Missed_Pmnt  CC_TL  Consumer_TL  \
0              0.00               0.000                0      0            0   
1              1.00               0.000                0      0            1   
2              0.25               0.000                1      0            6   
3              0.00               0.000                0      0            0   
4              0.00               0.167                0      0            0   

   Gold_TL  Home_TL  ...  last_prod_enq2_ConsumerLoan  last_prod_enq2_HL  \
0        1        0  ... 

In [15]:
y = df_encoded[target]
X = df_encoded.drop(columns=[target])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain shape:", X_train.shape, " Test shape:", X_test.shape)


Train shape: (33651, 60)  Test shape: (8413, 60)


# **Random forest**

In [16]:
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.7671460834422917
              precision    recall  f1-score   support

          P1       0.83      0.71      0.77       982
          P2       0.80      0.93      0.86      5090
          P3       0.43      0.21      0.28      1288
          P4       0.74      0.71      0.72      1053

    accuracy                           0.77      8413
   macro avg       0.70      0.64      0.66      8413
weighted avg       0.74      0.77      0.74      8413



# **Decision tree**

In [18]:
dt_clf = DecisionTreeClassifier(max_depth=20, min_samples_split=10, random_state=42)
dt_clf.fit(X_train, y_train)

y_pred_dt = dt_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Accuracy: 0.7080708427433734
              precision    recall  f1-score   support

          P1       0.70      0.71      0.71       982
          P2       0.80      0.83      0.81      5090
          P3       0.32      0.30      0.31      1288
          P4       0.69      0.63      0.66      1053

    accuracy                           0.71      8413
   macro avg       0.63      0.62      0.62      8413
weighted avg       0.70      0.71      0.70      8413



# **XGB**

In [19]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

xgb_clf = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(label_encoder.classes_),
    random_state=42
)

xgb_clf.fit(X_train_xgb, y_train_xgb)
y_pred_xgb = xgb_clf.predict(X_test_xgb)

print("Accuracy:", accuracy_score(y_test_xgb, y_pred_xgb))
print(classification_report(y_test_xgb, y_pred_xgb, target_names=label_encoder.classes_))

Accuracy: 0.7727326756210626
              precision    recall  f1-score   support

          P1       0.81      0.76      0.79       982
          P2       0.82      0.91      0.86      5090
          P3       0.42      0.28      0.34      1288
          P4       0.75      0.72      0.74      1053

    accuracy                           0.77      8413
   macro avg       0.70      0.67      0.68      8413
weighted avg       0.75      0.77      0.76      8413



# **HYperparamter tuning**

In [20]:
param_grid = {
    "colsample_bytree": [0.3, 0.5],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "alpha": [1, 10],
    "n_estimators": [50, 100],
}

best_test_acc = 0
best_params = None

results = []

index = 0
for colsample in param_grid["colsample_bytree"]:
    for lr in param_grid["learning_rate"]:
        for md in param_grid["max_depth"]:
            for alpha in param_grid["alpha"]:
                for n_est in param_grid["n_estimators"]:
                    model = xgb.XGBClassifier(
                        objective="multi:softmax",
                        num_class=len(label_encoder.classes_),
                        colsample_bytree=colsample,
                        learning_rate=lr,
                        max_depth=md,
                        alpha=alpha,
                        n_estimators=n_est,
                        random_state=42
                    )

                    model.fit(X_train_xgb, y_train_xgb)

                    y_train_pred = model.predict(X_train_xgb)
                    y_test_pred = model.predict(X_test_xgb)

                    train_acc = accuracy_score(y_train_xgb, y_train_pred)
                    test_acc = accuracy_score(y_test_xgb, y_test_pred)

                    results.append(
                        (
                            index,
                            train_acc,
                            test_acc,
                            colsample,
                            lr,
                            md,
                            alpha,
                            n_est,
                        )
                    )

                    if test_acc > best_test_acc:
                        best_test_acc = test_acc
                        best_params = (colsample, lr, md, alpha, n_est)

                    print(f"Combo {index}: train={train_acc:.3f}  test={test_acc:.3f}")
                    index += 1

Combo 0: train=0.706  test=0.706
Combo 1: train=0.747  test=0.743
Combo 2: train=0.704  test=0.703
Combo 3: train=0.746  test=0.742
Combo 4: train=0.737  test=0.731
Combo 5: train=0.772  test=0.760
Combo 6: train=0.730  test=0.724
Combo 7: train=0.766  test=0.756
Combo 8: train=0.747  test=0.744
Combo 9: train=0.771  test=0.765
Combo 10: train=0.745  test=0.742
Combo 11: train=0.769  test=0.763
Combo 12: train=0.772  test=0.761
Combo 13: train=0.796  test=0.774
Combo 14: train=0.766  test=0.758
Combo 15: train=0.786  test=0.771
Combo 16: train=0.736  test=0.730
Combo 17: train=0.759  test=0.750
Combo 18: train=0.735  test=0.729
Combo 19: train=0.757  test=0.748
Combo 20: train=0.765  test=0.755
Combo 21: train=0.785  test=0.770
Combo 22: train=0.761  test=0.751
Combo 23: train=0.780  test=0.768
Combo 24: train=0.757  test=0.750
Combo 25: train=0.776  test=0.767
Combo 26: train=0.757  test=0.751
Combo 27: train=0.774  test=0.765
Combo 28: train=0.785  test=0.769
Combo 29: train=0.803  t

In [21]:
print("\nBest XGBoost test accuracy:", best_test_acc)
print("Best params (colsample_bytree, learning_rate, max_depth, alpha, n_estimators):")
print(best_params)


Best XGBoost test accuracy: 0.7745156305717342
Best params (colsample_bytree, learning_rate, max_depth, alpha, n_estimators):
(0.5, 0.1, 5, 10, 100)
