In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler

In [5]:
df = pd.read_csv('data\\risk_scored_applicants_updated.csv')
print(df.columns)

Index(['Income', 'Age', 'Dependents', 'Occupation', 'City_Tier', 'Rent',
       'Loan_Repayment', 'Insurance', 'Groceries', 'Transport', 'Eating_Out',
       'Entertainment', 'Utilities', 'Healthcare', 'Education',
       'Miscellaneous', 'Missed_Rent', 'Missed_Utilities', 'Missed_Insurance',
       'Missed_Loan_Repayment', 'Missed_Groceries', 'Missed_Transport',
       'Missed_Eating_Out', 'Missed_Entertainment', 'Missed_Miscellaneous',
       'Missed_Healthcare', 'Missed_Education', 'Missed_Payment_Rate',
       'Payment_Reliability_Score', 'UPI_Remitter_Bank', 'Bank_CAELS_Score',
       'Bank_Risk_Tier', 'Expense_Volatility', 'Utility_Payment_Regularity',
       'Recurring_Payment_Stability', 'Savings_Gap_Index',
       'Payment_Irregularity_Score', 'Financial_Resilience_Index',
       'UPI_Success_Rate', 'CoLI_ridge', 'BRI_ridge', 'RiskScore_raw',
       'Default_Prob', 'Default_Prob_Final', 'Default_Label',
       'Total_Missed_Payments'],
      dtype='object')


In [9]:
columns_to_remove = [
'Expense_Volatility','Utility_Payment_Regularity','Recurring_Payment_Stability','Savings_Gap_Index','Payment_Irregularity_Score','Financial_Resilience_Index'

]

df = df.drop(columns=columns_to_remove, errors='ignore')

# Save back to the same file
df.to_csv('data\\risk_scored_applicants_updated.csv', index=False)

print(f"Updated dataset shape: {df.shape}")
print(f"Columns remaining: {len(df.columns)}")
print("\nDataset has been updated and saved!")

Updated dataset shape: (20000, 19)
Columns remaining: 19

Dataset has been updated and saved!


In [10]:
print(df.head())

         Income  Age  Dependents     Occupation City_Tier          Rent  \
0   44637.24964   49           0  Self_Employed    Tier_1  13391.174890   
1   26858.59659   34           2        Retired    Tier_2   5371.719318   
2   50367.60508   35           1        Student    Tier_3   7555.140763   
3  101455.60020   21           0  Self_Employed    Tier_3  15218.340040   
4   24875.28355   52           4   Professional    Tier_2   4975.056710   

   Loan_Repayment    Insurance     Groceries    Transport   Eating_Out  \
0        0.000000  2206.490129   6658.768341  2636.970696  1651.801726   
1        0.000000   869.522617   2818.444460  1543.018778   649.378103   
2     4612.103386  2201.800050   6313.222081  3221.396403  1513.814376   
3     6809.441427  4889.418087  14690.149360  7106.130005  5040.249158   
4     3112.609398   635.907170   3034.329665  1276.155163   692.827225   

   Entertainment    Utilities   Healthcare    Education  Miscellaneous  \
0    1536.184255  2911.792231 

# Fix Inverted Bank_Risk_Tier Labels

The CAELS scores are correct, but the Bank_Risk_Tier labels are inverted:
- High CAELS (0.8-1.0) should = Low Risk ✓
- Low CAELS (0.0-0.3) should = High Risk ✓

Current labels are backwards, so we'll fix them based on actual scores.

In [11]:
# Check current distribution
print("Current Bank_Risk_Tier vs CAELS Score:")
print(df.groupby('Bank_Risk_Tier')['Bank_CAELS_Score'].agg(['mean', 'std', 'min', 'max', 'count']))
print("\nProblem: High CAELS scores are labeled 'High Risk' (should be 'Low Risk')")

Current Bank_Risk_Tier vs CAELS Score:
                    mean       std    min    max  count
Bank_Risk_Tier                                         
High Risk       0.900774  0.068776  0.812  1.000   7163
Low Risk        0.104138  0.071665  0.000  0.221   6324
Medium Risk     0.622741  0.161552  0.276  0.805   6513

Problem: High CAELS scores are labeled 'High Risk' (should be 'Low Risk')


In [12]:
# Fix the Bank_Risk_Tier labels based on actual CAELS scores
# High CAELS (0.8-1.0) = Low Risk (good bank)
# Medium CAELS (0.3-0.8) = Medium Risk
# Low CAELS (0.0-0.3) = High Risk (poor bank)

def correct_bank_risk_tier(caels_score):
    if caels_score >= 0.8:
        return 'Low Risk'
    elif caels_score >= 0.3:
        return 'Medium Risk'
    else:
        return 'High Risk'

# Apply correction
df['Bank_Risk_Tier'] = df['Bank_CAELS_Score'].apply(correct_bank_risk_tier)

print("\nCorrected Bank_Risk_Tier vs CAELS Score:")
print(df.groupby('Bank_Risk_Tier')['Bank_CAELS_Score'].agg(['mean', 'std', 'min', 'max', 'count']))
print("\n✓ Fixed: High CAELS → Low Risk, Low CAELS → High Risk")


Corrected Bank_Risk_Tier vs CAELS Score:
                    mean       std    min    max  count
Bank_Risk_Tier                                         
High Risk       0.120846  0.085025  0.000  0.276   7005
Low Risk        0.879641  0.073093  0.800  1.000   9127
Medium Risk     0.592482  0.073725  0.521  0.698   3868

✓ Fixed: High CAELS → Low Risk, Low CAELS → High Risk


In [15]:
# Save the corrected dataset
df.to_csv('data\\risk_scored_applicants_updated.csv', index=False)
print("✓ Corrected dataset saved to 'data\\risk_scored_applicants_updated.csv'")

✓ Corrected dataset saved to 'data\risk_scored_applicants_updated.csv'
