In [6]:
import pandas as pd
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# 1. Load data
df = pd.read_csv(r"c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_updated.csv")

# 2. Select the four components of the formula
features = pd.DataFrame({
    "PIS": df["Payment_Irregularity_Score"],
    "one_minus_CoLI": 1 - df["CoLI_ridge"],
    "one_minus_BRI": 1 - df["BRI_ridge"],
    "one_minus_FRI": 1 - df["Financial_Resilience_Index"]
})

# 3. Choose the target you want to approximate (default prob)
y = df["Default_Prob_Final"]      # or df["Default_Prob"]

# 4. Fit Ridge regression without an intercept (formula has no constant term)
alphas = (0.001, 0.01, 0.1, 1, 10, 100)   # search space for regularisation strength
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

ridge = RidgeCV(alphas=alphas, fit_intercept=False)
ridge.fit(X_scaled, y)
w_std = ridge.coef_
w_orig = w_std / scaler.scale_

print("Weights (standardized):", w_std)
print("Weights (original scale):", w_orig)


# 5. Inspect learned weights
w1, w2, w3, w4 = ridge.coef_
print("Weights:")
print(f"  w1 (PIS)        = {w1:.6f}")
print(f"  w2 (1 - CoLI)   = {w2:.6f}")
print(f"  w3 (1 - BRI)    = {w3:.6f}")
print(f"  w4 (1 - FRI)    = {w4:.6f}")
print(f"Chosen alpha      = {ridge.alpha_}")

# 6. Optional: evaluate fit quality
y_pred = ridge.predict(X_scaled)
rmse = mean_squared_error(y, y_pred) ** 0.5
print("RMSE :", rmse)
print("R^2  :", r2_score(y, y_pred))

Weights (standardized): [0.01019808 0.01173606 0.03441564 0.05057033]
Weights (original scale): [0.42291579 0.76224303 0.71664486 0.15960954]
Weights:
  w1 (PIS)        = 0.010198
  w2 (1 - CoLI)   = 0.011736
  w3 (1 - BRI)    = 0.034416
  w4 (1 - FRI)    = 0.050570
Chosen alpha      = 100.0
RMSE : 0.1234357072628156
R^2  : 0.31216086151165845


In [8]:
import pandas as pd
df = pd.read_csv('risk_scored_applicants_realistic_defaults.csv')
print(df.columns)

Index(['Income', 'Age', 'Dependents', 'Occupation', 'City_Tier', 'Rent',
       'Loan_Repayment', 'Insurance', 'Groceries', 'Transport', 'Eating_Out',
       'Entertainment', 'Utilities', 'Healthcare', 'Education',
       'Miscellaneous', 'Missed_Rent', 'Missed_Utilities', 'Missed_Insurance',
       'Missed_Loan_Repayment', 'Missed_Groceries', 'Missed_Transport',
       'Missed_Eating_Out', 'Missed_Entertainment', 'Missed_Miscellaneous',
       'Missed_Healthcare', 'Missed_Education', 'Missed_Payment_Rate',
       'Payment_Reliability_Score', 'UPI_Remitter_Bank', 'Bank_CAELS_Score',
       'Bank_Risk_Tier', 'Expense_Volatility', 'Utility_Payment_Regularity',
       'Recurring_Payment_Stability', 'Savings_Gap_Index',
       'Payment_Irregularity_Score', 'Financial_Resilience_Index',
       'UPI_Success_Rate', 'CoLI_ridge', 'BRI_ridge', 'RiskScore_raw',
       'Default_Prob', 'Default_Prob_Final', 'Default_Label',
       'Total_Missed_Payments', 'Default_Prob_Ridge', 'Override_Catastroph

In [None]:
import pandas as pd

src = r"c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_realistic_defaults.csv"
dst = r"c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_minimal.csv"

df = pd.read_csv(src)

# Monthly spend categories
spend_cols = [
    "Rent", "Loan_Repayment", "Insurance", "Groceries", "Transport",
    "Eating_Out", "Entertainment", "Utilities", "Healthcare", "Education", "Miscellaneous"
]

# Missed-payment features you simulated
missed_cols = [
    "Missed_Rent", "Missed_Utilities", "Missed_Insurance", "Missed_Loan_Repayment",
    "Missed_Groceries", "Missed_Transport", "Missed_Eating_Out", "Missed_Entertainment",
    "Missed_Miscellaneous", "Missed_Healthcare", "Missed_Education",
    "Missed_Payment_Rate"
]

# Keep essentials + spends + missed + final flag
keep_cols = [
    "Income", "Age", "Dependents", "Occupation", "City_Tier",
    *spend_cols,
    "Payment_Irregularity_Score", "CoLI_ridge", "BRI_ridge", "Financial_Resilience_Index",
    *missed_cols,
    "Default_Prob_Ridge","Default_Label_Calibrated"
]

# Keep only available columns
keep_cols = [c for c in keep_cols if c in df.columns]
df_min = df[keep_cols].copy()

df_min.to_csv(dst, index=False)
print(f"Saved {len(df_min):,} rows to:\n{dst}")
print("Columns kept:", list(df_min.columns))
df_min.head()

Saved 20,000 rows to:
c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_minimal.csv
Columns kept: ['Income', 'Age', 'Dependents', 'Occupation', 'City_Tier', 'Rent', 'Loan_Repayment', 'Insurance', 'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 'Utilities', 'Healthcare', 'Education', 'Miscellaneous', 'Payment_Irregularity_Score', 'CoLI_ridge', 'BRI_ridge', 'Financial_Resilience_Index', 'Missed_Rent', 'Missed_Utilities', 'Missed_Insurance', 'Missed_Loan_Repayment', 'Missed_Groceries', 'Missed_Transport', 'Missed_Eating_Out', 'Missed_Entertainment', 'Missed_Miscellaneous', 'Missed_Healthcare', 'Missed_Education', 'Missed_Payment_Rate', 'Default_Prob_Ridge', 'Default_Label_Calibrated']


Unnamed: 0,Income,Age,Dependents,Occupation,City_Tier,Rent,Loan_Repayment,Insurance,Groceries,Transport,...,Missed_Groceries,Missed_Transport,Missed_Eating_Out,Missed_Entertainment,Missed_Miscellaneous,Missed_Healthcare,Missed_Education,Missed_Payment_Rate,Default_Prob_Ridge,Default_Label_Calibrated
0,44637.24964,49,0,Self_Employed,Tier_1,13391.17489,0.0,2206.490129,6658.768341,2636.970696,...,246.709511,134.218197,88.335504,97.384951,69.200856,114.588132,0.0,1.21,0.0,0
1,26858.59659,34,2,Retired,Tier_2,5371.719318,0.0,869.522617,2818.44446,1543.018778,...,132.315396,78.793901,62.504638,88.799302,25.530107,162.48412,125.541304,1.72,0.0,0
2,50367.60508,35,1,Student,Tier_3,7555.140763,4612.103386,2201.80005,6313.222081,3221.396403,...,1247.082631,453.232648,142.831755,369.478983,52.852165,637.305052,0.0,5.14,1.0,0
3,101455.6002,21,0,Self_Employed,Tier_3,15218.34004,6809.441427,4889.418087,14690.14936,7106.130005,...,2089.884236,1504.7972,778.83455,290.848863,460.82143,1539.814171,0.0,3.77,1.0,1
4,24875.28355,52,4,Professional,Tier_2,4975.05671,3112.609398,635.90717,3034.329665,1276.155163,...,122.013088,42.132943,57.585506,53.855413,40.734742,52.256501,147.889424,1.14,0.0,0


In [None]:
""""
spend_cols = [
    "Rent", "Loan_Repayment", "Insurance", "Groceries", "Transport",
    "Eating_Out", "Entertainment", "Utilities", "Healthcare", "Education", "Miscellaneous"
]

# Missed-payment features you simulated
missed_cols = [
    "Missed_Rent", "Missed_Utilities", "Missed_Insurance", "Missed_Loan_Repayment",
    "Missed_Groceries", "Missed_Transport", "Missed_Eating_Out", "Missed_Entertainment",
    "Missed_Miscellaneous", "Missed_Healthcare", "Missed_Education",
    "Missed_Payment_Rate"
]

# Keep essentials + spends + missed + final flag
keep_cols = [
    "Income", "Age", "Dependents", "Occupation", "City_Tier",
    *spend_cols,
    "Payment_Irregularity_Score", "CoLI_ridge", "BRI_ridge", "Financial_Resilience_Index",
    *missed_cols,
    "Default_Prob_Ridge","Default_Label_Calibrated"
]
"""

'"\nspend_cols = [\n    "Rent", "Loan_Repayment", "Insurance", "Groceries", "Transport",\n    "Eating_Out", "Entertainment", "Utilities", "Healthcare", "Education", "Miscellaneous"\n]\n\n# Missed-payment features you simulated\nmissed_cols = [\n    "Missed_Rent", "Missed_Utilities", "Missed_Insurance", "Missed_Loan_Repayment",\n    "Missed_Groceries", "Missed_Transport", "Missed_Eating_Out", "Missed_Entertainment",\n    "Missed_Miscellaneous", "Missed_Healthcare", "Missed_Education",\n    "Missed_Payment_Rate"\n]\n\n# Keep essentials + spends + missed + final flag\nkeep_cols = [\n    "Income", "Age", "Dependents", "Occupation", "City_Tier",\n    *spend_cols,\n    "Payment_Irregularity_Score", "CoLI_ridge", "BRI_ridge", "Financial_Resilience_Index",\n    *missed_cols,\n    "Default_Prob_Ridge","Default_Label_Calibrated"\n]\n\n\n'

In [21]:
print(df[["Missed_Rent", "Missed_Utilities", "Missed_Insurance", "Missed_Loan_Repayment",
    "Missed_Groceries", "Missed_Transport", "Missed_Eating_Out", "Missed_Entertainment",
    "Missed_Miscellaneous", "Missed_Healthcare", "Missed_Education",
    "Missed_Payment_Rate","Default_Label_Calibrated"]].corr())

                          Missed_Rent  Missed_Utilities  Missed_Insurance  \
Missed_Rent                  1.000000          0.819707          0.790992   
Missed_Utilities             0.819707          1.000000          0.840806   
Missed_Insurance             0.790992          0.840806          1.000000   
Missed_Loan_Repayment        0.407088          0.437520          0.443869   
Missed_Groceries             0.806188          0.852887          0.835743   
Missed_Transport             0.816194          0.858856          0.829982   
Missed_Eating_Out            0.797039          0.823766          0.807467   
Missed_Entertainment         0.801122          0.814853          0.814506   
Missed_Miscellaneous         0.764431          0.797035          0.794910   
Missed_Healthcare            0.779623          0.810348          0.811550   
Missed_Education             0.370444          0.404073          0.394423   
Missed_Payment_Rate          0.246054          0.320661          0.346148   

In [26]:
import pandas as pd

src = r"c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_realistic_defaults.csv"
dst = r"c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_minimal.csv"

df = pd.read_csv(src)

# Merge Missed_Education into Missed_Miscellaneous
df['Missed_Miscellaneous'] = df['Missed_Miscellaneous'] + df['Missed_Education']

# Monthly spend categories
spend_cols = [
    "Rent", "Loan_Repayment", "Insurance", "Groceries", "Transport",
    "Eating_Out", "Entertainment", "Utilities", "Healthcare", "Education", "Miscellaneous"
]

# Missed-payment features (excluding Missed_Education)
missed_cols = [
    "Missed_Rent", "Missed_Utilities", "Missed_Insurance", "Missed_Loan_Repayment",
    "Missed_Groceries", "Missed_Transport", "Missed_Eating_Out", "Missed_Entertainment",
    "Missed_Miscellaneous", "Missed_Healthcare"
]

# Keep essentials + spends + missed + final flag
keep_cols = [
    "Income", "Age", "Dependents", "Occupation", "City_Tier",
    *spend_cols,
    "Payment_Irregularity_Score", "CoLI_ridge", "BRI_ridge", "Financial_Resilience_Index",
    *missed_cols,
    "Default_Label_Calibrated"
]

# Keep only available columns
keep_cols = [c for c in keep_cols if c in df.columns]
df_min = df[keep_cols].copy()

df_min.to_csv(dst, index=False)
print(f"Saved {len(df_min):,} rows to:\n{dst}")
print(f"Merged Missed_Education into Missed_Miscellaneous")
print("Columns kept:", list(df_min.columns))
df_min.head()

Saved 20,000 rows to:
c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_minimal.csv
Merged Missed_Education into Missed_Miscellaneous
Columns kept: ['Income', 'Age', 'Dependents', 'Occupation', 'City_Tier', 'Rent', 'Loan_Repayment', 'Insurance', 'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 'Utilities', 'Healthcare', 'Education', 'Miscellaneous', 'Payment_Irregularity_Score', 'CoLI_ridge', 'BRI_ridge', 'Financial_Resilience_Index', 'Missed_Rent', 'Missed_Utilities', 'Missed_Insurance', 'Missed_Loan_Repayment', 'Missed_Groceries', 'Missed_Transport', 'Missed_Eating_Out', 'Missed_Entertainment', 'Missed_Miscellaneous', 'Missed_Healthcare', 'Default_Label_Calibrated']


Unnamed: 0,Income,Age,Dependents,Occupation,City_Tier,Rent,Loan_Repayment,Insurance,Groceries,Transport,...,Missed_Utilities,Missed_Insurance,Missed_Loan_Repayment,Missed_Groceries,Missed_Transport,Missed_Eating_Out,Missed_Entertainment,Missed_Miscellaneous,Missed_Healthcare,Default_Label_Calibrated
0,44637.24964,49,0,Self_Employed,Tier_1,13391.17489,0.0,2206.490129,6658.768341,2636.970696,...,349.756077,634.79065,0.0,246.709511,134.218197,88.335504,97.384951,69.200856,114.588132,0
1,26858.59659,34,2,Retired,Tier_2,5371.719318,0.0,869.522617,2818.44446,1543.018778,...,376.242717,188.993699,0.0,132.315396,78.793901,62.504638,88.799302,151.071411,162.48412,0
2,50367.60508,35,1,Student,Tier_3,7555.140763,4612.103386,2201.80005,6313.222081,3221.396403,...,2004.037567,927.681596,4963.988758,1247.082631,453.232648,142.831755,369.478983,52.852165,637.305052,0
3,101455.6002,21,0,Self_Employed,Tier_3,15218.34004,6809.441427,4889.418087,14690.14936,7106.130005,...,3387.497645,3156.001933,7109.821185,2089.884236,1504.7972,778.83455,290.848863,460.82143,1539.814171,1
4,24875.28355,52,4,Professional,Tier_2,4975.05671,3112.609398,635.90717,3034.329665,1276.155163,...,172.563886,142.893591,968.404032,122.013088,42.132943,57.585506,53.855413,188.624166,52.256501,0


In [None]:
import pandas as pd

# Load the data
df = pd.read_csv(r"c:\Users\subha\Desktop\risk_credit_final\risk_scored_applicants_minimal.csv")

# Select missed payment columns + target
corr_cols = [
    "Missed_Rent", "Missed_Utilities", "Missed_Insurance", "Missed_Loan_Repayment",
    "Missed_Groceries", "Missed_Transport", "Missed_Eating_Out", "Missed_Entertainment",
    "Missed_Miscellaneous", "Missed_Healthcare"
    "Default_Label_Calibrated"
]

# Print correlation matrix
print(df[corr_cols].corr())

                          Missed_Rent  Missed_Utilities  Missed_Insurance  \
Missed_Rent                  1.000000          0.819707          0.790992   
Missed_Utilities             0.819707          1.000000          0.840806   
Missed_Insurance             0.790992          0.840806          1.000000   
Missed_Loan_Repayment        0.407088          0.437520          0.443869   
Missed_Groceries             0.806188          0.852887          0.835743   
Missed_Transport             0.816194          0.858856          0.829982   
Missed_Eating_Out            0.797039          0.823766          0.807467   
Missed_Entertainment         0.801122          0.814853          0.814506   
Missed_Miscellaneous         0.516321          0.554034          0.545012   
Missed_Healthcare            0.779623          0.810348          0.811550   
Missed_Payment_Rate          0.246054          0.320661          0.346148   
Default_Label_Calibrated     0.536946          0.479766          0.475818   