In [12]:
import numpy as np
import pandas as pd

In [13]:
data = pd.read_csv("Loan_Data.csv")
print("Dataset preview:")
print(data.head())
print("\nColumns:", data.columns)



Dataset preview:
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  

Columns: Index(['customer_id', 'credit_lines_outstanding', 'loan_amt_outstanding',
       'total_debt_outstanding', '

In [14]:
# Step 2: Select relevant columns
fico_col = "fico_score"      
default_col = "default" 
data = data[[fico_col, default_col]].dropna()


In [15]:
n_buckets = 5

In [16]:
# Method 1: Quantization (MSE)

data = data.sort_values(fico_col).reset_index(drop=True)
data["bucket"] = pd.qcut(data[fico_col], q=n_buckets, labels=False)
bucket_means = data.groupby("bucket")[fico_col].mean()


mse = np.mean([(row[fico_col] - bucket_means[row["bucket"]])**2 
               for _, row in data.iterrows()])

print("\n===== Quantization (MSE) =====")
print("Bucket means (quantized values):\n", bucket_means)
print("Mean Squared Error =", round(mse, 2))
print()



===== Quantization (MSE) =====
Bucket means (quantized values):
 bucket
0    552.611707
1    606.751395
2    638.579688
3    670.346520
4    721.524837
Name: fico_score, dtype: float64
Mean Squared Error = 392.65



In [17]:
# Method 2: Log-Likelihood
log_likelihood = 0
bucket_probs = {}

for b, group in data.groupby("bucket"):
    ni = len(group)                # total records
    ki = group[default_col].sum()  # defaults
    pi = ki / ni if ni > 0 else 0.0001  # PD in bucket
    
    # Store for reference
    bucket_probs[b] = pi
    
    # Log-likelihood term
    if 0 < pi < 1:
        log_likelihood += ki*np.log(pi) + (ni-ki)*np.log(1-pi)

print("===== Log-Likelihood =====")
print("Default probabilities per bucket:\n", bucket_probs)
print("Log-Likelihood =", round(log_likelihood, 2))

===== Log-Likelihood =====
Default probabilities per bucket:
 {0: 0.3985365853658537, 1: 0.21562658548959918, 2: 0.15133232780291603, 3: 0.100150225338007, 4: 0.054189663823381834}
Log-Likelihood = -4321.03
