# Setup

In [67]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error as mse

In [68]:
ROOT = "/content"
data_file = os.path.join(ROOT, "data/Task3and4_Loan_Data.csv")
if not os.path.exists(data_file):
    print("File not found")
    sys.exit(1)

df = pd.read_csv(data_file)

# Data Processing (Train, Val, Test)

In [69]:
# All features
features = ['fico_score']
target = 'default'

# binary target (default = 1, no default = 0)
X, y = df[features], df[target]

# 20% data allocated to test (don't use til absolute end)
X_train_80pct, X_test_20pct, y_train_80pct, y_test_20pct = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 25% of the 80% train data to model validation (20% overall data)
# 60% of data for model training
X_train_60pct, X_val_20pct, y_train_60pct, y_val_20pct = train_test_split(
    X_train_80pct, y_train_80pct, test_size=0.25, random_state=42
)

# Fit the FICO scaler on the known FICO bounds
min_score, max_score = 300, 850
scaler_fico = MinMaxScaler(feature_range=(0, 1))
scaler_fico.fit(pd.DataFrame({'fico_score': [300, 850]}))

# Scale the FICO scores using the fitted scaler
X_train_60pct['fico_score_scaled'] = scaler_fico.transform(X_train_60pct[['fico_score']])
X_val_20pct['fico_score_scaled'] =   scaler_fico.transform(X_val_20pct[['fico_score']])
X_test_20pct['fico_score_scaled'] =  scaler_fico.transform(X_test_20pct[['fico_score']])

In [70]:
X_train_60pct.head()

Unnamed: 0,fico_score,fico_score_scaled
8588,647,0.630909
3178,693,0.714545
5200,597,0.54
8889,616,0.574545
5789,564,0.48


# Fico Score Bucketing

## Strategy: Minimize Mean Squared Error (MSE)

$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} \left( Y_i - \hat{Y_i} \right)^2
$$

Where:
- $n$ is the number of borrowers
- $Y_i$ is the observed/true value (credit score)
- $\hat{Y_i}$ is the predicted value (mean of credit score bucket)

In [71]:
NUM_BUCKETS = 10

range_fico = max_score - min_score
bucket_size = range_fico // NUM_BUCKETS
bucket_size_mid = bucket_size / 2

# Y hat can be average of scores in bucket or midpoint of bucket range
X_train_60pct['bucket_num'] = ((X_train_60pct['fico_score'] - min_score) // bucket_size).astype(int)
X_val_20pct['bucket_num'] = ((X_val_20pct['fico_score'] - min_score) // bucket_size).astype(int)
X_test_20pct['bucket_num'] = ((X_test_20pct['fico_score'] - min_score) // bucket_size).astype(int)

# Handle the edge case where FICO score is exactly max_score (850)
X_train_60pct.loc[X_train_60pct['fico_score'] == max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_val_20pct.loc[X_val_20pct['fico_score'] == max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_test_20pct.loc[X_test_20pct['fico_score'] == max_score, 'bucket_num'] = NUM_BUCKETS - 1

# we don't need this twice, but easier when next to other vars
X_train_60pct['y_true'] = X_train_60pct['fico_score']
X_val_20pct['y_true'] = X_val_20pct['fico_score']
X_test_20pct['y_true'] = X_test_20pct['fico_score']

X_train_60pct['y_hat_midp'] = X_train_60pct['bucket_num'] * bucket_size + bucket_size_mid + min_score
X_val_20pct['y_hat_midp'] = X_val_20pct['bucket_num'] * bucket_size + bucket_size_mid + min_score
X_test_20pct['y_hat_midp'] = X_test_20pct['bucket_num'] * bucket_size + bucket_size_mid + min_score

X_train_60pct['delta_y_midp'] = X_train_60pct['y_hat_midp'] - X_train_60pct['fico_score']
X_val_20pct['delta_y_midp'] = X_val_20pct['y_hat_midp'] - X_val_20pct['fico_score'] + min_score
X_test_20pct['delta_y_midp'] = X_test_20pct['y_hat_midp'] - X_test_20pct['fico_score'] + min_score

X_train_60pct['delta_y_sq_midp'] = X_train_60pct['delta_y_midp'] ** 2
X_val_20pct['delta_y_sq_midp'] = X_val_20pct['delta_y_midp'] ** 2
X_test_20pct['delta_y_sq_midp'] = X_test_20pct['delta_y_midp'] ** 2

# what is the mean fico score for all scores in that bucket? and update all the rows accordingly
X_train_60pct['y_hat_mean'] = X_train_60pct.groupby('bucket_num')['fico_score'].transform('mean')
X_val_20pct['y_hat_mean'] = X_val_20pct.groupby('bucket_num')['fico_score'].transform('mean')
X_test_20pct['y_hat_mean'] = X_test_20pct.groupby('bucket_num')['fico_score'].transform('mean')

X_train_60pct['delta_y_mean'] = X_train_60pct['y_hat_mean'] - X_train_60pct['fico_score']
X_val_20pct['delta_y_mean'] = X_val_20pct['y_hat_mean'] - X_val_20pct['fico_score']
X_test_20pct['delta_y_mean'] = X_test_20pct['y_hat_mean'] - X_test_20pct['fico_score']

X_train_60pct['delta_y_sq_mean'] = X_train_60pct['delta_y_mean'] ** 2
X_val_20pct['delta_y_sq_mean'] = X_val_20pct['delta_y_mean'] ** 2
X_test_20pct['delta_y_sq_mean'] = X_test_20pct['delta_y_mean'] ** 2

X_train_60pct.head(5)

Unnamed: 0,fico_score,fico_score_scaled,bucket_num,y_true,y_hat_midp,delta_y_midp,delta_y_sq_midp,y_hat_mean,delta_y_mean,delta_y_sq_mean
8588,647,0.630909,6,647,657.5,10.5,110.25,655.951759,8.951759,80.133986
3178,693,0.714545,7,693,712.5,19.5,380.25,706.834888,13.834888,191.404128
5200,597,0.54,5,597,602.5,5.5,30.25,604.511851,7.511851,56.427906
8889,616,0.574545,5,616,602.5,-13.5,182.25,604.511851,-11.488149,131.977567
5789,564,0.48,4,564,547.5,-16.5,272.25,552.806362,-11.193638,125.297523


In [72]:
MSE_X_train_60pct_midp = mse(X_train_60pct['y_true'], X_train_60pct['y_hat_midp'])
print(f"MSE for X_train using bucket midpoint as predicted value: {MSE_X_train_60pct_midp}")

MSE_X_train_60pct_mean = mse(X_train_60pct['y_true'], X_train_60pct['y_hat_mean'])
print(f"MSE for X_train using bucket average as predicted value: {MSE_X_train_60pct_mean}")

MSE for X_train using bucket midpoint as predicted value: 254.19866666666667
MSE for X_train using bucket average as predicted value: 237.9354319344827


We did the unsophisticated calculation of mean squared error, and it is high. Taking the average of the scores in a bucket outperforms the midpoint of the bucket's range by around 6%.

We will try more sophisticated methods to attempt to reduce the error of fico scores next.

## Strategy: Maximize Log-likelihood function

Log-likelihood
$$
LL(b_1, \ldots, b_{r-1}) = \sum_{i=1}^r \left[k_i \ln p_i + (n_i - k_i) \ln(1 - p_i)\right]
$$

Where:
- $b_i$ is the bucket boundaries,
- $n_i$ is the number of records in each bucket,
- $k_i$ is the number of defaults in each bucket, and
- $p_i = \frac{k_i}{n_i}$ is the probability of default in the bucket.