# Setup

In [217]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error as mse

In [218]:
ROOT = "/content"
data_file = os.path.join(ROOT, "data/Task3and4_Loan_Data.csv")
if not os.path.exists(data_file):
    print("File not found")
    sys.exit(1)

df = pd.read_csv(data_file)

# Data Processing (Train, Val, Test)

In [219]:
# All features
features = ['fico_score']
target = 'default'

# binary target (default = 1, no default = 0)
X, y = df[features], df[target]

# 20% data allocated to test (don't use til absolute end)
X_train_80pct, X_test_20pct, y_train_80pct, y_test_20pct = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 25% of the 80% train data to model validation (20% overall data)
# 60% of data for model training
X_train_60pct, X_val_20pct, y_train_60pct, y_val_20pct = train_test_split(
    X_train_80pct, y_train_80pct, test_size=0.25, random_state=42
)

"""
omit scaling between 0 and 1 for now
# Fit the FICO scaler on the known FICO bounds
min_score, max_score = 300, 850
scaler_fico = MinMaxScaler(feature_range=(0, 1))
scaler_fico.fit(pd.DataFrame({'fico_score': [300, 850]}))

# Scale the FICO scores using the fitted scaler
X_train_60pct['fico_score_scaled'] = scaler_fico.transform(X_train_60pct[['fico_score']])
X_val_20pct['fico_score_scaled'] =   scaler_fico.transform(X_val_20pct[['fico_score']])
X_test_20pct['fico_score_scaled'] =  scaler_fico.transform(X_test_20pct[['fico_score']])
"""

"\nomit scaling between 0 and 1 for now\n# Fit the FICO scaler on the known FICO bounds\nmin_score, max_score = 300, 850\nscaler_fico = MinMaxScaler(feature_range=(0, 1))\nscaler_fico.fit(pd.DataFrame({'fico_score': [300, 850]}))\n\n# Scale the FICO scores using the fitted scaler\nX_train_60pct['fico_score_scaled'] = scaler_fico.transform(X_train_60pct[['fico_score']])\nX_val_20pct['fico_score_scaled'] =   scaler_fico.transform(X_val_20pct[['fico_score']])\nX_test_20pct['fico_score_scaled'] =  scaler_fico.transform(X_test_20pct[['fico_score']])\n"

In [220]:
X_train_60pct.head()

Unnamed: 0,fico_score
8588,647
3178,693
5200,597
8889,616
5789,564


# Fico Score Bucketing

## Strategy: Minimize Mean Squared Error (MSE)

$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} \left( Y_i - \hat{Y_i} \right)^2
$$

Where:
- $n$ is the number of borrowers
- $Y_i$ is the observed/true value (credit score)
- $\hat{Y_i}$ is the predicted value (mean of credit score bucket)

In [221]:
NUM_BUCKETS = 10

min_score, max_score = 300, 850
range_fico = max_score - min_score
bucket_size = range_fico // NUM_BUCKETS
bucket_size_mid = bucket_size / 2

# Y hat can be average of scores in bucket or midpoint of bucket range
# we don't see any scores in bucket 0 because our min score is 409 and 300-354 is below that
# 355-409 (bucket #1) caputures the min score in our training data
X_train_60pct['bucket_num'] = ((X_train_60pct['fico_score'] - min_score) // bucket_size).astype(int)
X_val_20pct['bucket_num'] =   ((X_val_20pct['fico_score'] - min_score) // bucket_size).astype(int)
X_test_20pct['bucket_num'] =  ((X_test_20pct['fico_score'] - min_score) // bucket_size).astype(int)

# Handle the edge case where FICO score is exactly max_score (850) (put in highest bucket)
X_train_60pct.loc[X_train_60pct['fico_score'] == max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_val_20pct.loc[X_val_20pct['fico_score'] ==     max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_test_20pct.loc[X_test_20pct['fico_score'] ==   max_score, 'bucket_num'] = NUM_BUCKETS - 1

# we don't need this twice, but easier when next to other vars
X_train_60pct['y_true'] = X_train_60pct['fico_score']
X_val_20pct['y_true'] =   X_val_20pct['fico_score']
X_test_20pct['y_true'] =  X_test_20pct['fico_score']

X_train_60pct['y_hat_midp'] = X_train_60pct['bucket_num'] * bucket_size + bucket_size_mid + min_score
X_val_20pct['y_hat_midp'] =   X_val_20pct['bucket_num'] * bucket_size + bucket_size_mid + min_score
X_test_20pct['y_hat_midp'] =  X_test_20pct['bucket_num'] * bucket_size + bucket_size_mid + min_score

X_train_60pct['delta_y_midp'] = X_train_60pct['y_hat_midp'] - X_train_60pct['y_true']
X_val_20pct['delta_y_midp'] =   X_val_20pct['y_hat_midp'] - X_train_60pct['y_true']
X_test_20pct['delta_y_midp'] =  X_test_20pct['y_hat_midp'] - X_train_60pct['y_true']

X_train_60pct['delta_y_sq_midp'] = X_train_60pct['delta_y_midp'] ** 2
X_val_20pct['delta_y_sq_midp'] =   X_val_20pct['delta_y_midp'] ** 2
X_test_20pct['delta_y_sq_midp'] =  X_test_20pct['delta_y_midp'] ** 2

# what is the mean fico score for all scores in that bucket? and update all the rows accordingly
X_train_60pct['y_hat_mean'] = X_train_60pct.groupby('bucket_num')['fico_score'].transform('mean')
X_val_20pct['y_hat_mean'] =   X_val_20pct.groupby('bucket_num')['fico_score'].transform('mean')
X_test_20pct['y_hat_mean'] =  X_test_20pct.groupby('bucket_num')['fico_score'].transform('mean')

X_train_60pct['delta_y_mean'] = X_train_60pct['y_hat_mean'] - X_train_60pct['fico_score']
X_val_20pct['delta_y_mean'] =   X_val_20pct['y_hat_mean'] - X_val_20pct['fico_score']
X_test_20pct['delta_y_mean'] =  X_test_20pct['y_hat_mean'] - X_test_20pct['fico_score']

X_train_60pct['delta_y_sq_mean'] = X_train_60pct['delta_y_mean'] ** 2
X_val_20pct['delta_y_sq_mean'] =   X_val_20pct['delta_y_mean'] ** 2
X_test_20pct['delta_y_sq_mean'] =  X_test_20pct['delta_y_mean'] ** 2

# sort by bucket num ascending
X_train_60pct = X_train_60pct.sort_values(by='bucket_num')
X_val_20pct =   X_val_20pct.sort_values(by='bucket_num')
X_test_20pct =  X_test_20pct.sort_values(by='bucket_num')

X_train_60pct.head(5)

Unnamed: 0,fico_score,bucket_num,y_true,y_hat_midp,delta_y_midp,delta_y_sq_midp,y_hat_mean,delta_y_mean,delta_y_sq_mean
6556,409,1,409,382.5,-26.5,702.25,409.0,0.0,0.0
8689,456,2,456,437.5,-18.5,342.25,453.75,-2.25,5.0625
2896,450,2,450,437.5,-12.5,156.25,453.75,3.75,14.0625
8100,462,2,462,437.5,-24.5,600.25,453.75,-8.25,68.0625
7001,418,2,418,437.5,19.5,380.25,453.75,35.75,1278.0625


In [222]:
MSE_X_train_60pct_midp = mse(X_train_60pct['y_true'], X_train_60pct['y_hat_midp'])
print(f"MSE for X_train using bucket midpoint as predicted value: {MSE_X_train_60pct_midp}")

MSE_X_train_60pct_mean = mse(X_train_60pct['y_true'], X_train_60pct['y_hat_mean'])
print(f"MSE for X_train using bucket average as predicted value: {MSE_X_train_60pct_mean}")

MSE for X_train using bucket midpoint as predicted value: 254.19866666666667
MSE for X_train using bucket average as predicted value: 237.9354319344827


We did the unsophisticated calculation of mean squared error, and it is high. Taking the average of the scores in a bucket outperforms the midpoint of the bucket's range by around 6%.

We will try more sophisticated methods to attempt to reduce the error of fico scores next.

## Strategy: Maximize Log-likelihood function

Log-likelihood
$$
LL(b_1, \ldots, b_{r-1}) = \sum_{i=1}^r \left[k_i \ln p_i + (n_i - k_i) \ln(1 - p_i)\right]
$$

Where:
- $r_i$ is the bucket id
- $b_i$ is the bucket boundaries,
- $n_i$ is the number of records in each bucket,
- $k_i$ is the number of defaults in each bucket, and
- $p_i = \frac{k_i}{n_i}$ is the probability of default in the bucket.

In [223]:
NUM_BUCKETS = 10

min_score, max_score = 300, 850
range_fico = max_score - min_score
bucket_size = range_fico // NUM_BUCKETS
bucket_size_mid = bucket_size / 2

# Check bucket boundaries for debugging
bucket_lower_bounds = [ x * bucket_size + min_score for x in range(NUM_BUCKETS) ]
bucket_upper_bounds = [ x * bucket_size + min_score + bucket_size - 1 for x in range(NUM_BUCKETS) ]
bucket_upper_bounds[-1] = max_score
# print(f"Bucket lower boundaries: {bucket_lower_bounds}")

df_temp = pd.DataFrame(columns=['fico_score', 'default', 'bucket_num'],
                       index=X_train_60pct.index)
df_temp['fico_score'] = X_train_60pct['fico_score']
df_temp['default'] = y_train_60pct
df_temp['bucket_num'] = X_train_60pct['bucket_num']

df_temp.head(10)

Unnamed: 0,fico_score,default,bucket_num
6556,409,1,1
8689,456,1,2
2896,450,0,2
8100,462,1,2
7001,418,1,2
6820,462,0,2
7953,456,1,2
3659,456,1,2
1846,460,0,2
1725,461,1,2


In [224]:
buckets = [ x for x in range(NUM_BUCKETS) ]
df_log = pd.DataFrame(columns=['bucket_num', 'score_range', 'k', 'n', 'p', 'log_likelihood'],
                      index=buckets)

# # may or may not have fico scores in each bucket
df_log['bucket_num'] = buckets
df_log['score_range'] = \
    df_log['bucket_num'].apply(lambda x: f"{bucket_lower_bounds[x]} - {bucket_upper_bounds[x]}")

# # k_i = num defaulting borrowers in each bucket b_i
df_log['k'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 1).sum())
df_log['k'] = df_log['k'].fillna(0).astype(int) # counts always integers

# # n_i = num total borrowers in each bucket b_i
df_log['n'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 0).sum())
df_log['n'] += df_log['k'] # add defaults to non defaults for total
df_log['n'] = df_log['n'].fillna(0).astype(int) # counts always integers

# p_i = k_i / n_i = default rate per bucket b_i
df_log['p'] = df_log['k'] / df_log['n']

# log likelihood (see formula above)
df_log['log_likelihood'] = df_log['k'] * np.log( df_log['p'] ) \
    + ( df_log['n'] - df_log['k'] ) * np.log( 1 - df_log['p'] )

# make every value be 0 in case there are empty buckets
df_log = df_log.fillna(0)

total_log_likelihood = df_log['log_likelihood'].sum()
print(f"Total Log-Likelihood: {total_log_likelihood}")

df_log.head(10)

Total Log-Likelihood: -2585.0152442119415


Unnamed: 0,bucket_num,score_range,k,n,p,log_likelihood
0,0,300 - 354,0,0,0.0,0.0
1,1,355 - 409,1,1,1.0,0.0
2,2,410 - 464,10,16,0.625,-10.585012
3,3,465 - 519,102,166,0.614458,-110.674229
4,4,520 - 574,275,723,0.38036,-480.245448
5,5,575 - 629,389,1772,0.219526,-932.616587
6,6,630 - 684,246,1990,0.123618,-744.40394
7,7,685 - 739,76,1072,0.070896,-274.377596
8,8,740 - 794,6,236,0.025424,-27.955508
9,9,795 - 850,1,24,0.041667,-4.156925


#### Having fewer buckets may capture the data better since bucket #0 has no borrowers, and buckets #1 and #9 only have 1 borrower.

In [225]:
# All features
features = ['fico_score']
target = 'default'

# binary target (default = 1, no default = 0)
X, y = df[features], df[target]

# 20% data allocated to test (don't use til absolute end)
X_train_80pct, X_test_20pct, y_train_80pct, y_test_20pct = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 25% of the 80% train data to model validation (20% overall data)
# 60% of data for model training
X_train_60pct, X_val_20pct, y_train_60pct, y_val_20pct = train_test_split(
    X_train_80pct, y_train_80pct, test_size=0.25, random_state=42
)

NUM_BUCKETS = 5

min_score, max_score = 300, 850
range_fico = max_score - min_score
bucket_size = range_fico // NUM_BUCKETS
bucket_size_mid = bucket_size / 2

# Y hat can be average of scores in bucket or midpoint of bucket range
# we don't see any scores in bucket 0 because our min score is 409 and 300-354 is below that
# 355-409 (bucket #1) caputures the min score in our training data
X_train_60pct['bucket_num'] = ((X_train_60pct['fico_score'] - min_score) // bucket_size).astype(int)
X_val_20pct['bucket_num'] =   ((X_val_20pct['fico_score'] - min_score) // bucket_size).astype(int)
X_test_20pct['bucket_num'] =  ((X_test_20pct['fico_score'] - min_score) // bucket_size).astype(int)

# Handle the edge case where FICO score is exactly max_score (850) (put in highest bucket)
X_train_60pct.loc[X_train_60pct['fico_score'] == max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_val_20pct.loc[X_val_20pct['fico_score'] ==     max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_test_20pct.loc[X_test_20pct['fico_score'] ==   max_score, 'bucket_num'] = NUM_BUCKETS - 1

# Check bucket boundaries for debugging
bucket_lower_bounds = [ x * bucket_size + min_score for x in range(NUM_BUCKETS) ]
bucket_upper_bounds = [ x * bucket_size + min_score + bucket_size - 1 for x in range(NUM_BUCKETS) ]
bucket_upper_bounds[-1] = max_score
# print(f"Bucket lower boundaries: {bucket_lower_bounds}")

df_temp = pd.DataFrame(columns=['fico_score', 'default', 'bucket_num'],
                       index=X_train_60pct.index)
df_temp['fico_score'] = X_train_60pct['fico_score']
df_temp['default'] = y_train_60pct
df_temp['bucket_num'] = X_train_60pct['bucket_num']

df_temp.head(5)

Unnamed: 0,fico_score,default,bucket_num
8588,647,0,3
3178,693,0,3
5200,597,0,2
8889,616,0,2
5789,564,0,2


In [226]:
buckets = [ x for x in range(NUM_BUCKETS) ]
df_log = pd.DataFrame(columns=['bucket_num', 'score_range', 'k', 'n', 'p', 'log_likelihood'],
                      index=buckets)

# # may or may not have fico scores in each bucket
df_log['bucket_num'] = buckets
df_log['score_range'] = \
    df_log['bucket_num'].apply(lambda x: f"{bucket_lower_bounds[x]} - {bucket_upper_bounds[x]}")

# # k_i = num defaulting borrowers in each bucket b_i
df_log['k'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 1).sum())
df_log['k'] = df_log['k'].fillna(0).astype(int) # counts always integers

# # n_i = num total borrowers in each bucket b_i
df_log['n'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 0).sum())
df_log['n'] += df_log['k'] # add defaults to non defaults for total
df_log['n'] = df_log['n'].fillna(0).astype(int) # counts always integers

# p_i = k_i / n_i = default rate per bucket b_i
df_log['p'] = df_log['k'] / df_log['n']

# log likelihood (see formula above)
df_log['log_likelihood'] = df_log['k'] * np.log( df_log['p'] ) \
    + ( df_log['n'] - df_log['k'] ) * np.log( 1 - df_log['p'] )

# make every value be 0 in case there are empty buckets
df_log = df_log.fillna(0)

# total log likelihood
total_log_likelihood = df_log['log_likelihood'].sum()
print(f"Total Log-Likelihood: {total_log_likelihood}")

# View df_log to debug
df_log.head(10)

Total Log-Likelihood: -2628.6827441698797


Unnamed: 0,bucket_num,score_range,k,n,p,log_likelihood
0,0,300 - 409,1,1,1.0,0.0
1,1,410 - 519,112,182,0.615385,-121.262677
2,2,520 - 629,664,2495,0.266132,-1445.537729
3,3,630 - 739,322,3062,0.10516,-1029.674026
4,4,740 - 850,7,260,0.026923,-32.208312


#### This looks to be on the right track with the top bucket having 7 defaults and 260 total borrowers. The bottom bucket only has once value; however, so it would be worth widening the buckets to capture more low fico scores. Since I use the hard limit of 300 and no one applied for loans with scores under 400, we could adjust the lower limit upwards - but it would not be versatile with unseen data and scores lower than our min of 409.

In [227]:
# All features
features = ['fico_score']
target = 'default'

# binary target (default = 1, no default = 0)
X, y = df[features], df[target]

# 20% data allocated to test (don't use til absolute end)
X_train_80pct, X_test_20pct, y_train_80pct, y_test_20pct = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 25% of the 80% train data to model validation (20% overall data)
# 60% of data for model training
X_train_60pct, X_val_20pct, y_train_60pct, y_val_20pct = train_test_split(
    X_train_80pct, y_train_80pct, test_size=0.25, random_state=42
)

NUM_BUCKETS = 3

min_score, max_score = 300, 850
range_fico = max_score - min_score
bucket_size = range_fico // NUM_BUCKETS
bucket_size_mid = bucket_size / 2

# Y hat can be average of scores in bucket or midpoint of bucket range
# we don't see any scores in bucket 0 because our min score is 409 and 300-354 is below that
# 355-409 (bucket #1) caputures the min score in our training data
X_train_60pct['bucket_num'] = ((X_train_60pct['fico_score'] - min_score) // bucket_size).astype(int)
X_val_20pct['bucket_num'] =   ((X_val_20pct['fico_score'] - min_score) // bucket_size).astype(int)
X_test_20pct['bucket_num'] =  ((X_test_20pct['fico_score'] - min_score) // bucket_size).astype(int)

# Handle the edge case where FICO score is exactly max_score (850) (put in highest bucket)
X_train_60pct.loc[X_train_60pct['fico_score'] == max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_val_20pct.loc[X_val_20pct['fico_score'] ==     max_score, 'bucket_num'] = NUM_BUCKETS - 1
X_test_20pct.loc[X_test_20pct['fico_score'] ==   max_score, 'bucket_num'] = NUM_BUCKETS - 1

# Check bucket boundaries for debugging
bucket_lower_bounds = [ x * bucket_size + min_score for x in range(NUM_BUCKETS) ]
bucket_upper_bounds = [ x * bucket_size + min_score + bucket_size - 1 for x in range(NUM_BUCKETS) ]
bucket_upper_bounds[-1] = max_score
# print(f"Bucket lower boundaries: {bucket_lower_bounds}")

df_temp = pd.DataFrame(columns=['fico_score', 'default', 'bucket_num'],
                       index=X_train_60pct.index)
df_temp['fico_score'] = X_train_60pct['fico_score']
df_temp['default'] = y_train_60pct
df_temp['bucket_num'] = X_train_60pct['bucket_num']

df_temp.head(5)

Unnamed: 0,fico_score,default,bucket_num
8588,647,0,1
3178,693,0,2
5200,597,0,1
8889,616,0,1
5789,564,0,1


In [228]:
buckets = [ x for x in range(NUM_BUCKETS) ]
df_log = pd.DataFrame(columns=['bucket_num', 'score_range', 'k', 'n', 'p', 'log_likelihood'],
                      index=buckets)

# # may or may not have fico scores in each bucket
df_log['bucket_num'] = buckets
df_log['score_range'] = \
    df_log['bucket_num'].apply(lambda x: f"{bucket_lower_bounds[x]} - {bucket_upper_bounds[x]}")

# # k_i = num defaulting borrowers in each bucket b_i
df_log['k'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 1).sum())
df_log['k'] = df_log['k'].fillna(0).astype(int) # counts always integers

# # n_i = num total borrowers in each bucket b_i
df_log['n'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 0).sum())
df_log['n'] += df_log['k'] # add defaults to non defaults for total
df_log['n'] = df_log['n'].fillna(0).astype(int) # counts always integers

# p_i = k_i / n_i = default rate per bucket b_i
df_log['p'] = df_log['k'] / df_log['n']

# log likelihood (see formula above)
df_log['log_likelihood'] = df_log['k'] * np.log( df_log['p'] ) \
    + ( df_log['n'] - df_log['k'] ) * np.log( 1 - df_log['p'] )

# make every value be 0 in case there are empty buckets
df_log = df_log.fillna(0)

# total log likelihood
total_log_likelihood = df_log['log_likelihood'].sum()
print(f"Total Log-Likelihood: {total_log_likelihood}")

# View df_log to debug
df_log.head(10)

Total Log-Likelihood: -2711.671844266432


Unnamed: 0,bucket_num,score_range,k,n,p,log_likelihood
0,0,300 - 482,29,44,0.659091,-28.232012
1,1,483 - 665,936,4003,0.233825,-2177.05776
2,2,666 - 850,141,1953,0.072197,-506.382073


#### The high credit scores should not be more likley to default than the middle group (looking at p_i, the default percentage). The closer to 0 for log likelihood, the more likely the event (default). Therefore, since log likelihood is influenced by the n_i number of borrowers, we should aim to have similar amounts of borrowers in each bucket to get comparable log likelihood results.

## Dynamic programming to create buckets of similar sizes

In [235]:
# All features
features = ['fico_score']
target = 'default'

df = pd.read_csv(data_file)
# binary target (default = 1, no default = 0)
X, y = df[features], df[target]

# 20% data allocated to test (don't use til absolute end)
X_train_80pct, X_test_20pct, y_train_80pct, y_test_20pct = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 25% of the 80% train data to model validation (20% overall data)
# 60% of data for model training
X_train_60pct, X_val_20pct, y_train_60pct, y_val_20pct = train_test_split(
    X_train_80pct, y_train_80pct, test_size=0.25, random_state=42
)

X_train_60pct.head(5)

Unnamed: 0,fico_score
8588,647
3178,693
5200,597
8889,616
5789,564


In [315]:
# create 10 buckets, each with approx 1/10 of the training data
# scores distribution likely to vary a decent amount
NUM_BUCKETS = 10
min_score, max_score = 300, 850
# b_i : lower score, upper score, count of borrowers in bucket b_i
bucket_dict = {}

def bin_scores(lower_score_bound,
               delta,
               df_borrowers: pd.Series,
               bucket_id,
               num_buckets):
    """
    create buckets of approx the same amount of borrowers, and
    say the score range for each of the buckets
    """
    # exit conditions
    if bucket_id >= num_buckets or lower_score_bound >= max_score:
        return

    # final bucket: all remaining borrowers into this one
    if bucket_id == num_buckets - 1:
        borrowers_left = df_borrowers.shape[0]
        for _, value in bucket_dict.items():
            borrowers_left -= value['count']

        bucket_dict[bucket_id] = ({
            'lower_score_bound': lower_score_bound,
            'upper_score_bound': max_score,
            'count': borrowers_left
        })
        bin_scores(max_score, 0, df_borrowers, bucket_id + 1, num_buckets)

    # target num of borrowers per bucket
    # use delta to normalize bucket sizes
    # many borrowers had x score which pushed total above 600 for example
    target_count = (df_borrowers.shape[0] // num_buckets) + delta
    bucket_total = 0

    lp, rp = lower_score_bound, max_score
    while lp < rp:
        if bucket_total >= target_count:
            bucket_dict[bucket_id] = ({
                'lower_score_bound': lower_score_bound,
                'upper_score_bound': lp-1,
                'count': bucket_total
            })
            bin_scores(lp+1, target_count - bucket_total,
                        df_borrowers, bucket_id + 1, num_buckets)
            break

        else:
            # add num of borrowers with lp (score) to the bucket_total
            bucket_total += (df_borrowers['fico_score'] == lp).sum()
        lp += 1


bin_scores(min_score, 0, X_train_60pct, 0, NUM_BUCKETS)

_ = [ print(f'bucket #{k}: {v}\n') for k,v in bucket_dict.items() ]

bucket #0: {'lower_score_bound': 300, 'upper_score_bound': 558, 'count': 603}

bucket #1: {'lower_score_bound': 560, 'upper_score_bound': 587, 'count': 639}

bucket #2: {'lower_score_bound': 589, 'upper_score_bound': 608, 'count': 579}

bucket #3: {'lower_score_bound': 610, 'upper_score_bound': 624, 'count': 584}

bucket #4: {'lower_score_bound': 626, 'upper_score_bound': 641, 'count': 621}

bucket #5: {'lower_score_bound': 643, 'upper_score_bound': 657, 'count': 586}

bucket #6: {'lower_score_bound': 659, 'upper_score_bound': 676, 'count': 602}

bucket #7: {'lower_score_bound': 678, 'upper_score_bound': 697, 'count': 595}

bucket #8: {'lower_score_bound': 699, 'upper_score_bound': 731, 'count': 593}

bucket #9: {'lower_score_bound': 733, 'upper_score_bound': 850, 'count': 598}



In [342]:
df_dynamic_buckets = pd.DataFrame(columns=['bucket_num', 'count', 'score_range'],
                               index=sorted(bucket_dict.keys()))
df_dynamic_buckets['bucket_num'] = bucket_dict.keys()
df_dynamic_buckets['count'] = df_dynamic_buckets['bucket_num'].apply(lambda x: bucket_dict[x]['count'])
df_dynamic_buckets['score_range'] = \
    df_dynamic_buckets['bucket_num'].apply(lambda x: f"{bucket_dict[x]['lower_score_bound']} - {bucket_dict[x]['upper_score_bound']}")
df_dynamic_buckets['mean'] = df_dynamic_buckets['bucket_num'].apply(lambda x: (bucket_dict[x]['lower_score_bound'] + bucket_dict[x]['upper_score_bound']) / 2)

# apply our count rules on the X_train_60pct to assign buckets and calc log likelihood
df_temp = pd.DataFrame(columns=['fico_score', 'default', 'bucket_num'],
                       index=X_train_60pct.index)
df_temp['fico_score'] = X_train_60pct['fico_score']
df_temp['default'] = y_train_60pct
# use dynamic df lower and upper bound on df temp scores to bucketize
for score in df_temp['fico_score']:
    for bucket_num, bucket in bucket_dict.items():
        if bucket['lower_score_bound'] <= score <= bucket['upper_score_bound']:
            df_temp.loc[df_temp['fico_score'] == score, 'bucket_num'] = bucket_num
            break

# Log Likelihood
df_dynamic_buckets['k'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 1).sum())
df_dynamic_buckets['k'] = df_dynamic_buckets['k'].fillna(0).astype(int) # counts always integers

df_dynamic_buckets['n'] = df_temp.groupby('bucket_num')['default'].apply(lambda x: (x == 0).sum())
df_dynamic_buckets['n'] += df_dynamic_buckets['k'] # add defaults to non defaults for total
df_dynamic_buckets['n'] = df_dynamic_buckets['n'].fillna(0).astype(int) # counts always integers

df_dynamic_buckets['p'] = df_dynamic_buckets['k'] / df_dynamic_buckets['n']

df_dynamic_buckets['log_likelihood'] = df_dynamic_buckets['k'] * np.log( df_dynamic_buckets['p'] ) \
    + ( df_dynamic_buckets['n'] - df_dynamic_buckets['k'] ) * np.log( 1 - df_dynamic_buckets['p'] )

df_dynamic_buckets['Rating'] = abs(df_dynamic_buckets['log_likelihood'])

df_dynamic_buckets = df_dynamic_buckets.fillna(0)

total_log_likelihood = df_dynamic_buckets['log_likelihood'].sum()
print(f"Total Log-Likelihood: {total_log_likelihood}")

df_dynamic_buckets

Total Log-Likelihood: -2439.8059686048455


Unnamed: 0,bucket_num,count,score_range,mean,k,n,p,log_likelihood,Rating
0,0,603,300 - 558,429.0,293,603,0.485904,-417.728083,417.728083
1,1,639,560 - 587,573.5,188,639,0.29421,-387.156087,387.156087
2,2,579,589 - 608,598.5,144,579,0.248705,-324.765496,324.765496
3,3,584,610 - 624,617.0,105,584,0.179795,-275.111751,275.111751
4,4,621,626 - 641,633.5,106,621,0.170692,-283.786104,283.786104
5,5,586,643 - 657,650.0,67,586,0.114334,-208.312876,208.312876
6,6,602,659 - 676,667.5,58,602,0.096346,-190.820897,190.820897
7,7,595,678 - 697,687.5,57,595,0.095798,-187.872209,187.872209
8,8,593,699 - 731,715.0,34,593,0.057336,-130.206465,130.206465
9,9,598,733 - 850,791.5,7,337,0.020772,-34.046001,34.046001
