## Import Packages

In [1]:
import pandas as pd
import numpy as np

## Read-In Data

In [2]:
df_default = pd.read_csv("data_processed/01_binary_training.csv", low_memory=False)

## Features List

In [3]:
features_categorical = [
    #'earliest_cr_line', # need to handle this in a pipeline
    'grade',
    'home_ownership',
    'purpose',
    'sub_grade',
    'term',
    'verification_status',
]

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

categorical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']]))
])

model = Pipeline([
    ("select", 
        ColumnTransformer(
            [("keep", "passthrough", ["grade"])],
            remainder="drop"
        )
    ),  # keeps certain columns and drops everything else
    ("preprocessor", categorical_transformer_1),
    ("model", LogisticRegression())
])

In [5]:
X = df_default.drop(columns=(['charged_off']))
y = df_default['charged_off']

In [6]:
model.fit(X, y)

In [7]:
model.score(X, y)

0.7979802482974134

In [8]:
model.predict(X).mean()

0.030741028515140708

In [10]:
default_balance_actual = (df_default["charged_off"] * df_default["loan_amnt"]).sum()
default_balance_actual

2930991650.0

In [23]:
default_balance_hard_prediction = (model.predict(X) * df_default["loan_amnt"]).sum()
default_balance_hard_prediction

562074600.0

In [24]:
default_balance_hard_prediction / default_balance_actual

0.1917694306635094

In [25]:
default_balance_expected = (model.predict_proba(X)[:,1] * df_default["loan_amnt"]).sum()
default_balance_expected

2849805747.084288

In [26]:
default_balance_expected / default_balance_actual

0.9723008753997263

## Threshold Tuning

In [27]:
# predict probabilities
yhat = model.predict_proba(X)

In [28]:
# keep probabilities for the positive outcome only
probs = yhat[:, 1]
probs

array([0.08711724, 0.20165105, 0.20165105, ..., 0.40067307, 0.20165105,
       0.20165105])

In [29]:
# define thresholds
thresholds = np.arange(0, 1, 0.01)

In [30]:
# apply threshold to positive probabilities to create inferences
def to_inference(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')

In [32]:
from sklearn.metrics import f1_score, accuracy_score
# evaluate each threshold
scores = [accuracy_score(y, to_inference(probs, t)) for t in thresholds]

In [33]:
# get best threshold
ix = np.argmax(scores)
print('Threshold=%.3f, Score=%.5f' % (thresholds[ix], scores[ix]))

Threshold=0.640, Score=0.80020


In [35]:
(to_inference(probs, 0.64) * df_default["loan_amnt"]).sum()

0.0

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline([
    ("select", 
        ColumnTransformer(
            [("keep", "passthrough", ["grade"])],
            remainder="drop"
        )
    ),  # keeps certain columns and drops everything else
    ("preprocessor", categorical_transformer_1),
    ("model", RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1))
])

In [None]:
model.fit(X, y)

In [None]:
model.score(X, y)

In [None]:
model.predict(X).mean()

In [None]:
default_balance_expected = (model.predict_proba(X)[:,1] * df_default["loan_amnt"]).sum()
default_balance_expected

In [None]:
default_balance_actual = (df_default["charged_off"] * df_default["loan_amnt"]).sum()
default_balance_actual

In [None]:
default_balance_expected / default_balance_actual

In [None]:
# predict probabilities
yhat = model.predict_proba(X)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = np.arange(0, 1, 0.01)

from sklearn.metrics import f1_score, accuracy_score
# evaluate each threshold
scores = [f1_score(y, to_inference(probs, t)) for t in thresholds]

# get best threshold
ix = np.argmax(scores)
print('Threshold=%.3f, Score=%.5f' % (thresholds[ix], scores[ix]))

In [None]:
(to_inference(probs, 0.14) * df_default["loan_amnt"]).sum()

In [None]:
probs.mean()

In [None]:
y.mean()