## Modeling

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Read in the data from the .pkl
df = pd.read_pickle("./data/baseline_fe_data.pkl")

num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
target = ['is_bad']
num_cols.remove('is_bad')

In [5]:
df[cat_cols].dtypes

zip_code               object
initial_list_status    object
addr_state             object
verification_status    object
policy_code            object
purpose_cat            object
home_ownership         object
pymnt_plan             object
dtype: object

In [6]:
df[num_cols].dtypes

emp_length                       int64
annual_inc                     float64
debt_to_income                 float64
delinq_2yrs                    float64
inq_last_6mths                 float64
mths_since_last_delinq         float64
mths_since_last_record           int64
open_acc                       float64
pub_rec                          int64
revol_bal                        int64
revol_util                     float64
total_acc                      float64
mths_since_last_major_derog      int64
cr_line_yrs                    float64
cr_line_mths                     int64
dtype: object

In [7]:
df[target].dtypes

is_bad    int64
dtype: object

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   is_bad                       10000 non-null  int64  
 1   emp_length                   10000 non-null  int64  
 2   home_ownership               10000 non-null  object 
 3   annual_inc                   10000 non-null  float64
 4   verification_status          10000 non-null  object 
 5   pymnt_plan                   10000 non-null  object 
 6   purpose_cat                  10000 non-null  object 
 7   zip_code                     10000 non-null  object 
 8   addr_state                   10000 non-null  object 
 9   debt_to_income               10000 non-null  float64
 10  delinq_2yrs                  10000 non-null  float64
 11  inq_last_6mths               10000 non-null  float64
 12  mths_since_last_delinq       10000 non-null  float64
 13  mths_since_last_r

In [9]:
#def zip_region(x):
#    return x[1:3]
#
#df['zip_region'] = df.zip_code.apply(zip_region)
df.drop('zip_code', axis=1, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   is_bad                       10000 non-null  int64  
 1   emp_length                   10000 non-null  int64  
 2   home_ownership               10000 non-null  object 
 3   annual_inc                   10000 non-null  float64
 4   verification_status          10000 non-null  object 
 5   pymnt_plan                   10000 non-null  object 
 6   purpose_cat                  10000 non-null  object 
 7   addr_state                   10000 non-null  object 
 8   debt_to_income               10000 non-null  float64
 9   delinq_2yrs                  10000 non-null  float64
 10  inq_last_6mths               10000 non-null  float64
 11  mths_since_last_delinq       10000 non-null  float64
 12  mths_since_last_record       10000 non-null  int64  
 13  open_acc         

#### Label Encode and One-Hot Encode Categorical Variables
- NOTE: I may need to reduce the dimensions after the one-hot encoding, but we will see how this goes first

In [11]:
# Save out the data with basic feature engineering
df.to_pickle("./data/non_standard_explore_data.pkl")

## Split out the data

In [12]:
from sklearn.model_selection import train_test_split
seed = 20

# Try just using a train/test split at first without sorting the values by time (I don't know how truly time-based this model will be)
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_bad', axis=1), df['is_bad'], test_size=0.20, random_state=seed)

print("Training Distribution of is_bad:")
print(len(y_train[y_train==1])/len(y_train))

print()
print("Test Distribution of is_bad:")
print(len(y_test[y_test==1])/len(y_test))

Training Distribution of is_bad:
0.1285

Test Distribution of is_bad:
0.1335


## Standard scale the numerical data

In [14]:
#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

In [15]:
# See how much this boosts the correlations
corr = df.corr()['is_bad'].sort_values()

# Display correlations
print('Most Positive Correlations with "is_bad"')
print(20*"-")
print(corr.tail(10))
print()
print('Most Negative Correlations with "is_bad"')
print(20*"-")
print(corr.head(10))

Most Positive Correlations with "is_bad"
--------------------
inq_last_6mths                -0.001007
mths_since_last_delinq         0.014576
mths_since_last_record         0.014978
mths_since_last_major_derog    0.015481
delinq_2yrs                    0.021897
cr_line_yrs                    0.029565
debt_to_income                 0.029873
pub_rec                        0.032218
revol_util                     0.087797
is_bad                         1.000000
Name: is_bad, dtype: float64

Most Negative Correlations with "is_bad"
--------------------
total_acc                     -0.055271
annual_inc                    -0.050966
cr_line_mths                  -0.035356
emp_length                    -0.033449
open_acc                      -0.020341
revol_bal                     -0.016202
inq_last_6mths                -0.001007
mths_since_last_delinq         0.014576
mths_since_last_record         0.014978
mths_since_last_major_derog    0.015481
Name: is_bad, dtype: float64


In [16]:
num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
target = ['is_bad']
num_cols.remove('is_bad')

## Upsample the training data

In [18]:
# You need to try and balance the classes within the training set
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state=seed)
#x_train_r, y_train_r = sm.fit_resample(X_train, y_train)

## Try CatBoost Model

In [19]:
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [29]:
# Start with simple logistic regression classifier for second baseline attempt
clf_cb =  CatBoostClassifier(iterations=300, learning_rate=0.3, depth=6, eval_metric='Recall', random_seed=seed)
clf_cb.fit(X_train, y_train, cat_features=cat_cols, eval_set=(X_test, y_test), use_best_model=True)

0:	learn: 0.0009728	test: 0.0000000	best: 0.0000000 (0)	total: 11.5ms	remaining: 3.45s
1:	learn: 0.1206226	test: 0.1685393	best: 0.1685393 (1)	total: 27.3ms	remaining: 4.07s
2:	learn: 0.1206226	test: 0.1722846	best: 0.1722846 (2)	total: 40.3ms	remaining: 3.99s
3:	learn: 0.1225681	test: 0.1722846	best: 0.1722846 (2)	total: 46.3ms	remaining: 3.42s
4:	learn: 0.1546693	test: 0.1797753	best: 0.1797753 (4)	total: 61.3ms	remaining: 3.62s
5:	learn: 0.1556420	test: 0.1797753	best: 0.1797753 (4)	total: 75.3ms	remaining: 3.69s
6:	learn: 0.1517510	test: 0.1760300	best: 0.1797753 (4)	total: 92.8ms	remaining: 3.88s
7:	learn: 0.1556420	test: 0.1760300	best: 0.1797753 (4)	total: 110ms	remaining: 4s
8:	learn: 0.1605058	test: 0.1797753	best: 0.1797753 (4)	total: 125ms	remaining: 4.05s
9:	learn: 0.1585603	test: 0.1797753	best: 0.1797753 (4)	total: 138ms	remaining: 4s
10:	learn: 0.1605058	test: 0.1797753	best: 0.1797753 (4)	total: 150ms	remaining: 3.95s
11:	learn: 0.1605058	test: 0.1797753	best: 0.1797753

<catboost.core.CatBoostClassifier at 0x7f02fa9ad490>

In [30]:
ypred = clf_cb.predict(X_test)
score = roc_auc_score(y_test, ypred)

print(f"Test AUC score: {score}")
print(classification_report(y_test, ypred))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(y_test[y_test == 0])/len(y_test))

Test AUC score: 0.6028438917596513
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      1733
           1       0.74      0.22      0.34       267

    accuracy                           0.89      2000
   macro avg       0.82      0.60      0.64      2000
weighted avg       0.87      0.89      0.86      2000

Min accuracy to beat for just random guessing in the TEST set:
0.8665
