## Modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Read in the data from the .pkl
df = pd.read_pickle("./data/baseline_fe_data.pkl")

num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
target = ['is_bad']
num_cols.remove('is_bad')

In [3]:
df[cat_cols].dtypes

verification_status    object
policy_code            object
purpose_cat            object
initial_list_status    object
pymnt_plan             object
zip_code               object
addr_state             object
home_ownership         object
dtype: object

In [4]:
df[num_cols].dtypes

emp_length                       int64
annual_inc                     float64
debt_to_income                 float64
delinq_2yrs                    float64
inq_last_6mths                 float64
mths_since_last_delinq         float64
mths_since_last_record           int64
open_acc                       float64
pub_rec                          int64
revol_bal                        int64
revol_util                     float64
total_acc                      float64
mths_since_last_major_derog      int64
cr_line_yrs                    float64
cr_line_mths                     int64
dtype: object

In [5]:
df[target].dtypes

is_bad    int64
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   is_bad                       10000 non-null  int64  
 1   emp_length                   10000 non-null  int64  
 2   home_ownership               10000 non-null  object 
 3   annual_inc                   10000 non-null  float64
 4   verification_status          10000 non-null  object 
 5   pymnt_plan                   10000 non-null  object 
 6   purpose_cat                  10000 non-null  object 
 7   zip_code                     10000 non-null  object 
 8   addr_state                   10000 non-null  object 
 9   debt_to_income               10000 non-null  float64
 10  delinq_2yrs                  10000 non-null  float64
 11  inq_last_6mths               10000 non-null  float64
 12  mths_since_last_delinq       10000 non-null  float64
 13  mths_since_last_r

In [7]:
#def zip_region(x):
#    return x[1:3]
#
#df['zip_region'] = df.zip_code.apply(zip_region)
df.drop('zip_code', axis=1, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   is_bad                       10000 non-null  int64  
 1   emp_length                   10000 non-null  int64  
 2   home_ownership               10000 non-null  object 
 3   annual_inc                   10000 non-null  float64
 4   verification_status          10000 non-null  object 
 5   pymnt_plan                   10000 non-null  object 
 6   purpose_cat                  10000 non-null  object 
 7   addr_state                   10000 non-null  object 
 8   debt_to_income               10000 non-null  float64
 9   delinq_2yrs                  10000 non-null  float64
 10  inq_last_6mths               10000 non-null  float64
 11  mths_since_last_delinq       10000 non-null  float64
 12  mths_since_last_record       10000 non-null  int64  
 13  open_acc         

#### Label Encode and One-Hot Encode Categorical Variables
- NOTE: I may need to reduce the dimensions after the one-hot encoding, but we will see how this goes first

In [9]:
from sklearn import preprocessing

count = 0

for col in df:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:     
            le = preprocessing.LabelEncoder()
            df[col] = le.fit_transform(df[col])
            count += 1
            print (col)
            
print('%d columns were label encoded.' % count)

pymnt_plan
initial_list_status
2 columns were label encoded.


In [10]:
df = pd.get_dummies(df)
print(df.shape)

(10000, 108)


In [11]:
# Save out the data with basic feature engineering
df.to_pickle("./data/non_standard_explore_data.pkl")

## Split out the data

In [15]:
from sklearn.model_selection import train_test_split
seed = 20

# Try just using a train/test split at first without sorting the values by time (I don't know how truly time-based this model will be)
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_bad',axis=1), df['is_bad'], test_size=0.20, random_state=seed)

print("Training Distribution of is_bad:")
print(len(y_train[y_train==1])/len(y_train))

print()
print("Test Distribution of is_bad:")
print(len(y_test[y_test==1])/len(y_test))

Training Distribution of is_bad:
0.1285

Test Distribution of is_bad:
0.1335


## Standard scale the numerical data

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [18]:
# See how much this boosts the correlations
corr = df.corr()['is_bad'].sort_values()

# Display correlations
print('Most Positive Correlations with "is_bad"')
print(20*"-")
print(corr.tail(10))
print()
print('Most Negative Correlations with "is_bad"')
print(20*"-")
print(corr.head(10))

Most Positive Correlations with "is_bad"
--------------------
purpose_cat_major purchase small business        0.057989
purpose_cat_educational small business           0.057989
purpose_cat_small business                       0.079963
purpose_cat_home improvement small business      0.086037
revol_util                                       0.087797
purpose_cat_small business small business        0.089867
purpose_cat_credit card small business           0.106990
purpose_cat_other small business                 0.110097
purpose_cat_debt consolidation small business    0.263194
is_bad                                           1.000000
Name: is_bad, dtype: float64

Most Negative Correlations with "is_bad"
--------------------
total_acc                          -0.055271
purpose_cat_credit card            -0.055271
annual_inc                         -0.050966
verification_status_not verified   -0.050158
cr_line_mths                       -0.035356
purpose_cat_wedding                -0.035

In [19]:
num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
target = ['is_bad']
num_cols.remove('is_bad')

## Upsample the training data

In [20]:
# You need to try and balance the classes within the training set
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=seed)
x_train_r, y_train_r = sm.fit_resample(X_train, y_train)

## Try 3 initial models: LR Classifier, RF Classifier, LightGBM Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

In [32]:
# Start with simple logistic regression classifier for second baseline attempt
clf_lr = LogisticRegression(C = 0.001, random_state=seed)
clf_lr.fit(x_train_r, y_train_r)

LogisticRegression(C=0.001, random_state=20)

In [33]:
ypred = clf_lr.predict(X_test)
score = roc_auc_score(y_test, ypred)

print(f"Test AUC score: {score}")
print(classification_report(y_test, ypred))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(y_test[y_test == 0])/len(y_test))

Test AUC score: 0.6321548439522725
              precision    recall  f1-score   support

           0       0.91      0.68      0.78      1733
           1       0.22      0.58      0.32       267

    accuracy                           0.67      2000
   macro avg       0.57      0.63      0.55      2000
weighted avg       0.82      0.67      0.72      2000

Min accuracy to beat for just random guessing in the TEST set:
0.8665


In [45]:
clf_rf = RandomForestClassifier(n_estimators=200, 
                                random_state=seed,
                                min_samples_split=4,
                                min_samples_leaf=2,
                                oob_score=True)
clf_rf.fit(x_train_r, y_train_r)

RandomForestClassifier(min_samples_leaf=2, min_samples_split=4,
                       n_estimators=200, oob_score=True, random_state=20)

In [46]:
ypred = clf_rf.predict(X_test)
score = roc_auc_score(y_test, ypred)

print(f"Test AUC score: {score}")
print(classification_report(y_test, ypred))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(y_test[y_test == 0])/len(y_test))

Test AUC score: 0.5558912582583946
              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1733
           1       0.97      0.11      0.20       267

    accuracy                           0.88      2000
   macro avg       0.92      0.56      0.57      2000
weighted avg       0.89      0.88      0.84      2000

Min accuracy to beat for just random guessing in the TEST set:
0.8665


In [28]:
import lightgbm as lgb

dtrain = lgb.Dataset(x_train_r, label=y_train_r)
dvalid = lgb.Dataset(X_test, label=y_test)

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=20, verbose_eval=False)

In [31]:
ypred = bst.predict(X_test)
ypred_thresh = [1 if x >= 0.5 else 0 for x in ypred]

score = roc_auc_score(y_test, ypred_thresh)

print(f"Test AUC score: {score}")
print(classification_report(y_test, ypred_thresh))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(y_test[y_test == 0])/len(y_test))

Test AUC score: 0.5900291974904422
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1733
           1       0.89      0.18      0.30       267

    accuracy                           0.89      2000
   macro avg       0.89      0.59      0.62      2000
weighted avg       0.89      0.89      0.85      2000

Min accuracy to beat for just random guessing in the TEST set:
0.8665


In [49]:
from catboost import CatBoostClassifier


### Gradient Boosted Methods + Kfold Cross-validation

In [187]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from lightgbm import LGBMClassifier

kfold = KFold(n_splits=10)


clf_lgb = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
n_scores = cross_val_score(clf_lgb, x_train_r, y_train_r, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
#clf_lgb.fit(x_train_r, y_train_r)

In [190]:
for train_indices, test_indices in kfold.split(x_train_r):
    print('Train: %s | test: %s' % (train_indices, test_indices))

Train: [ 1569  1570  1571 ... 15681 15682 15683] | test: [   0    1    2 ... 1566 1567 1568]
Train: [    0     1     2 ... 15681 15682 15683] | test: [1569 1570 1571 ... 3135 3136 3137]
Train: [    0     1     2 ... 15681 15682 15683] | test: [3138 3139 3140 ... 4704 4705 4706]
Train: [    0     1     2 ... 15681 15682 15683] | test: [4707 4708 4709 ... 6273 6274 6275]
Train: [    0     1     2 ... 15681 15682 15683] | test: [6276 6277 6278 ... 7841 7842 7843]
Train: [    0     1     2 ... 15681 15682 15683] | test: [7844 7845 7846 ... 9409 9410 9411]
Train: [    0     1     2 ... 15681 15682 15683] | test: [ 9412  9413  9414 ... 10977 10978 10979]
Train: [    0     1     2 ... 15681 15682 15683] | test: [10980 10981 10982 ... 12545 12546 12547]
Train: [    0     1     2 ... 15681 15682 15683] | test: [12548 12549 12550 ... 14113 14114 14115]
Train: [    0     1     2 ... 14113 14114 14115] | test: [14116 14117 14118 ... 15681 15682 15683]


In [184]:
n_scores

array([0.93499044, 0.94582537, 0.93945188, 0.9292543 , 0.93813776,
       0.9375    , 0.93686224, 0.9317602 , 0.93239796, 0.92538265,
       0.94518802, 0.93881453, 0.93116635, 0.9292543 , 0.93112245,
       0.9375    , 0.93558673, 0.93558673, 0.94132653, 0.92091837,
       0.93116635, 0.93116635, 0.93371574, 0.94072658, 0.93813776,
       0.94706633, 0.93239796, 0.92283163, 0.93622449, 0.93558673])

In [181]:
#ypred = clf_lgb.predict(validation[feature_cols])
#score = metrics.roc_auc_score(validation['is_bad'], ypred)

ypred = clf_lgb.predict(X_test)
score = metrics.roc_auc_score(y_test, ypred)

print(f"Test AUC score: {score}")
print(metrics.classification_report(y_test, ypred))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(validation[validation.is_bad == 0])/len(validation))

NotFittedError: Estimator not fitted, call `fit` before exploiting the model.