## Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Read in the data from the .pkl
df = pd.read_pickle("./data/fe_data.pkl")

num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
target = ['is_bad']
num_cols.remove('is_bad')

In [2]:
df[cat_cols].dtypes

zip_code               object
inq_last_6mths_bin     object
delinq_2yrs_bin        object
policy_code            object
pymnt_plan             object
addr_state             object
home_ownership         object
initial_list_status    object
verification_status    object
purpose_cat            object
dtype: object

In [3]:
df[num_cols].dtypes

emp_length                       int64
annual_inc                     float64
debt_to_income                 float64
delinq_2yrs                    float64
inq_last_6mths                 float64
mths_since_last_delinq         float64
mths_since_last_record           int64
open_acc                       float64
pub_rec                          int64
revol_bal                        int64
revol_util                     float64
total_acc                      float64
mths_since_last_major_derog      int64
cr_line_yrs                    float64
cr_line_mths                     int64
dtype: object

In [4]:
df[target].dtypes

is_bad    int64
dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   is_bad                       10000 non-null  int64  
 1   emp_length                   10000 non-null  int64  
 2   home_ownership               10000 non-null  object 
 3   annual_inc                   9999 non-null   float64
 4   verification_status          10000 non-null  object 
 5   pymnt_plan                   10000 non-null  object 
 6   purpose_cat                  10000 non-null  object 
 7   zip_code                     10000 non-null  object 
 8   addr_state                   10000 non-null  object 
 9   debt_to_income               10000 non-null  float64
 10  delinq_2yrs                  10000 non-null  float64
 11  inq_last_6mths               10000 non-null  float64
 12  mths_since_last_delinq       10000 non-null  float64
 13  mths_since_last_r

In [6]:
# Impute more of the missing data
df['annual_inc'].fillna(df.annual_inc.median(), inplace=True)

In [7]:
df['open_acc'].fillna(df.open_acc.mode()[0], inplace=True)
df['open_acc'] = df['open_acc'].astype(int)

In [8]:
df['revol_util'].fillna(df.revol_util.mean(), inplace=True)

In [9]:
df['total_acc'].fillna(df.total_acc.median(), inplace=True)
df['total_acc'] = df['total_acc'].astype(int)

In [10]:
df['cr_line_yrs'].fillna(df.cr_line_yrs.median(), inplace=True)
df['cr_line_yrs'] = df['cr_line_yrs'].astype(int)

In [11]:
def zip_region(x):
    return x[1:3]

df['zip_region'] = df.zip_code.apply(zip_region)
df.drop('zip_code', axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   is_bad                       10000 non-null  int64  
 1   emp_length                   10000 non-null  int64  
 2   home_ownership               10000 non-null  object 
 3   annual_inc                   10000 non-null  float64
 4   verification_status          10000 non-null  object 
 5   pymnt_plan                   10000 non-null  object 
 6   purpose_cat                  10000 non-null  object 
 7   addr_state                   10000 non-null  object 
 8   debt_to_income               10000 non-null  float64
 9   delinq_2yrs                  10000 non-null  float64
 10  inq_last_6mths               10000 non-null  float64
 11  mths_since_last_delinq       10000 non-null  float64
 12  mths_since_last_record       10000 non-null  int64  
 13  open_acc         

#### Label Encode and One-Hot Encode Categorical Variables
- NOTE: I may need to reduce the dimensions after the one-hot encoding, but we will see how this goes first

In [15]:
from sklearn import preprocessing

count = 0

for col in df:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:     
            le = preprocessing.LabelEncoder()
            df[col] = le.fit_transform(df[col])
            count += 1
            print (col)
            
print('%d columns were label encoded.' % count)

pymnt_plan
initial_list_status
delinq_2yrs_bin
inq_last_6mths_bin
4 columns were label encoded.


In [16]:
df = pd.get_dummies(df)
print(df.shape)

(10000, 210)


In [191]:
# Save out the data with basic feature engineering
df.to_pickle("./data/explore_data.pkl")

In [69]:
num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
target = ['is_bad']
num_cols.remove('is_bad')

In [165]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
seed = 20

# Try just using a train/test split at first without sorting the values by time (I don't know how truly time-based this model will be)
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_bad',axis=1), df['is_bad'], test_size=0.10, random_state=seed)

In [166]:
# You need to try and balance the classes within the training set
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=seed)
x_train_r, y_train_r = sm.fit_resample(X_train, y_train)

In [169]:
len(y_train_r)

15684

In [170]:
# Start with simple logistic regression classifier for second baseline attempt
clf_lr = LogisticRegression(C = 0.0001, random_state=seed)
clf_lr.fit(x_train_r, y_train_r)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=0.0001, random_state=20)

In [171]:
ypred = clf_lr.predict(X_test)
score = roc_auc_score(y_test, ypred)

print(f"Test AUC score: {score}")
print(classification_report(y_test, ypred))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(y_test[y_test == 0])/len(y_test))

Test AUC score: 0.5815987346804138
              precision    recall  f1-score   support

           0       0.90      0.56      0.69       863
           1       0.18      0.61      0.28       137

    accuracy                           0.56      1000
   macro avg       0.54      0.58      0.48      1000
weighted avg       0.80      0.56      0.63      1000

Min accuracy to beat for just random guessing in the TEST set:
0.863


In [173]:
clf_rf = RandomForestClassifier(n_estimators=100, random_state=seed)
clf_rf.fit(x_train_r, y_train_r)

RandomForestClassifier(random_state=20)

In [174]:
ypred = clf_rf.predict(X_test)
score = roc_auc_score(y_test, ypred)

print(f"Test AUC score: {score}")
print(classification_report(y_test, ypred))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(y_test[y_test == 0])/len(y_test))

Test AUC score: 0.5134397915944211
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       863
           1       0.67      0.03      0.06       137

    accuracy                           0.86      1000
   macro avg       0.77      0.51      0.49      1000
weighted avg       0.84      0.86      0.81      1000

Min accuracy to beat for just random guessing in the TEST set:
0.863


In [175]:
import lightgbm as lgb

dtrain = lgb.Dataset(x_train_r, label=y_train_r)
dvalid = lgb.Dataset(X_test, label=y_test)

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=20, verbose_eval=False)

In [178]:
from sklearn import metrics
ypred = bst.predict(X_test)
ypred_thresh = [1 if x >= 0.5 else 0 for x in ypred]

score = metrics.roc_auc_score(y_test, ypred_thresh)

print(f"Test AUC score: {score}")
print(metrics.classification_report(y_test, ypred_thresh))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(test[test.is_bad == 0])/len(test))

Test AUC score: 0.5846943694970017
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       863
           1       0.83      0.18      0.29       137

    accuracy                           0.88      1000
   macro avg       0.86      0.58      0.61      1000
weighted avg       0.88      0.88      0.85      1000

Min accuracy to beat for just random guessing in the TEST set:
0.856


### Gradient Boosted Methods + Kfold Cross-validation

In [187]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from lightgbm import LGBMClassifier

kfold = KFold(n_splits=10)


clf_lgb = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
n_scores = cross_val_score(clf_lgb, x_train_r, y_train_r, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
#clf_lgb.fit(x_train_r, y_train_r)

In [190]:
for train_indices, test_indices in kfold.split(x_train_r):
    print('Train: %s | test: %s' % (train_indices, test_indices))

Train: [ 1569  1570  1571 ... 15681 15682 15683] | test: [   0    1    2 ... 1566 1567 1568]
Train: [    0     1     2 ... 15681 15682 15683] | test: [1569 1570 1571 ... 3135 3136 3137]
Train: [    0     1     2 ... 15681 15682 15683] | test: [3138 3139 3140 ... 4704 4705 4706]
Train: [    0     1     2 ... 15681 15682 15683] | test: [4707 4708 4709 ... 6273 6274 6275]
Train: [    0     1     2 ... 15681 15682 15683] | test: [6276 6277 6278 ... 7841 7842 7843]
Train: [    0     1     2 ... 15681 15682 15683] | test: [7844 7845 7846 ... 9409 9410 9411]
Train: [    0     1     2 ... 15681 15682 15683] | test: [ 9412  9413  9414 ... 10977 10978 10979]
Train: [    0     1     2 ... 15681 15682 15683] | test: [10980 10981 10982 ... 12545 12546 12547]
Train: [    0     1     2 ... 15681 15682 15683] | test: [12548 12549 12550 ... 14113 14114 14115]
Train: [    0     1     2 ... 14113 14114 14115] | test: [14116 14117 14118 ... 15681 15682 15683]


In [184]:
n_scores

array([0.93499044, 0.94582537, 0.93945188, 0.9292543 , 0.93813776,
       0.9375    , 0.93686224, 0.9317602 , 0.93239796, 0.92538265,
       0.94518802, 0.93881453, 0.93116635, 0.9292543 , 0.93112245,
       0.9375    , 0.93558673, 0.93558673, 0.94132653, 0.92091837,
       0.93116635, 0.93116635, 0.93371574, 0.94072658, 0.93813776,
       0.94706633, 0.93239796, 0.92283163, 0.93622449, 0.93558673])

In [181]:
#ypred = clf_lgb.predict(validation[feature_cols])
#score = metrics.roc_auc_score(validation['is_bad'], ypred)

ypred = clf_lgb.predict(X_test)
score = metrics.roc_auc_score(y_test, ypred)

print(f"Test AUC score: {score}")
print(metrics.classification_report(y_test, ypred))

print("Min accuracy to beat for just random guessing in the TEST set:")
print(len(validation[validation.is_bad == 0])/len(validation))

NotFittedError: Estimator not fitted, call `fit` before exploiting the model.