In [1]:
# Initial setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
plt.style.use('seaborn-white')
%matplotlib inline

In [2]:
# Read off data
lend = pd.read_csv('data/lending_ml.csv')
display(lend.head())

Unnamed: 0,purpose,yr_credit,dti,revol_util_dec,total_acc,addr_state,target
0,credit_card,26.0,27.65,0.837,9.0,AZ,1
1,car,12.0,1.0,0.094,4.0,GA,0
2,small_business,10.0,8.72,0.985,10.0,IL,1
3,other,15.0,20.0,2.1,37.0,CA,1
4,other,15.0,17.94,0.539,38.0,OR,1


### Features Engineering

1. Get dummy variable for puropes

In [3]:
purpose = pd.get_dummies(lend['purpose'], drop_first=True)
lend = pd.concat([lend, purpose], axis=1)

2. Get dummy variable for addr_state

Note that from inference statistic part of the exercise. Some states such as IA and MS have so few records that treating those labels as a feature may run into overfit problems later. We will replace state label that has <200 records as SML to have the model recognize them as low applicant states in the feature columns.

In [4]:
# create a crosstab to get count by states
table_state = pd.crosstab(lend['addr_state'], lend['target'], margins=True)

# filter for states that have more than 200 records, put that into list
LT_list = table_state[table_state['All'] > 200].index.tolist()
LT = pd.DataFrame(LT_list) # to be used later for the test set

In [5]:
lend['helper_col'] = ~(lend['addr_state'].isin(LT_list)) #this serves as helper column only and will be drop later

In [6]:
# if helper column is True, the record is from a small state, replace with 'SML', otherwise no change to the state label
lend['state'] = np.where(lend.helper_col == 1, 'SML', lend.addr_state)

In [7]:
# get dummy variables
state = pd.get_dummies(lend['state'], drop_first = True)
lend = pd.concat([lend, state], axis=1)

3. Get categorical feature column label for DTI grouping
    - In inference statistic portion, DTI at the individual level does not show significance on influencing the paid off rate, but at the binning level it does. Hence, we will create a new column to hold the categorical DTI.
    - We will, however, test both with or without binning to see if it actually improve model accuracy.

In [8]:
# crate function for grouping dti based on the value range
def dti(data):
    if data['dti'] <= 5: return 5
    elif (data['dti'] > 5) & (data['dti'] <=10) : return 10
    elif (data['dti'] > 10) & (data['dti'] <=15) : return 15
    elif (data['dti'] > 15) & (data['dti'] <=20) : return 20
    elif (data['dti'] > 20) & (data['dti'] <=30) : return 30
    else: return 40

In [9]:
lend['dti_gp'] = lend.apply(dti, axis=1)

In [10]:
dti_gp = pd.get_dummies(lend['dti_gp'])
lend = pd.concat([lend, dti_gp], axis = 1)

4. fillna for missing records under revol_util_dec

In [11]:
lend['revol'] = lend['revol_util_dec'].fillna(0)

In [12]:
# drop the redunant columns and form the final dataframe for machine learning
lend_fin = lend.drop(['addr_state', 'helper_col', 'dti_gp', 'dti', 'state', 'purpose', 'revol_util_dec'], axis=1)

In [13]:
# creating dataset for feature and target to be fed to machine learning model
features = lend_fin.drop(['target'], axis=1).values
target = lend_fin.target.values

# check shape of the features and target sets
print(features.shape)
print(target.shape)

(36182, 52)
(36182,)


In [14]:
# Create a function for feature engineering steps above to streamline future data processing
def data_clean(dataset):
    purpose = pd.get_dummies(dataset['purpose'], drop_first=True)
    dataset['helper_col'] = ~(dataset['addr_state'].isin(LT_list))
    dataset['state'] = np.where(dataset.helper_col == 1, 'SML', dataset.addr_state)
    state = pd.get_dummies(dataset['state'], drop_first = True)
    dataset['dti_gp'] = dataset.apply(dti, axis=1)
    dti_gp = pd.get_dummies(lend['dti_gp'])
    dataset['revol'] = dataset['revol_util_dec'].fillna(0)
    dataset = pd.concat([dataset, purpose, state, dti_gp], axis=1)    
    dataset = lend.drop(['addr_state', 'helper_col', 'dti_gp', 'dti', 'state', 'purpose', 'revol_util_dec'], axis=1)
    return dataset    

### Base Loan Paid off rate

In [15]:
'{:.2%}'.format(np.sum(target)/len(target))

'85.29%'

This is the baseline accuracy, in which a model of guessing 1s for target will achieve 85.29% accuracy. We are dealing with imbalance sample data. The model, therefore, need to have accuracy that is significantly higher to prove that the model is helpful.

In [16]:
# Split the data into a training and test set to be used for all models training
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = .3, random_state=5)

### Apply Logistics Regression

In [17]:
clf = LogisticRegression(solver='liblinear')
# Fit the model on the trainng data.
clf.fit(X_train, y_train)
# Print the accuracy from the testing data.
y_pred_clf = clf.predict(X_test)
print('Precision {:.2}'.format(precision_score(y_test, y_pred_clf)))
print('Recall {:.2}'.format(recall_score(y_test, y_pred_clf)))
print('F1 score {:.2}'.format(f1_score(y_test, y_pred_clf)))

Precision 0.85
Recall 1.0
F1 score 0.92


In [18]:
# Cross validation
def cv_score(clf, x, y, score_func=f1_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [19]:
clf = LogisticRegression(solver='liblinear')
score = cv_score(clf, X_train, y_train)
print('F1 score without reguluarization: {:.2%}'. format(score))

F1 score without reguluarization: 92.08%


With accuracy almost like the baseline, this base logistic regression does not help on improving the prediction of default rate. F1 score is 0.92 and we will use this as baseline benchmark and try improve on it

#### Tunning Model

In [20]:
# Perform Grid search to look for the best C
Cs = [0.001, 0.1, 1, 10, 100]
gridsearch = GridSearchCV(estimator=clf, param_grid={'C': Cs}, cv=5, scoring='f1')
gridsearch.fit(X_train, y_train)
print('best parameter: {}'.format(gridsearch.best_params_))
print('best score: {:.2%}'.format(gridsearch.best_score_))

best parameter: {'C': 0.001}
best score: 92.08%


No improvement on accuracy score. We will still set C in the model to avoid data overfit. However, Logistics Regression does not appear to help on predicting loan default rate.

### Apply Random Forest

#### Base Model

In [21]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt', random_state=33)

model.fit(X_train, y_train)
y_pred_rf = model.predict(X_test)

In [22]:
print(confusion_matrix(y_test,y_pred_rf))  
print(classification_report(y_test,y_pred_rf))  
print('F1 Score is {:.2%}'.format(f1_score(y_test, y_pred_rf)))

[[  32 1571]
 [ 103 9149]]
              precision    recall  f1-score   support

           0       0.24      0.02      0.04      1603
           1       0.85      0.99      0.92      9252

   micro avg       0.85      0.85      0.85     10855
   macro avg       0.55      0.50      0.48     10855
weighted avg       0.76      0.85      0.79     10855

F1 Score is 91.62%


The random forest baseline model, unfortunately, preform worst than baseline accuracy. We will identify parameters which may help improving the performance

#### Tunning model

In [23]:
param_grid = { 
    'n_estimators': [100, 200, 500, 1000],
    'max_depth' : [4, 6, 8, 10],
    'criterion' :['gini', 'entropy']
}

In [24]:
rfc = RandomForestClassifier(random_state=33, bootstrap=True, max_features='sqrt')
CV_model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=3)

In [25]:
best_model = CV_model.fit(X_train, y_train)

In [26]:
best_model.best_params_

{'criterion': 'gini', 'max_depth': 4, 'n_estimators': 100}

In [27]:
best_model.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=33, verbose=0, warm_start=False)

In [28]:
# predicting using the model with the best estimators
y_pred_tune = best_model.predict(X_test)

In [29]:
print('F1 Score is {:.2%}'.format(f1_score(y_test,y_pred_tune)))

F1 Score is 92.03%


Accuracy did not necessary improve.

In [30]:
# importing error test data
test_error = pd.read_csv('data\lending_test.csv')
print(test_error.head())
print(test_error.shape)

              purpose  yr_credit    dti  revol_util_dec  total_acc addr_state  \
0  debt_consolidation         24  15.55           0.444       22.0         MA   
1  debt_consolidation         15  16.73           0.545       41.0         NY   
2  debt_consolidation         22  15.75           0.346       16.0         CO   
3  debt_consolidation         19  18.55           0.546       31.0         CA   
4  debt_consolidation         14  27.06           0.709       17.0         CA   

   target  
0     1.0  
1     1.0  
2     0.0  
3     1.0  
4     1.0  
(188180, 7)


In [31]:
test_error_clean = data_clean(test_error)

In [32]:
# obatain target and features from the error test set
features_test = test_error_clean.drop(['target'], axis=1).values
target_test = test_error.target.values