In [1]:
# Initial setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
plt.style.use('seaborn-white')
%matplotlib inline

In [2]:
# Read off data
lend = pd.read_csv('data/lending_ml.csv')
display(lend.head())
print(lend.shape)

Unnamed: 0,home_ownership,purpose,yr_credit,dti,total_acc,addr_state,target
0,RENT,credit_card,29.0,27.65,9.0,AZ,1
1,RENT,car,15.0,1.0,4.0,GA,0
2,RENT,small_business,13.0,8.72,10.0,IL,1
3,RENT,other,18.0,20.0,37.0,CA,1
4,RENT,other,18.0,17.94,38.0,OR,1


(89549, 7)


#### This is a classification problem. 

The features being selected from inference statistic are 
- homeownership
- loan purpose
- years of credit
- debt to income ratio (DTI),
- total credit lines
- applicant's State

For the first phrase modeling, we will focus on the numeric variables, which are years of credit, DTI, and total credit lines. Remaining variable will be evaluated in the next phrase for model improvement.

#### Create features and target numpy array

In [3]:
features = lend[['dti', 'yr_credit', 'total_acc']].values
target = lend.target.values

# check shape of the features and target sets
print(features.shape)
print(target.shape)

(89549, 3)
(89549,)


#### Split the data into train and test set for all models training

In [4]:
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size = .25, random_state=5)
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(67161, 3)
(22388, 3)
(67161,)
(22388,)


### Baseline Model

Assume targets to be all 1s or 0s. We use F1 score for evaluation.

In [5]:
base_f1 = f1_score(y_val, np.ones(len(y_val)))
base_misclassify = 1 - precision_score(y_val, np.ones(len(y_val)))
print('Confusion Matrix \n {}'.format(confusion_matrix(y_val, np.ones(len(y_val)))))

Confusion Matrix 
 [[    0  3431]
 [    0 18957]]


In [6]:
print('Baseline Model achieves F1 score of {:.3f} with misclassification rate of {:.2f}'.format(base_f1, base_misclassify))

Baseline Model achieves F1 score of 0.917 with misclassification rate of 0.15


Our goal is to **maximize F1 score** by minimal misclassification rates. 

#### Define Cross Validation Function

In [7]:
def cv_score(clf, x, y, score_func=f1_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

### Apply Logistic Regression

**Base Model**

In [8]:
lf = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=33)

lf.fit(X_train, y_train)

y_pred_lf = lf.predict(X_val)
lf_f1_base = f1_score(y_val, y_pred_lf)
lf_misclassify_base = 1- accuracy_score(y_val, y_pred_lf)

In [9]:
print('Confusion Matrix \n {}'.format(confusion_matrix(y_val, y_pred_lf)))
print('Classification report \n {}'.format(classification_report(y_val, y_pred_lf)))
print('Logistics Regression Base Model achieves F1 score of {:.3f} with misclassification rate of {:.2f}'.format(lf_f1_base, lf_misclassify_base))

Confusion Matrix 
 [[ 1815  1616]
 [ 8588 10369]]
Classification report 
               precision    recall  f1-score   support

           0       0.17      0.53      0.26      3431
           1       0.87      0.55      0.67     18957

   micro avg       0.54      0.54      0.54     22388
   macro avg       0.52      0.54      0.47     22388
weighted avg       0.76      0.54      0.61     22388

Logistics Regression Base Model achieves F1 score of 0.670 with misclassification rate of 0.46


In [10]:
score_lf = cv_score(lf, X_train, y_train)
print('Cross validation f1 score for Logistic Regression Base Model: {:.4}'. format(score_lf))

Cross validation f1 score for Logistic Regression Base Model: 0.668


**Comment**: Comparing to baseline model, Logistic Regression help on limiting Type 1 error (approved loan requests that will default), it also lead to miss opportunity of introducing Type 2 error (rejected loan requests that would paid off)

#### Tunning Model

In [11]:
# Perform Grid search to look for the best C
Cs = [0.001, 0.1, 1, 10, 100]
gridsearch_lf = GridSearchCV(estimator=lf, param_grid={'C': Cs}, cv=5, scoring='f1')
gridsearch_lf.fit(X_train, y_train)
y_pred_lf_c = gridsearch_lf.best_estimator_.predict(X_val)
print('best parameter: {}'.format(gridsearch_lf.best_params_))
print('Confusion Matrix:\n {}'.format(confusion_matrix(y_val, y_pred_lf_c)))
print('Classification Report:\n {}'.format(classification_report(y_val, y_pred_lf_c)))
lf_f1_tune = f1_score(y_val, y_pred_lf_c)
lf_misclassify_tune = 1- accuracy_score(y_val, y_pred_lf_c)
score_lf_tune = gridsearch_lf.best_score_

best parameter: {'C': 10}
Confusion Matrix:
 [[ 1815  1616]
 [ 8585 10372]]
Classification Report:
               precision    recall  f1-score   support

           0       0.17      0.53      0.26      3431
           1       0.87      0.55      0.67     18957

   micro avg       0.54      0.54      0.54     22388
   macro avg       0.52      0.54      0.47     22388
weighted avg       0.76      0.54      0.61     22388



In [12]:
print('Logistics Regression Tuned Model achieves F1 score of {:.3f} with misclassification rate of {:.2f}'.format(lf_f1_tune, lf_misclassify_tune))
print('Cross validation f1 score for Logistic Regression Tuned Model: {:.4}'. format(score_lf_tune))

Logistics Regression Tuned Model achieves F1 score of 0.670 with misclassification rate of 0.46
Cross validation f1 score for Logistic Regression Tuned Model: 0.668


Tunning does not improve model performance. We will still set C =10 to enable more consistent model performance when it is being applied to new data.

### Apply Random Forest

#### Base Model

In [13]:
# Create the model with 100 trees
rf = RandomForestClassifier(n_estimators=100, random_state=33, class_weight='balanced_subsample')

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
rf_f1_base = f1_score(y_val, y_pred_rf)
rf_misclassify_base = 1- accuracy_score(y_val, y_pred_rf)

In [14]:
print('Confusion matrix: \n {}'.format(confusion_matrix(y_val,y_pred_rf)))
print('Classification report \n {}'.format(classification_report(y_val, y_pred_rf)))
print('Random Forest Base Model achieves F1 score of {:.3f} with misclassification rate of {:.2f}'.format(rf_f1_base, rf_misclassify_base))

Confusion matrix: 
 [[  354  3077]
 [ 1584 17373]]
Classification report 
               precision    recall  f1-score   support

           0       0.18      0.10      0.13      3431
           1       0.85      0.92      0.88     18957

   micro avg       0.79      0.79      0.79     22388
   macro avg       0.52      0.51      0.51     22388
weighted avg       0.75      0.79      0.77     22388

Random Forest Base Model achieves F1 score of 0.882 with misclassification rate of 0.21


In [15]:
score_rf = cv_score(rf, X_train, y_train)
print('Cross validation f1 score for Random Forest Base Model: {:.4}'.format(score_rf))

Cross validation f1 score for Random Forest Base Model: 0.8803


**Comment**: Random Forest F1 score with cross validation performs better than Logistic Regression, though below baseline model. The model start disqualifying bad loans, but also disqualifying good loans.

We will tune the model to seek for performance improvement.

#### Tunning model

Due to computational costs, we pick two more popular hyper-parameters (n_estimators and max_features) for tuning.

In [16]:
param_grid = { 
    'n_estimators': [10, 100, 500, 1000],
    'max_features' : ['auto', 'sqrt']
}

In [17]:
rf_1 = RandomForestClassifier(random_state=33, class_weight='balanced_subsample')

In [18]:
gridsearch_rf = GridSearchCV(estimator=rf_1, param_grid=param_grid, cv= 5, n_jobs=3)

In [19]:
gridsearch_rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators='warn', n_jobs=None, oob_score=False,
            random_state=33, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'n_estimators': [10, 100, 500, 1000], 'max_features': ['auto', 'sqrt']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
print('best parameters: {}'.format(gridsearch_rf.best_params_))

best parameters: {'max_features': 'auto', 'n_estimators': 1000}


In [21]:
# predicting using the model with the best estimators
y_pred_rf_tune = gridsearch_rf.best_estimator_.predict(X_val)
rf_f1_tune = f1_score(y_val, y_pred_rf_tune)
rf_misclassify_tune = 1 - accuracy_score(y_val, y_pred_rf_tune)

In [22]:
print('Confusion matrix: \n {}'.format(confusion_matrix(y_val,y_pred_rf)))
print('Classification report \n {}'.format(classification_report(y_val, y_pred_rf)))
print('Random Forest Tuned Model achieves F1 score of {:.3f} with misclassification rate of {:.2f}'.format(rf_f1_tune, rf_misclassify_tune))

Confusion matrix: 
 [[  354  3077]
 [ 1584 17373]]
Classification report 
               precision    recall  f1-score   support

           0       0.18      0.10      0.13      3431
           1       0.85      0.92      0.88     18957

   micro avg       0.79      0.79      0.79     22388
   macro avg       0.52      0.51      0.51     22388
weighted avg       0.75      0.79      0.77     22388

Random Forest Tuned Model achieves F1 score of 0.882 with misclassification rate of 0.21


Do not see improvement in F1 score. We will keep the base Random Forest classifier.

### Performance Summary

|Model | F1 score (cross validated)| misclassification rate |
|----- | -------|------|
|Baseline | 0.917| .15 |
|Logistic Regression | 0.670 | .46 |
|Random Forest | 0.883 | .21 |

### Model Recommendation

While neither model has F1 score higher than baseline, Random Forest has a high enough f1 score that we can keep improving on. We recommend Random Forest as the winning model.

### Measure errors

In [23]:
# importing error test data (2013)
test_data = pd.read_csv('data\lending_test.csv')
display(test_data.head())
print(test_data.shape)

   loan_amnt  funded_amnt  funded_amnt_inv        term int_rate  installment  \
0    10000.0      10000.0          10000.0   36 months    9.67%       321.13   
1    20800.0      20800.0          20800.0   36 months   13.53%       706.16   
2     8000.0       8000.0           8000.0   36 months   10.99%       261.88   
3    28000.0      28000.0          28000.0   36 months    7.62%       872.52   
4    11500.0      11500.0          11500.0   60 months   22.90%       323.54   

  grade sub_grade           emp_title emp_length  ... total_bal_ex_mort  \
0     B        B1    Registered Nurse    7 years  ...           39143.0   
1     B        B5  Operations Manager  10+ years  ...           23473.0   
2     B        B2       PARTS MANAGER    2 years  ...           15949.0   
3     A        A3  Area Sales Manager    5 years  ...          199739.0   
4     E        E4           Secretary    4 years  ...           24724.0   

   total_bc_limit total_il_high_credit_limit hardship_flag  \
0     

Getting the needed features and target from the test set

In [24]:
features_test = test_data[['yr_credit', 'dti', 'total_acc']].values
target_test = test_data.target.values

print(features_test.shape)
print(target_test.shape)

(134814, 3)
(134814,)


**Apply Random Forest Base Model**

In [26]:
test_pred = rf.predict(features_test)
f1_test = f1_score(target_test, test_pred)
misclassify_test = 1 - accuracy_score(target_test, test_pred)

In [31]:
print('Confusion Matrix \n {}'.format(confusion_matrix(target_test, y_pred_test)))
print('Classification Report \n {}'.format(classification_report(target_test, y_pred_test)))
print('F1 score with test data is {:.4f} with misclassification rate of {:.3f}'.format(f1_test, misclassify_test))

Confusion Matrix 
 [[  1500  19527]
 [  8625 105162]]
Classification Report 
               precision    recall  f1-score   support

           0       0.15      0.07      0.10     21027
           1       0.84      0.92      0.88    113787

   micro avg       0.79      0.79      0.79    134814
   macro avg       0.50      0.50      0.49    134814
weighted avg       0.73      0.79      0.76    134814

F1 score with test data is 0.8815 with misclassification rate of 0.209


F1 score is similar to the train score. The model performs consistently when applying to new data.

### Conclusion

1. Random Forest has balanced performance (high F1 score). 
2. The model enables Lending Club to filter out some bad loan requests which would reduce profit loss driven by loan defaults. However, the model introduces type 2 error which represents loss revenue by rejecting loan requests that would paid off. Lending Club would need to weigh between the two trade offs and define the threshold tolerance on the potential misclassification. 
2. From Data Exploratory Analysis, we identified that loan purpose is likely to impact loan paid off rate. A lot of applicants work in the military.

**Side Notes**: Based on the data samples, we found out average default rate for Lending Club's loan is ~15%, which is higher than residential and consumer loans default rates within the United States among similar period based on [Federal Reserve Broad](https://www.federalreserve.gov/releases/chargeoff/delallsa.htm) There are motivations to continue fine-tuning the model to help Lending Club to lower the loan default rate to be more in line with the industry standard.

### Further Improvements

- Introduce categorical features into the model to seek further improvement on F1 score.
- Cross-validation should be done in the form of Time Series Nested Cross-Validation to avoid contamination of time component on predicting results.
- Access 1-2 more classification models to have a broader measurement for model performance. I would consider Extreme Gradient Boosting and Support Vector Machines, for example.
- Features engineering which includes clustering to capture potential patterns that were not captured by looking at one feature alone.
- Run correlation test across features to access potential collinearity.