# Capstone 2 - Lending Club Loan Paid or Charged Off Prediction

# Documentation

Documentation is the sixth step in the Data Science Method.  The following will be performed in this step:

1. Review the Results
2. Finalize Code
3. Finalize Documentation
4. Create a Project Report
5. Create a Slide Deck for the Executive Audience

In [1]:
#load python packages
import os
import pandas as pd
import pandas.api.types as ptypes
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings # For handling error messages.
#warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings('ignore')

# Best Model - Gradient Boosting

In [2]:
# Read data output created in the EDA step
df = pd.read_csv("../EDA/data/Cap2_step3_output.csv", dtype={'verification_status_joint':object,'hardship_type':object, 'hardship_reason':object, 'hardship_status':object, 'hardship_loan_status':object, 'settlement_status':object}, index_col="Unnamed: 0")
df.head(5)

Unnamed: 0,loan_amnt,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,pub_rec,revol_bal,...,initial_list_status,application_type,verification_status_joint,hardship_flag,hardship_type,hardship_reason,hardship_status,hardship_loan_status,debt_settlement_flag,settlement_status
0,2500,13.56,55000.0,18.24,0.0,1.0,0.0,45.0,1.0,4341,...,w,Individual,,N,,,,,N,
1,30000,18.94,90000.0,26.52,0.0,0.0,71.0,75.0,1.0,12315,...,w,Individual,,N,,,,,N,
2,5000,17.97,59280.0,10.51,0.0,0.0,0.0,0.0,0.0,4599,...,w,Individual,,N,,,,,N,
3,4000,18.94,92000.0,16.74,0.0,0.0,0.0,0.0,0.0,5468,...,w,Individual,,N,,,,,N,
4,30000,16.14,57250.0,26.35,0.0,0.0,0.0,0.0,0.0,829,...,w,Individual,,N,,,,,N,


In [18]:
df.shape

(2150627, 87)

In [3]:
# Drop unwanted columns
df_sel = df.drop(['title', 'zip_code', 'addr_state'], axis=1)

In [4]:
# Drop irrelevant loan_status

df_sel.reset_index()
df_sel = df_sel[~(df_sel['loan_status'] == 'In Grace Period')]
df_sel = df_sel[~(df_sel['loan_status'] == 'Current')]
df_sel = df_sel[~(df_sel['loan_status'] == 'Late (31-120 days)')]
df_sel = df_sel[~(df_sel['loan_status'] == 'Default')]
df_sel = df_sel[~(df_sel['loan_status'] == 'Late (16-30 days)')]
df_sel.shape

(1250411, 84)

In [5]:
# Loan status:
# Setting '%Fully Paid' to 0
# Setting '%Charges Off' to 1

df_sel['loan_status'] = df_sel['loan_status'].replace(['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid'], '0')
df_sel['loan_status'] = df_sel['loan_status'].replace(['Charged Off', 'Does not meet the credit policy. Status:Charged Off'], '1')

In [6]:
df_sel['loan_status'] = pd.to_numeric(df_sel['loan_status'])

In [7]:
# Create a list of object columns.  
cols_obj = []
for col in df_sel.columns:
    if df_sel[col].dtype == object:
        cols_obj.append(col)

['term',
 'grade',
 'sub_grade',
 'home_ownership',
 'verification_status',
 'pymnt_plan',
 'initial_list_status',
 'application_type',
 'verification_status_joint',
 'hardship_flag',
 'hardship_type',
 'hardship_reason',
 'hardship_status',
 'hardship_loan_status',
 'debt_settlement_flag',
 'settlement_status']

In [8]:
df_sel = pd.concat([df_sel.drop(cols_obj, axis=1), pd.get_dummies(df_sel[cols_obj])], axis=1)
df_sel.head(5)

Unnamed: 0,loan_amnt,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,pub_rec,revol_bal,...,hardship_loan_status_Current,hardship_loan_status_In Grace Period,hardship_loan_status_Issued,hardship_loan_status_Late (16-30 days),hardship_loan_status_Late (31-120 days),debt_settlement_flag_N,debt_settlement_flag_Y,settlement_status_ACTIVE,settlement_status_BROKEN,settlement_status_COMPLETE
100,30000,22.35,100000.0,30.46,0.0,0.0,51.0,84.0,1.0,15603,...,0,0,0,0,0,1,0,0,0,0
152,40000,16.14,45000.0,50.53,0.0,0.0,0.0,0.0,0.0,34971,...,0,0,0,0,0,1,0,0,0,0
170,20000,7.56,100000.0,18.92,0.0,0.0,48.0,0.0,0.0,25416,...,0,0,0,0,0,1,0,0,0,0
186,4500,11.31,38500.0,4.64,0.0,0.0,25.0,0.0,0.0,4472,...,0,0,0,0,0,1,0,0,0,0
269,20000,17.97,57000.0,22.18,0.0,0.0,0.0,0.0,0.0,33356,...,0,0,0,0,0,1,0,0,0,0


In [9]:
# Using a sample size of 200,000 for modeling, due to Memory error when using all observations.
df_sample = df_sel.sample(200000)
df_sample.head()

Unnamed: 0,loan_amnt,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,pub_rec,revol_bal,...,hardship_loan_status_Current,hardship_loan_status_In Grace Period,hardship_loan_status_Issued,hardship_loan_status_Late (16-30 days),hardship_loan_status_Late (31-120 days),debt_settlement_flag_N,debt_settlement_flag_Y,settlement_status_ACTIVE,settlement_status_BROKEN,settlement_status_COMPLETE
971888,22800,18.2,59000.0,36.03,0.0,0.0,34.0,0.0,0.0,22567,...,0,0,0,0,0,0,1,1,0,0
754115,16750,23.13,65000.0,21.77,0.0,1.0,42.0,0.0,0.0,12109,...,0,0,0,0,0,0,1,1,0,0
1854644,30000,7.62,156000.0,16.43,0.0,1.0,0.0,0.0,0.0,42612,...,0,0,0,0,0,1,0,0,0,0
1764961,35000,21.49,103500.0,29.04,0.0,1.0,37.0,0.0,0.0,32855,...,0,0,0,0,0,1,0,0,0,0
2041180,11850,14.16,35000.0,28.16,0.0,0.0,0.0,0.0,0.0,15125,...,0,0,0,0,0,1,0,0,0,0


In [10]:
y = df_sample['loan_status']
X = df_sample.drop(['loan_status'], axis=1)

In [11]:
from sklearn.model_selection import train_test_split

y = y.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [12]:
from sklearn import preprocessing as ppg

scaler = ppg.StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_train_scaled

array([[-1.13328785,  0.2482422 , -1.04646236, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [-0.47347742, -0.87185443,  1.02278796, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [ 0.83128283,  0.80618508, -0.69727637, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       ...,
       [ 1.83288694,  0.68196383,  0.87406059, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [ 0.00206162,  0.14296996,  0.21448705, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [-0.47347742,  0.42088868, -0.27049349, ..., -0.10677954,
        -0.0602199 , -0.10324284]])

In [13]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 2.14793156,  2.95373878,  0.37614723, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [-0.47347742,  0.03769772, -0.43215367, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [-1.08276182,  1.19569237, -1.26890677, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       ...,
       [ 2.4986416 ,  1.40623685,  0.08515891, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [-1.12734361,  0.42088868, -1.62843902, ..., -0.10677954,
        -0.0602199 , -0.10324284],
       [ 0.12094638,  0.07770117, -0.82013811, ..., -0.10677954,
        -0.0602199 , -0.10324284]])

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
    
param_grid = {'learning_rate': learning_rates}
gb = GradientBoostingClassifier(n_estimators=30, max_features=2, max_depth = 2, random_state = 0)
gb_cv = GridSearchCV(gb, param_grid, cv=5)
gb_cv.fit(X_train_scaled, y_train)
y_pred_gdbst = gb_cv.predict(X_test_scaled)
print("Tuned Gradient Boosting Parameters: {}".format(gb_cv.best_params_)) 
print("Best score is {}".format(gb_cv.best_score_))
print("Accuracy score is {}".format(accuracy_score(y_test, y_pred_gdbst)))

Tuned Gradient Boosting Parameters: {'learning_rate': 1}
Best score is 0.8827666666666667
Accuracy score is 0.88142


# Review the results

In [22]:
from sklearn.metrics import confusion_matrix

pl_gbst = confusion_matrix(y_test,y_pred_gdbst)
pl_gbst

array([[38161,  1629],
       [ 4300,  5910]], dtype=int64)

In [23]:
from sklearn.metrics import classification_report

cl_rep_gbst = classification_report(y_test,y_pred_gdbst)
print(cl_rep_gbst)

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     39790
           1       0.78      0.58      0.67     10210

    accuracy                           0.88     50000
   macro avg       0.84      0.77      0.80     50000
weighted avg       0.88      0.88      0.87     50000



In [24]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

ac = accuracy_score(y_test, y_pred_gdbst)
print('Test Data Set Gradient Boosting: Accuracy=%.3f' % (ac))

f1 = f1_score(y_test, y_pred_gdbst, average='weighted')
print('Test Data Set Gradient Boosting: f1-score=%.3f' % (f1))

roc_auc = roc_auc_score(y_test, y_pred_gdbst)
print('Test Data Set Gradient Boosting: roc_auc_score=%.3f' % (roc_auc))

Test Data Set Gradient Boosting: Accuracy=0.881
Test Data Set Gradient Boosting: f1-score=0.874
Test Data Set Gradient Boosting: roc_auc_score=0.769


# Figures for data story

1. Scatter plot Income vs Loan Amount, hue = Fully Paid / Charges Off (Show decision boundary??)<br>
2. ROC curve

# Finalize code

# Finalize documentation