In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from datetime import datetime


df = pd.read_csv('full2017.csv.zip', compression='zip', low_memory=False)

df.drop('Unnamed: 0', axis=1, inplace=True)

In [2]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,monthly_pmt,grade,issue_d,purpose,loan_status,dti,recoveries,...,tax_liens,pub_rec_bankruptcies,addr_state,delinq_2yrs,delinq_amnt,annual_inc,num_tl_90g_dpd_24m,num_tl_30dpd,total_return,roi
0,14000.0,60,12.74,316.69,C,Mar,credit_card,Current,17.48,0.0,...,0.0,0.0,CA,0.0,0.0,40000.0,0.0,0.0,19001.4,135.724286
1,15400.0,60,11.39,337.84,B,Mar,debt_consolidation,Current,11.52,0.0,...,0.0,0.0,CT,0.0,0.0,95000.0,0.0,0.0,20270.4,131.625974
2,4600.0,36,11.39,151.45,B,Mar,credit_card,Current,13.43,0.0,...,0.0,0.0,CT,0.0,0.0,44470.0,0.0,0.0,5452.2,118.526087
3,15000.0,36,5.32,451.73,A,Mar,debt_consolidation,Current,9.35,0.0,...,0.0,0.0,TX,0.0,0.0,182000.0,0.0,0.0,16262.28,108.4152
4,14000.0,36,15.99,492.13,C,Mar,credit_card,Current,31.07,0.0,...,0.0,0.0,KS,0.0,0.0,82000.0,0.0,0.0,17716.68,126.547714


## Feature Engineering

In [3]:
#binarizing the loan status column; 
#1 indicates the loan is performing or paid of 
#0 indicates not performing or defaulted/written off
df.loc[
    (df.loan_status == 'Current') | 
    (df.loan_status == 'Fully Paid') | 
    (df.loan_status =='In Grace Period'), 'loan_status'] = 1 
df.loc[
    (df.loan_status == 'Charged Off') | 
    (df.loan_status == 'Late (31-120 days)') | 
    (df.loan_status =='Default')|
    (df.loan_status == 'Late (16-30 days)'), 'loan_status'] = 0

In [4]:
df.loan_status.value_counts()

1    426548
0     16450
Name: loan_status, dtype: int64

In [5]:
16450/426548

0.03856541350563125

In [6]:
df.groupby('purpose').loan_status.value_counts()

purpose             loan_status
car                 1                5196
                    0                 157
credit_card         1               88797
                    0                2577
debt_consolidation  1              235296
                    0                9437
educational         1                   1
home_improvement    1               33393
                    0                1257
house               1                2890
                    0                 105
major_purchase      1               10668
                    0                 461
medical             1                6553
                    0                 324
moving              1                3419
                    0                 213
other               1               31605
                    0                1403
renewable_energy    1                 269
                    0                  22
small_business      1                4601
                    0                 331
va

In [7]:
#label encoding loan grades to make the column usable
#label encoding helps keep dimensionality low
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#let's generate a list of how each var will be encoded
grades = df.grade.unique()
purposes = df.purpose.unique()


grades_encoded = le.fit_transform(grades)
purposes_encoded = le.fit_transform(purposes)

print('Grades: ', grades)
print('Grades Encoded: ', grades_encoded)
print('Purposes:\n', purposes)
print('Purposes Encoded: ', purposes_encoded)

Grades:  ['C' 'B' 'A' 'E' 'D' 'G' 'F']
Grades Encoded:  [2 1 0 4 3 6 5]
Purposes:
 ['credit_card' 'debt_consolidation' 'car' 'other' 'vacation'
 'major_purchase' 'home_improvement' 'medical' 'house' 'small_business'
 'moving' 'renewable_energy' 'wedding' 'educational']
Purposes Encoded:  [ 1  2  0  9 12  6  4  7  5 11  8 10 13  3]


In [8]:
df.loc[:,['grade','issue_d','purpose','addr_state']].head()

Unnamed: 0,grade,issue_d,purpose,addr_state
0,C,Mar,credit_card,CA
1,B,Mar,debt_consolidation,CT
2,B,Mar,credit_card,CT
3,A,Mar,debt_consolidation,TX
4,C,Mar,credit_card,KS


In [9]:
#label encoding loan grades to make the column usable
#label encoding helps keep dimensionality low
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df['grade'] = le.fit_transform(df.grade)
df['purpose'] = le.fit_transform(df.purpose)
df['addr_state'] = le.fit_transform(df.addr_state)
#df.head()

In [10]:
months = {'Jan':1, 'Feb':2,'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
         'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}

df['issue_d'] = df.issue_d.map(months)

In [11]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,monthly_pmt,grade,issue_d,purpose,loan_status,dti,recoveries,...,tax_liens,pub_rec_bankruptcies,addr_state,delinq_2yrs,delinq_amnt,annual_inc,num_tl_90g_dpd_24m,num_tl_30dpd,total_return,roi
0,14000.0,60,12.74,316.69,2,3,1,1,17.48,0.0,...,0.0,0.0,4,0.0,0.0,40000.0,0.0,0.0,19001.4,135.724286
1,15400.0,60,11.39,337.84,1,3,2,1,11.52,0.0,...,0.0,0.0,6,0.0,0.0,95000.0,0.0,0.0,20270.4,131.625974
2,4600.0,36,11.39,151.45,1,3,1,1,13.43,0.0,...,0.0,0.0,6,0.0,0.0,44470.0,0.0,0.0,5452.2,118.526087
3,15000.0,36,5.32,451.73,0,3,2,1,9.35,0.0,...,0.0,0.0,42,0.0,0.0,182000.0,0.0,0.0,16262.28,108.4152
4,14000.0,36,15.99,492.13,2,3,1,1,31.07,0.0,...,0.0,0.0,15,0.0,0.0,82000.0,0.0,0.0,17716.68,126.547714


In [12]:
df.loc[:,['grade','issue_d','purpose','addr_state']].head()

Unnamed: 0,grade,issue_d,purpose,addr_state
0,2,3,1,4
1,1,3,2,6
2,1,3,1,6
3,0,3,2,42
4,2,3,1,15


## Modeling

__KNN Classifier Model__

In [17]:
start_time = datetime.now()

knn = KNeighborsClassifier(n_neighbors=10, weights='distance')

x1= pd.DataFrame({
    'loan_amnt': df.loan_amnt,
    'term': df.term,
    'int_rate': df.int_rate,
    'annual_inc': df.annual_inc,
    'dti': df.dti,
    'grade': df.grade,
    'purpose': df.purpose,
    'addr_state': df.addr_state,
    'issue_d': df.issue_d
    
})
y1= df.loan_status

knn.fit(x1,y1)

y_knn_pred = knn.predict(x1)


print('R-Squared: \n', knn.score(x1,y1))

#test using cross validation
from sklearn.model_selection import cross_val_score

display(cross_val_score(knn, x1,y1, cv=10))
score = cross_val_score(knn, x1, y1, cv=10)
print("KNN Cross Validation Accuracy: %0.3f (+/- %0.3f)" % (score.mean(), score.std() * 2))

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

R-Squared: 
 0.9999977426534657


array([0.96155756, 0.96167043, 0.96142212, 0.96146727, 0.96182844,
       0.96160271, 0.96214447, 0.96223476, 0.96218876, 0.96214361])

KNN Cross Validation Accuracy: 0.962 (+/- 0.001)
Duration: 0:01:26.867544


In [14]:
print("Number of mislabled points out of a total {} points: {}".format(x1.shape[0],
                                                                       (y1 != y_knn_pred).sum()))

Number of mislabled points out of a total 442998 points: 13


In [15]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_knn_pred,y1)
display(cf_matrix)

sensitivity = cf_matrix[1][1]/sum(cf_matrix[1])
specificity = cf_matrix[0][0]/sum(cf_matrix[0])
print("Sensitivity : ", sensitivity)
print("Specificity : ", specificity)

array([[ 16450,     27],
       [     0, 426521]])

Sensitivity :  1.0
Specificity :  0.9983613521878982


In [25]:
#check the error rates
knn_table = pd.crosstab(y1, y_knn_pred, margins=True)
print(knn_table, '\n')

tI_errors = knn_table.loc[0.0,1.0] / knn_table.loc['All','All']
tII_errors = knn_table.loc[1.0,0.0] / knn_table.loc['All','All']

print((
    'KNN Accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
).format(tI_errors, tII_errors))

col_0            0       1     All
loan_status                       
0            16450       0   16450
1               27  426521  426548
All          16477  426521  442998 

Logistic Regression accuracy:
Percent Type I errors: 0.0
Percent Type II errors: 6.094835642598838e-05




__KNN v2__

In [24]:
start_time = datetime.now()

knn2 = KNeighborsClassifier(n_neighbors=10, weights='distance')

x2 = df.drop('loan_status', axis=1)
    
y2 = df.loan_status

knn2.fit(x2,y2)

y_knn_pred2 = knn2.predict(x2)


print('R-Squared: \n', knn2.score(x2,y2))

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

R-Squared: 
 1.0
Duration: 0:03:15.483031


In [25]:
print("Number of mislabled points out of a total {} points: {}".format(x2.shape[0],
                                                                       (y2 != y_knn_pred2).sum()))

Number of mislabled points out of a total 442998 points: 0


In [26]:
knn_score2 = cross_val_score(knn2, x2, y2, cv=10)
display(knn_score2)
print("Unweighted Accuracy: %0.3f (+/- %0.3f)" % (knn_score2.mean(), knn_score2.std() * 2))

array([0.96069977, 0.95968397, 0.9613544 , 0.96027088, 0.95981941,
       0.96110609, 0.96065463, 0.96045147, 0.96056344, 0.96013454])

Unweighted Accuracy: 0.960 (+/- 0.001)


__BernoulliNB Model__

In [27]:
#using the same features as the KNNC model
data = df.loc[:,['loan_amnt','term','int_rate','annual_inc','dti','grade']]
b_target = df.loan_status

In [28]:
start_time = datetime.now()
bnb = BernoulliNB()

bnb.fit(data,b_target)

y_pred = bnb.predict(data)

print('Accuracy: ', bnb.score(data,b_target))

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Accuracy:  0.962866649510833
Duration: 0:00:00.686073


In [29]:
print("Number of mislabled points out of a total {} points: {}".format(data.shape[0],
                                                                       (b_target !=y_pred).sum()))

Number of mislabled points out of a total 442998 points: 16450


In [30]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_pred, b_target)
display(cf_matrix)

sensitivity = cf_matrix[1][1]/sum(cf_matrix[1])
specificity = cf_matrix[0][0]/sum(cf_matrix[0])
print("Sensitivity : ", sensitivity)
print("Specificity : ", specificity)

array([[     0,      0],
       [ 16450, 426548]])

Sensitivity :  0.962866649510833
Specificity :  nan


  


In [31]:
display(cross_val_score(bnb, data,b_target, cv=10))
score = cross_val_score(bnb, data, b_target, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (score.mean(), score.std() * 2))

array([0.96286682, 0.96286682, 0.96286682, 0.96286682, 0.96286682,
       0.96286682, 0.96286682, 0.96286682, 0.96286598, 0.96286598])

Accuracy: 0.963 (+/- 0.000)


In [32]:
pd.Series(y_pred).value_counts()

1    442998
dtype: int64

The model always predicts "Performing"

__Random Forest Classification__

In [20]:
# from sklearn.ensemble import RandomForestClassifier; already loaded
start_time = datetime.now()

rfc = RandomForestClassifier()
X_rfc = df.loc[:,['loan_amnt','term','int_rate','annual_inc','dti','grade','purpose','addr_state']]
Y_rfc= df.loan_status

rfc.fit(X_rfc,Y_rfc)

y_rfc_pred = rfc.predict(X_rfc)

#use cross validation to verify the accuracy
rfc_score = cross_val_score(rfc, X_rfc, Y_rfc, cv=10)
display(rfc_score)
print("RFC Accuracy: %0.3f (+/- %0.3f)" % (rfc_score.mean(), rfc_score.std() * 2))

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

array([0.95799097, 0.95623025, 0.96121896, 0.95993228, 0.95959368,
       0.96115124, 0.96081264, 0.96180587, 0.96232421, 0.96207589])

RFC Accuracy: 0.960 (+/- 0.004)
Duration: 0:01:14.428323


In [19]:
print("Number of mislabled points out of a total {} points: {}".format(X_rfc.shape[0],
                                                                       (Y_rfc !=y_rfc_pred).sum()))

Number of mislabled points out of a total 442998 points: 1526


In [41]:
rfc_cf_matrix = confusion_matrix(y_rfc_pred, Y_rfc)
display(rfc_cf_matrix)

sensitivity = rfc_cf_matrix[1][1]/sum(rfc_cf_matrix[1])
specificity = rfc_cf_matrix[0][0]/sum(rfc_cf_matrix[0])
print("Sensitivity : ", sensitivity)
print("Specificity : ", specificity)

array([[ 14984,     73],
       [  1466, 426475]])

Sensitivity :  0.9965742941199839
Specificity :  0.9951517566580328


In [50]:
#check the error rates
rfc_table = pd.crosstab(Y_rfc, y_rfc_pred, margins=True)
print(rfc_table, '\n')

rfc_tI_errors = rfc_table.loc[0.0,1.0] / rfc_table.loc['All','All']
rfc_tII_errors = rfc_table.loc[1.0,0.0] / rfc_table.loc['All','All']

print((
    'Random Forest:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
).format(rfc_tI_errors, rfc_tII_errors))

col_0            0       1     All
loan_status                       
0            14984    1466   16450
1               73  426475  426548
All          15057  427941  442998 

Random Forest:
Percent Type I errors: 0.0033092700192777396
Percent Type II errors: 0.00016478629700359822




__Logistic Regression__

In [49]:
from sklearn.linear_model import LogisticRegression
start_time = datetime.now()

#the class_weight paramater handles class imbalance by assigning weights equal to the inverse
#of the class distribution
lr = LogisticRegression()
w_lr = LogisticRegression(class_weight='balanced')

X = df.loc[:,['loan_amnt','term','int_rate','annual_inc','dti','grade']]
Y = df.loan_status

lr.fit(X,Y)
w_lr.fit(X,Y)

#storing predictions in a variable
Y_ = lr.predict(X)
w_Y_pred = w_lr.predict(X)

u_score = cross_val_score(lr, X, Y, cv=10)
print("Unweighted Accuracy: %0.3f (+/- %0.3f)" % (u_score.mean(), u_score.std() * 2))
#print('Accuracy: ', lr.score(X,Y))

w_score = cross_val_score(w_lr, X, Y, cv=10)
print('Weighted Accuracy:  %0.3f (+/- %0.3f)' % (w_score.mean(), w_score.std() * 2))

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Unweighted Accuracy: 0.963 (+/- 0.000)
Weighted Accuracy:  0.678 (+/- 0.036)
Duration: 0:00:54.159833


We can tell immediately from the logistic regression model that something is very off. The model as is, is currently not very accurate and greatly impacted by the class imbalance.

In [46]:
cf_matrix = confusion_matrix(Y, Y_)
display(cf_matrix)

sensitivity = cf_matrix[1][1]/sum(cf_matrix[1])
specificity = cf_matrix[0][0]/sum(cf_matrix[0])
print('Unweighted Confusion Matrix: \n')
print("Sensitivity : ", sensitivity)
print("Specificity : ", specificity)

array([[     1,  16449],
       [     0, 426548]])

Unweighted Confusion Matrix: 

Sensitivity :  1.0
Specificity :  6.0790273556231004e-05


The unweighted logistic regression model predicted 1 'Not Performing' loan, which is an improvement from the Bernoulli Naive Bayes model.

In [48]:
cf_matrix = confusion_matrix(Y, w_Y_pred)
display(cf_matrix)

sensitivity = cf_matrix[1][1]/sum(cf_matrix[1])
specificity = cf_matrix[0][0]/sum(cf_matrix[0])
print('Weighted Confusion Matrix: \n')
print("Sensitivity : ", sensitivity)
print("Specificity : ", specificity)

array([[  9944,   6506],
       [134666, 291882]])

Weighted Confusion Matrix: 

Sensitivity :  0.6842887553100706
Specificity :  0.6044984802431611


In [52]:
#check the error rates
logreg_table = pd.crosstab(Y, w_Y_pred, margins=True)
print(logreg_table, '\n')

logreg_tI_errors = logreg_table.loc[0.0,1.0] / logreg_table.loc['All','All']
logreg_tII_errors = logreg_table.loc[1.0,0.0] / logreg_table.loc['All','All']

print((
    'Logistic Regression Accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
).format(logreg_tI_errors, logreg_tII_errors))

col_0             0       1     All
loan_status                        
0              9944    6506   16450
1            134666  291882  426548
All          144610  298388  442998 

Logistic Regression Accuracy:
Percent Type I errors: 0.014686296552128903
Percent Type II errors: 0.30398782838748706


