In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [21]:
# Read the preprocess data and store it in a dataframe
row_count = 10000
loans_df = pd.DataFrame()
list_ = []
for chunk in pd.read_csv("final_filtered.csv", chunksize=row_count,index_col=0): 
    list_.append(chunk)
    
loans_df= pd.concat(list_)


For our analysis , we want to predict if a loan will be fully paid or charged off. So filter the loans which have loan_status = 0 (Charged off) or loan_status = 5 (Fully Paid)

In [3]:
loans_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1321847 entries, 0 to 103545
Columns: 163 entries, loan_amnt to debt_settlement_flag_Y
dtypes: float64(65), int64(98)
memory usage: 1.6 GB


In [4]:
loans_df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,debt_settlement_flag_Y
0,5000.0,5000.0,4975.0,162.87,24000.0,27.65,0.0,1.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2500.0,2500.0,2500.0,59.83,30000.0,1.0,0.0,5.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2400.0,2400.0,2400.0,84.33,12252.0,8.72,0.0,2.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,10000.0,10000.0,10000.0,339.31,49200.0,20.0,0.0,1.0,10.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,3000.0,3000.0,3000.0,67.79,80000.0,17.94,0.0,0.0,15.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
loans_df["loan_status"].value_counts()

5    639250
1    479981
0    170927
8     16750
6      8304
7      3773
4      1988
3       761
2       113
Name: loan_status, dtype: int64

For our analysis , we want to predict if a loan will be fully paid or charged off. So filter the loans which have loan_status = 0 (Charged off) or loan_status = 5 (Fully Paid). So filter the loans wiht loan status = 5 and 0

In [22]:
filtered_loans_df = loans_df[(loans_df["loan_status"] == 5) |
                            (loans_df["loan_status"] == 0)]


In [7]:
filtered_loans_df = filtered_loans_df[filtered_loans_df.columns.drop(list(filtered_loans_df.filter(regex='addr_state')))]

In [23]:
filtered_loans_df = filtered_loans_df.drop(['issue_year','funded_amnt' ,'funded_amnt_inv' ,'total_pymnt' ,'total_pymnt_inv','last_pymnt_amnt','last_pymnt_year','out_prncp_inv','out_prncp'],axis=1)

<h3>Classification Models :</h3>

<p>1.Logistic Regression</p>
<p>2.Decision Tree Classification</p>
<p>3.Random Forest Classsification</p>
<p>4.Gradient Boost Classifier</p>
    

In [9]:
filtered_loans_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 810177 entries, 0 to 103537
Columns: 104 entries, loan_amnt to debt_settlement_flag_Y
dtypes: float64(58), int64(46)
memory usage: 649.0 MB


In [10]:
filtered_loans_df.head()

Unnamed: 0,loan_amnt,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,total_acc,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,debt_settlement_flag_Y
0,5000.0,162.87,24000.0,27.65,0.0,1.0,3.0,0.0,13648.0,9.0,...,0,0,0,0,0,0,0,0,0,0
1,2500.0,59.83,30000.0,1.0,0.0,5.0,3.0,0.0,1687.0,4.0,...,0,0,0,0,0,0,0,0,0,0
2,2400.0,84.33,12252.0,8.72,0.0,2.0,2.0,0.0,2956.0,10.0,...,0,0,0,0,0,0,1,0,0,0
3,10000.0,339.31,49200.0,20.0,0.0,1.0,10.0,0.0,5598.0,37.0,...,0,0,0,0,1,0,0,0,0,0
4,3000.0,67.79,80000.0,17.94,0.0,0.0,15.0,0.0,27783.0,38.0,...,0,0,0,0,1,0,0,0,0,0


<h4>Split the data into training and test data set</h4>

<p>Update the target variable values. we want to predict if the loan gets charged off. so update the loan status values </p>

<p>charged off - 1 <p>
fully paid - 0 <p>

In [24]:
#Normalize the data and split it into trianing set and test set.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

#Target Variable
y = filtered_loans_df.loan_status.values
#print(y)
y[y==0] = 1
y[y==5] = 0
#Independent variables
X = filtered_loans_df.drop(['loan_status'],axis=1)

#Normalize the data
normalizer = Normalizer()
scaled_X = normalizer.fit_transform(X)

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.2, random_state=42, stratify=y)

In [25]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def accuracy_metrics(y_test,y_pred):
    print("#--------------Confusion Matrix--------------------")
    print(confusion_matrix(y_test, y_pred))
    print("#--------------Classification Report--------------------")
    print(classification_report(y_test, y_pred))
    print("#--------------Accuracy Score --------------------")
    print(accuracy_score(y_test, y_pred))
    
        

In [26]:
def classify(clf,X_train,y_train,X_test,y_test) :
    # Fit the classifier to the training data
    clf.fit(X_train,y_train)
    # Predict the labels of the test set: y_pred
    y_pred = clf.predict(X_test)
    #Calculate the accuracy of the model
    accuracy_metrics(y_test,y_pred)
    

    print("#--------------cross vlaidation---------------------- ")
    cv_scores_3 = cross_val_score(clf,X_train,y_train,cv=3)
    print("3 fold Cross-Validation score : ", cv_scores_3 )
    print("Mean of 3 fold Cross-Validation score : ", np.mean(cv_scores_3) )
    print("#------------------------------------ ")
    cv_scores_5 = cross_val_score(clf,X_train,y_train,cv=5)
    print("5 fold Cross-Validation : " , cv_scores_5)
    print("Mean of 5 fold Cross-Validation score : ", np.mean(cv_scores_5) )
    print("#------------------------------------ ")

In [27]:
def classify_with_parameter_tuning(clf,param_grid,X_train,y_train,X_test,y_test) :
    
    clf_cv = GridSearchCV(clf,param_grid,cv=5)
    # Fit the classifier to the training data
    clf_cv.fit(X_train,y_train)
    # Predict the labels of the test set: y_pred
    y_pred = clf_cv.predict(X_test)
    
    # Print the optimal parameters and best score
    print("Tuned Classifier Parameter: {}".format(clf_cv.best_params_))
    print("Tuned Classifieer Regression Accuracy: {}".format(clf_cv.best_score_))
    accuracy_metrics(y_test,y_pred)
    

In [28]:
def plot_roc_graph(clf,X_train,y_train,X_test,y_test) :
    # Compute predicted probabilities: y_pred_prob
    logreg.fit(X_train,y_train)
    y_pred_prob =  logreg.predict_proba(X_test)[:,1]

    # Generate ROC curve values: fpr, tpr, thresholds
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    # Plot ROC curve
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

<h4>Logistic Regression </h4>

In [16]:
#Logistic Regression 

# Create the classifier: logreg with default parameters
logreg = LogisticRegression()

print("#--------------Logistic Regression Analysis----------------------")
classify(logreg,X_train,y_train,X_test,y_test)


#--------------Logistic Regression Analysis----------------------
#--------------Confusion Matrix--------------------
[[127668    182]
 [  3801  30385]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       0.97      1.00      0.98    127850
          1       0.99      0.89      0.94     34186

avg / total       0.98      0.98      0.97    162036

#--------------Accuracy Score --------------------
0.975419042682
#--------------cross vlaidation---------------------- 
3 fold Cross-Validation score :  [ 0.97236725  0.97201535  0.97177453]
Mean of 3 fold Cross-Validation score :  0.972052376577
#------------------------------------ 
5 fold Cross-Validation :  [ 0.97415702  0.97359367  0.97338538  0.97363224  0.97290709]
Mean of 5 fold Cross-Validation score :  0.973535078599
#------------------------------------ 


In [17]:
#Feature coefficents
coeffs = pd.DataFrame()
coeffs['features'] = X.columns
coeffs['estimatedCoefficients']  = logreg.coef_[0]
coeffs['absestimatedCoefficients'] = abs(logreg.coef_[0])

In [18]:
coeffs.sort_values(by=['absestimatedCoefficients'],ascending=False)

Unnamed: 0,features,estimatedCoefficients,absestimatedCoefficients
10,total_rec_prncp,-157.560343,157.560343
0,loan_amnt,117.135941,117.135941
11,total_rec_int,40.659001,40.659001
13,recoveries,29.403663,29.403663
64,last_credit_pull_year,10.166422,10.166422
61,earliest_cr_line_year,10.043729,10.043729
1,installment,6.481916,6.481916
14,collection_recovery_fee,4.782197,4.782197
30,mort_acc,-4.430355,4.430355
20,acc_open_past_24mths,-4.382102,4.382102


<h3>Paramter Tuning</h3>

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the hyperparameter grid
c_space = [10,100.0]
param_grid = {'C': c_space, 'penalty': ['l1', 'l2'] }

logreg = LogisticRegression()
classify_with_parameter_tuning(logreg,param_grid,X_train,y_train,X_test,y_test)


Tuned Classifier Parameter: {'C': 100.0, 'penalty': 'l1'}
Tuned Classifieer Regression Accuracy: 0.9909479573117578
#--------------Confusion Matrix--------------------
[[127744    106]
 [  1316  32870]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       0.99      1.00      0.99    127850
          1       1.00      0.96      0.98     34186

avg / total       0.99      0.99      0.99    162036

#--------------Accuracy Score --------------------
0.991224172406


In [None]:
# Create the classifier: logreg
logreg = LogisticRegression(C=100.0,penalty='l1')

# Fit the classifier to the training data
m = logreg.fit(X_train,y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

In [None]:
# Import necessary modules
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegressionCV
logreg = LogisticRegression(C=100.0,penalty='l2')

# Compute predicted probabilities: y_pred_prob
logreg.fit(X_train,y_train)
y_pred_prob =  logreg.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

<h3>Decision Tree Classifier </h3>

In [11]:
#Decision Tree Classifier
decisionTree = DecisionTreeClassifier(random_state=32)
print("#--------------Decision Tree Classification Analysis----------------------")
classify(decisionTree,X_train,y_train,X_test,y_test)

#--------------Decision Tree Classification Analysis----------------------
#--------------Confusion Matrix--------------------
[[127705    145]
 [   142  34044]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       1.00      1.00      1.00    127850
          1       1.00      1.00      1.00     34186

avg / total       1.00      1.00      1.00    162036

#--------------Accuracy Score --------------------
0.998228788664
#--------------cross vlaidation---------------------- 
3 fold Cross-Validation score :  [ 0.99788936  0.99788472  0.99821334]
Mean of 3 fold Cross-Validation score :  0.997995806968
#------------------------------------ 
5 fold Cross-Validation :  [ 0.99803285  0.99831826  0.99800969  0.99830284  0.99804055]
Mean of 5 fold Cross-Validation score :  0.998140836806
#------------------------------------ 


In [12]:
important_features = pd.Series(data=decisionTree.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)

In [13]:
important_features

recoveries                  5.821732e-01
loan_amnt                   2.937232e-01
total_rec_prncp             9.835423e-02
debt_settlement_flag_Y      1.627167e-02
total_rec_int               2.513662e-03
tot_hi_cred_lim             9.205183e-04
num_tl_90g_dpd_24m          8.905737e-04
installment                 8.864840e-04
pct_tl_nvr_dlq              2.828990e-04
num_bc_tl                   2.484337e-04
mort_acc                    1.970902e-04
total_rec_late_fee          1.510670e-04
num_il_tl                   1.509090e-04
earliest_cr_line_year       1.464719e-04
int_rate                    1.456431e-04
last_credit_pull_year       1.344724e-04
fico_average                1.317784e-04
mo_sin_rcnt_rev_tl_op       1.262058e-04
num_actv_bc_tl              1.160522e-04
avg_cur_bal                 1.020909e-04
term                        9.590718e-05
last_credit_pull_month      9.123394e-05
annual_inc                  8.840003e-05
earliest_cr_line_month      8.774945e-05
total_bal_ex_mor

<h3>Paramter Tuning</h3>

In [14]:
decisionTree = DecisionTreeClassifier(random_state=32)
#Hyper Parameters Set
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,4,6,8,10], 
          }
classify_with_parameter_tuning(decisionTree,param_grid,X_train,y_train,X_test,y_test)

Tuned Classifier Parameter: {'max_features': 'auto', 'min_samples_split': 10}
Tuned Classifieer Regression Accuracy: 0.9640263461191315
#--------------Confusion Matrix--------------------
[[124144   3706]
 [  4439  29747]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       0.97      0.97      0.97    127850
          1       0.89      0.87      0.88     34186

avg / total       0.95      0.95      0.95    162036

#--------------Accuracy Score --------------------
0.949733392579


<h3>Randon Forest Tree Classifier </h3>

In [15]:
from sklearn.ensemble import RandomForestClassifier
#Random Forest Classifier
rfc = RandomForestClassifier(n_jobs = -1,random_state=32)
print("#--------------Random Forest Tree Classification Analysis----------------------")
classify(rfc,X_train,y_train,X_test,y_test)

#--------------Random Forest Tree Classification Analysis----------------------
#--------------Confusion Matrix--------------------
[[127832     18]
 [  2191  31995]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       0.98      1.00      0.99    127850
          1       1.00      0.94      0.97     34186

avg / total       0.99      0.99      0.99    162036

#--------------Accuracy Score --------------------
0.986367227036
#--------------cross vlaidation---------------------- 
3 fold Cross-Validation score :  [ 0.98171703  0.98304998  0.98379512]
Mean of 3 fold Cross-Validation score :  0.982854042682
#------------------------------------ 
5 fold Cross-Validation :  [ 0.98387706  0.98305922  0.9807449   0.98462524  0.98548925]
Mean of 5 fold Cross-Validation score :  0.983559132476
#------------------------------------ 


<h3>Paramter Tuning</h3>

In [16]:
#making the instance
rfc = RandomForestClassifier(n_jobs = -1,random_state=32)
#hyper parameters set
param_grid = {'criterion':['gini','entropy'],
          'n_estimators':[10,15,20,25,30]
          
          }
classify_with_parameter_tuning(rfc,param_grid,X_train,y_train,X_test,y_test)

Tuned Classifier Parameter: {'criterion': 'gini', 'n_estimators': 25}
Tuned Classifieer Regression Accuracy: 0.9895933138005465
#--------------Confusion Matrix--------------------
[[127845      5]
 [  1266  32920]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       0.99      1.00      1.00    127850
          1       1.00      0.96      0.98     34186

avg / total       0.99      0.99      0.99    162036

#--------------Accuracy Score --------------------
0.992156064085


In [17]:
rfc = RandomForestClassifier(n_jobs = -1,random_state=32,criterion='gini',n_estimators = 25)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
#Accuracy Score
accuracy_metrics(y_test,y_pred)

#--------------Confusion Matrix--------------------
[[127845      5]
 [  1266  32920]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       0.99      1.00      1.00    127850
          1       1.00      0.96      0.98     34186

avg / total       0.99      0.99      0.99    162036

#--------------Accuracy Score --------------------
0.992156064085


In [18]:
important_features = pd.Series(data=rfc.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)

In [19]:
important_features

recoveries                    2.679578e-01
total_rec_prncp               2.524778e-01
collection_recovery_fee       1.862525e-01
installment                   4.390435e-02
loan_amnt                     3.744203e-02
debt_settlement_flag_Y        2.035187e-02
total_rec_int                 1.682141e-02
total_rec_late_fee            1.225751e-02
last_credit_pull_month        5.917209e-03
term                          5.416144e-03
total_bal_ex_mort             4.534167e-03
grade_E                       4.246865e-03
int_rate                      3.735086e-03
pct_tl_nvr_dlq                3.451724e-03
earliest_cr_line_year         3.420257e-03
total_il_high_credit_limit    3.223008e-03
tot_cur_bal                   3.094492e-03
dti                           3.064762e-03
grade_D                       3.008947e-03
last_credit_pull_year         3.001025e-03
tot_hi_cred_lim               2.973196e-03
revol_bal                     2.885543e-03
mo_sin_old_rev_tl_op          2.856448e-03
annual_inc 

<h4>Gradient Boosting Classifier</h4>

In [31]:
from sklearn.ensemble import GradientBoostingClassifier
#Random Forest Classifier
gbc = GradientBoostingClassifier()
print("#--------------Gradient Boosting Classification Analysis----------------------")
classify(gbc,X_train,y_train,X_test,y_test)

#--------------Gradient Boosting Classification Analysis----------------------
#--------------Confusion Matrix--------------------
[[127840     10]
 [   979  33207]]
#--------------Classification Report--------------------
             precision    recall  f1-score   support

          0       0.99      1.00      1.00    127850
          1       1.00      0.97      0.99     34186

avg / total       0.99      0.99      0.99    162036

#--------------Accuracy Score --------------------
0.99389641808
#--------------cross vlaidation---------------------- 
3 fold Cross-Validation score :  [ 0.99412631  0.99378839  0.99372356]
Mean of 3 fold Cross-Validation score :  0.993879417592
#------------------------------------ 
5 fold Cross-Validation :  [ 0.99413711  0.99376678  0.99409078  0.99374364  0.99362792]
Mean of 5 fold Cross-Validation score :  0.993873246309
#------------------------------------ 


In [32]:
important_features = pd.Series(data=gbc.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)

In [33]:
important_features

loan_amnt                       0.313358
total_rec_prncp                 0.289440
recoveries                      0.255396
total_rec_int                   0.050899
installment                     0.024370
total_rec_late_fee              0.019222
debt_settlement_flag_Y          0.012457
last_credit_pull_month          0.008683
num_tl_90g_dpd_24m              0.005597
last_credit_pull_year           0.005057
emp_length_n/a                  0.005030
total_rev_hi_lim                0.002104
total_bal_ex_mort               0.001944
num_tl_30dpd                    0.001653
avg_cur_bal                     0.001350
mort_acc                        0.001316
collection_recovery_fee         0.000929
tax_liens                       0.000293
annual_inc                      0.000268
addr_state_GA                   0.000236
addr_state_MS                   0.000218
collections_12_mths_ex_med      0.000154
tot_coll_amt                    0.000026
pub_rec_bankruptcies            0.000000
addr_state_UT   