In [73]:
import warnings
warnings.filterwarnings('ignore')

In [74]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [75]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [76]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [77]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


# Split the Data into Training and Testing

In [78]:
# Create our features
X = df.drop(columns="loan_status")

# Create our target
y = df['loan_status']

In [79]:
X = pd.get_dummies(X)
X.describe

<bound method NDFrame.describe of        loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  \
0        10500.0    0.1719       375.35     66000.0  27.24          0.0   
1        25000.0    0.2000       929.09    105000.0  20.23          0.0   
2        20000.0    0.2000       529.88     56000.0  24.26          0.0   
3        10000.0    0.1640       353.55     92000.0  31.44          0.0   
4        22000.0    0.1474       520.39     52000.0  18.76          0.0   
...          ...       ...          ...         ...    ...          ...   
68812    10000.0    0.1502       346.76     26000.0   9.60          0.0   
68813    12000.0    0.2727       368.37     63000.0  29.07          0.0   
68814     5000.0    0.1992       185.62     52000.0  14.86          0.0   
68815    40000.0    0.0646      1225.24    520000.0   9.96          0.0   
68816    16000.0    0.1131       350.36     72000.0   7.02          2.0   

       inq_last_6mths  open_acc  pub_rec  revol_bal  ...  issue_d

In [80]:
# Check the balance of our target values
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [81]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)
X_train.shape


(51612, 95)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [82]:
# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [83]:
# Resample the training data with the RandomOversampler
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)



In [84]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [85]:
predictions = rf_model.predict(X_test_scaled)

In [86]:
y_pred = rf_model.predict(X_test_scaled)

In [87]:
#Calculate the balanced accuracy score 
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.67209249388625

In [88]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,30,57
Actual 1,11,17107


In [89]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.73      0.34      1.00      0.47      0.59      0.32        87
   low_risk       1.00      1.00      0.34      1.00      0.59      0.37     17118

avg / total       1.00      1.00      0.35      1.00      0.59      0.37     17205



Balanced Random Forest Classifier Test Results and Analysis 

This test consisted of a confusion matrix, classification report, and an accuracy score. For this test, the accuracy score was lower around 62%, that indicates that this classifier may not have been suitable. The precision and recall scores did well being at 1 for average and total as well as for low risk. However, the high risk scores were not as high. 

Future Recommendations

Since the accuracy score was not high, the Random Forest Classifier may not be a good fit for a dataset like this. It might help to evaluate training and test values and pick another column for analysis such as home ownership type which is a similar comparision to the loan status being low risk or high risk. 

In [90]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.01353588, 0.01297201, 0.02038753, 0.01318108, 0.01577101,
       0.00407065, 0.00499123, 0.00854937, 0.00220504, 0.01426264,
       0.0127941 , 0.01944821, 0.02214118, 0.06678644, 0.06919125,
       0.08051097, 0.06469571, 0.01430057, 0.        , 0.        ,
       0.06727015, 0.0012891 , 0.        , 0.        , 0.00391999,
       0.01272119, 0.00713076, 0.00648652, 0.00411488, 0.0069342 ,
       0.01226728, 0.01139244, 0.01252728, 0.00505316, 0.00744483,
       0.01414998, 0.01028238, 0.01217646, 0.00631934, 0.00894764,
       0.00726254, 0.00732178, 0.01371393, 0.01343748, 0.01317586,
       0.00049884, 0.        , 0.01404312, 0.01556393, 0.00962128,
       0.00900153, 0.00671149, 0.00928488, 0.00907981, 0.00390484,
       0.00874606, 0.00860621, 0.00603634, 0.00786158, 0.01009957,
       0.00854063, 0.00934174, 0.00927852, 0.0099795 , 0.        ,
       0.        , 0.00142504, 0.00603117, 0.00741941, 0.00611102,
       0.0025387 , 0.        , 0.01267206, 0.01493472, 0.01199

In [91]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.08051097066308971, 'total_rec_prncp'),
 (0.06919124657611576, 'total_pymnt_inv'),
 (0.067270153810282, 'last_pymnt_amnt'),
 (0.06678643518631669, 'total_pymnt'),
 (0.06469570692744232, 'total_rec_int'),
 (0.022141182468728152, 'out_prncp_inv'),
 (0.020387534921032957, 'installment'),
 (0.019448213410804435, 'out_prncp'),
 (0.015771010666736102, 'dti'),
 (0.015563927850601083, 'mo_sin_old_rev_tl_op'),
 (0.014934716219988178, 'total_bal_ex_mort'),
 (0.014300565001213917, 'total_rec_late_fee'),
 (0.01426263711804363, 'revol_bal'),
 (0.014149977555926745, 'max_bal_bc'),
 (0.014043118080550856, 'mo_sin_old_il_acct'),
 (0.01371393197112525, 'avg_cur_bal'),
 (0.013535875990791299, 'loan_amnt'),
 (0.013437484985140215, 'bc_open_to_buy'),
 (0.013181079361250765, 'annual_inc'),
 (0.013175864403227659, 'bc_util'),
 (0.012972008493593977, 'int_rate'),
 (0.012794104263175381, 'total_acc'),
 (0.012721192914691728, 'tot_cur_bal'),
 (0.012672055340250002, 'tot_hi_cred_lim'),
 (0.012527282303194501

### Easy Ensemble AdaBoost Classifier

In [92]:
# Train the Classifier
from sklearn.ensemble import AdaBoostClassifier 
clf = AdaBoostClassifier(n_estimators = 100, random_state=1)
clf.fit(X, y)

AdaBoostClassifier(n_estimators=100, random_state=1)

In [93]:
from sklearn import metrics 

In [103]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
print("accuracy:",metrics.accuracy_score(y_test, y_pred))

accuracy: 0.9949433304272014


In [106]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[   30,    57],
       [   11, 17107]], dtype=int64)

In [108]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions ))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.73      0.34      1.00      0.47      0.59      0.32        87
   low_risk       1.00      1.00      0.34      1.00      0.59      0.37     17118

avg / total       1.00      1.00      0.35      1.00      0.59      0.37     17205



AdaBoost Classifier Test Results and Analysis 
The AdaBoost classifier test consisted of a confusion matrix, accruacy score, and the classification report. The accuracy score was high at 99% indicaing that for the X and y values wich related to loan status, that this test yielded the best results. The The high and low risk precision were high along with the the recall scores. Overall, the AdaBoost classifier was a good fit for this dataset. 

Future Recommendations 
Since this model showed great accuracy, there is no necessary changes. 