In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [5]:
# Load the data
file_path = Path('./Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-19,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-19,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-19,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-19,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-19,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


# Split the Data into Training and Testing

In [6]:
# Create our features

df_copy = df.copy()

# Create our target
y = df_copy[target]

X = df_copy.drop(target, axis=1)

In [7]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.052138,2.219423,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.390633,1.897432,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,2.0,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,3.0,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,18.0,19.0,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0


In [8]:
X.select_dtypes(include=['object'])

Unnamed: 0,home_ownership,verification_status,issue_d,pymnt_plan,initial_list_status,next_pymnt_d,application_type,hardship_flag,debt_settlement_flag
0,RENT,Source Verified,Mar-19,n,w,May-19,Individual,N,N
1,MORTGAGE,Verified,Mar-19,n,w,May-19,Individual,N,N
2,MORTGAGE,Verified,Mar-19,n,w,May-19,Individual,N,N
3,RENT,Verified,Mar-19,n,w,May-19,Individual,N,N
4,MORTGAGE,Not Verified,Mar-19,n,w,May-19,Individual,N,N
...,...,...,...,...,...,...,...,...,...
68812,RENT,Source Verified,Jan-19,n,w,May-19,Individual,N,N
68813,RENT,Not Verified,Jan-19,n,w,May-19,Individual,N,N
68814,MORTGAGE,Source Verified,Jan-19,n,w,May-19,Individual,N,N
68815,MORTGAGE,Verified,Jan-19,n,f,May-19,Individual,N,N


In [9]:
# Check the balance of our target values
y['loan_status'].value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

# Label Encoding

In [10]:
le = LabelEncoder()
X['home_ownership'] = le.fit_transform(X['home_ownership'])
X['verification_status'] = le.fit_transform(X['verification_status'])
X['issue_d'] = le.fit_transform(X['issue_d'])
X['pymnt_plan'] = le.fit_transform(X['pymnt_plan'])
X['initial_list_status'] = le.fit_transform(X['initial_list_status'])
X['next_pymnt_d'] = le.fit_transform(X['next_pymnt_d'])
X['application_type'] = le.fit_transform(X['application_type'])
X['hardship_flag'] = le.fit_transform(X['hardship_flag'])
X['debt_settlement_flag'] = le.fit_transform(X['debt_settlement_flag'])


In [11]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [12]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 

brf_model

BalancedRandomForestClassifier(bootstrap=True, class_weight=None,
                               criterion='gini', max_depth=None,
                               max_features='auto', max_leaf_nodes=None,
                               min_impurity_decrease=0.0, min_samples_leaf=2,
                               min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=100,
                               n_jobs=1, oob_score=False, random_state=1,
                               replacement=False, sampling_strategy='auto',
                               verbose=0, warm_start=False)

In [13]:
# Fitting the model
brf_model = brf_model.fit(X_train, y_train)

In [14]:
# Making predictions using the testing data.
y_pred = brf_model.predict(X_test)

In [15]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.7486897280700942

In [16]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,64,37
Actual 1,2331,14773


In [17]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.03      0.63      0.86      0.05      0.74      0.53       101
   low_risk       1.00      0.86      0.63      0.93      0.74      0.56     17104

avg / total       0.99      0.86      0.64      0.92      0.74      0.56     17205



In [18]:
len(X.columns)

85

In [19]:
# We can sort the features by their importance.
sorted(zip(X.columns, brf_model.feature_importances_), key=lambda t: t[1], reverse=True)

[('total_rec_prncp', 0.08657085068523943),
 ('last_pymnt_amnt', 0.06619997738256855),
 ('total_pymnt', 0.06519395091973006),
 ('total_pymnt_inv', 0.05986789931351337),
 ('total_rec_int', 0.05222133190662811),
 ('int_rate', 0.030620450347263732),
 ('issue_d', 0.023938456635172214),
 ('mths_since_recent_inq', 0.021856043400737712),
 ('installment', 0.019058900689013905),
 ('out_prncp_inv', 0.018488120732016045),
 ('bc_util', 0.016901926863373162),
 ('max_bal_bc', 0.01623849715879394),
 ('tot_hi_cred_lim', 0.015802195647856256),
 ('dti', 0.015617231581198007),
 ('out_prncp', 0.015576644743312574),
 ('total_bal_il', 0.015172866422854281),
 ('avg_cur_bal', 0.014929971266126213),
 ('total_il_high_credit_limit', 0.014238868111630177),
 ('total_bc_limit', 0.014143708113171472),
 ('revol_bal', 0.01409964917169687),
 ('total_rev_hi_lim', 0.014089200392967016),
 ('all_util', 0.013591053596348992),
 ('mo_sin_old_il_acct', 0.013531856273250197),
 ('mo_sin_old_rev_tl_op', 0.013072758374079126),
 ('b

### Easy Ensemble AdaBoost Classifier

In [20]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier

classifier = EasyEnsembleClassifier(n_estimators=100, random_state=1)

classifier.fit(X_train, y_train)

EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=1,
                       random_state=1, replacement=False,
                       sampling_strategy='auto', verbose=0, warm_start=False)

In [21]:
predictions = classifier.predict(X_test)

In [22]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.9316600714093861

In [23]:
# Display the confusion matrix
cm2 = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm2_df = pd.DataFrame(
    cm2, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm2_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,93,8
Actual 1,983,16121


In [24]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.09      0.92      0.94      0.16      0.93      0.87       101
   low_risk       1.00      0.94      0.92      0.97      0.93      0.87     17104

avg / total       0.99      0.94      0.92      0.97      0.93      0.87     17205



# Findings Summary:

1.) Two Ensemble algorithms have been tried for this project to determine which algorithm results in the best performance:

### Balanced Random Forest Classifier:
	Accuracy Score: 0.7486897280700942
	Precision:
		high_risk       0.03
		low_risk       1.00

	Recall (Sensitivity):
		high_risk     0.63
		low_risk     0.86

	F1 Score:
		high_risk        0.05     
		low_risk       0.93 

### Easy Ensemble AdaBoost Classifier:
	Accuracy Score: 0.9316600714093861
	Precision:
		high_risk       0.09
		low_risk       1.00

	Recall (Sensitivity):
		high_risk     0.92
		low_risk     0.94

	F1 Score:
		high_risk        0.16   
		low_risk       0.97
 


4.) Based on the accuracy scores, clearly Easy Ensemble AdaBoost Classifier stands out

5.) The recall (sensitivity) of the high risk and low risk are mostly in line with each other for all models. However, the precision  for high risk is much lower than it is for predicting low risk. The lower precision for high risk is reflected in the dropped F1 score as well.

6.) Due to the nature of the business, having higher recall (sensitivy)scores is more important than precision

7.) Considering both the Accuracy and Recall scores, I would choose 'Easy Ensemble AdaBoost Classifier' for this project

8.) Now we can clearly also see which features, or columns, of the dataset are more relevant. And to improve the model, we can drop some of the lower ranked features



