In [37]:
import warnings
warnings.filterwarnings('ignore')

In [38]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.ensemble import RandomForestClassifier


In [41]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


# Read the CSV and Perform Basic Data Cleaning

In [42]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [43]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=0)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,19-Mar,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,19-Mar,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,19-Mar,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,19-Mar,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,19-Mar,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [44]:
df.loc[df["debt_settlement_flag"] == "y"]

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag


# Split the Data into Training and Testing

In [45]:
# Create our features
#X = # YOUR CODE HERE

# Create our target
#y = # YOUR CODE HERE
X = df.drop(columns='loan_status')
X = pd.get_dummies(X)

y = df.loc[:,target].copy()

# x_ensm = [i for i in df.columns if i not in ('ID', 'debt_settlement_flag')]
# X = df[x_ensm]
# y = df['debt_settlement_flag']
# X = pd.get_dummies(X)

In [46]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,issue_d_19-Mar,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_19-Apr,next_pymnt_d_19-May,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.177238,1.0,0.123879,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.381873,0.0,0.329446,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [47]:
y= pd.DataFrame(y)

In [48]:
print(type(X))

<class 'pandas.core.frame.DataFrame'>


In [49]:
print(type(y))

<class 'pandas.core.frame.DataFrame'>


In [50]:
from sklearn.model_selection import train_test_split
# YOUR CODE HERE
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=1)

In [51]:
print(X_train)

       loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  \
3903      1600.0    0.0881        50.74    35964.00  24.12          0.0   
28390     9000.0    0.0756       280.21    41000.00  16.89          1.0   
15470    10000.0    0.1033       214.10   112000.00  17.75          0.0   
4279     36000.0    0.1033      1167.21   120000.00  19.95          0.0   
57514    18000.0    0.0881       570.81    51000.00  19.11          1.0   
47334     9000.0    0.0881       285.41    55000.00  13.31          0.0   
60120    12000.0    0.1131       394.63    44000.00  18.90          1.0   
20190    22000.0    0.1447       756.94    35000.00  31.14          0.0   
2727     10000.0    0.2000       371.64    95000.00   9.85          0.0   
56480    10000.0    0.1131       328.86   165000.00  15.16          0.0   
54370    24000.0    0.2235       667.64    90000.00  22.73          0.0   
30486    23000.0    0.0646       704.51   144000.00   6.14          0.0   
54680    35000.0    0.103

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [52]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
from imblearn.ensemble import BalancedRandomForestClassifier




In [53]:
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 

In [54]:
# from sklearn.datasets import make_classification

# X, y = make_classification(n_samples=1000, n_classes=3,
#                            n_informative=4, weights=[0.2, 0.3, 0.5],
#                            random_state=0)


In [55]:
print(X.shape)

(68817, 95)


In [56]:
print(y.shape)

(68817, 1)


In [57]:
brf_model.fit(X_train, y_train)


BalancedRandomForestClassifier(bootstrap=True, class_weight=None,
                criterion='gini', max_depth=None, max_features='auto',
                max_leaf_nodes=None, min_impurity_decrease=0.0,
                min_samples_leaf=2, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                oob_score=False, random_state=1, replacement=False,
                sampling_strategy='auto', verbose=0, warm_start=False)

In [58]:
predictions = brf_model.predict(X_test)

predictions

array(['low_risk', 'low_risk', 'high_risk', ..., 'low_risk', 'low_risk',
       'low_risk'], dtype=object)

In [59]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
acc_score = balanced_accuracy_score(y_test, predictions)


In [60]:
acc_score

0.7855052723466922

In [61]:
# Display the confusion matrix
# YOUR CODE HERE
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,predictions)


In [62]:
cm

array([[   68,    33],
       [ 1749, 15355]])

In [63]:
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,68,33
Actual 1,1749,15355


In [64]:
# Print the imbalanced classification report
# YOUR CODE HERE
print("Confusion Matrix")
display(cm_df)
#print(f"Accuracy Score : {acc_score}")
display(acc_score)
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))



Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,68,33
Actual 1,1749,15355


0.7855052723466922

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.67      0.90      0.07      0.78      0.59       101
   low_risk       1.00      0.90      0.67      0.95      0.78      0.62     17104

avg / total       0.99      0.90      0.67      0.94      0.78      0.62     17205



In [65]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE

importances = brf_model.feature_importances_
importances

sorted(zip(brf_model.feature_importances_, X.columns), reverse=False)


[(0.0, 'acc_now_delinq'),
 (0.0, 'chargeoff_within_12_mths'),
 (0.0, 'collection_recovery_fee'),
 (0.0, 'debt_settlement_flag_N'),
 (0.0, 'delinq_amnt'),
 (0.0, 'hardship_flag_N'),
 (0.0, 'home_ownership_ANY'),
 (0.0, 'num_tl_120dpd_2m'),
 (0.0, 'num_tl_30dpd'),
 (0.0, 'policy_code'),
 (0.0, 'pymnt_plan_n'),
 (0.0, 'recoveries'),
 (0.0, 'tax_liens'),
 (0.0004192455022893127, 'initial_list_status_f'),
 (0.0005168345750594915, 'collections_12_mths_ex_med'),
 (0.000573414997420326, 'num_tl_90g_dpd_24m'),
 (0.0008125182396705508, 'initial_list_status_w'),
 (0.0008976722260399365, 'pub_rec'),
 (0.0012151288883862276, 'pub_rec_bankruptcies'),
 (0.0012213148580230454, 'application_type_Individual'),
 (0.0012263315437383057, 'application_type_Joint App'),
 (0.0015472230884974506, 'verification_status_Source Verified'),
 (0.001736019018028134, 'verification_status_Verified'),
 (0.0018404849590376573, 'home_ownership_OWN'),
 (0.0020851101815353096, 'home_ownership_RENT'),
 (0.0021899772867773103

### Easy Ensemble AdaBoost Classifier

In [66]:
# Train the EasyEnsembleClassifier
# YOUR CODE HERE
from imblearn.ensemble import EasyEnsembleClassifier 

eec = EasyEnsembleClassifier(n_estimators=100,random_state=1)





In [69]:
eec.fit(X_train, y_train)

AttributeError: 'Pipeline' object has no attribute '_iter'

In [70]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
predictions = eec.predict(X_test)
balanced_accuracy_score(y_test,predictions)

0.5

In [73]:
# Display the confusion matrix
# YOUR CODE HERE
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,101,0
Actual 1,17104,0


In [75]:
# Print the imbalanced classification report
# YOUR CODE HERE
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      1.00      0.00      0.01      0.00      0.00       101
   low_risk       0.00      0.00      1.00      0.00      0.00      0.00     17104

avg / total       0.00      0.01      0.99      0.00      0.00      0.00     17205

