In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
!pip install "imbalanced-learn<0.9"

Collecting imbalanced-learn<0.9
  Using cached imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.9.1
    Uninstalling imbalanced-learn-0.9.1:
      Successfully uninstalled imbalanced-learn-0.9.1
Successfully installed imbalanced-learn-0.8.1


In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [5]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [6]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [7]:
df_binary_encoded =  pd.get_dummies(df, columns=['home_ownership','verification_status','pymnt_plan','initial_list_status','application_type'])
df_binary_encoded


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,issue_d,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,...,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App
0,10500.0,0.1719,375.35,66000.0,Mar-2019,low_risk,27.24,0.0,0.0,8.0,...,0,1,0,1,0,1,0,1,1,0
1,25000.0,0.2000,929.09,105000.0,Mar-2019,low_risk,20.23,0.0,0.0,17.0,...,0,0,0,0,1,1,0,1,1,0
2,20000.0,0.2000,529.88,56000.0,Mar-2019,low_risk,24.26,0.0,0.0,8.0,...,0,0,0,0,1,1,0,1,1,0
3,10000.0,0.1640,353.55,92000.0,Mar-2019,low_risk,31.44,0.0,1.0,10.0,...,0,1,0,0,1,1,0,1,1,0
4,22000.0,0.1474,520.39,52000.0,Mar-2019,low_risk,18.76,0.0,1.0,14.0,...,0,0,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,26000.0,Jan-2019,low_risk,9.60,0.0,0.0,9.0,...,0,1,0,1,0,1,0,1,1,0
68813,12000.0,0.2727,368.37,63000.0,Jan-2019,low_risk,29.07,0.0,0.0,8.0,...,0,1,1,0,0,1,0,1,1,0
68814,5000.0,0.1992,185.62,52000.0,Jan-2019,low_risk,14.86,0.0,0.0,5.0,...,0,0,0,1,0,1,0,1,1,0
68815,40000.0,0.0646,1225.24,520000.0,Jan-2019,low_risk,9.96,0.0,1.0,21.0,...,0,0,0,0,1,1,1,0,1,0


# Split the Data into Training and Testing

In [8]:
# Create our features
x_cols = [i for i in df_binary_encoded.columns if i not in ('loan_status')]
X = df_binary_encoded[x_cols]

# Create our target
y = df_binary_encoded[target]

In [9]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.106747,0.357659,0.478007,0.373992,0.148001,1.0,0.123879,0.876121,0.86034,0.13966
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.308793,0.479314,0.49952,0.483865,0.355104,0.0,0.329446,0.329446,0.346637,0.346637
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
pd.set_option('display.max_rows', 500)
print(X.dtypes)


loan_amnt                              float64
int_rate                               float64
installment                            float64
annual_inc                             float64
issue_d                                 object
dti                                    float64
delinq_2yrs                            float64
inq_last_6mths                         float64
open_acc                               float64
pub_rec                                float64
revol_bal                              float64
total_acc                              float64
out_prncp                              float64
out_prncp_inv                          float64
total_pymnt                            float64
total_pymnt_inv                        float64
total_rec_prncp                        float64
total_rec_int                          float64
total_rec_late_fee                     float64
recoveries                             float64
collection_recovery_fee                float64
last_pymnt_am

In [11]:
df_binary_encoded =  pd.get_dummies(df, columns=['home_ownership','verification_status','pymnt_plan','initial_list_status','application_type'])
df_binary_encoded


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,issue_d,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,...,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App
0,10500.0,0.1719,375.35,66000.0,Mar-2019,low_risk,27.24,0.0,0.0,8.0,...,0,1,0,1,0,1,0,1,1,0
1,25000.0,0.2000,929.09,105000.0,Mar-2019,low_risk,20.23,0.0,0.0,17.0,...,0,0,0,0,1,1,0,1,1,0
2,20000.0,0.2000,529.88,56000.0,Mar-2019,low_risk,24.26,0.0,0.0,8.0,...,0,0,0,0,1,1,0,1,1,0
3,10000.0,0.1640,353.55,92000.0,Mar-2019,low_risk,31.44,0.0,1.0,10.0,...,0,1,0,0,1,1,0,1,1,0
4,22000.0,0.1474,520.39,52000.0,Mar-2019,low_risk,18.76,0.0,1.0,14.0,...,0,0,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,26000.0,Jan-2019,low_risk,9.60,0.0,0.0,9.0,...,0,1,0,1,0,1,0,1,1,0
68813,12000.0,0.2727,368.37,63000.0,Jan-2019,low_risk,29.07,0.0,0.0,8.0,...,0,1,1,0,0,1,0,1,1,0
68814,5000.0,0.1992,185.62,52000.0,Jan-2019,low_risk,14.86,0.0,0.0,5.0,...,0,0,0,1,0,1,0,1,1,0
68815,40000.0,0.0646,1225.24,520000.0,Jan-2019,low_risk,9.96,0.0,1.0,21.0,...,0,0,0,0,1,1,1,0,1,0


In [12]:
# Check the balance of our target values
y['loan_status'].value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [13]:
#pd.to_datetime(df_binary_encoded['issue_d'], format='%b-%y')
df_binary_encoded['issue_d'].value_counts()

Jan-2019    31041
Feb-2019    25579
Mar-2019    12197
Name: issue_d, dtype: int64

In [14]:
df_binary_encoded['next_pymnt_d'].value_counts()

May-2019    42449
Apr-2019    26368
Name: next_pymnt_d, dtype: int64

In [15]:
# Months dictionary 
# Small dictionary due to values in data. Would change to break out month and year if dataset encompassed a larger range.
months_num = {
    "Jan-2019": 1,
    "Feb-2019": 2,
    "Mar-2019": 3,
    "Apr-2019": 4,
    "May-2019": 5
}


In [16]:
df_binary_encoded["issued_month_num"] = df_binary_encoded["issue_d"].apply(lambda x: months_num[x])
df_binary_encoded['next_pymnt_month_num'] = df_binary_encoded["next_pymnt_d"].apply(lambda x: months_num[x])
df_binary_encoded[["issue_d", "next_pymnt_d","issued_month_num", 'next_pymnt_month_num']]

Unnamed: 0,issue_d,next_pymnt_d,issued_month_num,next_pymnt_month_num
0,Mar-2019,May-2019,3,5
1,Mar-2019,May-2019,3,5
2,Mar-2019,May-2019,3,5
3,Mar-2019,May-2019,3,5
4,Mar-2019,May-2019,3,5
...,...,...,...,...
68812,Jan-2019,May-2019,1,5
68813,Jan-2019,May-2019,1,5
68814,Jan-2019,May-2019,1,5
68815,Jan-2019,May-2019,1,5


In [17]:
df_binary_encoded[['hardship_flag','debt_settlement_flag']].value_counts()

hardship_flag  debt_settlement_flag
N              N                       68817
dtype: int64

In [18]:
# Can drop hardship_flag and debt_settlement_flag since all records have the same value

df_binary_encoded = df_binary_encoded.drop(["issue_d", "next_pymnt_d", "hardship_flag", "debt_settlement_flag"], axis=1)
df_binary_encoded


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,issued_month_num,next_pymnt_month_num
0,10500.0,0.1719,375.35,66000.0,low_risk,27.24,0.0,0.0,8.0,0.0,...,0,1,0,1,0,1,1,0,3,5
1,25000.0,0.2000,929.09,105000.0,low_risk,20.23,0.0,0.0,17.0,1.0,...,0,0,1,1,0,1,1,0,3,5
2,20000.0,0.2000,529.88,56000.0,low_risk,24.26,0.0,0.0,8.0,0.0,...,0,0,1,1,0,1,1,0,3,5
3,10000.0,0.1640,353.55,92000.0,low_risk,31.44,0.0,1.0,10.0,1.0,...,0,0,1,1,0,1,1,0,3,5
4,22000.0,0.1474,520.39,52000.0,low_risk,18.76,0.0,1.0,14.0,0.0,...,1,0,0,1,0,1,1,0,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,26000.0,low_risk,9.60,0.0,0.0,9.0,0.0,...,0,1,0,1,0,1,1,0,1,5
68813,12000.0,0.2727,368.37,63000.0,low_risk,29.07,0.0,0.0,8.0,0.0,...,1,0,0,1,0,1,1,0,1,5
68814,5000.0,0.1992,185.62,52000.0,low_risk,14.86,0.0,0.0,5.0,1.0,...,0,1,0,1,0,1,1,0,1,5
68815,40000.0,0.0646,1225.24,520000.0,low_risk,9.96,0.0,1.0,21.0,0.0,...,0,0,1,1,1,0,1,0,1,5


In [19]:
x_cols = [i for i in df_binary_encoded.columns if i not in ('loan_status')]
X = df_binary_encoded[x_cols]

# Create our target
y = df_binary_encoded[target]

In [20]:
X.shape

(68817, 90)

In [21]:
y.shape

(68817, 1)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [23]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)


In [24]:
help(brf.fit)

Help on method fit in module imblearn.ensemble._forest:

fit(X, y, sample_weight=None) method of imblearn.ensemble._forest.BalancedRandomForestClassifier instance
    Build a forest of trees from the training set (X, y).
    
    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The training input samples. Internally, its dtype will be converted
        to ``dtype=np.float32``. If a sparse matrix is provided, it will be
        converted into a sparse ``csc_matrix``.
    
    y : array-like of shape (n_samples,) or (n_samples, n_outputs)
        The target values (class labels in classification, real numbers in
        regression).
    
    sample_weight : array-like of shape (n_samples,)
        Sample weights. If None, then samples are equally weighted. Splits
        that would create child nodes with net zero or negative weight are
        ignored while searching for a split in each node. In the case of
        classification, sp

In [25]:
X_train.shape

(51612, 90)

In [26]:
y_train.shape

(51612, 1)

In [27]:
len(y_train)

51612

In [28]:
brf.fit(X = X_train,y= y_train)
y_pred = brf.predict(X_test)

In [29]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.7834690976113514

In [30]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[   69,    32],
       [ 1988, 15116]])

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.03      0.68      0.88      0.06      0.78      0.59       101
   low_risk       1.00      0.88      0.68      0.94      0.78      0.62     17104

avg / total       0.99      0.88      0.68      0.93      0.78      0.62     17205



In [32]:
# List the features sorted in descending order by feature importance
print(print(brf.feature_importances_))

[0.01195588 0.02791759 0.02131641 0.01538105 0.01728903 0.00306034
 0.00425572 0.00886852 0.00164197 0.01424016 0.01027484 0.01583513
 0.01572009 0.0589642  0.06549005 0.07935725 0.05583892 0.0097067
 0.         0.         0.05676619 0.00047533 0.         0.
 0.00339996 0.01353289 0.00803686 0.00577572 0.00512149 0.00844509
 0.01505558 0.01298184 0.01270071 0.0044033  0.00471901 0.01685487
 0.01007593 0.01336574 0.00606542 0.00698434 0.01008464 0.00839481
 0.01444354 0.01440504 0.01640975 0.0001614  0.         0.01384593
 0.0133705  0.01108226 0.00869129 0.00469049 0.0133017  0.01726626
 0.0019719  0.00488215 0.00684411 0.00507072 0.00845383 0.01064887
 0.0083218  0.01065202 0.00769324 0.00947029 0.         0.
 0.00063244 0.00567521 0.00682612 0.00692475 0.00147208 0.
 0.01239357 0.01332274 0.01469188 0.01431498 0.00028549 0.00171349
 0.00148545 0.00147206 0.00227887 0.00240779 0.00161289 0.
 0.00079976 0.0006848  0.00196911 0.00154944 0.03859177 0.00686464]
None


### Easy Ensemble AdaBoost Classifier

In [40]:
!pip install "imbalanced-learn<0.9.0"

Collecting imbalanced-learn<0.9.0
  Using cached imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.9.1
    Uninstalling imbalanced-learn-0.9.1:
      Successfully uninstalled imbalanced-learn-0.9.1
Successfully installed imbalanced-learn-0.8.1


In [41]:
!pip install scikit-learn==1.0 -U



In [42]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [43]:
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1, verbose=1, sampling_strategy=0)
eec.fit(X_train, y_train)

AttributeError: 'EasyEnsembleClassifier' object has no attribute 'n_features_in_'

I was unable to find the correct combination of packages in order for EasyEnsembleClassifier to run.
This bug indicates that it was fixed in imblearn 0.9.1, but this error is still appearing for me. I have tried in python 3.7, 3.8, and 3.9 with no luck. Further investigation is needed, but time constraints do not allow for it.

In [None]:
# Calculated the balanced accuracy score
# YOUR CODE HERE

0.9316600714093861

In [None]:
# Display the confusion matrix
# YOUR CODE HERE

array([[   93,     8],
       [  983, 16121]])

In [None]:
# Print the imbalanced classification report
# YOUR CODE HERE

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.09      0.92      0.94      0.16      0.93      0.87       101
   low_risk       1.00      0.94      0.92      0.97      0.93      0.87     17104

avg / total       0.99      0.94      0.92      0.97      0.93      0.87     17205

