# Credit Risk Ensemble Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [5]:
# Counts of each coulmns in DataFrame
df.count()

loan_size           77536
interest_rate       77536
homeowner           77536
borrower_income     77536
debt_to_income      77536
num_of_accounts     77536
derogatory_marks    77536
total_debt          77536
loan_status         77536
dtype: int64

In [6]:
# Count nulls in DataFrame
df.isnull().sum()

loan_size           0
interest_rate       0
homeowner           0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

# Split the Data into Training and Testing

In [7]:
# Create our features
X = pd.get_dummies(df.drop("loan_status", axis=1), columns=["homeowner"])
# Create our target
y = df["loan_status"]
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
0,10700.0,7.672,52800,0.431818,5,1,22800,0,1,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0,1,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0,0,1
3,10700.0,7.664,52700,0.43074,5,1,22700,0,1,0
4,10800.0,7.698,53000,0.433962,5,1,23000,1,0,0


In [8]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.497472,0.398911,0.103616
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.499997,0.489678,0.304764
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0,0.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0,0.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,1.0,1.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [10]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

# # This stratify parameter makes a split so that the proportion of values in the sample 
# # produced will be the same as the proportion of values provided to parameter stratify.
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

### Scaling Data

In [11]:
# Creating the scaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
pd.DataFrame(X_train_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.140456,0.123092,0.128595,0.321873,0.091152,1.044956,0.128595,1.007318,-0.818610,-0.337768
1,-0.289695,-0.273940,-0.277726,-0.208750,-0.434343,-0.674714,-0.277726,1.007318,-0.818610,-0.337768
2,-0.480874,-0.475267,-0.480886,-0.503997,-0.434343,-0.674714,-0.480886,1.007318,-0.818610,-0.337768
3,-0.433079,-0.420155,-0.421133,-0.414880,-0.434343,-0.674714,-0.421133,1.007318,-0.818610,-0.337768
4,-0.624258,-0.631605,-0.636244,-0.745204,-0.434343,-0.674714,-0.636244,-0.992735,1.221583,-0.337768
...,...,...,...,...,...,...,...,...,...,...
58147,1.191938,1.184841,1.180248,1.411744,1.142141,1.044956,1.180248,1.007318,-0.818610,-0.337768
58148,-0.719847,-0.740704,-0.743799,-0.920736,-0.959837,-0.674714,-0.743799,-0.992735,1.221583,-0.337768
58149,-1.484561,-1.467283,-1.472786,-2.336149,-1.485332,-0.674714,-1.472786,1.007318,-0.818610,-0.337768
58150,-0.289695,-0.268316,-0.265775,-0.192048,-0.434343,-0.674714,-0.265775,1.007318,-0.818610,-0.337768


# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [12]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [13]:
# Calculated the balanced accuracy score
y_pred_rf = brf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_rf)

0.9936818806972654

In [14]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_rf)

array([[  622,     3],
       [  147, 18612]], dtype=int64)

In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_rf))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.81      1.00      0.99      0.89      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       0.99      0.99      1.00      0.99      0.99      0.99     19384



In [16]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_

# List the top 10 most important features
importances_sorted = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.22759633831687995, 'borrower_income'),
 (0.1965574723230606, 'interest_rate'),
 (0.17417807060091195, 'debt_to_income'),
 (0.14944360428565667, 'total_debt'),
 (0.14229163194622926, 'loan_size'),
 (0.07859627283289125, 'num_of_accounts'),
 (0.02885770833789564, 'derogatory_marks'),
 (0.001219411955649641, 'homeowner_own'),
 (0.0008974701883394341, 'homeowner_mortgage'),
 (0.0003620192124855904, 'homeowner_rent')]

### Easy Ensemble Classifier

In [17]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [18]:
# Calculated the balanced accuracy score
y_pred_eec = eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_eec)

0.9944548430086891

In [19]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_eec)

array([[  622,     3],
       [  118, 18641]], dtype=int64)

In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      1.00      0.99      0.91      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       0.99      0.99      1.00      0.99      0.99      0.99     19384



* Which model had the best balanced accuracy score?

    The Easy Ensemble Classifier model has balanced accuracy score of 99.45% while Balanced Random Forest Classifier has 99.37%. Accuracy Score depicts how often a model is correct - the ratio of correctly predicted observations to the total number of observations. Hence Ensemble Classifier model is better.
    

* Which model had the best recall score?

    Both the models have the same recall scores of nearly 100% which implies that most of the high risk loans are correctly classified as high risk. There are only 3 loans which are actually high risk and classifed by both the models as low risk.


* Which model had the best geometric mean score?
    
    Both the models perform equal in terms of geometric mean score


* What are the top three features?

    Top three features of importance are Borrower's Income, Interest Rate and Debt to Income Ratio which contributes nearly 60% towards an outcome/decision on a loan status. Additionaly, total debt and loan size are factors covering another 29%. Rest of the factors are not as important when coming to decide if a loan is low risk or high.