In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [5]:
# Load data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)

# Preview
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


### Split Data

In [6]:
# Create features
X = df.drop(['loan_status'], axis=1)

# Create target
y = df['loan_status']

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Get data types of each column in X
column_types = X.dtypes

# Identify numerical features by selecting columns with numerical data types (e.g., int, float)
numerical_features_indices = np.where(column_types != object)[0]
cat_features_indices = np.where(column_types == object)[0]

# Define transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features_indices),  # Pass through numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features_indices)  # One-hot encode categorical features
    ],
    remainder='drop'
)

# Fit and transform data
X_encoded = preprocessor.fit_transform(X)

In [8]:
# Check balance of target values
y.value_counts()

loan_status
low_risk     68470
high_risk      347
Name: count, dtype: int64

In [9]:
# Split X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=1)

### Data Pre-Processing

In [10]:
# Create StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [11]:
# Fit Standard Scaler with training data
scaler.fit(X_train)

In [12]:
# Scale training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ensemble Learners

### Balanced Random Forest Classifier

In [13]:
# Resample training data with BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(n_estimators = 500, random_state = 1)
model.fit(X_train, y_train)
Counter(y_train)

Counter({'low_risk': 51366, 'high_risk': 246})

In [14]:
# Display confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   65,    36],
       [ 1662, 15442]])

In [15]:
# Calculate balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.773197051931573

In [16]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.64      0.90      0.07      0.76      0.57       101
   low_risk       1.00      0.90      0.64      0.95      0.76      0.60     17104

avg / total       0.99      0.90      0.65      0.94      0.76      0.60     17205



In [17]:
# List features sorted in descending order by feature importance
importances = model.feature_importances_
sorted(zip(importances, X.columns), reverse=True)

[(0.06661463495120314, 'initial_list_status'),
 (0.06318721529567178, 'out_prncp'),
 (0.06061788889673201, 'revol_bal'),
 (0.05826808075239343, 'total_acc'),
 (0.0517793455702452, 'total_rec_prncp'),
 (0.030814261825048914, 'int_rate'),
 (0.018223876144473468, 'debt_settlement_flag'),
 (0.01758748442537316, 'installment'),
 (0.01701350913498804, 'annual_inc'),
 (0.01643946938793743, 'acc_open_past_24mths'),
 (0.01638879681134762, 'delinq_amnt'),
 (0.01598335431546418, 'open_il_12m'),
 (0.01582245081932586, 'open_acc'),
 (0.015335352361938786, 'pub_rec'),
 (0.015039951662678231, 'num_tl_90g_dpd_24m'),
 (0.014672217013232325, 'max_bal_bc'),
 (0.014515194032758657, 'num_tl_30dpd'),
 (0.014340120491331286, 'delinq_2yrs'),
 (0.014264818693198206, 'home_ownership'),
 (0.013938521774605056, 'open_il_24m'),
 (0.013757719619549559, 'mths_since_rcnt_il'),
 (0.013740413820085784, 'total_rev_hi_lim'),
 (0.013613755428855435, 'num_tl_op_past_12m'),
 (0.01354320917024013, 'inq_last_12m'),
 (0.013299

### Easy Ensemble Classifier

In [19]:
# Resample training data with EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier(n_estimators = 500, random_state = 1)
model.fit(X_train, y_train)
Counter(y_train)

Counter({'low_risk': 51366, 'high_risk': 246})

In [20]:
# Display confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   83,    18],
       [ 1331, 15773]])

In [21]:
# Calculate balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.8719820619807537

In [22]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.06      0.82      0.92      0.11      0.87      0.75       101
   low_risk       1.00      0.92      0.82      0.96      0.87      0.77     17104

avg / total       0.99      0.92      0.82      0.95      0.87      0.77     17205



# Conclusion

1. Easy Ensemble has the best balanced accuracy score at 0.87

2. Easy Ensemble has the best recall score at 0.92

3. Easy Ensemble has the best geometric mean score at 0.87

4. The top three features are: Initial List Status, Outstanding Principal, and Revolving Balance.