In [1]:
# Credit Risk Resampling Techniques
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
# Read the CSV into DataFrame
# Load the data
file_path = Path('lending_data.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [4]:
# Split the Data into Training and Testing
# Create our features
X = df.copy()
X.drop("loan_status", axis=1, inplace=True)

# Create our target
y = df["loan_status"].values.reshape(-1, 1)

In [5]:
X.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700,7.672,own,52800,0.431818,5,1,22800
1,8400,6.692,own,43600,0.311927,3,0,13600
2,9000,6.963,rent,46100,0.349241,3,0,16100
3,10700,7.664,own,52700,0.43074,5,1,22700
4,10800,7.698,mortgage,53000,0.433962,5,1,23000


In [6]:
# Review index Dtype
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  int64  
 1   interest_rate     77536 non-null  float64
 2   homeowner         77536 non-null  object 
 3   borrower_income   77536 non-null  int64  
 4   debt_to_income    77536 non-null  float64
 5   num_of_accounts   77536 non-null  int64  
 6   derogatory_marks  77536 non-null  int64  
 7   total_debt        77536 non-null  int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 4.7+ MB


In [7]:
#Convert column into dummy variable
X = pd.get_dummies(X, columns=["homeowner"])
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
0,10700,7.672,52800,0.431818,5,1,22800,0,1,0
1,8400,6.692,43600,0.311927,3,0,13600,0,1,0
2,9000,6.963,46100,0.349241,3,0,16100,0,0,1
3,10700,7.664,52700,0.43074,5,1,22700,0,1,0
4,10800,7.698,53000,0.433962,5,1,23000,1,0,0


In [8]:
# Check the balance of our target values
df["loan_status"].value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [9]:
# Create X_train, X_test, y_train, y_test
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [10]:
X_train.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
count,58152.0,58152.0,58152.0,58152.0,58152.0,58152.0,58152.0,58152.0,58152.0,58152.0
mean,9805.544091,7.292321,49221.982047,0.377239,3.826506,0.391577,19221.982047,0.49773,0.398662,0.103608
std,2101.606869,0.893036,8405.039562,0.081689,1.91189,0.583111,8405.039562,0.499999,0.489627,0.304754
min,5000.0,5.259,30100.0,0.003322,0.0,0.0,100.0,0.0,0.0,0.0
25%,8700.0,6.82375,44800.0,0.330357,3.0,0.0,14800.0,0.0,0.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0,0.0,0.0
75%,10400.0,7.525,51400.0,0.416342,4.0,1.0,21400.0,1.0,1.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0,1.0,1.0


In [11]:
#  Data Pre-Processing
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [12]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [13]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Simple Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [15]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9481182566723452

In [16]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  541,    59],
       [  102, 18682]])

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.90      0.99      0.87      0.95      0.89       600
   low_risk       1.00      0.99      0.90      1.00      0.95      0.91     18784

avg / total       0.99      0.99      0.90      0.99      0.95      0.90     19384



In [18]:
# Oversampling
# In this section, we will compare two oversampling algorithms to determine which algorithm results in the best performance. 
# We will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm

# Naive Random Oversampling
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

# View the count of target classes with Counter
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'low_risk': 56252, 'high_risk': 56252})

In [19]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.9481182566723452

In [21]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  597,     3],
       [  111, 18673]])

In [38]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      1.00      0.99      0.91      1.00      0.99       600
   low_risk       1.00      0.99      1.00      1.00      1.00      0.99     18784

avg / total       0.99      0.99      1.00      0.99      1.00      0.99     19384



In [39]:
# SMOTE Oversampling
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

# View the count of target classes with Counter
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter
Counter(y_resampled)

Counter({'low_risk': 56252, 'high_risk': 56252})

In [40]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [41]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9945453577512777

In [42]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  597,     3],
       [  111, 18673]])

In [43]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.99      0.99      0.91      0.99      0.99       600
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18784

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [44]:
# Undersampling

# In this section, we will test an undersampling algorithm to determine which algorithm results (cont'd)
# in the best performance compared to the oversampling algorithms above. We will undersample the data using (cont'd)
# the Cluster Centroids algorithm

# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids

# View the count of target classes with Counter
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_risk': 1900, 'low_risk': 1900})

In [45]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [46]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.9945453577512777

In [47]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  581,    19],
       [  110, 18674]])

In [48]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.97      0.99      0.90      0.98      0.96       600
   low_risk       1.00      0.99      0.97      1.00      0.98      0.97     18784

avg / total       0.99      0.99      0.97      0.99      0.98      0.96     19384



In [49]:
# Combination (Over and Under) Sampling

# In this section, we will test a combination over- and under-sampling algorithm to determine if the algorithm (cont'd)
# results in the best performance compared to the other sampling algorithms above. We will resample the data using (cont'd)
# the SMOTEENN algorithm

# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

# View the count of target classes with Counter
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'high_risk': 74065, 'low_risk': 74591})

In [50]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [51]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.9812386428165816

In [52]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  598,     2],
       [  114, 18670]])

In [53]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      1.00      0.99      0.91      1.00      0.99       600
   low_risk       1.00      0.99      1.00      1.00      1.00      0.99     18784

avg / total       0.99      0.99      1.00      0.99      1.00      0.99     19384

