In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
# Load data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [4]:
# Create features
X = df.drop(['loan_status'], axis=1)

# Create target
y = df['loan_status']

In [5]:
# numerically encode 'homeowner' column
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(X['homeowner'])
list(encoder.classes_)
X["homeowner_e"] = encoder.transform(X['homeowner'])
X.head()
homeowner_num = {
    "mortgage": 1,
    "own": 2,
    "rent": 3}
X["homeowner_num"] = X["homeowner"].apply(lambda x: homeowner_num[x])
X.head()
X.drop(["homeowner", "homeowner_e"], axis=1, inplace=True)
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_num
0,10700.0,7.672,52800,0.431818,5,1,22800,2
1,8400.0,6.692,43600,0.311927,3,0,13600,2
2,9000.0,6.963,46100,0.349241,3,0,16100,3
3,10700.0,7.664,52700,0.43074,5,1,22700,2
4,10800.0,7.698,53000,0.433962,5,1,23000,1


In [6]:
# Check balance of target values
y.value_counts()

loan_status
low_risk     75036
high_risk     2500
Name: count, dtype: int64

In [8]:
# Create X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Data Pre-Processing

In [9]:
# Create StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [10]:
# Fit Standard Scaler with training data
scaler.fit(X_train)

In [11]:
# Scale training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simple Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

In [13]:
# Calculate balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9520479254722232

In [14]:
# Display confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  563,    56],
       [  102, 18663]])

In [15]:
# Print imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.85      0.91      0.99      0.88      0.95      0.90       619
   low_risk       1.00      0.99      0.91      1.00      0.95      0.91     18765

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



# Oversampling

### Naive Random Oversampling

In [16]:
# Resample training data with RandomOversampler
from imblearn.over_sampling import RandomOverSampler

# View count of target classes with Counter
ros = RandomOverSampler(random_state = 1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'low_risk': 56271, 'high_risk': 56271})

In [17]:
# Train Logistic Regression model using resampled data
model = LogisticRegression(solver = 'lbfgs', random_state = 1)
model.fit(X_resampled, y_resampled)

In [18]:
# Display confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  615,     4],
       [  116, 18649]])

In [19]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9936781215845847

In [20]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### SMOTE Oversampling

In [21]:
# Resample training data with SMOTE
from imblearn.over_sampling import SMOTE

# View count of target classes with Counter
X_resampled, y_resampled = SMOTE(random_state = 1, sampling_strategy = 1.0).fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'low_risk': 56271, 'high_risk': 56271})

In [22]:
# Train Logistic Regression model using resampled data
model = LogisticRegression(solver = 'lbfgs', random_state = 1)
model.fit(X_resampled, y_resampled)

In [23]:
# Calculate balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9936781215845847

In [24]:
# Display confusion matrix
confusion_matrix(y_test, y_pred)

array([[  615,     4],
       [  116, 18649]])

In [25]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



# Undersampling

In [26]:
# Resample data using ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids

# View count of target classes with Counter
cc = ClusterCentroids()
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)
Counter(y_train)

Counter({'low_risk': 56271, 'high_risk': 1881})

In [27]:
# Train Logistic Regression model using resampled data
model = LogisticRegression(solver = 'lbfgs', random_state = 1)
model.fit(X_resampled, y_resampled)

In [28]:
# Calculate balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9881304218875842

In [29]:
# Display confusion matrix
confusion_matrix(y_test, y_pred)

array([[  608,    11],
       [  112, 18653]])

In [30]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.98      0.99      0.91      0.99      0.98       619
   low_risk       1.00      0.99      0.98      1.00      0.99      0.98     18765

avg / total       0.99      0.99      0.98      0.99      0.99      0.98     19384



# Combination (Over and Under) Sampling

In [31]:
# Resample training data with SMOTEENN
from imblearn.combine import SMOTEENN

# View count of target classes with Counter
o_u = SMOTEENN(random_state = 1)
X_resampled, y_resampled = o_u.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'low_risk': 55948, 'high_risk': 55462})

In [32]:
# Train Logistic Regression model using resampled data
model = LogisticRegression(solver = 'lbfgs', random_state = 1)
model.fit(X_resampled, y_resampled)

In [33]:
# Calculate balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9934649587814939

In [34]:
# Display confusion matrix
confusion_matrix(y_test, y_pred)

array([[  615,     4],
       [  124, 18641]])

In [35]:
# Print imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



# Conclusion

1. Random Oversampling has the best score of 0.99

2. Oversampling and Combination Sampling have the best recall score at 0.99

3. All models have an equal geometric mean score at 0.99
