In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#iimports needed for resampled modules
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [2]:
df_lending=pd.read_csv(Path("./Resources/lending_data.csv"))


df_lending.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
#creating features and target dataframes from the base
df_features=df_lending.drop(['loan_status'],axis=1)

df_target=df_lending['loan_status']

display(df_target.value_counts())
display(df_target[:10])
display(df_features.head())

0    75036
1     2500
Name: loan_status, dtype: int64

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: loan_status, dtype: int64

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [4]:
#training and testing data creation

# Use train_test_split to separate the data
training_features, testing_features, training_targets, testing_targets = train_test_split(df_features, df_target, random_state=1)

print("Training Target Summary")
display(training_targets.value_counts())
print("Training Target Summary")
display(testing_targets.value_counts())

Training Target Summary


0    56271
1     1881
Name: loan_status, dtype: int64

Training Target Summary


0    18765
1      619
Name: loan_status, dtype: int64

In [5]:
#Creating a Logistic Regression Model with the Original Data
orig_model=LogisticRegression(random_state=1)
orig_model=orig_model.fit(training_features,training_targets)

#Prediction of training & testing data
training_predictions_orig=orig_model.predict(training_features)
testing_predictions_orig=orig_model.predict(testing_features)

# Checking the accuracy score
print(f"Testing data accuracy score using original data: {balanced_accuracy_score(testing_predictions_orig, testing_targets):.4}\n")

# Generating confusion matrix (testing data)
testing_matrix_orig = confusion_matrix(testing_targets, testing_predictions_orig)
print(f'Confusion matrix & Classification Report for testing data: \n {testing_matrix_orig}')

testing_report_orig = classification_report_imbalanced(testing_targets, testing_predictions_orig)
print(testing_report_orig)

Testing data accuracy score using original data: 0.9218

Confusion matrix & Classification Report for testing data: 
 [[18663   102]
 [   56   563]]
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.91      1.00      0.95      0.91     18765
          1       0.85      0.91      0.99      0.88      0.95      0.90       619

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



## How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

The accuracy score of the logistic regression is very good on the Testing data ~92%. Model performs extremely well on the healthy loans. But the performance in terms of the precision and recall for the high-risk loan is lower. A test precision score of 85% implies that of all the bad loans predicted by the model 85% were infact bad. The recall of 91% implies that model is able to capture 91% of the default loans accurately.  

In [6]:
#Logistic Regression Model with Resampled Training Data

random_oversampler = RandomOverSampler(random_state=1)
X_resampled, y_resampled = random_oversampler.fit_resample(training_features, training_targets)

y_resampled.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [10]:
#Creating a Logistic Regression Model with the Original Data
resampled_model=LogisticRegression(random_state=1)
resampled_model=resampled_model.fit(X_resampled,y_resampled)

#Prediction of testing data
testing_predictions_resampled=resampled_model.predict(testing_features)

# Checking the accuracy score
print(f"Testing data accuracy score using resampled data: {balanced_accuracy_score(testing_targets,testing_predictions_resampled):.4}\n")

# Generating confusion matrix (testing data)
testing_matrix_resampled = confusion_matrix(testing_targets, testing_predictions_resampled)
print(f'Confusion matrix & Classification Report for testing data: \n {testing_matrix_resampled}')

testing_report_resampled = classification_report_imbalanced(testing_targets, testing_predictions_resampled)
print(testing_report_resampled)

Testing data accuracy score using resampled data: 0.9937

Confusion matrix & Classification Report for testing data: 
 [[18649   116]
 [    4   615]]
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.84      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



## How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

The accuracy score of the logistic regression is better on resampled data ~99% (vs 92% in original data). New model continues to perform extremely well on the healthy loans. However the performance in terms of the precision and recall for the high-risk loan is different. In the resampled data the precision remain the same (~85%) but the recall shows an improvement from 91% originally to 99% after resampling. 