# Credit Risk Classification


In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

---

## Split the Data into Training and Testing Sets

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df= pd.read_csv(Path("./Resources/lending_data.csv"))
# Review the DataFrame
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Separate the data into labels(y) and features(X)

y= lending_df["loan_status"]

X= lending_df.drop(columns= "loan_status")

In [4]:
# Review the y variable Series
y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [5]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [6]:
# Check the balance of target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=1)

In [23]:
y_train.value_counts()

0    56271
1     1881
Name: loan_status, dtype: int64

---

## Create a Logistic Regression Model with the Original Data

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model

lr_model= LogisticRegression(random_state=1)
# Fit the model using training data
lr_model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [9]:
# Make a prediction using the testing data
y_pred = lr_model.predict(X_test)

In [10]:
# Print the balanced_accuracy score of the model
print(balanced_accuracy_score(y_test, y_pred))

0.9520479254722232


In [11]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_pred)

array([[18663,   102],
       [   56,   563]])

In [12]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.91      1.00      0.95      0.91     18765
          1       0.85      0.91      0.99      0.88      0.95      0.90       619

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:**  In the model trained with dataset with imbalanced class, the model performed well at predicting  0s (healthy loan) but could not perform well at predicting 1 (high-risk loan) labels because the data was dominated by 0s.
Whereas model must perform well at predicting 1 (high-risk loan) as well because any lender would like to know about high risk loans to avoid them.
* Refer to analysis report.

---

## Predict a Logistic Regression Model with Resampled Training Data

In [22]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros_model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_oversampled, y_oversampled = ros_model.fit_resample(X_train, y_train)

In [21]:
# Count the distinct values of the resampled labels data
y_undersampled.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [24]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model_ros = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
lr_model_ros.fit(X_oversampled, y_oversampled)

# Make a prediction using the testing data
y_pred_ros= lr_model_ros.predict(X_test)

In [26]:
# Print the balanced_accuracy score of the model 
print(balanced_accuracy_score(y_test, y_pred_ros))

0.9936781215845847


In [27]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_pred_ros)

array([[18649,   116],
       [    4,   615]])

In [31]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, y_pred_ros))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.84      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model when fitted with oversampled data, performs better not only at predicting `0` (healthy loan) but also `1` (high-risk loan) labels. However as the lender must focus on avoiding high risk loans, the model should be better at predicting 1s than 0s, which the model fitted with oversampled data is doing better.
* Refer to analysis report.