# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE 
from sklearn.datasets import make_classification
from imblearn.under_sampling import ClusterCentroids 
from imblearn.combine import SMOTEENN 



# Read the CSV and Perform Basic Data Cleaning

In [3]:
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [4]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df

pd.set_option('display.max_rows', 20)

df.dtypes

loan_amnt                     float64
int_rate                      float64
installment                   float64
home_ownership                 object
annual_inc                    float64
                               ...   
total_bal_ex_mort             float64
total_bc_limit                float64
total_il_high_credit_limit    float64
hardship_flag                  object
debt_settlement_flag           object
Length: 86, dtype: object

# Split the Data into Training and Testing

In [5]:
df_wo_status = df.drop('loan_status', axis=1)
df_wo_status

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,n,27.24,0.0,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2000,929.09,MORTGAGE,105000.0,Verified,Mar-2019,n,20.23,0.0,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2000,529.88,MORTGAGE,56000.0,Verified,Mar-2019,n,24.26,0.0,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.1640,353.55,RENT,92000.0,Verified,Mar-2019,n,31.44,0.0,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,n,18.76,0.0,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,RENT,26000.0,Source Verified,Jan-2019,n,9.60,0.0,...,80.0,0.0,0.0,0.0,20625.0,6798.0,11300.0,5425.0,N,N
68813,12000.0,0.2727,368.37,RENT,63000.0,Not Verified,Jan-2019,n,29.07,0.0,...,96.2,0.0,0.0,0.0,87939.0,60350.0,13500.0,62939.0,N,N
68814,5000.0,0.1992,185.62,MORTGAGE,52000.0,Source Verified,Jan-2019,n,14.86,0.0,...,100.0,0.0,1.0,0.0,30592.0,18611.0,3600.0,18492.0,N,N
68815,40000.0,0.0646,1225.24,MORTGAGE,520000.0,Verified,Jan-2019,n,9.96,0.0,...,98.2,12.5,0.0,0.0,1033574.0,95958.0,100800.0,78634.0,N,N


In [6]:
loan_status_df = df['loan_status']
loan_status_df

0        low_risk
1        low_risk
2        low_risk
3        low_risk
4        low_risk
           ...   
68812    low_risk
68813    low_risk
68814    low_risk
68815    low_risk
68816    low_risk
Name: loan_status, Length: 68817, dtype: object

In [7]:
# Create our features
X = pd.get_dummies(df_wo_status)

pd.set_option('display.max_rows', 20)

X.dtypes

# Create our target
y = pd.get_dummies(loan_status_df)
y.drop('high_risk', axis=1, inplace=True)

In [8]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,issue_d_Mar-2019,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.177238,1.0,0.123879,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.381873,0.0,0.329446,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y['low_risk'].value_counts()

1    68470
0      347
Name: low_risk, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
X_train_array = X_train.to_numpy()
X_train_array
y_train_array = y_train.to_numpy()
X_test_array = X_test.to_numpy()
y_test_array = y_test.to_numpy()

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [12]:
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(sampling_strategy=1, random_state=1)
X_train
#y_train
X_resampled, y_resampled = ros.fit_resample(X_train_array, y_train_array)


In [13]:
unique, counts = np.unique(y_resampled, return_counts=True)
counts

array([51366, 51366])

In [14]:
# Train the Logistic Regression model using the resampled data
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier
classifier.fit(X_resampled, y_resampled)
predictions = classifier.predict(X_test)
predictions


array([0, 0, 1, ..., 0, 1, 1], dtype=uint8)

In [15]:
X_test_array

array([[1.00000e+04, 1.30800e-01, 3.37330e+02, ..., 0.00000e+00,
        1.00000e+00, 1.00000e+00],
       [1.02750e+04, 1.61400e-01, 2.50640e+02, ..., 0.00000e+00,
        1.00000e+00, 1.00000e+00],
       [4.00000e+04, 8.19000e-02, 1.25697e+03, ..., 0.00000e+00,
        1.00000e+00, 1.00000e+00],
       ...,
       [1.50000e+04, 1.55700e-01, 3.61360e+02, ..., 0.00000e+00,
        1.00000e+00, 1.00000e+00],
       [5.60000e+03, 1.13100e-01, 1.84170e+02, ..., 0.00000e+00,
        1.00000e+00, 1.00000e+00],
       [4.00000e+04, 1.44700e-01, 1.37626e+03, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00]])

In [16]:
X_resampled

array([[1.6000e+03, 8.8100e-02, 5.0740e+01, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [9.0000e+03, 7.5600e-02, 2.8021e+02, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [1.0000e+04, 1.0330e-01, 2.1410e+02, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       ...,
       [2.1000e+04, 7.0200e-02, 6.4862e+02, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [2.0000e+04, 1.5020e-01, 6.9351e+02, ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00],
       [6.5000e+03, 1.2980e-01, 2.1895e+02, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00]])

In [17]:
y_test
#y_resampled

Unnamed: 0,low_risk
7034,1
38804,1
65533,1
37736,1
6390,1
...,...
31854,1
27835,1
3089,1
43343,1


In [18]:
# Calculated the balanced accuracy score
print(f"Training Data Score: {classifier.score(X_resampled, y_resampled)}")
print(f"Testing Data Score: {classifier.score(X_test_array, y_test_array)}")

Training Data Score: 0.6601448428921854
Testing Data Score: 0.587503632664923


In [19]:
# Display the confusion matrix
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test_array, y_pred)
cm

array([[   73,    28],
       [ 7069, 10035]])

In [20]:
# Print the imbalanced classification report
report = classification_report(y_test_array, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.01      0.72      0.02       101
           1       1.00      0.59      0.74     17104

    accuracy                           0.59     17205
   macro avg       0.50      0.65      0.38     17205
weighted avg       0.99      0.59      0.73     17205



### SMOTE Oversampling

In [21]:
# Resample the training data with SMOTE
sm = SMOTE(random_state=1)
X_resampled_smote, y_resampled_smote = sm.fit_resample(X_train_array, y_train_array)


In [22]:
# Train the Logistic Regression model using the resampled data
classifier_smote = LogisticRegression(solver='lbfgs', random_state=1)
classifier_smote
classifier_smote.fit(X_resampled_smote, y_resampled_smote)
predictions_smote = classifier_smote.predict(X_test)
predictions_smote


array([1, 0, 1, ..., 0, 1, 1], dtype=uint8)

In [23]:
# Calculated the balanced accuracy score
print(f"Training Data Score: {classifier_smote.score(X_resampled_smote, y_resampled_smote)}")
print(f"Testing Data Score: {classifier_smote.score(X_test_array, y_test_array)}")

Training Data Score: 0.6365786707160378
Testing Data Score: 0.6644580063934903


In [24]:
# Display the confusion matrix
y_pred_smote = classifier_smote.predict(X_test)

cm_smote = confusion_matrix(y_test_array, y_pred_smote)
cm_smote

array([[   62,    39],
       [ 5734, 11370]])

In [25]:
# Print the imbalanced classification report
report_smote = classification_report(y_test_array, y_pred_smote)
print(report_smote)


              precision    recall  f1-score   support

           0       0.01      0.61      0.02       101
           1       1.00      0.66      0.80     17104

    accuracy                           0.66     17205
   macro avg       0.50      0.64      0.41     17205
weighted avg       0.99      0.66      0.79     17205



# Undersampling

In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [26]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
cc = ClusterCentroids(random_state=1)
X_resampled_cc, y_resampled_cc = cc.fit_resample(X_train_array, y_train_array)


In [27]:
# Train the Logistic Regression model using the resampled data
classifier_cc = LogisticRegression(solver='lbfgs', random_state=1)
classifier_cc
classifier_cc.fit(X_resampled_cc, y_resampled_cc)
predictions_cc = classifier_cc.predict(X_test)
predictions_cc


array([0, 1, 0, ..., 0, 0, 0], dtype=uint8)

In [28]:
# Calculated the balanced accuracy score
print(f"Training Data Score: {classifier_cc.score(X_resampled_cc, y_resampled_cc)}")
print(f"Testing Data Score: {classifier_cc.score(X_test_array, y_test_array)}")

Training Data Score: 0.6686991869918699
Testing Data Score: 0.3919790758500436


In [29]:
# Display the confusion matrix
y_pred_cc = classifier_cc.predict(X_test)

cm_cc = confusion_matrix(y_test_array, y_pred_cc)
cm_cc

array([[   68,    33],
       [10428,  6676]])

In [30]:
# Print the imbalanced classification report
report_cc = classification_report(y_test_array, y_pred_cc)
print(report_cc)


              precision    recall  f1-score   support

           0       0.01      0.67      0.01       101
           1       1.00      0.39      0.56     17104

    accuracy                           0.39     17205
   macro avg       0.50      0.53      0.29     17205
weighted avg       0.99      0.39      0.56     17205



# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [31]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
sme = SMOTEENN(random_state=1)
X_resampled_sme, y_resampled_sme = sme.fit_resample(X_train_array, y_train_array)


In [32]:
# Train the Logistic Regression model using the resampled data
classifier_sme = LogisticRegression(solver='lbfgs', random_state=1)
classifier_sme.fit(X_resampled_sme, y_resampled_sme)
predictions_sme = classifier_sme.predict(X_test)
predictions_sme


array([0, 0, 1, ..., 0, 1, 1], dtype=uint8)

In [33]:
# Calculated the balanced accuracy score
print(f"Training Data Score: {classifier_sme.score(X_resampled_sme, y_resampled_sme)}")
print(f"Testing Data Score: {classifier_sme.score(X_test_array, y_test_array)}")

Training Data Score: 0.6628950966188504
Testing Data Score: 0.5822144725370532


In [34]:
# Display the confusion matrix
y_pred_sme = classifier_sme.predict(X_test)

cm_sme = confusion_matrix(y_test_array, y_pred_sme)
cm_sme

array([[  71,   30],
       [7158, 9946]])

In [35]:
# Print the imbalanced classification report
report_sme = classification_report(y_test_array, y_pred_sme)
print(report_sme)


              precision    recall  f1-score   support

           0       0.01      0.70      0.02       101
           1       1.00      0.58      0.73     17104

    accuracy                           0.58     17205
   macro avg       0.50      0.64      0.38     17205
weighted avg       0.99      0.58      0.73     17205

