# Q3: How Robust Credit Risk Models are Over Time

This script compares Naive Bayes, Support Vector Machines, Decision Trees, [INSERT 2 NEURAL NET CLASSIFIERS] in their performance in credit risk prediction when trained and tested data differ in economic periods. The dataset used is that of https://www.kaggle.com/datasets/wordsforthewise/lending-club/data, where the economic periods detailed are between 2007 to 2018. This is split in half such that 2007-2012 and 2013-2018 denote periods 1 and 2, respectively. All models will be trained on period 1 and tested on period 2. The results of this will then be compared to determine temporal stability of each model. The classification models were developed to determine whether a candidate would have a low or high credit risk.  

## Imports

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import json
import subprocess
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

# Add project root to path
PROJECT_ROOT = Path().resolve().parent
sys.path.append(str(PROJECT_ROOT))
warnings.filterwarnings("ignore")

from data_processings.datasets import LendingClubDataset
from data_processings.feature_engineering import process_q3_features
from models.neural import BinaryClassifier

## Pre-Processing of Data

Loading Dataset for Accepted Loans

In [None]:
num_samples = 100000
dataloader = LendingClubDataset()
accepted_df = dataloader.load(num_samples)

Feature Construction

In [None]:
# Retain relevant columns and build new features from existing features
accepted_df = process_q3_features(accepted_df)
accepted_df

In [None]:
# Issue Year 
accepted_df["issue_year"].unique()

Feature Type Conversion

In [None]:
# Binary Mapping for Binary Categorical Features 
accepted_df = accepted_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # Remove leading and trailing spaces
accepted_df['loan_status'] = accepted_df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1}) # Where Fully Paid -> Low Risk and Charged Off -> High Risk
accepted_df['application_type'] = accepted_df['application_type'].map({'Individual': 0, 'Joint App': 1})
accepted_df['term'] = accepted_df['term'].map({'36 months': 0, '60 months': 1})
accepted_df

In [None]:
# One Hot Encoding for Non-Binary Categorical Features
categorical_features = ["purpose", "home_ownership", "emp_length", "verification_status"]
accepted_df = pd.get_dummies(accepted_df, columns=categorical_features, drop_first=True, dtype=int)
accepted_df

In [None]:
# Verify Types
for col, dtype in accepted_df.dtypes.items():
    print(f"{col}: {dtype}")

Train-Test Split Based on Economic Periods: Period 1 (Train) and Period (Test)

where: Period 1 (2007-2012) and Period 2 (2013-2018)

In [None]:
year_indicator_col = 'issue_year'
target_col = 'loan_status'
train_df = accepted_df[accepted_df[year_indicator_col] <= 2012].copy()
test_df = accepted_df[accepted_df[year_indicator_col] > 2013].copy()
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]
X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

Scaling with MinMax

In [None]:
MinMax_scaler = MinMaxScaler()
X_train = MinMax_scaler.fit_transform(X_train)
X_test = MinMax_scaler.transform(X_test)

## Model Training and Evaluation

In [None]:
class_mapping = {0: 'Low Risk',
                 1: 'High Risk'}

Q3_DATA_PATH = Path("..") / "assets" / "q3_data"

def dict_to_json(data : dict, folder_path: str, file_name: str):
    file_name = f"{file_name}.json"
    file_path = os.path.join(folder_path, file_name)
    os.makedirs(folder_path, exist_ok=True)

    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)

    print(f"JSON file is saved to {file_path}!")

def conmat_to_png(conmat, class_mapping: dict, folder_path: str, file_name: str, model_name: str):

    file_name = f"{file_name}.png"
    file_path = os.path.join(folder_path, file_name)
    os.makedirs(folder_path, exist_ok=True)

    class_labels = list(class_mapping.values())
    
    fig = plt.figure(figsize=(15, 10))
    axis = sns.heatmap(conmat, annot=True, fmt='d', cbar=True, xticklabels=class_labels, yticklabels=class_labels)
    
    axis.set_ylabel("True Values")

    axis.set_xlabel("Predicted Values")

    title = f"Normalized Confusion Matrix for {model_name}"

    axis.set_title("Normalized Confusion Matrix")

    fig.savefig(file_path)

    print(f"PNG file is saved to {file_path}!")

**Model 1: Naive Bayes**

Training

In [None]:
GNB = GaussianNB()
GNB.fit(X_train, y_train)

Evaluation

In [None]:
GNB_preds = GNB.predict(X_test)
GNB_summary = classification_report(y_true=y_test, y_pred=GNB_preds, labels=list(class_mapping.keys()), target_names=list(class_mapping.values()), output_dict=True)

print(f"Naive Bayes Accuracy: {GNB_summary["accuracy"]}")

GNB_file_name = "gnb_performance"
dict_to_json(GNB_summary, Q3_DATA_PATH, GNB_file_name)

GNB_conmat = confusion_matrix(y_test, GNB_preds, normalize='true')

GNB_conmat_name = "gnb_confusion_matrix"
GNB_model_name = "Naive Bayes"
conmat_to_png(GNB_conmat, class_mapping, Q3_DATA_PATH, GNB_conmat_name, GNB_model_name)

**Model 2: Support Vector Machine**

Training

In [None]:
C_vals = [0.01, 0.1, 1.0, 10.0, 100.0]
best_accuracy = 0
best_SVM = SVC(C=1)
best_C = 0
SVM_accuracies = []

for i in range(len(C_vals)):
    SVM_candidate = SVC(kernel='linear', C=C_vals[i], random_state=10)
    SVM_candidate.fit(X_train, y_train)

    SVM_candidate_preds = SVM_candidate.predict(X_test)

    SVM_candidate_acc = round(accuracy_score(y_test, SVM_candidate_preds), 2)
    SVM_accuracies.append(SVM_candidate_acc)

    if SVM_candidate_acc > best_accuracy:
        best_SVM = SVM_candidate
        best_C = C_vals[i]
        best_accuracy = SVM_candidate_acc

SVM = best_SVM

print(f"Best C Value: {best_C}")
print(f"Accuracy: {best_accuracy}")

In [None]:
SVM_explore_dict = {
    'C': C_vals,
    'accuracies': SVM_accuracies
}

SVM_exp_file_name = "svm_exploration"
dict_to_json(SVM_explore_dict, Q3_DATA_PATH, SVM_exp_file_name)

Evaluation

In [None]:
SVM_preds = SVM.predict(X_test)
SVM_summary = classification_report(y_true=y_test, y_pred=SVM_preds, labels=list(class_mapping.keys()), target_names=list(class_mapping.values()), output_dict=True)
print(f"Support Vector Machine Accuracy: {best_accuracy}")

SVM_file_name = "svm_performance"
dict_to_json(SVM_summary, Q3_DATA_PATH, SVM_file_name)

SVM_conmat = confusion_matrix(y_test, SVM_preds, normalize='true')

SVM_conmat_name = "svm_confusion_matrix"
SVM_model_name = "Support Vector Machine"
conmat_to_png(SVM_conmat, class_mapping, Q3_DATA_PATH, SVM_conmat_name, SVM_model_name)

**Model 3: Decision Tree**

Training

In [None]:
DTC = DecisionTreeClassifier(random_state=10, criterion="entropy")
DTC.fit(X_train, y_train)

Evaluation

In [None]:
DTC_preds = DTC.predict(X_test)
DTC_summary = classification_report(y_true=y_test, y_pred=DTC_preds, labels=list(class_mapping.keys()), target_names=list(class_mapping.values()), output_dict=True)
print(f"Decision Tree Accuracy: {DTC_summary["accuracy"]}")

DTC_file_name = "dtc_performance"
dict_to_json(DTC_summary, Q3_DATA_PATH, DTC_file_name)

DTC_conmat = confusion_matrix(y_test, DTC_preds, normalize='true')

DTC_conmat_name = "dtc_confusion_matrix"
DTC_model_name = "Decision Tree"
conmat_to_png(DTC_conmat, class_mapping, Q3_DATA_PATH, DTC_conmat_name, DTC_model_name)

Plot and Save Tree

In [None]:
fig = plt.figure(figsize=(30,12))
plot_tree(decision_tree=DTC, max_depth=3, fontsize=10, feature_names=X_train.columns)

DTC_tree_name = "dtc_tree.png"
DTC_tree_path = os.path.join(Q3_DATA_PATH, DTC_tree_name)
os.makedirs(Q3_DATA_PATH, exist_ok=True)

fig.savefig(DTC_tree_path) 

**Model 4: K-Nearest Neighbours**

Training

In [None]:
K_vals = [1, 5, 10, 15, 20]
best_accuracy = 0
best_KNN = KNeighborsClassifier(n_neighbors=1)
best_K = 0
KNN_accuracies = []

for i in range(len(K_vals)):
    KNN_candidate = KNeighborsClassifier(n_neighbors=K_vals[i])

    KNN_candidate.fit(X_train, y_train)
    KNN_candidate_preds = KNN_candidate.predict(X_test)

    KNN_candidate_acc = round(accuracy_score(y_test, KNN_candidate_preds), 2)
    KNN_accuracies.append(KNN_candidate_acc)

    if KNN_candidate_acc > best_accuracy:
        best_KNN = KNN_candidate
        best_K = K_vals[i]
        best_accuracy = KNN_candidate_acc

KNN = best_KNN

print(f"Best K Value: {best_K}")
print(f"Accuracy: {best_accuracy}")

In [None]:
KNN_explore_dict = {
    'K': K_vals,
    'accuracies': KNN_accuracies
}

KNN_exp_file_name = "knn_exploration"
dict_to_json(SVM_explore_dict, Q3_DATA_PATH, KNN_exp_file_name)

Evaluation

In [None]:
KNN_preds = KNN.predict(X_test)
KNN_summary = classification_report(y_true=y_test, y_pred=KNN_preds, labels=list(class_mapping.keys()), target_names=list(class_mapping.values()), output_dict=True)
print(f"K-Nearest Neighbours Accuracy: {KNN_summary["accuracy"]}")

KNN_file_name = "knn_performance"
dict_to_json(KNN_summary, Q3_DATA_PATH, KNN_file_name)

KNN_conmat = confusion_matrix(y_test, KNN_preds, normalize='true')

KNN_conmat_name = "knn_confusion_matrix"
KNN_model_name = "K Nearest Neighbours"
conmat_to_png(KNN_conmat, class_mapping, Q3_DATA_PATH, KNN_conmat_name, KNN_model_name)

**Model 5: Neural Net**

Training

In [None]:
input_size = X_train.shape[1]
NN = BinaryClassifier(input_size)
NN.fit(X_train, y_train)

Evaluation

In [None]:
NN_preds = NN.predict(X_test)
NN_summary = classification_report(y_true=y_test, y_pred=NN_preds, labels=list(class_mapping.keys()), target_names=list(class_mapping.values()), output_dict=True)

print(f"Neural Network Accuracy: {NN_summary["accuracy"]}")

NN_file_name = "nn_performance"
dict_to_json(NN_summary, Q3_DATA_PATH, NN_file_name)

NN_conmat = confusion_matrix(y_test, NN_preds, normalize='true')

NN_conmat_name = "nn_confusion_matrix"
NN_model_name = "Neural Network"
conmat_to_png(NN_conmat, class_mapping, Q3_DATA_PATH, NN_conmat_name, NN_model_name)

NN_details_file_name = "nn_details"
NN.print_model_summary(Q3_DATA_PATH, NN_details_file_name)