In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix

from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import imblearn
from imblearn.over_sampling import SMOTE

from collections import Counter
from typing import Union, List
import joblib

import warnings
warnings.filterwarnings(action='ignore')

## load data

In [2]:
df = pd.read_csv('scaled_train.csv')

In [3]:
df.columns

Index(['cat_OTHERS', 'cat_OWN', 'cat_RENT', 'cat_credit_card',
       'cat_debt_consolidation', 'cat_educational', 'cat_home_improvement',
       'cat_house', 'cat_major_purchase', 'cat_medical', 'cat_moving',
       'cat_other', 'cat_renewable_energy', 'cat_small_business',
       'cat_vacation', 'cat_wedding', 'loan_amnt', 'term', 'int_rate',
       'installment', 'sub_grade', 'emp_length', 'verification_status',
       'delinq_2yrs', 'inq_last_6mths', 'pub_rec', 'revol_util',
       'collections_12_mths_ex_med', 'application_type', 'dti_joint',
       'acc_now_delinq', 'chargeoff_within_12_mths', 'mths_since_recent_inq',
       'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
       'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'is_after_2015',
       'is_after_2012', 'fico_avg', 'all_util_log', 'annual_inc_log',
       'annual_inc_joint_log', 'bc_open_to_buy_log', 'delinq_amnt_log',
       'dti_log', 'max_bal_bc_log', 'mo_sin_old_il_acct_log',
       'mo_sin_old_rev_tl_op_log', 

In [4]:
print(df.shape)
df_y = df['loan_status']
df_X = df.drop(columns=['loan_status', 'raw_total_pymnt', 'raw_loan_amnt'])
origin_train_df = df.copy()

(1131682, 88)


In [8]:
def my_custom_roe(df):
    equity, returns = 0, 0
    for i in range(len(df)):
        if df.iloc[i]['y_pred'] == 0:   
            equity += df.iloc[i]['raw_loan_amnt']
            returns += df.iloc[i]['raw_total_pymnt']
        else:
            equity += df.iloc[i]['raw_loan_amnt']
            returns += df.iloc[i]['raw_loan_amnt'] * 1.03
    return returns / equity

## Model Fitting

### Logistic Regression - Lasso penalty

In [11]:
skf = StratifiedKFold(shuffle=True, random_state=30)
roelst_Lasso = []

for fold, (train_index, test_index) in enumerate(skf.split(df_X, df_y)):
    # 인덱스를 사용하여 학습 및 테스트 데이터를 추출
    X_train_fold = df_X.iloc[train_index]
    y_train_fold = df_y.iloc[train_index]
    X_test_fold = df_X.iloc[test_index]
    y_test_fold = df_y.iloc[test_index]

    # SMOTE 오버샘플링
    smote = SMOTE(random_state=30)
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)

    # RidgeClassifierCV 학습
    clfLasso_fold = LogisticRegressionCV(random_state = 30, solver = 'saga').fit(X_train_fold, y_train_fold)

    print('Test Error:')
    y_pred = clfLasso_fold.predict(X_test_fold)
    print(classification_report(y_test_fold, y_pred))

    origin_test_df = df.iloc[test_index]

    # ROE 계산 및 추가
    origin_test_df['y_pred'] = y_pred
    roelst_Lasso.append(my_custom_roe(origin_test_df))

Test Error:
              precision    recall  f1-score   support

           0       0.85      0.79      0.82    180600
           1       0.35      0.46      0.40     45737

    accuracy                           0.72    226337
   macro avg       0.60      0.62      0.61    226337
weighted avg       0.75      0.72      0.73    226337

Test Error:
              precision    recall  f1-score   support

           0       0.85      0.79      0.82    180600
           1       0.35      0.46      0.40     45737

    accuracy                           0.72    226337
   macro avg       0.60      0.62      0.61    226337
weighted avg       0.75      0.72      0.73    226337

Test Error:
              precision    recall  f1-score   support

           0       0.85      0.79      0.82    180599
           1       0.35      0.46      0.40     45737

    accuracy                           0.72    226336
   macro avg       0.60      0.62      0.61    226336
weighted avg       0.75      0.72     

In [12]:
print(roelst_Lasso)
ROE_Lasso = np.mean(roelst_Lasso)
print(ROE_Lasso)

[1.0447910683448645, 1.0442785382641457, 1.0450904476982943, 1.0451302780769849, 1.0452042055905486]
1.0448989075949675


### Logistic Regression - Ridge penalty

In [9]:
skf = StratifiedKFold(shuffle=True, random_state=30)
roelst_Ridge = []

for fold, (train_index, test_index) in enumerate(skf.split(df_X, df_y)):
    # 인덱스를 사용하여 학습 및 테스트 데이터를 추출
    X_train_fold = df_X.iloc[train_index]
    y_train_fold = df_y.iloc[train_index]
    X_test_fold = df_X.iloc[test_index]
    y_test_fold = df_y.iloc[test_index]

    # SMOTE 오버샘플링
    smote = SMOTE(random_state=30)
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)

    # RidgeClassifierCV 학습
    clfRidge_fold = RidgeClassifierCV().fit(X_train_fold, y_train_fold)

    print('Test Error:')
    y_pred = clfRidge_fold.predict(X_test_fold)
    print(classification_report(y_test_fold, y_pred))

    origin_test_df = df.iloc[test_index]

    # ROE 계산 및 추가
    origin_test_df['y_pred'] = y_pred
    roelst_Ridge.append(my_custom_roe(origin_test_df))

Test Error:
              precision    recall  f1-score   support

           0       0.85      0.80      0.82    180600
           1       0.36      0.45      0.40     45737

    accuracy                           0.73    226337
   macro avg       0.60      0.62      0.61    226337
weighted avg       0.75      0.73      0.74    226337

Test Error:
              precision    recall  f1-score   support

           0       0.85      0.80      0.82    180600
           1       0.36      0.44      0.39     45737

    accuracy                           0.73    226337
   macro avg       0.60      0.62      0.61    226337
weighted avg       0.75      0.73      0.74    226337

Test Error:
              precision    recall  f1-score   support

           0       0.85      0.80      0.82    180599
           1       0.36      0.45      0.40     45737

    accuracy                           0.73    226336
   macro avg       0.60      0.62      0.61    226336
weighted avg       0.75      0.73     

In [10]:
print(roelst_Ridge)
ROE_Ridge = np.mean(roelst_Ridge)
print(ROE_Ridge)

[1.044635060039317, 1.0440141923165451, 1.0451849616801596, 1.0450393229329094, 1.0450124921987727]
1.0447772058335407


### RandomForest

In [14]:
skf = StratifiedKFold(shuffle=True, random_state=30)
roelst_RF = []
RF_para = {'max_depth':[5, 10, None], 
           'n_estimators':[15, 30]}

for fold, (train_index, test_index) in enumerate(skf.split(df_X, df_y)):
    # 인덱스를 사용하여 학습 및 테스트 데이터를 추출
    X_train_fold = df_X.iloc[train_index]
    y_train_fold = df_y.iloc[train_index]
    X_test_fold = df_X.iloc[test_index]
    y_test_fold = df_y.iloc[test_index]

    # SMOTE 오버샘플링
    smote = SMOTE(random_state=30)
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)

    # Random Forest 학습 
    clfRF_fold = RandomForestClassifier(random_state = 30)
    clfRF_grid = GridSearchCV(clfRF_fold, RF_para, cv = 5, refit = True)
    clfRF_grid.fit(X_train_fold, y_train_fold)
    clfRF_fold = clfRF_grid.best_estimator_

    print('Test Error:')
    y_pred = clfRF_fold.predict(X_test_fold)
    print(classification_report(y_test_fold, y_pred))

    origin_test_df = df.iloc[test_index]

    # ROE 계산 및 추가
    origin_test_df['y_pred'] = y_pred
    roelst_RF.append(my_custom_roe(origin_test_df))

Test Error:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88    180600
           1       0.43      0.16      0.23     45737

    accuracy                           0.79    226337
   macro avg       0.62      0.55      0.56    226337
weighted avg       0.74      0.79      0.75    226337

Test Error:
              precision    recall  f1-score   support

           0       0.82      0.94      0.88    180600
           1       0.43      0.16      0.24     45737

    accuracy                           0.79    226337
   macro avg       0.62      0.55      0.56    226337
weighted avg       0.74      0.79      0.75    226337

Test Error:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88    180599
           1       0.43      0.16      0.24     45737

    accuracy                           0.79    226336
   macro avg       0.62      0.55      0.56    226336
weighted avg       0.74      0.79     

In [15]:
print(roelst_RF)
ROE_RF = np.mean(roelst_RF)
print(ROE_RF)

[1.0396726753183139, 1.039555335000076, 1.0408513802500143, 1.040219273108219, 1.0410321772136437]
1.0402661681780532


### XGBoost

In [16]:
skf = StratifiedKFold(shuffle=True, random_state=30)
roelst_xgB = []
xgB_para = {'n_estimators' : [100, None], 
           'max_depth' : [15, None],
           'gamma' : [0, 1],
           'colsample_bytree' : [0.8, 0.9]}

for fold, (train_index, test_index) in enumerate(skf.split(df_X, df_y)):
    # 인덱스를 사용하여 학습 및 테스트 데이터를 추출
    X_train_fold = df_X.iloc[train_index]
    y_train_fold = df_y.iloc[train_index]
    X_test_fold = df_X.iloc[test_index]
    y_test_fold = df_y.iloc[test_index]

    # SMOTE 오버샘플링
    smote = SMOTE(random_state=30)
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)

    # Random Forest 학습 
    clfxgB_fold = XGBClassifier(seed = 30)
    clfxgB_grid = GridSearchCV(clfxgB_fold, xgB_para, cv = 5, refit = True)
    clfxgB_grid.fit(X_train_fold, y_train_fold)
    clfxgB_fold = clfxgB_grid.best_estimator_

    print('Test Error:')
    y_pred = clfxgB_fold.predict(X_test_fold)
    print(classification_report(y_test_fold, y_pred))

    origin_test_df = df.iloc[test_index]

    # ROE 계산 및 추가
    origin_test_df['y_pred'] = y_pred
    roelst_xgB.append(my_custom_roe(origin_test_df))

Test Error:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89    180600
           1       0.55      0.10      0.17     45737

    accuracy                           0.80    226337
   macro avg       0.68      0.54      0.53    226337
weighted avg       0.76      0.80      0.74    226337

Test Error:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89    180600
           1       0.55      0.10      0.17     45737

    accuracy                           0.80    226337
   macro avg       0.68      0.54      0.53    226337
weighted avg       0.76      0.80      0.74    226337

Test Error:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89    180599
           1       0.55      0.10      0.17     45737

    accuracy                           0.80    226336
   macro avg       0.68      0.54      0.53    226336
weighted avg       0.76      0.80     

In [17]:
print(roelst_xgB)
ROE_xgB = np.mean(roelst_xgB)
print(ROE_xgB)

[1.0400145732068389, 1.0397745231418913, 1.041344576153326, 1.0399808720841184, 1.0408086368831841]
1.0403846362938718


### QDA

In [18]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

In [19]:
skf = StratifiedKFold(shuffle=True, random_state=30)
roelst_qda = []

for fold, (train_index, test_index) in enumerate(skf.split(df_X, df_y)):
    # 인덱스를 사용하여 학습 및 테스트 데이터를 추출
    X_train_fold = df_X.iloc[train_index]
    y_train_fold = df_y.iloc[train_index]
    X_test_fold = df_X.iloc[test_index]
    y_test_fold = df_y.iloc[test_index]

    # SMOTE 오버샘플링
    smote = SMOTE(random_state=30)
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)

    # RidgeClassifierCV 학습
    clfQDA_fold = QDA().fit(X_train_fold, y_train_fold)

    print('Test Error:')
    y_pred = clfQDA_fold.predict(X_test_fold)
    print(classification_report(y_test_fold, y_pred))

    origin_test_df = df.iloc[test_index]

    # ROE 계산 및 추가
    origin_test_df['y_pred'] = y_pred
    roelst_qda.append(my_custom_roe(origin_test_df))

Test Error:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84    180600
           1       0.31      0.25      0.28     45737

    accuracy                           0.74    226337
   macro avg       0.56      0.55      0.56    226337
weighted avg       0.72      0.74      0.72    226337

Test Error:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84    180600
           1       0.31      0.26      0.28     45737

    accuracy                           0.73    226337
   macro avg       0.56      0.56      0.56    226337
weighted avg       0.72      0.73      0.72    226337

Test Error:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84    180599
           1       0.31      0.25      0.28     45737

    accuracy                           0.74    226336
   macro avg       0.57      0.55      0.56    226336
weighted avg       0.72      0.74     

In [20]:
print(roelst_qda)
ROE_QDA = np.mean(roelst_qda)
print(ROE_QDA)

[1.0448992145128417, 1.0451616186205173, 1.045871315992722, 1.0442149659778939, 1.0468134887794411]
1.045392120776683
