In [1]:
import joblib
import pandas as pd
import numpy as np
import warnings
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from typing import Union, List
#import ISLP
import imblearn
from imblearn.over_sampling import RandomOverSampler
import warnings
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN


warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv('0423_col40.csv')

In [3]:
#df = p.get_df()
df.shape

(1131682, 40)

new_loan_status = []
for i in range(len(df)) :
    if df['loan_status'][i] == 0 :
        new_loan_status.append(1)
    else :
        new_loan_status.append(0)

df.insert(0, 'new_loan_status', new_loan_status)
df

In [5]:
df_X = df.drop(columns=['loan_status', 'Unnamed: 0'])
df_y = df['loan_status']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.2, random_state = 42)

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [7]:
smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [8]:
adasyn = ADASYN(random_state = 42)
X_train_ad, y_train_ad = adasyn.fit_resample(X_train, y_train)

In [9]:
pca = PCA(n_components = 36)
pca.fit(df_X)
X_pca = pca.transform(df_X)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, df_y, test_size = 0.2, random_state = 42)
#X_train, X_test, y_train, y_test = train_test_split(X_pca, df_y, test_size=0.2, random_state=42, stratify = y_train)

In [10]:
ros = RandomOverSampler(random_state=42)
X_resampled_pca, y_resampled_pca = ros.fit_resample(X_train_pca, y_train_pca)

In [11]:
adasyn = ADASYN(random_state = 42)
X_train_adpca, y_train_adpca = adasyn.fit_resample(X_train_pca, y_train_pca)

## RidgeClassifier

In [13]:
from sklearn.linear_model import RidgeClassifierCV

### Ridge : oversample X, pca X

In [11]:
clfRidge = RidgeClassifierCV().fit(X_train, y_train)

In [13]:
y_pred = clfRidge.predict(X_test)
joblib.dump(clfRidge, 'clfRidge.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.03      0.06     45821
           1       0.80      0.99      0.89    180516

    accuracy                           0.80    226337
   macro avg       0.69      0.51      0.47    226337
weighted avg       0.76      0.80      0.72    226337



### Ridge : oversample(naive), pca X

In [14]:
clfRidge_over = RidgeClassifierCV().fit(X_resampled, y_resampled)

In [15]:
y_pred = clfRidge_over.predict(X_test)
joblib.dump(clfRidge_over, 'clfRidge_over.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.29      0.64      0.40     45821
           1       0.87      0.60      0.71    180516

    accuracy                           0.61    226337
   macro avg       0.58      0.62      0.55    226337
weighted avg       0.75      0.61      0.65    226337



### Ridge : oversample(smooth), pca X

In [33]:
clfRidge_oversm = RidgeClassifierCV().fit(X_train_sm, y_train_sm)

In [34]:
y_pred = clfRidge_oversm.predict(X_test)
joblib.dump(clfRidge_oversm, 'clfRidge_oversm.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.18      0.26     45821
           1       0.82      0.94      0.88    180516

    accuracy                           0.79    226337
   macro avg       0.63      0.56      0.57    226337
weighted avg       0.74      0.79      0.75    226337



### Ridge : oversample(adasyn), pca X

In [12]:
clfRidge_overad = RidgeClassifierCV().fit(X_train_ad, y_train_ad)

In [14]:
y_pred = clfRidge_overad.predict(X_test)
joblib.dump(clfRidge_overad, 'clfRidge_overad.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.13      0.20     45821
           1       0.81      0.96      0.88    180516

    accuracy                           0.80    226337
   macro avg       0.65      0.55      0.54    226337
weighted avg       0.75      0.80      0.75    226337



### Ridge : oversample X, pca O

In [None]:
clfRidge_pca = RidgeClassifierCV().fit(X_train_pca, y_train_pca)

### Ridge : oversample(naive), pca O

In [25]:
clfRidge_pcaover = RidgeClassifierCV().fit(X_resampled_pca, y_resampled_pca)

In [26]:
y_pred = clfRidge_pcaover.predict(X_test_pca)
joblib.dump(clfRidge_overad, 'clfRidge_overad.pkl')
print(classification_report(y_test_pca, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.65      0.43     45821
           1       0.88      0.66      0.75    180516

    accuracy                           0.66    226337
   macro avg       0.60      0.65      0.59    226337
weighted avg       0.77      0.66      0.69    226337



### Ridge : oversample(adasyn), pca O

In [15]:
clfRidge_pcaad = RidgeClassifierCV().fit(X_train_adpca, y_train_adpca)

In [16]:
y_pred = clfRidge_pcaad.predict(X_test_pca)
joblib.dump(clfRidge_pcaad, 'clfRidge_pcaad.pkl')
print(classification_report(y_test_pca, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.64      0.74    180516
           1       0.32      0.65      0.42     45821

    accuracy                           0.64    226337
   macro avg       0.60      0.65      0.58    226337
weighted avg       0.76      0.64      0.68    226337



## LassoClassifier

In [14]:
from sklearn.linear_model import LogisticRegressionCV

In [15]:
clfLasso = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(X_train, y_train)

In [28]:
y_pred = clfLasso.predict(X_test)
print(classification_report(y_test, y_pred))
#48분 걸려서 recall 0.05!

              precision    recall  f1-score   support

           0       0.80      1.00      0.89    180516
           1       0.00      0.00      0.00     45821

    accuracy                           0.80    226337
   macro avg       0.40      0.50      0.44    226337
weighted avg       0.64      0.80      0.71    226337



In [None]:
clfLasso_over = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(X_resampled, y_resampled)

In [None]:
y_pred = clfLasso_over.predict(X_test)
print(classification_report(y_test, y_pred))

In [11]:
clfLasso_pcaover = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(X_resampled_pca, y_resampled_pca)

In [12]:
y_pred = clfLasso_pcaover.predict(X_test_pca)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.67      0.43     45821
           1       0.88      0.63      0.73    180516

    accuracy                           0.64    226337
   macro avg       0.60      0.65      0.58    226337
weighted avg       0.77      0.64      0.67    226337



## LDA, QDA

In [16]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [17]:
clfLDA = LinearDiscriminantAnalysis().fit(X_train, y_train)

In [18]:
y_pred = clfLDA.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.10      0.16     45821
           1       0.81      0.98      0.89    180516

    accuracy                           0.80    226337
   macro avg       0.66      0.54      0.52    226337
weighted avg       0.75      0.80      0.74    226337



In [26]:
clfLDA_over = LinearDiscriminantAnalysis().fit(X_resampled, y_resampled)

In [27]:
y_pred = clfLDA.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.10      0.16     45821
           1       0.81      0.98      0.89    180516

    accuracy                           0.80    226337
   macro avg       0.66      0.54      0.52    226337
weighted avg       0.75      0.80      0.74    226337



In [21]:
# QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [22]:
clfQDA = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

In [23]:
y_pred = clfQDA.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.37      0.31      0.34     45821
           1       0.83      0.86      0.85    180516

    accuracy                           0.75    226337
   macro avg       0.60      0.59      0.59    226337
weighted avg       0.74      0.75      0.74    226337



In [24]:
clfQDA_over = QuadraticDiscriminantAnalysis().fit(X_resampled, y_resampled)

In [25]:
y_pred = clfQDA_over.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.60      0.41     45821
           1       0.87      0.66      0.75    180516

    accuracy                           0.65    226337
   macro avg       0.59      0.63      0.58    226337
weighted avg       0.76      0.65      0.68    226337



In [28]:
clfQDA_pcaover = QuadraticDiscriminantAnalysis().fit(X_resampled_pca, y_resampled_pca)

In [30]:
y_pred = clfQDA_pcaover.predict(X_test_pca)
print(classification_report(y_test_pca, y_pred))

              precision    recall  f1-score   support

           0       0.32      0.53      0.40     45821
           1       0.86      0.72      0.78    180516

    accuracy                           0.68    226337
   macro avg       0.59      0.63      0.59    226337
weighted avg       0.75      0.68      0.70    226337



사실 LDA와 QDA는 자료의 정규성을 가정하고 하는 거라 자료가 연속이어야 함! 해당 자료는 그 차원에서 문제가 있기는 합니다. LDA와 QDA의 타당성에 대해서는 이와 같은 관점에서 무리가 있습니다.

## Naive Bayes

In [45]:
from sklearn.naive_bayes import GaussianNB

In [46]:
clfNB = GaussianNB().fit(X_train, y_train)

In [47]:
y_pred = clfNB.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.34      0.42      0.38     45821
           1       0.84      0.79      0.82    180516

    accuracy                           0.72    226337
   macro avg       0.59      0.61      0.60    226337
weighted avg       0.74      0.72      0.73    226337



In [48]:
clfNB_over = GaussianNB().fit(X_resampled, y_resampled)

In [49]:
y_pred = clfNB_over.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.60      0.41     45821
           1       0.87      0.66      0.75    180516

    accuracy                           0.65    226337
   macro avg       0.59      0.63      0.58    226337
weighted avg       0.75      0.65      0.68    226337



## SVM

from PyML import *

In [9]:
from sklearn import svm
clfSVM = svm.SVC()
clfSVM.fit(X_train_ad, y_train_ad)

In [10]:
y_pred = clfSVM.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.27      0.54      0.36     45821
           1       0.84      0.64      0.73    180516

    accuracy                           0.62    226337
   macro avg       0.56      0.59      0.54    226337
weighted avg       0.73      0.62      0.65    226337



In [11]:
joblib.dump(clfSVM, 'clfSAM.pkl')

['clfSAM.pkl']