In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from typing import Union, List
import joblib
import imblearn
from imblearn.over_sampling import RandomOverSampler
import warnings
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

warnings.filterwarnings(action='ignore')

### load data

In [2]:
df = pd.read_csv('modified_log_var25_0426.csv')

In [3]:
#df = p.get_df()
df.shape

(1131682, 26)

In [None]:
scaler = StandardScaler()

scdf = scaler.fit_transform(df)
scdf = pd.DataFrame(scdf)
scdf

### design matirx

In [5]:
df_X = df.drop(columns=['loan_status'])
df_y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.2, random_state = 30)

In [7]:
pca = PCA(n_components = 15)
pca.fit(df_X)
df_Xpca = pca.transform(df_X)
Xpca_train, Xpca_test, ypca_train, ypca_test = train_test_split(df_Xpca, df_y, test_size = 0.2, random_state = 30)

### oversampling

In [8]:
#naive Random oversample
ros = RandomOverSampler(random_state = 30)

X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
Xpca_train_ros, ypca_train_ros = ros.fit_resample(Xpca_train, ypca_train)

In [9]:
#smote oversample
smote = SMOTE(random_state = 30)

X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
Xpca_train_sm, ypca_train_sm = smote.fit_resample(Xpca_train, ypca_train)

In [10]:
#adasyn oversample
adasyn = ADASYN(random_state = 30)

X_train_ad, y_train_ad = adasyn.fit_resample(X_train, y_train)
Xpca_train_ad, ypca_train_ad = adasyn.fit_resample(Xpca_train, ypca_train)

## RidgeClassifier

In [11]:
from sklearn.linear_model import RidgeClassifierCV

### Ridge : pca X

In [12]:
clfRidge = RidgeClassifierCV().fit(X_train, y_train)

In [13]:
y_pred = clfRidge.predict(X_test)
joblib.dump(clfRidge, 'clfRidge.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.89    180654
           1       0.57      0.03      0.06     45683

    accuracy                           0.80    226337
   macro avg       0.69      0.51      0.47    226337
weighted avg       0.76      0.80      0.72    226337



In [14]:
clfRidge_ros = RidgeClassifierCV().fit(X_train_ros, y_train_ros)

In [15]:
y_pred = clfRidge_ros.predict(X_test)
joblib.dump(clfRidge_ros, 'clfRidge_ros.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.66      0.75    180654
           1       0.32      0.64      0.43     45683

    accuracy                           0.65    226337
   macro avg       0.60      0.65      0.59    226337
weighted avg       0.77      0.65      0.69    226337



In [16]:
clfRidge_sm = RidgeClassifierCV().fit(X_train_sm, y_train_sm)

In [17]:
y_pred = clfRidge_sm.predict(X_test)
joblib.dump(clfRidge_sm, 'clfRidge_sm.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87    180654
           1       0.41      0.23      0.29     45683

    accuracy                           0.78    226337
   macro avg       0.62      0.57      0.58    226337
weighted avg       0.74      0.78      0.75    226337



In [18]:
clfRidge_ad = RidgeClassifierCV().fit(X_train_ad, y_train_ad)

In [19]:
y_pred = clfRidge_ad.predict(X_test)
joblib.dump(clfRidge_ad, 'clfRidge_ad.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87    180654
           1       0.42      0.22      0.29     45683

    accuracy                           0.78    226337
   macro avg       0.62      0.57      0.58    226337
weighted avg       0.74      0.78      0.75    226337



### Ridge : pca O

In [20]:
pcaRidge = RidgeClassifierCV().fit(Xpca_train, ypca_train)

In [21]:
y_pred = pcaRidge.predict(Xpca_test)
joblib.dump(pcaRidge, 'pcaRidge.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.89    180654
           1       0.57      0.03      0.06     45683

    accuracy                           0.80    226337
   macro avg       0.68      0.51      0.47    226337
weighted avg       0.75      0.80      0.72    226337



In [22]:
pcaRidge_ros = RidgeClassifierCV().fit(Xpca_train_ros, ypca_train_ros)

In [23]:
y_pred = pcaRidge_ros.predict(Xpca_test)
joblib.dump(pcaRidge_ros, 'pcaRidge_ros.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.66      0.75    180654
           1       0.32      0.63      0.43     45683

    accuracy                           0.65    226337
   macro avg       0.60      0.65      0.59    226337
weighted avg       0.76      0.65      0.69    226337



In [24]:
pcaRidge_sm = RidgeClassifierCV().fit(Xpca_train_sm, ypca_train_sm)

In [25]:
y_pred = pcaRidge_sm.predict(Xpca_test)
joblib.dump(pcaRidge_sm, 'pcaRidge_sm.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.66      0.75    180654
           1       0.32      0.64      0.42     45683

    accuracy                           0.65    226337
   macro avg       0.60      0.65      0.59    226337
weighted avg       0.76      0.65      0.68    226337



In [26]:
pcaRidge_ad = RidgeClassifierCV().fit(Xpca_train_ad, ypca_train_ad)

In [27]:
y_pred = pcaRidge_ad.predict(Xpca_test)
joblib.dump(pcaRidge_ad, 'pcaRidge_ad.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.65      0.75    180654
           1       0.32      0.64      0.42     45683

    accuracy                           0.65    226337
   macro avg       0.60      0.65      0.59    226337
weighted avg       0.76      0.65      0.68    226337



## LassoClassifier

In [29]:
from sklearn.linear_model import LogisticRegressionCV

### Lasso : pca X

In [30]:
clfLasso = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(X_train, y_train)

In [31]:
y_pred = clfLasso.predict(X_test)
joblib.dump(clfLasso, 'clfLasso.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.89    180654
           1       0.51      0.04      0.07     45683

    accuracy                           0.80    226337
   macro avg       0.66      0.51      0.48    226337
weighted avg       0.74      0.80      0.72    226337



In [12]:
clfLasso_ros = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(X_train_ros, y_train_ros)

In [13]:
y_pred = clfLasso_ros.predict(X_test)
joblib.dump(clfLasso_ros, 'clfLasso_ros.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.67      0.76    180654
           1       0.32      0.61      0.42     45683

    accuracy                           0.66    226337
   macro avg       0.60      0.64      0.59    226337
weighted avg       0.76      0.66      0.69    226337



In [11]:
clfLasso_sm = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(X_train_sm, y_train_sm)

In [12]:
y_pred = clfLasso_sm.predict(X_test)
joblib.dump(clfLasso_sm, 'clfLasso_sm.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.69      0.77    180654
           1       0.33      0.59      0.42     45683

    accuracy                           0.67    226337
   macro avg       0.60      0.64      0.59    226337
weighted avg       0.76      0.67      0.70    226337



In [29]:
clfLasso_ad = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(X_train_ad, y_train_ad)

In [30]:
y_pred = clfLasso_ad.predict(X_test)
joblib.dump(clfLasso_ad, 'clfLasso_ad.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.69      0.77    180654
           1       0.33      0.59      0.42     45683

    accuracy                           0.67    226337
   macro avg       0.60      0.64      0.60    226337
weighted avg       0.76      0.67      0.70    226337



### Lasso : pca O

In [14]:
pcaLasso = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(Xpca_train, ypca_train)

In [15]:
y_pred = pcaLasso.predict(Xpca_test)
joblib.dump(pcaLasso, 'pcaLasso.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87    180654
           1       0.42      0.27      0.33     45683

    accuracy                           0.78    226337
   macro avg       0.63      0.59      0.60    226337
weighted avg       0.75      0.78      0.76    226337



In [18]:
pcaLasso_ov = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(Xpca_train_ros, ypca_train_ros)

In [19]:
y_pred = pcaLasso_ov.predict(Xpca_test)
joblib.dump(pcaLasso_ov, 'pcaLasso_ov.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.63      0.73    180654
           1       0.31      0.66      0.42     45683

    accuracy                           0.64    226337
   macro avg       0.60      0.65      0.58    226337
weighted avg       0.77      0.64      0.67    226337



In [31]:
pcaLasso_sm = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(Xpca_train_sm, ypca_train_sm)

In [32]:
y_pred = pcaLasso_ov.predict(Xpca_test)
joblib.dump(pcaLasso_sm, 'pcaLasso_sm.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.63      0.73    180654
           1       0.31      0.66      0.42     45683

    accuracy                           0.64    226337
   macro avg       0.60      0.65      0.58    226337
weighted avg       0.77      0.64      0.67    226337



In [33]:
pcaLasso_ad = LogisticRegressionCV(penalty='l1', solver = 'saga').fit(Xpca_train_ad, ypca_train_ad)

In [34]:
y_pred = pcaLasso_ad.predict(Xpca_test)
joblib.dump(pcaLasso_ad, 'pcaLasso_ad.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.62      0.73    180654
           1       0.31      0.67      0.43     45683

    accuracy                           0.63    226337
   macro avg       0.60      0.65      0.58    226337
weighted avg       0.77      0.63      0.67    226337



## LDA

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### pca X

In [17]:
clfLDA = LinearDiscriminantAnalysis().fit(X_train, y_train)

In [18]:
y_pred = clfLDA.predict(X_test)
joblib.dump(clfLDA, 'clfLDA.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.10      0.16     45821
           1       0.81      0.98      0.89    180516

    accuracy                           0.80    226337
   macro avg       0.66      0.54      0.52    226337
weighted avg       0.75      0.80      0.74    226337



In [26]:
clfLDA_ros = LinearDiscriminantAnalysis().fit(X_train_ros, y_train_ros)

In [27]:
y_pred = clfLDA_ros.predict(X_test)
joblib.dump(clfLDA_ros, 'clfLDA_ros.pkl')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.10      0.16     45821
           1       0.81      0.98      0.89    180516

    accuracy                           0.80    226337
   macro avg       0.66      0.54      0.52    226337
weighted avg       0.75      0.80      0.74    226337



## QDA

In [21]:
# QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [22]:
clfQDA = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

In [23]:
y_pred = clfQDA.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.37      0.31      0.34     45821
           1       0.83      0.86      0.85    180516

    accuracy                           0.75    226337
   macro avg       0.60      0.59      0.59    226337
weighted avg       0.74      0.75      0.74    226337



In [24]:
clfQDA_over = QuadraticDiscriminantAnalysis().fit(X_resampled, y_resampled)

In [25]:
y_pred = clfQDA_over.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.60      0.41     45821
           1       0.87      0.66      0.75    180516

    accuracy                           0.65    226337
   macro avg       0.59      0.63      0.58    226337
weighted avg       0.76      0.65      0.68    226337



In [28]:
clfQDA_pcaover = QuadraticDiscriminantAnalysis().fit(X_resampled_pca, y_resampled_pca)

In [30]:
y_pred = clfQDA_pcaover.predict(X_test_pca)
print(classification_report(y_test_pca, y_pred))

              precision    recall  f1-score   support

           0       0.32      0.53      0.40     45821
           1       0.86      0.72      0.78    180516

    accuracy                           0.68    226337
   macro avg       0.59      0.63      0.59    226337
weighted avg       0.75      0.68      0.70    226337



사실 LDA와 QDA는 자료의 정규성을 가정하고 하는 거라 자료가 연속이어야 함! 해당 자료는 그 차원에서 문제가 있기는 합니다. LDA와 QDA의 타당성에 대해서는 이와 같은 관점에서 무리가 있습니다.

## Naive Bayes

In [45]:
from sklearn.naive_bayes import GaussianNB

### pca X

In [46]:
clfNB = GaussianNB().fit(X_train, y_train)

In [47]:
y_pred = clfNB.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.34      0.42      0.38     45821
           1       0.84      0.79      0.82    180516

    accuracy                           0.72    226337
   macro avg       0.59      0.61      0.60    226337
weighted avg       0.74      0.72      0.73    226337



In [48]:
clfNB_over = GaussianNB().fit(X_resampled, y_resampled)

In [49]:
y_pred = clfNB_over.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.60      0.41     45821
           1       0.87      0.66      0.75    180516

    accuracy                           0.65    226337
   macro avg       0.59      0.63      0.58    226337
weighted avg       0.75      0.65      0.68    226337



## K-NN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [14]:
clfKNN = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)

In [15]:
y_pred = clfKNN.predict(X_test)
#joblib.dump(clfKNN, 'clfKNN.pkl') clfKNN은 데이터 창고일 뿐. predict 수행을 함으로써 계산함.
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88    180654
           1       0.35      0.08      0.14     45683

    accuracy                           0.78    226337
   macro avg       0.58      0.52      0.51    226337
weighted avg       0.71      0.78      0.73    226337



In [16]:
clfKNN_ov = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)

In [17]:
y_pred = clfKNN_ov.predict(X_test)
#joblib.dump(clfKNN, 'clfKNN.pkl') clfKNN은 데이터 창고일 뿐. predict 수행을 함으로써 계산함.
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88    180654
           1       0.35      0.08      0.14     45683

    accuracy                           0.78    226337
   macro avg       0.58      0.52      0.51    226337
weighted avg       0.71      0.78      0.73    226337



본원적으로, K-NN은 다차원 데이터에서 차원의 저주 문제가 발생하기 쉬움.

## SVM

In [1]:
from sklearn.svm import SVC
#from sklearn import svm

In [9]:
clfSVM = svm.SVC()
clfSVM.fit(Xpca_train, ypca_train)

In [10]:
y_pred = clfSVM.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.27      0.54      0.36     45821
           1       0.84      0.64      0.73    180516

    accuracy                           0.62    226337
   macro avg       0.56      0.59      0.54    226337
weighted avg       0.73      0.62      0.65    226337



In [11]:
joblib.dump(clfSVM, 'clfSAM.pkl')

['clfSAM.pkl']

### pca O

In [37]:
clfSVM = SVC(kernel='linear', C=1e10, random_state=30).fit(Xpca_train, ypca_train)

In [None]:
y_pred = pcaSVM.predict(Xpca_test)
joblib.dump(pcaSVM, 'pcaSVM.pkl')
print(classification_report(ypca_test, y_pred))

In [11]:
from sklearn.svm import LinearSVC

In [12]:
pcaSVM_ros = LinearSVC(C=1, random_state=30).fit(Xpca_train_ros, ypca_train_ros)

In [13]:
y_pred = pcaSVM_ros.predict(Xpca_test)
joblib.dump(pcaSVM_ros, 'pcaSVM_ros.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.42      0.55    180654
           1       0.18      0.50      0.27     45683

    accuracy                           0.44    226337
   macro avg       0.48      0.46      0.41    226337
weighted avg       0.65      0.44      0.49    226337



In [24]:
pcaSVM_sm = LinearSVC(C=1e10, random_state=30).fit(Xpca_train_sm, ypca_train_sm)

In [26]:
y_pred = pcaSVM_sm.predict(Xpca_test)
joblib.dump(pcaSVM_sm, 'pcaSVM_sm.pkl')
print(classification_report(ypca_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.62      0.71    180654
           1       0.25      0.50      0.34     45683

    accuracy                           0.60    226337
   macro avg       0.54      0.56      0.52    226337
weighted avg       0.72      0.60      0.64    226337



In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000]}
pcaSVM_ros_grid = LinearSVC(random_state=30)
grid = GridSearchCV(pcaSVM_ros_grid, param_grid)

In [28]:
grid.fit(Xpca_train_ros, ypca_train_ros)

In [35]:
sorted(grid.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [44]:
print(grid.cv_results_['param_C'], grid.cv_results_['rank_test_score'])

[0.01 0.1 1 10 100 1000] [5 2 1 6 3 3]
