In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

In [3]:
train = pd.read_csv('/content/drive/MyDrive/datasets/train_Winsorization.csv', encoding='euc-kr')
test = pd.read_csv('/content/drive/MyDrive/datasets/test.csv', encoding='euc-kr')

In [4]:
train['기업수명주기'] = train['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

test['기업수명주기'] = test['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

train['파부비초과여부']=train['파부비초과여부'].astype('category')
test['파부비초과여부']=test['파부비초과여부'].astype('category')
train['파차의초과여부']=train['파차의초과여부'].astype('category')
test['파차의초과여부']=test['파차의초과여부'].astype('category')

In [5]:
X_train = train.drop('t-1감사의견코드',axis=1)
X_train = X_train.iloc[:,3:]
y_train = train[['t-1감사의견코드']]
X_test = test.drop('t-1감사의견코드',axis=1)
X_test = X_test.iloc[:,3:]
y_test = test[['t-1감사의견코드']]

In [6]:
scaler = StandardScaler()
train_sc = scaler.fit_transform(X_train)
test_sc = scaler.transform(X_test)

---
## 리샘플링 X

In [None]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(random_state=0),
        RandomForestClassifier(random_state=0),
        XGBClassifier(random_state=0),
        LGBMClassifier(random_state=0),
        SVC(random_state=0)
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),4),round(roc_auc_score(y_train,y_hat),4))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),4),
                        round(precision_score(y_test,y_hat),4),
                        round(recall_score(y_test,y_hat),4),
                        round(f1_score(y_test,y_hat),4),
                        round(roc_auc_score(y_test,y_hat),4))

        rdict['model'].append(clf);
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])

        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [None]:
model_basic(train_sc, y_train, test_sc, y_test)

[[37624   633]
 [ 2456   412]]
[[37582   675]
 [ 1451  1417]]
[[37443   814]
 [ 1316  1552]]
[[37424   833]
 [ 1229  1639]]
[[37985   272]
 [ 2284   584]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(random_state=0),0.9293,0.5609,0.9249,0.3943,0.1437,0.2106,0.5636
1,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,0.9998,0.9483,0.6773,0.4941,0.5714,0.7382
2,"XGBClassifier(base_score=None, booster=None, c...",0.9655,0.8346,0.9482,0.656,0.5411,0.593,0.7599
3,LGBMClassifier(random_state=0),0.9552,0.7944,0.9499,0.663,0.5715,0.6139,0.7749
4,SVC(random_state=0),0.9409,0.6209,0.9378,0.6822,0.2036,0.3136,0.5983


---
2. 리샘플링

하이퍼 파라미터 튜닝

- 1.LogisticRegression

In [None]:
### LogisticRegression

from sklearn.model_selection import GridSearchCV
param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2', 'elasticnet']
}

estimator = LogisticRegression(n_jobs=-1)

In [None]:
# define grid_search
lr_grid_search = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           n_jobs=-1,
                           verbose=0
                          )

# fit with (x_train, y_train)
lr_grid_search.fit(train_sc, y_train)

In [None]:
# 최적 하이퍼 파라미터 조합
lr_grid_search.best_params_

{'C': 0.001, 'penalty': 'l2'}

- 2.RandomforestClassifier

In [7]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'criterion': ['entropy', 'gini'],
        'max_depth': [10,20,30],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [20,30,50],
        'min_samples_split': [20,30,50],
        'n_estimators': [50,75,100]
        }

estimator = RandomForestClassifier(n_jobs=-1)

In [None]:
# define grid_search
rf_grid_search = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           n_jobs=-1,
                           verbose=0
                          )

# fit with (x_train, y_train)
rf_grid_search.fit(train_sc, y_train)

In [None]:
# 최적 하이퍼 파라미터 조합
rf_grid_search.best_params_

- 3.XGB classifier

In [None]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'n_estimators':[40,50,70],
        'learning_rate':[0.005,0.01,0.03,0.01],
        'max_depth' : [10,15,20,30]
}

estimator = XGBClassifier(n_jobs=-1)

In [None]:
# define grid_search
xgb_grid_search = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           n_jobs=-1,
                           verbose=0
                          )

# fit with (x_train, y_train)
xgb_grid_search.fit(train_sc, y_train)

ValueError: ignored

In [None]:
# 최적 하이퍼 파라미터 조합
xgb_grid_search.best_params_

4.LGBM calssifier

In [None]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'n_estimators':[40,50,70],
        'learning_rate':[0.005,0.01,0.03,0.01],
        'max_depth' :[10,15,20,30]
}

estimator = LGBMClassifier(n_jobs=-1)

In [None]:
# define grid_search
lgbm_grid_search = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           n_jobs=-1,
                           verbose=0
                          )

# fit with (x_train, y_train)
lgbm_grid_search.fit(train_sc, y_train)

In [None]:
# 최적 하이퍼 파라미터 조합
lgbm_grid_search.best_params_

5. SVC

In [None]:
### 이부분만 수정하면 됨!(param_grid, estimator)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
param_grid = {
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.001, 0.01, 0.1, 1, 10]
}

estimator = SVC()

In [None]:
# define grid_search
svc_grid_search = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           n_jobs=-1,
                           verbose=0
                          )

# fit with (x_train, y_train)
svc_grid_search.fit(X_train, y_train)

In [None]:
# 최적 하이퍼 파라미터 조합
svc_grid_search.best_params_

---

## 하이퍼 파라미터 튜닝후 결과값

In [None]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        # LogisticRegression(C = 10, penalty = 'l2', random_state=0),
        # SVC(C = 10, kernel = 'rbf', random_state=0),
        # DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 6, random_state=0),
        # RandomForestClassifier(random_state=0, criterion = 'entropy', max_depth = 8, max_features = 'auto', min_samples_leaf = 5,
        #                         min_samples_split = 7, n_estimators = 20),
        # XGBClassifier(learning_rate = 0.03, max_depth = 3, n_estimators = 40, random_state=0),
        # LGBMClassifier(learning_rate = 0.03, max_depth = 4, n_estimators = 46, random_state=0)
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),4),round(roc_auc_score(y_train,y_hat),4))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),4),
                        round(precision_score(y_test,y_hat),4),
                        round(recall_score(y_test,y_hat),4),
                        round(f1_score(y_test,y_hat),4),
                        round(roc_auc_score(y_test,y_hat),4))

        rdict['model'].append(clf);
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])

        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final