# 1. 모델링 위한 세팅

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score,accuracy_score, precision_score,roc_auc_score,f1_score,confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from imblearn.ensemble import RUSBoostClassifier
from catboost import CatBoostClassifier

plt.rcParams['font.family'] = 'Malgun Gothic'

import warnings
warnings.filterwarnings('ignore')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
df = pd.read_csv('../data/dataset/코스피_전처리완.csv')

X = df[df.columns[6:]]
y = df['분식기업']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state = 42)

n, d = X_train.shape
print("number of feature:", d)  # 변수 개수
print("number of data:", n)     # 데이터 개수

number of feature: 40
number of data: 4920


In [3]:
print(y_train.value_counts())
print(y_test.value_counts())

분식기업
0.0    4821
1.0      99
Name: count, dtype: int64
분식기업
0.0    2066
1.0      43
Name: count, dtype: int64


# 2. 모델링

In [4]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# CatBoost 모델 생성
model = CatBoostClassifier(iterations=100, learning_rate=0.1, loss_function='Logloss', random_state = 42)

# 모델 학습
model.fit(X_train, y_train, verbose=0)

# 테스트 데이터에 대한 예측
y_pred2 = model.predict(X_test)

# 다양한 평가 지표 출력
accuracy = accuracy_score(y_test, y_pred2)
precision = precision_score(y_test, y_pred2)
recall = recall_score(y_test, y_pred2)
f1 = f1_score(y_test, y_pred2)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.9810336652441916
Precision: 1.0
Recall: 0.06976744186046512
F1 Score: 0.13043478260869565


In [5]:
df2 = pd.DataFrame()

In [6]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

rf_train = [accuracy_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),f1_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train)]
rf_test = [accuracy_score(y_test, y_pred_test),precision_score(y_test, y_pred_test),recall_score(y_test, y_pred_test),f1_score(y_test, y_pred_test),roc_auc_score(y_test, y_pred_test)]

df2['rf_train'] = rf_train
df2['rf_test'] = rf_test


print(confusion_matrix(y_test,y_pred_test))

[[2066    0]
 [  42    1]]


In [7]:
ad = AdaBoostClassifier(random_state=42)
ad.fit(X_train, y_train)
y_pred_train = ad.predict(X_train)
y_pred_test = ad.predict(X_test)

ad_train = [accuracy_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),f1_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train)]
ad_test = [accuracy_score(y_test, y_pred_test),precision_score(y_test, y_pred_test),recall_score(y_test, y_pred_test),f1_score(y_test, y_pred_test),roc_auc_score(y_test, y_pred_test)]

df2['ad_train'] = ad_train
df2['ad_test'] = ad_test


print(confusion_matrix(y_test,y_pred_test))

[[2064    2]
 [  40    3]]


In [8]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_train = gb.predict(X_train)
y_pred_test = gb.predict(X_test)

gb_train = [accuracy_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),f1_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train)]
gb_test = [accuracy_score(y_test, y_pred_test),precision_score(y_test, y_pred_test),recall_score(y_test, y_pred_test),f1_score(y_test, y_pred_test),roc_auc_score(y_test, y_pred_test)]

df2['gb_train'] = gb_train
df2['gb_test'] = gb_test


print(confusion_matrix(y_test,y_pred_test))

[[2054   12]
 [  38    5]]


In [9]:
rus = RUSBoostClassifier(random_state=42)
rus.fit(X_train, y_train)
y_pred_train = rus.predict(X_train)
y_pred_test = rus.predict(X_test)

rus_train = [accuracy_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),f1_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train)]
rus_test = [accuracy_score(y_test, y_pred_test),precision_score(y_test, y_pred_test),recall_score(y_test, y_pred_test),f1_score(y_test, y_pred_test),roc_auc_score(y_test, y_pred_test)]

df2['rus_train'] = rus_train
df2['rus_test'] = rus_test


print(confusion_matrix(y_test,y_pred_test))

[[1604  462]
 [  16   27]]


In [10]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
y_pred_train = xgb.predict(X_train)
y_pred_test = xgb.predict(X_test)

xgb_train = [accuracy_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),f1_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train)]
xgb_test = [accuracy_score(y_test, y_pred_test),precision_score(y_test, y_pred_test),recall_score(y_test, y_pred_test),f1_score(y_test, y_pred_test),roc_auc_score(y_test, y_pred_test)]

df2['xgb_train'] = xgb_train
df2['xgb_test'] = xgb_test


print(confusion_matrix(y_test,y_pred_test))

[[2064    2]
 [  38    5]]


In [11]:
lgb = LGBMClassifier(random_state=42)
lgb.fit(X_train, y_train)
y_pred_train = lgb.predict(X_train)
y_pred_test = lgb.predict(X_test)

lgb_train =[accuracy_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),f1_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train)]
lgb_test = [accuracy_score(y_test, y_pred_test),precision_score(y_test, y_pred_test),recall_score(y_test, y_pred_test),f1_score(y_test, y_pred_test),roc_auc_score(y_test, y_pred_test)]

df2['lgb_train'] = lgb_train
df2['lgb_test'] = lgb_test


print(confusion_matrix(y_test,y_pred_test))

[LightGBM] [Info] Number of positive: 99, number of negative: 4821
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9332
[LightGBM] [Info] Number of data points in the train set: 4920, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.020122 -> initscore=-3.885617
[LightGBM] [Info] Start training from score -3.885617
[[2066    0]
 [  38    5]]


In [12]:
cat = CatBoostClassifier(random_state=42, verbose=0)
cat.fit(X_train, y_train)
y_pred_train = cat.predict(X_train)
y_pred_test = cat.predict(X_test)

cat_train = [accuracy_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),f1_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train)]
cat_test = [accuracy_score(y_test, y_pred_test),precision_score(y_test, y_pred_test),recall_score(y_test, y_pred_test),f1_score(y_test, y_pred_test),roc_auc_score(y_test, y_pred_test)]

df2['cat_train'] = cat_train
df2['cat_test'] = cat_test


print(confusion_matrix(y_test,y_pred_test))

[[2066    0]
 [  41    2]]


## 성능요약

In [13]:
df2.index = ['accuracy','precision','recall','f1-score','roc-auc']
df2

Unnamed: 0,rf_train,rf_test,ad_train,ad_test,gb_train,gb_test,rus_train,rus_test,xgb_train,xgb_test,lgb_train,lgb_test,cat_train,cat_test
accuracy,1.0,0.980085,0.979878,0.980085,0.993902,0.976292,0.76626,0.773352,1.0,0.981034,1.0,0.981982,0.997764,0.98056
precision,1.0,1.0,0.5,0.6,1.0,0.294118,0.054283,0.055215,1.0,0.714286,1.0,1.0,1.0,1.0
recall,1.0,0.023256,0.121212,0.069767,0.69697,0.116279,0.646465,0.627907,1.0,0.116279,1.0,0.116279,0.888889,0.046512
f1-score,1.0,0.045455,0.195122,0.125,0.821429,0.166667,0.100156,0.101504,1.0,0.2,1.0,0.208333,0.941176,0.088889
roc-auc,1.0,0.511628,0.559362,0.5344,0.848485,0.555235,0.707592,0.702143,1.0,0.557656,1.0,0.55814,0.944444,0.523256


# 3. 교차검증

In [None]:
from sklearn.model_selection import GridSearchCV

