In [19]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.5f}'.format

import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline
import seaborn as sns
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

from scipy import stats
import scipy.stats as stats
from scipy.stats import bartlett
from statsmodels.stats.outliers_influence import variance_inflation_factor

#### Dataset

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
df_good = pd.read_csv('./dataset/data_확장기.csv')
df_bad = pd.read_csv('./dataset/data_수축기.csv')
df_change = pd.read_csv('./dataset/data_전환기.csv')
df_all = pd.read_csv('./dataset/data_전체.csv')

In [22]:
def data(df):
    features = df.drop(['거래소코드', '회사명', '회계년도', 'target'], axis=1)
    target = df['target']

    X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=121, stratify=target, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [23]:
X_train, X_test, y_train, y_test = data(df_good)

In [24]:
X_train

Unnamed: 0,자기자본비율,부채비율,유동비율,당좌비율,차입금의존도,OCF,총자산이익률,총자본경상이익률,자기자본순이익률,자기자본영업이익률,매출원가율,총자본회전율,자기자본회전율,순운전자본회전율,총자본증가율,유형자산증가율,순이익증가율
22520,81.71861,22.37115,370.86516,223.46498,0.00000,4022986.00000,12.68386,15.85383,15.52139,15.95515,90.80318,206.78441,253.04446,656.13637,4.32880,-1.17619,8.53339
47856,47.00158,112.75880,142.69388,49.20317,31.91125,12743343.00000,1.29576,1.36182,2.75683,3.02661,92.59168,66.29312,141.04444,442.91120,2.83499,-0.04845,107.22252
63906,64.00674,56.23354,38.44053,32.33705,26.71352,-4356478.00000,8.13533,8.96843,12.71011,0.11738,80.87208,20.57989,32.15269,-227.52822,13.50592,-4.25742,4.42448
19078,75.59666,32.28098,49.08312,47.24760,12.03471,14639719.00000,5.17894,5.74003,6.85075,3.23862,34.42434,18.02773,23.84726,-163.71880,7.22674,-3.24225,15.39770
44790,88.47718,6.12016,858.55475,740.30293,0.00000,-383084.00000,15.81969,21.04631,16.78788,-4.06156,0.00000,1.22121,0.00000,0.00000,20.17480,-5.32896,16.51874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19373,31.26743,219.82163,122.71612,45.71768,17.78604,1093819.00000,-5.87094,-7.13913,-18.77655,-20.04596,44.78506,172.43162,551.47360,1171.04104,-15.80830,71.66411,-396.26439
47291,26.86878,272.17917,109.90786,49.04053,55.75835,758454.00000,8.20810,10.46860,30.54885,52.00868,81.06608,133.81252,498.02232,2020.21373,43.98610,4.84788,1.81287
17249,42.50339,135.27535,145.19849,123.27816,7.94677,1263421.00000,3.52219,4.95879,8.28685,13.29397,89.49929,260.22857,612.25368,1109.28352,9.03562,-2.56150,-17.98362
33974,54.40624,83.80245,267.18902,265.18611,10.10198,-2300336.00000,18.24019,22.20544,45.40161,53.28188,69.24655,136.75441,251.35796,237.09425,120.49152,119.47452,456.45928


#### 확장기

##### RFE

In [25]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFECV

In [52]:
# define RFE

# cross-validation을 통해 자동으로 선택할 feature 개수 선택 -> RFECV
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=121)
# rfe_cv = RFECV(estimator=DecisionTreeClassifier(), cv=cv, scoring='f1')

rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=8)

# fit RFE
rfe = rfe.fit(X_train, y_train)

feature_names = np.array(X_train.columns)
print("Features selected by SelectFromModel: "
      f"{feature_names[rfe.support_]}")

Features selected by SelectFromModel: ['유동비율' 'OCF' '총자본경상이익률' '자기자본영업이익률' '총자본회전율' '자기자본회전율' '유형자산증가율' '순이익증가율']


##### SelectionFromModel

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import SelectFromModel

In [39]:
rf = RandomForestClassifier(n_estimators=1000, random_state=121)
sfm_rf = SelectFromModel(rf, threshold = 'median')
sfm_rf = sfm_rf.fit(X_train, y_train)

In [40]:
feature_names = np.array(X_train.columns)
print("Features selected by SelectFromModel: "
      f"{feature_names[sfm_rf.get_support()]}")

Features selected by SelectFromModel: ['부채비율' 'OCF' '총자산이익률' '총자본경상이익률' '자기자본순이익률' '자기자본영업이익률' '총자본회전율'
 '자기자본회전율' '순이익증가율']


In [44]:
sfm_lasso = SelectFromModel(LassoCV(), threshold='median').fit(X_train, y_train)

feature_names = np.array(X_train.columns)
print("Features selected by SelectFromModel: "
      f"{feature_names[sfm_lasso.get_support()]}")

Features selected by SelectFromModel: ['자기자본비율' '부채비율' '유동비율' '당좌비율' '차입금의존도' 'OCF' '총자산이익률' '총자본경상이익률'
 '자기자본순이익률' '자기자본영업이익률' '매출원가율' '총자본회전율' '자기자본회전율' '순운전자본회전율' '총자본증가율'
 '유형자산증가율' '순이익증가율']


In [51]:
sfm_lr = SelectFromModel(LogisticRegression(max_iter=1000), threshold='median').fit(X_train, y_train)

feature_names = np.array(X_train.columns)
print("Features selected by SelectFromModel: "
      f"{feature_names[sfm_lr.get_support()]}")

Features selected by SelectFromModel: ['자기자본비율' '차입금의존도' '총자산이익률' '총자본경상이익률' '자기자본영업이익률' '매출원가율' '총자본회전율'
 '총자본증가율' '유형자산증가율']


##### Permutation Importance

In [54]:
import eli5
from eli5.sklearn import PermutationImportance

import xgboost
from xgboost import XGBClassifier
from sklearn import metrics

In [56]:
xgb = XGBClassifier(random_state = 121, n_estimatiors = 500).fit(X_train, y_train)

perm = PermutationImportance(xgb, scoring = "f1", random_state = 42).fit(X_test, y_test)
eli5.show_weights(perm, top = 80, feature_names = X_test.columns.tolist())

Weight,Feature
0.4005  ± 0.0208,자기자본영업이익률
0.1392  ± 0.0091,총자본회전율
0.0975  ± 0.0085,총자산이익률
0.0779  ± 0.0145,총자본경상이익률
0.0496  ± 0.0166,순이익증가율
0.0379  ± 0.0072,자기자본회전율
0.0345  ± 0.0038,유동비율
0.0263  ± 0.0027,OCF
0.0223  ± 0.0093,자기자본순이익률
0.0201  ± 0.0102,당좌비율
