In [1]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.5f}'.format

import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline
import seaborn as sns
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

#### Dataset

In [2]:
df_good = pd.read_csv('./dataset/data_확장기.csv')
df_bad = pd.read_csv('./dataset/data_수축기.csv')
df_change = pd.read_csv('./dataset/data_전환기.csv')
df_all = pd.read_csv('./dataset/data_전체.csv')

#### 함수

In [3]:
def data(df):
    import sklearn
    from sklearn.model_selection import train_test_split

    features = df.drop(['거래소코드', '회사명', '회계년도', 'target'], axis=1)
    target = df['target']

    X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=121, stratify=target, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [4]:
def rfe(df, n):
    import sklearn
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.feature_selection import RFE

    X_train, X_test, y_train, y_test = data(df)

    rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=n)

    # fit RFE
    rfe = rfe.fit(X_train, y_train)

    feature_names = np.array(X_train.columns)
    print("Features selected by SelectFromModel: "
        f"{feature_names[rfe.support_]}")

In [5]:
def rfecv(df):
    import sklearn
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import RepeatedStratifiedKFold
    from sklearn.feature_selection import RFECV

    X_train, X_test, y_train, y_test = data(df)

    # cross-validation을 통해 자동으로 선택할 feature 개수 선택 -> RFECV
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=121)
    rfe = RFECV(estimator=DecisionTreeClassifier(), cv=cv, scoring='f1')

    # fit RFE
    rfe = rfe.fit(X_train, y_train)

    feature_names = np.array(X_train.columns)
    print("Features selected by SelectFromModel: "
        f"{feature_names[rfe.support_]}")

In [6]:
def sfm(df):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LassoCV
    from sklearn.linear_model import LogisticRegression
    
    from sklearn.feature_selection import SelectFromModel

    X_train, X_test, y_train, y_test = data(df)

    sfm_rf = SelectFromModel(RandomForestClassifier(n_estimators=1000, random_state=121), threshold = 'median').fit(X_train, y_train)
    sfm_lasso = SelectFromModel(LassoCV(), threshold='median').fit(X_train, y_train)
    sfm_lr = SelectFromModel(LogisticRegression(max_iter=1000), threshold='median').fit(X_train, y_train)

    feature_names = np.array(X_train.columns)

    print("Features selected by SelectFromModel_Random Forest: "
        f"{feature_names[sfm_rf.get_support()]}")

    print("Features selected by SelectFromModel_Lasso: "
        f"{feature_names[sfm_lasso.get_support()]}")

    print("Features selected by SelectFromModel_LogisticRegression: "
        f"{feature_names[sfm_lr.get_support()]}")

In [7]:
def pi(df):
    import eli5
    from eli5.sklearn import PermutationImportance

    import xgboost
    from xgboost import XGBClassifier
    from sklearn import metrics

    X_train, X_test, y_train, y_test = data(df)

    xgb = XGBClassifier(random_state = 121).fit(X_train, y_train)

    perm = PermutationImportance(xgb, scoring = "f1", random_state = 42).fit(X_test, y_test)
    return eli5.show_weights(perm, top = 80, feature_names = X_test.columns.tolist())

#### 확장기

In [8]:
print(len(df_good[df_good['target']==0]))
print(len(df_good[df_good['target']==1]))

67502
7972


In [9]:
rfe(df_good, 8)

Features selected by SelectFromModel: ['유동비율' 'OCF' '총자본경상이익률' '자기자본영업이익률' '총자본회전율' '자기자본회전율' '유형자산증가율' '순이익증가율']


In [10]:
rfecv(df_good)

Features selected by SelectFromModel: ['자기자본비율' '부채비율' '유동비율' '당좌비율' '차입금의존도' 'OCF' '총자산이익률' '총자본경상이익률'
 '자기자본순이익률' '자기자본영업이익률' '매출원가율' '총자본회전율' '자기자본회전율' '순운전자본회전율' '총자본증가율'
 '유형자산증가율' '순이익증가율']


In [11]:
sfm(df_good)

Features selected by SelectFromModel_Random Forest: ['부채비율' 'OCF' '총자산이익률' '총자본경상이익률' '자기자본순이익률' '자기자본영업이익률' '총자본회전율'
 '자기자본회전율' '순이익증가율']
Features selected by SelectFromModel_Lasso: ['자기자본비율' '부채비율' '유동비율' '당좌비율' '차입금의존도' 'OCF' '총자산이익률' '총자본경상이익률'
 '자기자본순이익률' '자기자본영업이익률' '매출원가율' '총자본회전율' '자기자본회전율' '순운전자본회전율' '총자본증가율'
 '유형자산증가율' '순이익증가율']
Features selected by SelectFromModel_LogisticRegression: ['자기자본비율' '차입금의존도' '총자산이익률' '총자본경상이익률' '자기자본영업이익률' '매출원가율' '총자본회전율'
 '총자본증가율' '유형자산증가율']


In [12]:
pi(df_good)

Weight,Feature
0.4005  ± 0.0208,자기자본영업이익률
0.1392  ± 0.0091,총자본회전율
0.0975  ± 0.0085,총자산이익률
0.0779  ± 0.0145,총자본경상이익률
0.0496  ± 0.0166,순이익증가율
0.0379  ± 0.0072,자기자본회전율
0.0345  ± 0.0038,유동비율
0.0263  ± 0.0027,OCF
0.0223  ± 0.0093,자기자본순이익률
0.0201  ± 0.0102,당좌비율


#### 수축기

In [13]:
print(len(df_bad[df_bad['target']==0]))
print(len(df_bad[df_bad['target']==1]))

32000
2974


In [14]:
rfe(df_bad, 7)

Features selected by SelectFromModel: ['유동비율' 'OCF' '총자본경상이익률' '자기자본영업이익률' '총자본회전율' '자기자본회전율' '순이익증가율']


In [15]:
rfecv(df_bad)

Features selected by SelectFromModel: ['자기자본비율' '유동비율' '당좌비율' '차입금의존도' 'OCF' '총자본경상이익률' '자기자본순이익률' '자기자본영업이익률'
 '매출원가율' '총자본회전율' '자기자본회전율' '순운전자본회전율' '총자본증가율' '유형자산증가율' '순이익증가율']


In [16]:
sfm(df_bad)

Features selected by SelectFromModel_Random Forest: ['부채비율' 'OCF' '총자산이익률' '총자본경상이익률' '자기자본순이익률' '자기자본영업이익률' '총자본회전율'
 '자기자본회전율' '순이익증가율']
Features selected by SelectFromModel_Lasso: ['자기자본비율' '부채비율' '유동비율' '당좌비율' '차입금의존도' 'OCF' '총자산이익률' '총자본경상이익률'
 '자기자본순이익률' '자기자본영업이익률' '매출원가율' '총자본회전율' '자기자본회전율' '순운전자본회전율' '총자본증가율'
 '유형자산증가율' '순이익증가율']
Features selected by SelectFromModel_LogisticRegression: ['자기자본비율' '유동비율' '차입금의존도' '자기자본영업이익률' '매출원가율' '총자본회전율' '자기자본회전율' '총자본증가율'
 '유형자산증가율']


In [17]:
pi(df_bad)

Weight,Feature
0.3140  ± 0.0340,자기자본영업이익률
0.1855  ± 0.0201,총자본회전율
0.0886  ± 0.0226,총자본경상이익률
0.0613  ± 0.0180,총자산이익률
0.0401  ± 0.0083,순이익증가율
0.0379  ± 0.0160,자기자본회전율
0.0374  ± 0.0114,유동비율
0.0257  ± 0.0088,OCF
0.0181  ± 0.0175,자기자본순이익률
0.0165  ± 0.0125,부채비율


#### ALL

In [18]:
print(len(df_all[df_all['target']==0]))
print(len(df_all[df_all['target']==1]))

156585
17512


In [20]:
rfe(df_all, 8)

Features selected by SelectFromModel: ['유동비율' 'OCF' '총자본경상이익률' '자기자본순이익률' '자기자본영업이익률' '총자본회전율' '자기자본회전율'
 '순이익증가율']


In [21]:
rfecv(df_all)

Features selected by SelectFromModel: ['부채비율' '유동비율' '당좌비율' '차입금의존도' 'OCF' '총자본경상이익률' '자기자본순이익률' '자기자본영업이익률'
 '매출원가율' '총자본회전율' '자기자본회전율' '순운전자본회전율' '유형자산증가율' '순이익증가율']


In [22]:
sfm(df_all)

Features selected by SelectFromModel_Random Forest: ['부채비율' 'OCF' '총자산이익률' '총자본경상이익률' '자기자본순이익률' '자기자본영업이익률' '총자본회전율'
 '자기자본회전율' '순이익증가율']
Features selected by SelectFromModel_Lasso: ['자기자본비율' '부채비율' '유동비율' '당좌비율' '차입금의존도' 'OCF' '총자산이익률' '총자본경상이익률'
 '자기자본순이익률' '자기자본영업이익률' '매출원가율' '총자본회전율' '자기자본회전율' '순운전자본회전율' '총자본증가율'
 '유형자산증가율' '순이익증가율']
Features selected by SelectFromModel_LogisticRegression: ['자기자본비율' '차입금의존도' '총자산이익률' '총자본경상이익률' '자기자본영업이익률' '매출원가율' '총자본회전율'
 '총자본증가율' '유형자산증가율']


In [23]:
pi(df_all)

Weight,Feature
0.3613  ± 0.0092,자기자본영업이익률
0.2028  ± 0.0054,총자본회전율
0.1073  ± 0.0122,총자산이익률
0.0642  ± 0.0066,자기자본회전율
0.0598  ± 0.0065,총자본경상이익률
0.0548  ± 0.0044,순이익증가율
0.0343  ± 0.0086,유동비율
0.0205  ± 0.0091,자기자본순이익률
0.0196  ± 0.0060,OCF
0.0148  ± 0.0072,부채비율
