In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

In [2]:
train = pd.read_csv('../data/train_Winsorization_시총_상장.csv', encoding='euc-kr')
test = pd.read_csv('../data/test_시총_상장.csv', encoding='euc-kr')

In [3]:
X_train_sum = train[['부채비율', '총자본회전률', '매출액대비잉여현금흐름', 'PBR',
       '총자산대비영업현금흐름', '자기자본증가율', '총자본투자효율', '총자본순이익률',
       '매출액영업이익률']]
y_train = train[['t-1감사의견코드']]
X_test_sum = test[['부채비율', '총자본회전률', '매출액대비잉여현금흐름', 'PBR',
       '총자산대비영업현금흐름', '자기자본증가율', '총자본투자효율', '총자본순이익률',
       '매출액영업이익률']]
y_test = test[['t-1감사의견코드']]

In [4]:
X_train = train.drop('t-1감사의견코드',axis=1)
y_train = train[['t-1감사의견코드']]

X_test = test.drop('t-1감사의견코드',axis=1)
y_test = test[['t-1감사의견코드']]

In [5]:
from imblearn.under_sampling import OneSidedSelection

# One Side Selection을 사용하여 언더샘플링된 데이터 생성
oss = OneSidedSelection()
X_test_sum, y_test = oss.fit_resample(X_test_sum, y_test)
X_train_sum, y_train = oss.fit_resample(X_train_sum, y_train)

In [6]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_sum)
X_test_sc = scaler.transform(X_test_sum)

In [7]:
X_train_sc=pd.DataFrame(X_train_sc, columns=X_train_sum.columns)
X_test_sc=pd.DataFrame(X_test_sc, columns=X_test_sum.columns)

In [8]:
X_test_sc=pd.concat([X_test_sc,y_test],axis=1)
X_train_sc=pd.concat([X_train_sc,y_train],axis=1)

In [15]:
# X_test_sc.to_csv('test_다시.csv',index=False,encoding='euc-kr')

In [None]:
from imblearn.over_sampling import SMOTE

# SMOTE 객체 생성
smote = SMOTE(random_state=0)

# 오버샘플링 수행
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train_sc, y_train)

---

In [9]:
train['기업수명주기'] = train['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

test['기업수명주기'] = test['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

In [106]:
len(train.columns)

53

In [107]:
train_int = train[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계','PER','PBR',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름','시가총액',
       't-1감사의견코드']]
test_int = test[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계','PER','PBR',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름','시가총액',
       't-1감사의견코드']]

# Standard Scaler

In [108]:
from sklearn.preprocessing import StandardScaler
X_train = train_int.drop('t-1감사의견코드', axis=1)
y_train = train_int[['t-1감사의견코드']]

X_test = test_int.drop('t-1감사의견코드', axis=1)
y_test = test_int[['t-1감사의견코드']]

In [109]:
train_cat = train[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]
test_cat = test[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]

In [110]:
scaler = StandardScaler()
train_sc = scaler.fit_transform(X_train)
test_sc = scaler.transform(X_test)

train_sc = pd.DataFrame(train_sc, columns=X_train.columns)
test_sc = pd.DataFrame(test_sc, columns=X_test.columns)

# 스케일링 + 카테고리 변수 +타겟 데이터프레임
train_sc_total = pd.concat([train_sc,train_cat ,y_train], axis=1)
test_sc_total = pd.concat([test_sc, test_cat,y_test], axis=1)

---

# Undersampling

In [111]:
from imblearn.under_sampling import OneSidedSelection
from collections import Counter

In [112]:
print(train_sc_total['t-1감사의견코드'].value_counts())
print(test_sc_total['t-1감사의견코드'].value_counts())

t-1감사의견코드
0.0    9028
1.0      90
Name: count, dtype: int64
t-1감사의견코드
0.0    2697
1.0      27
Name: count, dtype: int64


In [113]:
train_sc_total.drop('t-1감사의견코드',axis=1)

Unnamed: 0,부채비율,당좌비율,유동비율,이자보상배율,차입금의존도,자기자본구성비율,매출액영업이익률,자기자본순이익률,총자본순이익률,총자본회전률,...,총자산대비현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,시가총액,기업수명주기,이보배초과여부,파부비초과여부,파당비초과여부,파차의초과여부,파로이초과여부
0,-0.240036,-0.296824,-0.313183,-0.252583,-0.001066,-0.089792,0.116696,-0.352024,-0.470948,-1.257022,...,-0.325849,0.079667,0.276497,-0.534745,3,0,0,0,0,0
1,0.946404,-0.493110,-0.516744,-0.247044,1.607750,-1.247596,0.194140,0.470169,0.197419,0.369686,...,-0.145500,0.777842,0.715655,0.273857,3,0,1,0,1,0
2,0.528165,-0.477719,-0.445573,-0.251717,0.787290,-0.973020,0.066276,0.099228,-0.008026,-0.120287,...,-0.075902,0.104853,0.187317,-0.240306,2,0,1,0,0,0
3,-0.857766,0.341673,0.443287,0.714891,-1.269212,1.642525,0.284824,0.398605,0.685461,-1.237423,...,-0.308847,-0.188579,0.152751,-0.286926,3,0,0,0,0,0
4,0.639334,-0.494966,-0.541166,-0.261271,1.234381,-1.054954,-0.143747,-1.751501,-1.551100,-0.257479,...,0.075077,-0.272026,0.083865,-0.448414,2,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,0.635570,-0.405849,-0.400583,-0.251580,0.097033,-1.052516,0.070992,0.083722,-0.002643,-0.394671,...,0.128448,-0.342146,0.000567,3.381146,2,0,1,0,0,0
9114,1.890287,-0.483912,-0.517683,-0.253974,0.991216,-1.639709,0.038164,-0.336916,-0.262812,1.212438,...,-0.079927,0.466609,0.036053,-0.498175,2,0,1,0,1,0
9115,-0.377377,-0.071210,-0.083635,-0.247732,-0.564092,0.167227,0.084413,0.315511,0.322121,0.506878,...,-0.016368,-0.624833,0.094073,1.421620,5,0,0,0,0,0
9116,-0.345690,-0.325266,-0.381773,-0.218620,-0.413080,0.103826,0.205204,0.373955,0.288029,-0.727852,...,0.776136,0.036300,0.670684,-0.417130,4,0,0,0,0,0


In [114]:
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import RandomUnderSampler

# 't-1감사의견코드' 열을 제외한 데이터와 해당 열만을 추출합니다.
train_X = train_sc_total.drop('t-1감사의견코드', axis=1)
train_y = train_sc_total['t-1감사의견코드']

# OneSidedSelection 적용
undersampler = OneSidedSelection(random_state=7)
train_X_resampled, train_y_resampled = undersampler.fit_resample(train_X, train_y)

# 원하는 샘플링 비율로 데이터셋을 다시 조정합니다.
ratio = 0.33  # 3:1 비율을 의미합니다.
custom_undersampler = RandomUnderSampler(sampling_strategy=ratio, random_state=7)
train_X_resampled, train_y_resampled = custom_undersampler.fit_resample(train_X_resampled, train_y_resampled)


In [115]:
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import RandomUnderSampler

# 't-1감사의견코드' 열을 제외한 데이터와 해당 열만을 추출합니다.
test_X = test_sc_total.drop('t-1감사의견코드', axis=1)
test_y = test_sc_total['t-1감사의견코드']

# OneSidedSelection 적용
undersampler = OneSidedSelection(random_state=7)
test_X_resampled, test_y_resampled = undersampler.fit_resample(test_X, test_y)

# 원하는 샘플링 비율로 데이터셋을 다시 조정합니다.
ratio = 0.33  # 3:1 비율을 의미합니다.
custom_undersampler = RandomUnderSampler(sampling_strategy=ratio, random_state=7)
test_X_resampled, test_y_resampled = custom_undersampler.fit_resample(test_X_resampled, test_y_resampled)


In [116]:
print(train_X_resampled.shape)
print(test_X_resampled.shape)

(362, 49)
(108, 49)


---


- 데이터 프레임 정리

In [117]:
train_y_resampled=pd.DataFrame(train_y_resampled,columns=['t-1감사의견코드'])
test_y_resampled=pd.DataFrame(test_y_resampled,columns=['t-1감사의견코드'])

In [118]:
train_int_resampled = train_X_resampled.drop(['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부'],axis=1)
test_int_resampled = test_X_resampled.drop(['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부'],axis=1)
train_cat_resampled= train_X_resampled[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]
test_cat_resampled= test_X_resampled[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]

In [119]:
train_resampled_sum = pd.concat([train_X_resampled,train_y_resampled],axis=1)
test_resampled_sum = pd.concat([test_X_resampled,test_y_resampled],axis=1)

train_int_resampled_sum = pd.concat([train_int_resampled,train_y_resampled],axis=1)
train_cat_resampled_sum =pd.concat([train_cat_resampled,train_y_resampled],axis=1)
test_cat_resampled_sum=pd.concat([test_cat_resampled,test_y_resampled],axis=1)

---

# MDA

In [120]:
# 1-1 정규성 테스트(샤피로)
from scipy.stats import norm
from scipy import stats
from statsmodels.formula.api import ols
from scipy.stats import kstest

for j in train_int_resampled_sum.columns:
    a = stats.shapiro(train_int_resampled_sum[j])
    p = a.pvalue
    print(j,a)

부채비율 ShapiroResult(statistic=0.6517183184623718, pvalue=1.556141996526758e-26)
당좌비율 ShapiroResult(statistic=0.5080589056015015, pvalue=1.7829098977113898e-30)
유동비율 ShapiroResult(statistic=0.5417087078094482, pvalue=1.2111193904318673e-29)
이자보상배율 ShapiroResult(statistic=0.26787954568862915, pvalue=2.274722909409432e-35)
차입금의존도 ShapiroResult(statistic=0.9391801357269287, pvalue=5.1375667609043774e-11)
자기자본구성비율 ShapiroResult(statistic=0.9692990779876709, pvalue=6.370469805005996e-07)
매출액영업이익률 ShapiroResult(statistic=0.30486464500427246, pvalue=1.0392941596197025e-34)
자기자본순이익률 ShapiroResult(statistic=0.7888702750205994, pvalue=2.4699972558970697e-21)
총자본순이익률 ShapiroResult(statistic=0.8849671483039856, pvalue=7.791794142508041e-16)
총자본회전률 ShapiroResult(statistic=0.9278445243835449, pvalue=3.264059161844912e-12)
자기자본회전률 ShapiroResult(statistic=0.6983218193054199, pvalue=5.577732735445396e-25)
운전자본회전률 ShapiroResult(statistic=0.36240023374557495, pvalue=1.2645558639833304e-33)
순운전자본회전률 Shapiro

In [121]:
# 1-2 정규성 테스트(K-S)
for j in train_resampled_sum.columns:
    a = kstest(train_resampled_sum[j],'norm')
    p = a.pvalue
    print(j,a)

부채비율 KstestResult(statistic=0.2139058166216583, pvalue=5.133250437977357e-15, statistic_location=-0.12799210925166862, statistic_sign=1)
당좌비율 KstestResult(statistic=0.29515970617853776, pvalue=1.912231456084068e-28, statistic_location=-0.5303860814400919, statistic_sign=-1)
유동비율 KstestResult(statistic=0.2789875439357965, pvalue=2.083692556219988e-25, statistic_location=-0.5776507298005372, statistic_sign=-1)
이자보상배율 KstestResult(statistic=0.43692709089952475, pvalue=2.1638892948144748e-63, statistic_location=-0.1401053451561435, statistic_sign=1)
차입금의존도 KstestResult(statistic=0.09739115690713246, pvalue=0.0019315517688805787, statistic_location=-1.2965608979867782, statistic_sign=-1)
자기자본구성비율 KstestResult(statistic=0.0823816678476137, pvalue=0.01384428239611211, statistic_location=0.7305224123215357, statistic_sign=-1)
매출액영업이익률 KstestResult(statistic=0.31255160573751584, pvalue=6.41968298581131e-32, statistic_location=0.3894728161776757, statistic_sign=1)
자기자본순이익률 KstestResult(statistic

-p-value가 0 이는 데이터개수가 많아서 p-value 자체가 너무작아 계산이 불가능하다고 판단. 중심극한 정리에 의해서 정규성이 있다고 가정하고 진행

In [122]:
# 부도기업과 정상기업의 피처별 등분산비교 (정규성 가정으로 bartlett)
Bad = train_int_resampled_sum[train_int_resampled_sum['t-1감사의견코드']== 1] #Existing Customer
Good = train_int_resampled_sum[train_int_resampled_sum['t-1감사의견코드']== 0] #Attrited Customer

c = []
for i in train_int_resampled_sum:
    # lresult = stats.levene(close[i], normal[i])
    lresult = stats.bartlett(Bad[i], Good[i])
    c.append([i,lresult[-1]])

c= pd.DataFrame(c)
c.columns=["피처값",'F-test']
c

Unnamed: 0,피처값,F-test
0,부채비율,1.47407e-16
1,당좌비율,0.0003316197
2,유동비율,0.0001999758
3,이자보상배율,0.001041025
4,차입금의존도,0.03789876
5,자기자본구성비율,0.00014518
6,매출액영업이익률,1.4449e-15
7,자기자본순이익률,1.573731e-22
8,총자본순이익률,2.145101e-11
9,총자본회전률,0.004089106


In [123]:
# F-test 결과 0.05 이상이면 homo 0.05 이하이면 hetero
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
    else:
        c.loc[i,"분산"] = "hetero"
c

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,1.47407e-16,hetero,
1,당좌비율,0.0003316197,hetero,
2,유동비율,0.0001999758,hetero,
3,이자보상배율,0.001041025,hetero,
4,차입금의존도,0.03789876,hetero,
5,자기자본구성비율,0.00014518,hetero,
6,매출액영업이익률,1.4449e-15,hetero,
7,자기자본순이익률,1.573731e-22,hetero,
8,총자본순이익률,2.145101e-11,hetero,
9,총자본회전률,0.004089106,hetero,


In [124]:
c[c["분산"]=='homo']

Unnamed: 0,피처값,F-test,분산,T-test
11,운전자본회전률,0.648432,homo,
14,당좌자산회전률,0.068484,homo,
15,유동자산회전률,0.159531,homo,
19,순이익증가율,0.605182,homo,
23,영업이익증가율,0.069359,homo,
29,log자산총계,0.788062,homo,
32,OCF이자보상배율,0.364705,homo,
34,장기부채상환능력,0.360449,homo,
42,시가총액,0.397503,homo,


In [125]:
# homo 인 feature 는 student t-test, hetero이면 Welchs T-Test 진행
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=True)       ## equal_var = True Student T-test
        c.loc[i,"T-test"] = result[-1]
        print(Bad[c.loc[i,"피처값"]])
    else:
        c.loc[i,"분산"] = "hetero"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=False)      ## equal_var = False Welchs T-Test
        c.loc[i,"T-test"] = result[-1]
c

272   -0.329097
273   -0.329097
274   -0.329097
275   -0.329097
276    0.015291
         ...   
357   -0.184092
358   -0.329097
359   -0.329097
360   -0.329097
361   -0.329097
Name: 운전자본회전률, Length: 90, dtype: float64
272   -1.023536
273    0.501651
274   -0.614050
275    0.608473
276   -0.720872
         ...   
357   -0.910779
358   -0.127414
359   -0.608115
360   -1.266853
361   -1.444891
Name: 당좌자산회전률, Length: 90, dtype: float64
272   -1.055160
273    0.498955
274   -0.617775
275    0.052263
276   -0.627081
         ...   
357   -0.887651
358    0.219773
359   -0.729448
360   -1.343649
361   -1.604219
Name: 유동자산회전률, Length: 90, dtype: float64
272    7.326681
273   -0.069053
274   -3.826375
275   -0.069053
276   -0.069053
         ...   
357   -0.069053
358    0.196720
359   -0.069053
360   -0.490112
361   -0.069053
Name: 순이익증가율, Length: 90, dtype: float64
272   -0.102689
273   -0.102689
274   -0.102689
275   -0.102689
276   -0.102689
         ...   
357   -0.102689
358    0.046683
3

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,1.47407e-16,hetero,1.3e-05
1,당좌비율,0.0003316197,hetero,0.130001
2,유동비율,0.0001999758,hetero,0.066452
3,이자보상배율,0.001041025,hetero,0.22865
4,차입금의존도,0.03789876,hetero,3e-06
5,자기자본구성비율,0.00014518,hetero,2e-06
6,매출액영업이익률,1.4449e-15,hetero,0.106062
7,자기자본순이익률,1.573731e-22,hetero,0.0
8,총자본순이익률,2.145101e-11,hetero,0.0
9,총자본회전률,0.004089106,hetero,0.0


In [126]:
# 0.05 이상이면 유의미하지 않으므로 0.05 이하인 것만 추출
d = c[c["T-test"]<= 0.05]
d.sort_values('T-test',ascending=False)["피처값"].unique()
# d.sort_values('T-test',ascending=False)["피처값"]

array(['당좌자산회전률', '자본분배율', '부가가치율', '이윤분배율', '유동자산회전률', '매출액대비잉여현금흐름',
       '총자산대비잉여현금흐름', '총자산대비영업현금흐름', 'PBR', '부채비율', '차입금의존도', '자기자본구성비율',
       '총자본투자효율', '총자본회전률', '총자본순이익률', '순운전자본회전률', '자기자본순이익률',
       't-1감사의견코드'], dtype=object)

In [127]:
d.sort_values('T-test',ascending=False).dropna()

Unnamed: 0,피처값,F-test,분산,T-test
14,당좌자산회전률,0.0684844,homo,0.022147
27,자본분배율,0.0005226515,hetero,0.011374
25,부가가치율,0.03513533,hetero,0.008939
28,이윤분배율,3.803464e-09,hetero,0.005566
15,유동자산회전률,0.1595314,homo,0.005153
38,매출액대비잉여현금흐름,8.63119e-12,hetero,0.004058
41,총자산대비잉여현금흐름,4.464901e-08,hetero,0.001409
40,총자산대비영업현금흐름,0.0107316,hetero,0.001262
31,PBR,6.917363e-52,hetero,0.000404
0,부채비율,1.47407e-16,hetero,1.3e-05


In [128]:
d['피처값'].values

array(['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본순이익률', '총자본회전률',
       '순운전자본회전률', '당좌자산회전률', '유동자산회전률', '총자본투자효율', '부가가치율', '자본분배율',
       '이윤분배율', 'PBR', '매출액대비잉여현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드'], dtype=object)

In [130]:

fea = ['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본순이익률', '총자본회전률',
       '순운전자본회전률', '당좌자산회전률', '유동자산회전률', '총자본투자효율', '부가가치율', '자본분배율',
       '이윤분배율', 'PBR', '매출액대비잉여현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드']
mda_feature = train_resampled_sum[fea]
mda_feature

Unnamed: 0,부채비율,차입금의존도,자기자본구성비율,자기자본순이익률,총자본순이익률,총자본회전률,순운전자본회전률,당좌자산회전률,유동자산회전률,총자본투자효율,부가가치율,자본분배율,이윤분배율,PBR,매출액대비잉여현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,t-1감사의견코드
0,-0.811548,-1.109282,1.439153,-0.252629,-0.615387,0.212894,0.992025,0.335483,0.061569,-0.425728,-0.273753,-0.406512,-0.328276,-0.562243,0.282864,0.025255,0.657857,0.0
1,-0.335711,-0.165158,0.084317,0.376340,0.294309,-0.041891,-0.078423,-0.204564,-0.357204,0.549347,0.269209,0.154793,0.214508,-0.244790,0.174956,0.344645,0.147227,0.0
2,3.053530,2.225474,-1.914285,-2.405120,-1.305285,-0.022292,-0.650559,-0.174891,-0.217613,-0.498355,-0.280345,-0.848800,-1.009706,-0.185881,-0.056484,-1.383475,-0.868045,0.0
3,-0.227169,-0.146133,-0.111739,-0.082863,-0.196424,-0.002693,-0.650559,-0.519097,-0.561938,-0.221299,0.019003,-0.011026,0.034488,0.766478,-0.192914,-1.012077,-1.027834,0.0
4,0.473981,1.467440,-0.930102,-1.780921,-2.532567,-0.257479,-0.502911,-0.091807,-0.515408,-0.299305,-0.081978,-0.963687,-1.494475,0.156117,-0.537762,-2.577578,-2.353262,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,-0.945563,2.056625,-2.790197,-6.128047,-5.073258,-1.021836,-1.351887,-0.910779,-0.887651,-3.449807,-2.858514,-0.209881,0.092427,9.489893,-0.263992,-1.609200,-0.762472,1.0
358,-0.522509,-0.399405,0.497889,0.944480,1.243480,-0.100688,0.530625,-0.127414,0.219773,1.576203,0.944015,0.504006,0.351944,-0.122063,0.254376,2.471555,0.442046,1.0
359,0.577272,1.302753,-1.010086,-3.471028,-3.690771,-0.041891,-0.742839,-0.608115,-0.729448,-2.249456,-0.972831,-0.209881,0.092427,1.350657,0.037378,-3.097369,-0.684808,1.0
360,-0.876848,-1.281697,1.731774,0.032037,-0.060060,-1.080632,-1.259607,-1.266853,-1.343649,-0.694042,0.057957,0.024340,0.103848,0.561934,-0.072763,-0.260023,-0.226056,1.0


In [131]:
def vif(data):
    import pandas as pd
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    # VIF 출력을 위한 데이터 프레임 형성
    vif = pd.DataFrame()

    # VIF 값과 각 Feature 이름에 대해 설정
    vif["VIF Factor"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]
    vif["features"] = data.columns

    # VIF 값이 높은 순으로 정렬
    vif = vif.sort_values(by="VIF Factor", ascending=False)
    vif = vif.reset_index().drop(columns='index')

    return vif

vif(mda_feature)

Unnamed: 0,VIF Factor,features
0,18.108156,유동자산회전률
1,15.690418,순운전자본회전률
2,12.195206,총자본순이익률
3,10.86764,자본분배율
4,10.7111,자기자본순이익률
5,10.37218,이윤분배율
6,9.944742,자기자본구성비율
7,9.327045,당좌자산회전률
8,5.618691,총자본회전률
9,5.176078,총자본투자효율


In [132]:
mda_feature.columns

Index(['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본순이익률', '총자본회전률',
       '순운전자본회전률', '당좌자산회전률', '유동자산회전률', '총자본투자효율', '부가가치율', '자본분배율', '이윤분배율',
       'PBR', '매출액대비잉여현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드'],
      dtype='object')

In [137]:

fea = ['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본순이익률', '총자본회전률',
       '순운전자본회전률', '당좌자산회전률', '총자본투자효율', '부가가치율', '자본분배율', '이윤분배율',
       'PBR', '매출액대비잉여현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드']
mda_feature2 = train_resampled_sum[fea]
mda_feature2

Unnamed: 0,부채비율,차입금의존도,자기자본구성비율,자기자본순이익률,총자본순이익률,총자본회전률,순운전자본회전률,당좌자산회전률,총자본투자효율,부가가치율,자본분배율,이윤분배율,PBR,매출액대비잉여현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,t-1감사의견코드
0,-0.811548,-1.109282,1.439153,-0.252629,-0.615387,0.212894,0.992025,0.335483,-0.425728,-0.273753,-0.406512,-0.328276,-0.562243,0.282864,0.025255,0.657857,0.0
1,-0.335711,-0.165158,0.084317,0.376340,0.294309,-0.041891,-0.078423,-0.204564,0.549347,0.269209,0.154793,0.214508,-0.244790,0.174956,0.344645,0.147227,0.0
2,3.053530,2.225474,-1.914285,-2.405120,-1.305285,-0.022292,-0.650559,-0.174891,-0.498355,-0.280345,-0.848800,-1.009706,-0.185881,-0.056484,-1.383475,-0.868045,0.0
3,-0.227169,-0.146133,-0.111739,-0.082863,-0.196424,-0.002693,-0.650559,-0.519097,-0.221299,0.019003,-0.011026,0.034488,0.766478,-0.192914,-1.012077,-1.027834,0.0
4,0.473981,1.467440,-0.930102,-1.780921,-2.532567,-0.257479,-0.502911,-0.091807,-0.299305,-0.081978,-0.963687,-1.494475,0.156117,-0.537762,-2.577578,-2.353262,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,-0.945563,2.056625,-2.790197,-6.128047,-5.073258,-1.021836,-1.351887,-0.910779,-3.449807,-2.858514,-0.209881,0.092427,9.489893,-0.263992,-1.609200,-0.762472,1.0
358,-0.522509,-0.399405,0.497889,0.944480,1.243480,-0.100688,0.530625,-0.127414,1.576203,0.944015,0.504006,0.351944,-0.122063,0.254376,2.471555,0.442046,1.0
359,0.577272,1.302753,-1.010086,-3.471028,-3.690771,-0.041891,-0.742839,-0.608115,-2.249456,-0.972831,-0.209881,0.092427,1.350657,0.037378,-3.097369,-0.684808,1.0
360,-0.876848,-1.281697,1.731774,0.032037,-0.060060,-1.080632,-1.259607,-1.266853,-0.694042,0.057957,0.024340,0.103848,0.561934,-0.072763,-0.260023,-0.226056,1.0


In [138]:
vif(mda_feature2)

Unnamed: 0,VIF Factor,features
0,12.078422,총자본순이익률
1,10.867071,자본분배율
2,10.691515,자기자본순이익률
3,10.37077,이윤분배율
4,10.036204,순운전자본회전률
5,9.431047,자기자본구성비율
6,5.53098,총자본회전률
7,5.299303,당좌자산회전률
8,5.090975,총자본투자효율
9,4.881894,부채비율


In [139]:
mda_feature2.columns

Index(['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본순이익률', '총자본회전률',
       '순운전자본회전률', '당좌자산회전률', '총자본투자효율', '부가가치율', '자본분배율', '이윤분배율', 'PBR',
       '매출액대비잉여현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드'],
      dtype='object')

In [140]:
fea = ['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본회전률',
       '순운전자본회전률', '당좌자산회전률', '총자본투자효율', '부가가치율', '자본분배율', '이윤분배율',
       'PBR', '매출액대비잉여현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드']
mda_feature2 = train_resampled_sum[fea]
mda_feature2

Unnamed: 0,부채비율,차입금의존도,자기자본구성비율,자기자본순이익률,총자본회전률,순운전자본회전률,당좌자산회전률,총자본투자효율,부가가치율,자본분배율,이윤분배율,PBR,매출액대비잉여현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,t-1감사의견코드
0,-0.811548,-1.109282,1.439153,-0.252629,0.212894,0.992025,0.335483,-0.425728,-0.273753,-0.406512,-0.328276,-0.562243,0.282864,0.025255,0.657857,0.0
1,-0.335711,-0.165158,0.084317,0.376340,-0.041891,-0.078423,-0.204564,0.549347,0.269209,0.154793,0.214508,-0.244790,0.174956,0.344645,0.147227,0.0
2,3.053530,2.225474,-1.914285,-2.405120,-0.022292,-0.650559,-0.174891,-0.498355,-0.280345,-0.848800,-1.009706,-0.185881,-0.056484,-1.383475,-0.868045,0.0
3,-0.227169,-0.146133,-0.111739,-0.082863,-0.002693,-0.650559,-0.519097,-0.221299,0.019003,-0.011026,0.034488,0.766478,-0.192914,-1.012077,-1.027834,0.0
4,0.473981,1.467440,-0.930102,-1.780921,-0.257479,-0.502911,-0.091807,-0.299305,-0.081978,-0.963687,-1.494475,0.156117,-0.537762,-2.577578,-2.353262,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,-0.945563,2.056625,-2.790197,-6.128047,-1.021836,-1.351887,-0.910779,-3.449807,-2.858514,-0.209881,0.092427,9.489893,-0.263992,-1.609200,-0.762472,1.0
358,-0.522509,-0.399405,0.497889,0.944480,-0.100688,0.530625,-0.127414,1.576203,0.944015,0.504006,0.351944,-0.122063,0.254376,2.471555,0.442046,1.0
359,0.577272,1.302753,-1.010086,-3.471028,-0.041891,-0.742839,-0.608115,-2.249456,-0.972831,-0.209881,0.092427,1.350657,0.037378,-3.097369,-0.684808,1.0
360,-0.876848,-1.281697,1.731774,0.032037,-1.080632,-1.259607,-1.266853,-0.694042,0.057957,0.024340,0.103848,0.561934,-0.072763,-0.260023,-0.226056,1.0


In [141]:
vif(mda_feature2)

Unnamed: 0,VIF Factor,features
0,10.736803,자본분배율
1,10.364891,이윤분배율
2,10.035683,순운전자본회전률
3,8.924078,자기자본구성비율
4,5.516351,총자본회전률
5,5.29831,당좌자산회전률
6,4.952175,자기자본순이익률
7,4.869126,총자본투자효율
8,4.365608,차입금의존도
9,4.114163,부채비율


In [142]:
mda_feature2.columns

Index(['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본회전률', '순운전자본회전률',
       '당좌자산회전률', '총자본투자효율', '부가가치율', '자본분배율', '이윤분배율', 'PBR', '매출액대비잉여현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드'],
      dtype='object')

In [143]:
fea = ['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본회전률', '순운전자본회전률',
       '당좌자산회전률', '총자본투자효율', '부가가치율', '이윤분배율', 'PBR', '매출액대비잉여현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드']
mda_feature3 = train_resampled_sum[fea]
mda_feature3

Unnamed: 0,부채비율,차입금의존도,자기자본구성비율,자기자본순이익률,총자본회전률,순운전자본회전률,당좌자산회전률,총자본투자효율,부가가치율,이윤분배율,PBR,매출액대비잉여현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,t-1감사의견코드
0,-0.811548,-1.109282,1.439153,-0.252629,0.212894,0.992025,0.335483,-0.425728,-0.273753,-0.328276,-0.562243,0.282864,0.025255,0.657857,0.0
1,-0.335711,-0.165158,0.084317,0.376340,-0.041891,-0.078423,-0.204564,0.549347,0.269209,0.214508,-0.244790,0.174956,0.344645,0.147227,0.0
2,3.053530,2.225474,-1.914285,-2.405120,-0.022292,-0.650559,-0.174891,-0.498355,-0.280345,-1.009706,-0.185881,-0.056484,-1.383475,-0.868045,0.0
3,-0.227169,-0.146133,-0.111739,-0.082863,-0.002693,-0.650559,-0.519097,-0.221299,0.019003,0.034488,0.766478,-0.192914,-1.012077,-1.027834,0.0
4,0.473981,1.467440,-0.930102,-1.780921,-0.257479,-0.502911,-0.091807,-0.299305,-0.081978,-1.494475,0.156117,-0.537762,-2.577578,-2.353262,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,-0.945563,2.056625,-2.790197,-6.128047,-1.021836,-1.351887,-0.910779,-3.449807,-2.858514,0.092427,9.489893,-0.263992,-1.609200,-0.762472,1.0
358,-0.522509,-0.399405,0.497889,0.944480,-0.100688,0.530625,-0.127414,1.576203,0.944015,0.351944,-0.122063,0.254376,2.471555,0.442046,1.0
359,0.577272,1.302753,-1.010086,-3.471028,-0.041891,-0.742839,-0.608115,-2.249456,-0.972831,0.092427,1.350657,0.037378,-3.097369,-0.684808,1.0
360,-0.876848,-1.281697,1.731774,0.032037,-1.080632,-1.259607,-1.266853,-0.694042,0.057957,0.103848,0.561934,-0.072763,-0.260023,-0.226056,1.0


In [144]:
vif(mda_feature3)

Unnamed: 0,VIF Factor,features
0,10.035646,순운전자본회전률
1,8.922591,자기자본구성비율
2,5.491328,총자본회전률
3,5.28562,당좌자산회전률
4,4.867125,총자본투자효율
5,4.86095,자기자본순이익률
6,4.360986,차입금의존도
7,4.043554,부채비율
8,3.321708,총자산대비잉여현금흐름
9,3.288821,부가가치율


In [145]:
mda_feature3.columns

Index(['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본회전률', '순운전자본회전률',
       '당좌자산회전률', '총자본투자효율', '부가가치율', '이윤분배율', 'PBR', '매출액대비잉여현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드'],
      dtype='object')

In [146]:
fea = ['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본회전률',
       '당좌자산회전률', '총자본투자효율', '부가가치율', '이윤분배율', 'PBR', '매출액대비잉여현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드']
mda_feature3 = train_resampled_sum[fea]
mda_feature3

Unnamed: 0,부채비율,차입금의존도,자기자본구성비율,자기자본순이익률,총자본회전률,당좌자산회전률,총자본투자효율,부가가치율,이윤분배율,PBR,매출액대비잉여현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,t-1감사의견코드
0,-0.811548,-1.109282,1.439153,-0.252629,0.212894,0.335483,-0.425728,-0.273753,-0.328276,-0.562243,0.282864,0.025255,0.657857,0.0
1,-0.335711,-0.165158,0.084317,0.376340,-0.041891,-0.204564,0.549347,0.269209,0.214508,-0.244790,0.174956,0.344645,0.147227,0.0
2,3.053530,2.225474,-1.914285,-2.405120,-0.022292,-0.174891,-0.498355,-0.280345,-1.009706,-0.185881,-0.056484,-1.383475,-0.868045,0.0
3,-0.227169,-0.146133,-0.111739,-0.082863,-0.002693,-0.519097,-0.221299,0.019003,0.034488,0.766478,-0.192914,-1.012077,-1.027834,0.0
4,0.473981,1.467440,-0.930102,-1.780921,-0.257479,-0.091807,-0.299305,-0.081978,-1.494475,0.156117,-0.537762,-2.577578,-2.353262,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,-0.945563,2.056625,-2.790197,-6.128047,-1.021836,-0.910779,-3.449807,-2.858514,0.092427,9.489893,-0.263992,-1.609200,-0.762472,1.0
358,-0.522509,-0.399405,0.497889,0.944480,-0.100688,-0.127414,1.576203,0.944015,0.351944,-0.122063,0.254376,2.471555,0.442046,1.0
359,0.577272,1.302753,-1.010086,-3.471028,-0.041891,-0.608115,-2.249456,-0.972831,0.092427,1.350657,0.037378,-3.097369,-0.684808,1.0
360,-0.876848,-1.281697,1.731774,0.032037,-1.080632,-1.266853,-0.694042,0.057957,0.103848,0.561934,-0.072763,-0.260023,-0.226056,1.0


In [147]:
vif(mda_feature3)

Unnamed: 0,VIF Factor,features
0,8.318656,자기자본구성비율
1,4.843846,자기자본순이익률
2,4.738872,총자본투자효율
3,4.358232,차입금의존도
4,4.043543,부채비율
5,3.467816,총자본회전률
6,3.278199,총자산대비잉여현금흐름
7,3.254523,부가가치율
8,2.84536,매출액대비잉여현금흐름
9,2.60674,당좌자산회전률


In [148]:
mda_feature3.columns

Index(['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본회전률', '당좌자산회전률',
       '총자본투자효율', '부가가치율', '이윤분배율', 'PBR', '매출액대비잉여현금흐름', '총자산대비영업현금흐름',
       '총자산대비잉여현금흐름', 't-1감사의견코드'],
      dtype='object')

In [149]:
d

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,1.47407e-16,hetero,1.3e-05
4,차입금의존도,0.03789876,hetero,3e-06
5,자기자본구성비율,0.00014518,hetero,2e-06
7,자기자본순이익률,1.573731e-22,hetero,0.0
8,총자본순이익률,2.145101e-11,hetero,0.0
9,총자본회전률,0.004089106,hetero,0.0
12,순운전자본회전률,0.01367496,hetero,0.0
14,당좌자산회전률,0.0684844,homo,0.022147
15,유동자산회전률,0.1595314,homo,0.005153
24,총자본투자효율,0.001633222,hetero,2e-06


In [150]:
fea_hetero= ['부채비율', '차입금의존도', '자기자본구성비율', '자기자본순이익률', '총자본회전률',
       '총자본투자효율', '부가가치율', '이윤분배율', 'PBR', '매출액대비잉여현금흐름', '총자산대비영업현금흐름',
       '총자산대비잉여현금흐름']
fea_homo=['당좌자산회전률']

In [151]:
# Welch's t-test 수행
t_stat, p_value = stats.ttest_ind(Bad[fea_hetero], Good[fea_hetero], equal_var=False)
result_df_hetero = pd.DataFrame({'t-statistic': t_stat, 'p-value': p_value}, index=fea_hetero)

# t-test 수행
t_stat, p_value = stats.ttest_ind(Bad[fea_homo], Good[fea_homo], equal_var=True)
result_df_homo = pd.DataFrame({'t-statistic': t_stat, 'p-value': p_value}, index=fea_homo)

result_df = pd.concat([result_df_hetero,result_df_homo],axis=0)


result_df = result_df.sort_values('p-value', ascending=True).reset_index()

In [152]:
result_df = result_df[['index', 'p-value']]
result_df.columns = ['Variable', 'p-value']

---

# Chi 2

* 카이제곱 검정 조건
    * 종속변인은 범주형 자료여야 한다.
    * 기대빈도가 5이하인 셀이 전체의 20%가 넘지 않아야 한다.
    * 각 칸의 빈도는 다른 칸의 빈도와 독립적이어야 한다.

In [153]:
from scipy.stats import chi2_contingency

# 기대빈도가 5 이하인 항목의 비율을 저장할 딕셔너리
expected_freq_5_ratio = {}

# 각 독립 변수에 대해 기대빈도 계산 및 비율 확인
for column in train_cat_resampled_sum.columns[:-1]:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(train_cat_resampled_sum[column], train_cat_resampled_sum['t-1감사의견코드'])

    # 카이제곱 검정 수행
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)

    # 기대빈도가 5 이하인 항목의 비율 계산
    expected_freq_5 = (expected_freq <= 5).mean()

    # 결과 저장
    expected_freq_5_ratio[column] = expected_freq_5

# 결과 출력
for column, ratio in expected_freq_5_ratio.items():
    print(f"변수 '{column}'의 기대빈도가 5 이하인 항목 비율: {ratio}")

변수 '기업수명주기'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '이보배초과여부'의 기대빈도가 5 이하인 항목 비율: 0.25
변수 '파부비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파당비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.25
변수 '파차의초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파로이초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0


In [154]:
# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

for column in train_cat_resampled_sum.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(train_cat_resampled_sum[column], test_cat_resampled_sum['t-1감사의견코드'])
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    # 카이제곱 통계량(chi2), p-value(p_value), 자유도(dof), 예상빈도(expected_freq)
    print(p_value)
    chi2_scores.append((column, chi2))

# 카이제곱 검정 결과를 기준으로 변수 정렬
sorted_features = sorted(chi2_scores, key=lambda x : x[1], reverse=True)

# 선택된 변수 출력
sorted_features

0.4003076803959865
1.0
1.0
1.0
0.3908282308519907
0.4226780741706354
1.0


[('기업수명주기', 4.042328042328042),
 ('파차의초과여부', 0.7363636363636363),
 ('파로이초과여부', 0.6428571428571428),
 ('이보배초과여부', 0.0),
 ('파부비초과여부', 0.0),
 ('파당비초과여부', 0.0),
 ('t-1감사의견코드', 0.0)]

In [155]:
chi2_scores

[('기업수명주기', 4.042328042328042),
 ('이보배초과여부', 0.0),
 ('파부비초과여부', 0.0),
 ('파당비초과여부', 0.0),
 ('파차의초과여부', 0.7363636363636363),
 ('파로이초과여부', 0.6428571428571428),
 ('t-1감사의견코드', 0.0)]

In [156]:
import pandas as pd
from scipy.stats import chi2_contingency

# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

p_values = []
for column in test_cat_resampled_sum.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(test_cat_resampled_sum[column], test_cat_resampled_sum['t-1감사의견코드'])
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    p_values.append(p_value)
    chi2_scores.append((column, chi2))

# p-value가 0.05보다 작은 값을 출력하는 데이터프레임 생성
result_df_1 = pd.DataFrame({'Variable': test_cat_resampled_sum.columns, 'p-value': p_values})
filtered_df_chi = result_df_1[result_df_1['p-value'] < 0.05]

filtered_df_chi


Unnamed: 0,Variable,p-value
5,파로이초과여부,0.0004695851
6,t-1감사의견코드,3.835975e-24


In [157]:
result = pd.concat([result_df, filtered_df_chi], axis=0)

In [158]:
result.sort_values('p-value', ascending=True).reset_index(drop=True)

Unnamed: 0,Variable,p-value
0,t-1감사의견코드,3.835975e-24
1,자기자본순이익률,1.412922e-08
2,총자본회전률,3.222872e-07
3,총자본투자효율,1.750901e-06
4,자기자본구성비율,2.024238e-06
5,차입금의존도,2.560761e-06
6,부채비율,1.261084e-05
7,PBR,0.0004037682
8,파로이초과여부,0.0004695851
9,총자산대비영업현금흐름,0.001261726


---
# feature개수 정하기 위한 Logit

In [159]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import numpy as np
lr_clf = LogisticRegression()

feature = train_resampled_sum.drop(['t-1감사의견코드'],axis=1)
target = train_resampled_sum[['t-1감사의견코드']]

logit = SelectFromModel(LogisticRegression())
logit.fit(feature, target)
logit_support = logit.get_support()
lr_feature = feature.loc[:,logit_support].columns.tolist()

In [160]:
lr_feature

['이자보상배율',
 '매출액영업이익률',
 '자기자본순이익률',
 '총자본순이익률',
 '총자본회전률',
 '재고자산회전률',
 '총자본증가율',
 '유동자산증가율',
 '부가가치율',
 '노동소득분배율',
 '이윤분배율',
 'OCF이자보상배율',
 '매출액대비금융비용상환능력',
 '연구개발비대비매출액',
 '매출액대비잉여현금흐름',
 '총자산대비잉여현금흐름',
 '이보배초과여부',
 '파로이초과여부']

In [161]:
len(lr_feature)

18

---

### Embedded Method

> Randomforeset

In [162]:
from sklearn.ensemble import RandomForestClassifier

In [168]:
selector = SelectFromModel(estimator=RandomForestClassifier(), threshold=0.0223).fit(feature, target)
rf = selector.get_support()
count = np.count_nonzero(rf)
count

18

In [169]:
rf_features = feature.loc[:, rf].columns.tolist()
rf_features

['부채비율',
 '당좌비율',
 '유동비율',
 '이자보상배율',
 '차입금의존도',
 '자기자본구성비율',
 '매출액영업이익률',
 '자기자본순이익률',
 '총자본순이익률',
 '총자본회전률',
 '순운전자본회전률',
 '유동자산회전률',
 '자기자본증가율',
 '유형자산증가율',
 '총자본투자효율',
 'PBR',
 '매출액대비잉여현금흐름',
 '시가총액']

> LASSO

In [176]:
lasso = SelectFromModel(estimator=LogisticRegression(penalty='l1', solver='liblinear', C=0.18)).fit(feature, target)
lasso_support = lasso.get_support()
lasso_feature = feature.loc[:,lasso_support].columns.tolist()

In [177]:
len(lasso_feature)

18

# Wrapper Method

In [178]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [179]:
selector = SequentialFeatureSelector(estimator=LogisticRegression(), n_features_to_select=18, direction='backward', scoring='f1', cv=5, n_jobs=-1)

# 변수 선택 수행
selector.fit(feature, target)

# 선택된 변수의 인덱스
selected_features = selector.get_support(indices=True)

# 선택된 변수 출력
for i in selected_features:
    print(feature.columns[i])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

부채비율
매출액영업이익률
자기자본순이익률
총자본순이익률
총자본회전률
자기자본증가율
유동자산증가율
총자본투자효율
부가가치율
PER
PBR
연구개발비대비매출액
매출액대비잉여현금흐름
총자산대비현금흐름
총자산대비영업현금흐름
총자산대비잉여현금흐름
이보배초과여부
파로이초과여부


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [180]:
selected_columns = feature.columns[selected_features]
wrapper_features = list(selected_columns)

---

# 종합

In [181]:
result = result.sort_values('p-value', ascending=True).reset_index(drop=True)
result = result.head(18)
filter = result[['Variable']]

In [182]:
rf_features = pd.DataFrame(rf_features)
lasso_features = pd.DataFrame(lasso_feature)
wrapper_features = pd.DataFrame(wrapper_features)

In [183]:
total = pd.concat([filter, rf_features, lasso_features, wrapper_features], axis=1)

In [184]:
total.columns = ['t&chi', 'rf', 'lasso', 'wrapper']
total

Unnamed: 0,t&chi,rf,lasso,wrapper
0,t-1감사의견코드,부채비율,부채비율,부채비율
1,자기자본순이익률,당좌비율,매출액영업이익률,매출액영업이익률
2,총자본회전률,유동비율,자기자본순이익률,자기자본순이익률
3,총자본투자효율,이자보상배율,총자본순이익률,총자본순이익률
4,자기자본구성비율,차입금의존도,총자본회전률,총자본회전률
5,차입금의존도,자기자본구성비율,총자본증가율,자기자본증가율
6,부채비율,매출액영업이익률,자기자본증가율,유동자산증가율
7,PBR,자기자본순이익률,순이익증가율,총자본투자효율
8,파로이초과여부,총자본순이익률,유동자산증가율,부가가치율
9,총자산대비영업현금흐름,총자본회전률,재고자산증가율,PER


In [185]:
filter = total['t&chi'].tolist()
rf = total['rf'].tolist()
lasso = total['lasso'].tolist()
wrapper = total['wrapper'].tolist()

In [186]:
feature.columns

Index(['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'PER', 'PBR',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액',
       '매출액대비현금흐름', '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       '시가총액', '기업수명주기', '이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부',
       '파로이초과여부'],
      dtype='object')

In [187]:
total_result = pd.DataFrame(index=feature.columns)

# 각 컬럼의 값에 인덱스 포함 여부에 따라 True 또는 False 입력
total_result['t&chi'] = total_result.index.isin(filter)
total_result['wrapper'] = total_result.index.isin(wrapper)
total_result['rf'] = total_result.index.isin(rf)
total_result['lasso'] = total_result.index.isin(lasso)
total_result["true_sum"] = total_result.sum(axis=1)

total_result.sort_values('true_sum', ascending=False, inplace=True)
total_result

Unnamed: 0,t&chi,wrapper,rf,lasso,true_sum
부채비율,True,True,True,True,4
총자본회전률,True,True,True,True,4
매출액대비잉여현금흐름,True,True,True,True,4
PBR,True,True,True,True,4
자기자본순이익률,True,True,True,True,4
총자산대비영업현금흐름,True,True,False,True,3
부가가치율,True,True,False,True,3
자기자본증가율,False,True,True,True,3
총자본투자효율,True,True,True,False,3
총자본순이익률,False,True,True,True,3


In [188]:
total_result_2 = total_result[total_result['true_sum']>=3]
total_result_2.reset_index()

Unnamed: 0,index,t&chi,wrapper,rf,lasso,true_sum
0,부채비율,True,True,True,True,4
1,총자본회전률,True,True,True,True,4
2,매출액대비잉여현금흐름,True,True,True,True,4
3,PBR,True,True,True,True,4
4,자기자본순이익률,True,True,True,True,4
5,총자산대비영업현금흐름,True,True,False,True,3
6,부가가치율,True,True,False,True,3
7,자기자본증가율,False,True,True,True,3
8,총자본투자효율,True,True,True,False,3
9,총자본순이익률,False,True,True,True,3


In [189]:
total_result_2.index

Index(['부채비율', '총자본회전률', '매출액대비잉여현금흐름', 'PBR', '자기자본순이익률', '총자산대비영업현금흐름',
       '부가가치율', '자기자본증가율', '총자본투자효율', '총자본순이익률', '매출액영업이익률'],
      dtype='object')

In [191]:
# train_resampled_sum.to_csv('Undersampling_0.33_train.csv',index=False,encoding='euc-kr')
# test_resampled_sum.to_csv('Undersampling_0.33_test.csv',index=False,encoding='euc-kr')