In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train = pd.read_csv('/content/drive/MyDrive/train.csv', encoding='euc-kr')
test = pd.read_csv('/content/drive/MyDrive/test.csv', encoding='euc-kr')

In [5]:
train['자산총계(천원)'] = train['자산총계(천원)'] * 1000
train['log자산총계'] = np.log(train['자산총계(천원)'])

test['자산총계(천원)'] = test['자산총계(천원)'] * 1000
test['log자산총계'] = np.log(test['자산총계(천원)'])

In [6]:
train['기업수명주기'] = train['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

test['기업수명주기'] = test['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

In [7]:
train = train[['회사명', '거래소코드', '회계년도', 'Year', '산업분류코드', '산업분류', '부채비율', '당좌비율',
       '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율',
       '매출액영업이익률', '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력',
       '연구개발비대비매출액', '매출액대비현금흐름', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', '기업수명주기','이보배초과여부',
       '파부비초과여부','파당비초과여부', '파차의초과여부', '파로이초과여부', 'log자산총계',
       't-1감사의견코드']]

In [8]:
# 't-1감사의견코드' 컬럼을 추출하여 저장
t_1_column = train.pop('t-1감사의견코드')

# 't-1감사의견코드' 컬럼을 데이터프레임의 맨 뒤에 추가
train['t-1감사의견코드'] = t_1_column

# Feature 개수 정하기 위해 Logit

In [9]:
X_train = train.iloc[:,6:-1]
y_train = train[['t-1감사의견코드']]

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import numpy as np
lr_clf = LogisticRegression()

feature = X_train
target = y_train

logit = SelectFromModel(LogisticRegression())
logit.fit(feature, target)
logit_support = logit.get_support()
lr_feature = feature.loc[:,logit_support].columns.tolist()

In [11]:
# 14개
lr_feature

['당좌비율',
 '유동비율',
 '차입금의존도',
 '자기자본구성비율',
 '매출액영업이익률',
 '재고자산회전률',
 '자기자본증가율',
 '순이익증가율',
 '유형자산증가율',
 '영업이익증가율',
 '총자본투자효율',
 '노동소득분배율',
 '자본분배율',
 'log자산총계']

# Filter Method

## t-test & Chi-Square

### t-test
- t-test 하기 위해 연속형 컬럼만 추출

In [12]:
train_int = train[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드']]

In [13]:
test_int = test[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드']]

#### MDA

In [14]:
# 1-1 정규성 테스트(샤피로)
from scipy.stats import norm
from scipy import stats
from statsmodels.formula.api import ols
from scipy.stats import kstest

for j in test_int.columns:
    a = stats.shapiro(test_int[j])
    p = a.pvalue
    print(j,a)

부채비율 ShapiroResult(statistic=0.00048786401748657227, pvalue=0.0)
당좌비율 ShapiroResult(statistic=0.0009923577308654785, pvalue=0.0)
유동비율 ShapiroResult(statistic=0.0010990500450134277, pvalue=0.0)
이자보상배율 ShapiroResult(statistic=0.012764692306518555, pvalue=0.0)
차입금의존도 ShapiroResult(statistic=0.7496182918548584, pvalue=0.0)
자기자본구성비율 ShapiroResult(statistic=0.9563366174697876, pvalue=0.0)
매출액영업이익률 ShapiroResult(statistic=0.007776379585266113, pvalue=0.0)
자기자본순이익률 ShapiroResult(statistic=0.014060437679290771, pvalue=0.0)
총자본순이익률 ShapiroResult(statistic=0.41211169958114624, pvalue=0.0)
총자본회전률 ShapiroResult(statistic=0.24337923526763916, pvalue=0.0)
자기자본회전률 ShapiroResult(statistic=0.000538170337677002, pvalue=0.0)
운전자본회전률 ShapiroResult(statistic=0.003760218620300293, pvalue=0.0)
순운전자본회전률 ShapiroResult(statistic=0.09678095579147339, pvalue=0.0)
재고자산회전률 ShapiroResult(statistic=0.001590132713317871, pvalue=0.0)
당좌자산회전률 ShapiroResult(statistic=0.026680588722229004, pvalue=0.0)
유동자산회전률 ShapiroResult

In [15]:
# 1-2 정규성 테스트(K-S)
for j in train_int.columns:
    a = kstest(train_int[j],'norm')
    p = a.pvalue
    print(j,a)

부채비율 KstestResult(statistic=0.9189314493432176, pvalue=0.0, statistic_location=3.07, statistic_sign=-1)
당좌비율 KstestResult(statistic=0.9686856307994549, pvalue=0.0, statistic_location=2.62, statistic_sign=-1)
유동비율 KstestResult(statistic=0.976092352768502, pvalue=0.0, statistic_location=2.84, statistic_sign=-1)
이자보상배율 KstestResult(statistic=0.5277894082488922, pvalue=0.0, statistic_location=1.5099936480913938, statistic_sign=-1)
차입금의존도 KstestResult(statistic=0.8135296230775833, pvalue=0.0, statistic_location=2.69, statistic_sign=-1)
자기자본구성비율 KstestResult(statistic=0.8964839268945669, pvalue=0.0, statistic_location=2.77, statistic_sign=-1)
매출액영업이익률 KstestResult(statistic=0.6279097711882253, pvalue=0.0, statistic_location=1.9, statistic_sign=-1)
자기자본순이익률 KstestResult(statistic=0.618309680556216, pvalue=0.0, statistic_location=2.16, statistic_sign=-1)
총자본순이익률 KstestResult(statistic=0.5093201554320996, pvalue=0.0, statistic_location=1.77, statistic_sign=-1)
총자본회전률 KstestResult(statistic=0.49

p-value가 0 이는 데이터개수가 많아서 p-value 자체가 너무작아 계산이 불가능하다고 판단. 중심극한 정리에 의해서 정규성이 있다고 가정하고 진행

In [16]:
# 부도기업과 정상기업의 피처별 등분산비교 (정규성 가정으로 bartlett)
Bad = train_int[train_int['t-1감사의견코드']== 1] #Existing Customer
Good = train_int[train_int['t-1감사의견코드']== 0] #Attrited Customer

c = []
for i in train_int:
    # lresult = stats.levene(close[i], normal[i])
    lresult = stats.bartlett(Bad[i], Good[i])
    c.append([i,lresult[-1]])

c= pd.DataFrame(c)
c.columns=["피처값",'F-test']
c

Unnamed: 0,피처값,F-test
0,부채비율,0.0
1,당좌비율,0.0
2,유동비율,0.0
3,이자보상배율,0.0
4,차입금의존도,0.0
5,자기자본구성비율,1.167465e-94
6,매출액영업이익률,0.0
7,자기자본순이익률,0.0
8,총자본순이익률,0.0
9,총자본회전률,0.0


In [17]:
# F-test 결과 0.05 이상이면 homo 0.05 이하이면 hetero
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
    else:
        c.loc[i,"분산"] = "hetero"
c

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,0.0,hetero,
1,당좌비율,0.0,hetero,
2,유동비율,0.0,hetero,
3,이자보상배율,0.0,hetero,
4,차입금의존도,0.0,hetero,
5,자기자본구성비율,1.167465e-94,hetero,
6,매출액영업이익률,0.0,hetero,
7,자기자본순이익률,0.0,hetero,
8,총자본순이익률,0.0,hetero,
9,총자본회전률,0.0,hetero,


In [18]:
c[c["분산"]=='homo']

Unnamed: 0,피처값,F-test,분산,T-test


In [19]:
c

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,0.0,hetero,
1,당좌비율,0.0,hetero,
2,유동비율,0.0,hetero,
3,이자보상배율,0.0,hetero,
4,차입금의존도,0.0,hetero,
5,자기자본구성비율,1.167465e-94,hetero,
6,매출액영업이익률,0.0,hetero,
7,자기자본순이익률,0.0,hetero,
8,총자본순이익률,0.0,hetero,
9,총자본회전률,0.0,hetero,


In [20]:
# homo 인 feature 는 student t-test, hetero이면 Welchs T-Test 진행
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=True)       ## equal_var = True Student T-test
        c.loc[i,"T-test"] = result[-1]
        print(Bad[c.loc[i,"피처값"]])
    else:
        c.loc[i,"분산"] = "hetero"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=False)      ## equal_var = False Welchs T-Test
        c.loc[i,"T-test"] = result[-1]
c

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,0.0,hetero,0.53001
1,당좌비율,0.0,hetero,0.008492
2,유동비율,0.0,hetero,0.006085
3,이자보상배율,0.0,hetero,0.169462
4,차입금의존도,0.0,hetero,0.02846
5,자기자본구성비율,1.167465e-94,hetero,0.0
6,매출액영업이익률,0.0,hetero,0.050517
7,자기자본순이익률,0.0,hetero,0.724641
8,총자본순이익률,0.0,hetero,0.000864
9,총자본회전률,0.0,hetero,0.0


In [21]:
# 0.05 이상이면 유의미하지 않으므로 0.05 이하인 것만 추출
d = c[c["T-test"]<= 0.05]
d.sort_values('T-test',ascending=False)["피처값"].unique()
# d.sort_values('T-test',ascending=False)["피처값"]

array(['장기부채상환능력', '차입금의존도', '자기자본회전률', '영업이익증가율', '매출액대비금융비용상환능력',
       'OCF이자보상배율', '당좌비율', '유동비율', '순운전자본회전률', '총자본순이익률', '순이익증가율',
       '재고자산회전률', '총자산대비현금흐름', '총자본회전률', '총자본증가율', 'log자산총계', '자기자본구성비율',
       't-1감사의견코드'], dtype=object)

In [22]:
d.sort_values('T-test',ascending=False).dropna()

Unnamed: 0,피처값,F-test,분산,T-test
32,장기부채상환능력,0.0,hetero,0.033151
4,차입금의존도,0.0,hetero,0.02846
10,자기자본회전률,0.0,hetero,0.027422
23,영업이익증가율,0.0,hetero,0.013501
33,매출액대비금융비용상환능력,0.0,hetero,0.012374
30,OCF이자보상배율,0.0,hetero,0.010711
1,당좌비율,0.0,hetero,0.008492
2,유동비율,0.0,hetero,0.006085
12,순운전자본회전률,0.0,hetero,0.002208
8,총자본순이익률,0.0,hetero,0.000864


In [23]:
d['피처값'].values

array(['당좌비율', '유동비율', '차입금의존도', '자기자본구성비율', '총자본순이익률', '총자본회전률',
       '자기자본회전률', '순운전자본회전률', '재고자산회전률', '총자본증가율', '순이익증가율', '영업이익증가율',
       'log자산총계', 'OCF이자보상배율', '장기부채상환능력', '매출액대비금융비용상환능력', '총자산대비현금흐름',
       't-1감사의견코드'], dtype=object)

In [24]:
# t-test가 가장 작은순으로 의미 있으므로 가장작은 순위로 12개 피처 선정
fea = ['당좌비율', '유동비율', '차입금의존도', '자기자본구성비율', '총자본순이익률', '총자본회전률',
       '자기자본회전률', '순운전자본회전률', '재고자산회전률', '총자본증가율', '순이익증가율', '영업이익증가율',
       'log자산총계', 'OCF이자보상배율', '장기부채상환능력', '매출액대비금융비용상환능력', '총자산대비현금흐름'
       ]
mda_feature = train[fea]
mda_feature

Unnamed: 0,당좌비율,유동비율,차입금의존도,자기자본구성비율,총자본순이익률,총자본회전률,자기자본회전률,순운전자본회전률,재고자산회전률,총자본증가율,순이익증가율,영업이익증가율,log자산총계,OCF이자보상배율,장기부채상환능력,매출액대비금융비용상환능력,총자산대비현금흐름
0,347.96,354.61,6.09,7.40,-90.18,0.17,2.28,0.14,9.53,0.00,0.00,0.00,22.828067,-9.332108,-0.000000,4.174742,0.772454
1,68.72,68.72,45.96,48.26,6.61,0.36,0.76,0.81,0.00,-0.04,-19.83,-6.98,23.977288,9.548280,2.352094,56.802819,0.000884
2,109.86,109.86,0.00,20.04,-1.80,0.10,0.68,0.06,0.00,-49.82,0.00,0.00,23.592060,-618.027372,-0.000000,-23.752940,-0.516536
3,55.31,55.31,70.59,27.22,18.17,0.40,1.48,2.09,0.00,0.00,0.00,0.00,23.265147,5.010647,4.843848,26.994572,-0.149208
4,30.73,124.85,1.76,20.41,27.88,1.17,17.19,0.69,1.47,3.50,0.00,0.00,23.488822,16.602420,0.000000,46.704264,-0.001722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137673,84.46,129.25,19.60,19.98,10.04,2.20,14.88,1.33,6.91,-1.60,22.36,19.39,23.805059,4.818200,0.000000,213.217431,-0.003247
137674,148.14,158.66,25.22,42.55,3.32,3.04,8.55,2.13,36.81,-21.61,-62.60,-26.36,25.560289,4.835895,0.036460,195.625970,-0.018967
137675,223.79,300.12,18.78,66.65,36.01,2.00,3.27,1.99,10.11,52.23,0.00,0.00,22.865585,40.753692,0.163392,288.357414,0.215584
137676,272.56,280.21,7.13,78.62,1.43,0.88,1.15,1.09,33.58,-3.79,2238.26,0.00,24.057147,40.446933,0.295099,-288.743530,0.063097


In [25]:
def vif(data):
    import pandas as pd
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    # VIF 출력을 위한 데이터 프레임 형성
    vif = pd.DataFrame()

    # VIF 값과 각 Feature 이름에 대해 설정
    vif["VIF Factor"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]
    vif["features"] = data.columns

    # VIF 값이 높은 순으로 정렬
    vif = vif.sort_values(by="VIF Factor", ascending=False)
    vif = vif.reset_index().drop(columns='index')

    return vif

vif(mda_feature)

Unnamed: 0,VIF Factor,features
0,45.310796,당좌비율
1,45.150614,유동비율
2,3.307882,log자산총계
3,3.245975,자기자본구성비율
4,1.176584,장기부채상환능력
5,1.087598,총자본회전률
6,1.062461,총자본순이익률
7,1.033903,총자본증가율
8,1.030056,총자산대비현금흐름
9,1.028777,차입금의존도


In [26]:
# t-test가 가장 작은순으로 의미 있으므로 가장작은 순위로 12개 피처 선정
fea = ['당좌비율', '차입금의존도', '자기자본구성비율', '총자본순이익률', '총자본회전률',
       '자기자본회전률', '순운전자본회전률', '재고자산회전률', '총자본증가율', '순이익증가율', '영업이익증가율',
       'log자산총계', 'OCF이자보상배율', '장기부채상환능력', '매출액대비금융비용상환능력', '총자산대비현금흐름'
       ]
mda_feature2 = train[fea]
mda_feature2

Unnamed: 0,당좌비율,차입금의존도,자기자본구성비율,총자본순이익률,총자본회전률,자기자본회전률,순운전자본회전률,재고자산회전률,총자본증가율,순이익증가율,영업이익증가율,log자산총계,OCF이자보상배율,장기부채상환능력,매출액대비금융비용상환능력,총자산대비현금흐름
0,347.96,6.09,7.40,-90.18,0.17,2.28,0.14,9.53,0.00,0.00,0.00,22.828067,-9.332108,-0.000000,4.174742,0.772454
1,68.72,45.96,48.26,6.61,0.36,0.76,0.81,0.00,-0.04,-19.83,-6.98,23.977288,9.548280,2.352094,56.802819,0.000884
2,109.86,0.00,20.04,-1.80,0.10,0.68,0.06,0.00,-49.82,0.00,0.00,23.592060,-618.027372,-0.000000,-23.752940,-0.516536
3,55.31,70.59,27.22,18.17,0.40,1.48,2.09,0.00,0.00,0.00,0.00,23.265147,5.010647,4.843848,26.994572,-0.149208
4,30.73,1.76,20.41,27.88,1.17,17.19,0.69,1.47,3.50,0.00,0.00,23.488822,16.602420,0.000000,46.704264,-0.001722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137673,84.46,19.60,19.98,10.04,2.20,14.88,1.33,6.91,-1.60,22.36,19.39,23.805059,4.818200,0.000000,213.217431,-0.003247
137674,148.14,25.22,42.55,3.32,3.04,8.55,2.13,36.81,-21.61,-62.60,-26.36,25.560289,4.835895,0.036460,195.625970,-0.018967
137675,223.79,18.78,66.65,36.01,2.00,3.27,1.99,10.11,52.23,0.00,0.00,22.865585,40.753692,0.163392,288.357414,0.215584
137676,272.56,7.13,78.62,1.43,0.88,1.15,1.09,33.58,-3.79,2238.26,0.00,24.057147,40.446933,0.295099,-288.743530,0.063097


In [27]:
vif(mda_feature2)

Unnamed: 0,VIF Factor,features
0,3.307603,log자산총계
1,3.24592,자기자본구성비율
2,1.176578,장기부채상환능력
3,1.173595,당좌비율
4,1.087595,총자본회전률
5,1.062461,총자본순이익률
6,1.033902,총자본증가율
7,1.030047,총자산대비현금흐름
8,1.028777,차입금의존도
9,1.012653,자기자본회전률


In [28]:
# Welch's t-test 수행
t_stat, p_value = stats.ttest_ind(Bad[fea], Good[fea], equal_var=False)

# 결과를 데이터프레임으로 변환
result_df = pd.DataFrame({'t-statistic': t_stat, 'p-value': p_value}, index=fea)

result_df = result_df.sort_values('p-value', ascending=True).reset_index()

In [29]:
result_df = result_df[['index', 'p-value']]
result_df.columns = ['Variable', 'p-value']

In [30]:
result_df

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,log자산총계,0.0
2,총자본증가율,4.006026e-74
3,총자본회전률,8.385601999999999e-36
4,총자산대비현금흐름,6.909689e-34
5,재고자산회전률,0.0001384389
6,순이익증가율,0.0006370057
7,총자본순이익률,0.0008639113
8,순운전자본회전률,0.002207923
9,당좌비율,0.008492177


### Chi
* 카이제곱 검정 조건
    * 종속변인은 범주형 자료여야 한다.
    * 기대빈도가 5이하인 셀이 전체의 20%가 넘지 않아야 한다.
    * 각 칸의 빈도는 다른 칸의 빈도와 독립적이어야 한다.

- 범주형 변수만 추출

In [31]:
train_cat = train[['기업수명주기', 't-1감사의견코드', '이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']].astype('category')
test_cat = test[['기업수명주기', 't-1감사의견코드', '이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']].astype('category')

In [32]:
X_train_cat = train_cat.drop('t-1감사의견코드', axis=1)
y_train_cat = train['t-1감사의견코드']
X_test_cat = test_cat.drop('t-1감사의견코드', axis=1)
y_test_cat = test['t-1감사의견코드']

In [33]:
from scipy.stats import chi2_contingency

# 기대빈도가 5 이하인 항목의 비율을 저장할 딕셔너리
expected_freq_5_ratio = {}

# 각 독립 변수에 대해 기대빈도 계산 및 비율 확인
for column in train_cat.columns[:-1]:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(train_cat[column], train_cat['t-1감사의견코드'])

    # 카이제곱 검정 수행
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)

    # 기대빈도가 5 이하인 항목의 비율 계산
    expected_freq_5 = (expected_freq <= 5).mean()

    # 결과 저장
    expected_freq_5_ratio[column] = expected_freq_5

# 결과 출력
for column, ratio in expected_freq_5_ratio.items():
    print(f"변수 '{column}'의 기대빈도가 5 이하인 항목 비율: {ratio}")

변수 '기업수명주기'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 't-1감사의견코드'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '이보배초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파부비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파당비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파차의초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0


In [34]:
# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

for column in X_train_cat.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(X_train_cat[column], y_train_cat)
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    # 카이제곱 통계량(chi2), p-value(p_value), 자유도(dof), 예상빈도(expected_freq)
    print(p_value)
    chi2_scores.append((column, chi2))

# 카이제곱 검정 결과를 기준으로 변수 정렬
sorted_features = sorted(chi2_scores, key=lambda x : x[1], reverse=True)

# 선택된 변수 출력
sorted_features

0.0
0.12908771021386392
0.0
0.00039229715645192347
0.0
1.9096172238484248e-59


[('기업수명주기', 3089.2507048662806),
 ('파차의초과여부', 1985.8292770838254),
 ('파부비초과여부', 1906.060822214597),
 ('파로이초과여부', 264.3747900999849),
 ('파당비초과여부', 12.568529415290646),
 ('이보배초과여부', 2.303441378969885)]

In [35]:
chi2_scores

[('기업수명주기', 3089.2507048662806),
 ('이보배초과여부', 2.303441378969885),
 ('파부비초과여부', 1906.060822214597),
 ('파당비초과여부', 12.568529415290646),
 ('파차의초과여부', 1985.8292770838254),
 ('파로이초과여부', 264.3747900999849)]

In [36]:
import pandas as pd
from scipy.stats import chi2_contingency

# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

p_values = []
for column in X_train_cat.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(X_train_cat[column], y_train_cat)
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    p_values.append(p_value)
    chi2_scores.append((column, chi2))

# p-value가 0.05보다 작은 값을 출력하는 데이터프레임 생성
result_df_1 = pd.DataFrame({'Variable': X_train_cat.columns, 'p-value': p_values})
filtered_df_chi = result_df_1[result_df_1['p-value'] < 0.05]

filtered_df_chi

Unnamed: 0,Variable,p-value
0,기업수명주기,0.0
2,파부비초과여부,0.0
3,파당비초과여부,0.0003922972
4,파차의초과여부,0.0
5,파로이초과여부,1.909617e-59


In [37]:
result_df

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,log자산총계,0.0
2,총자본증가율,4.006026e-74
3,총자본회전률,8.385601999999999e-36
4,총자산대비현금흐름,6.909689e-34
5,재고자산회전률,0.0001384389
6,순이익증가율,0.0006370057
7,총자본순이익률,0.0008639113
8,순운전자본회전률,0.002207923
9,당좌비율,0.008492177


In [38]:
result = pd.concat([result_df, filtered_df_chi], axis=0)

In [39]:
result

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,log자산총계,0.0
2,총자본증가율,4.006026e-74
3,총자본회전률,8.385601999999999e-36
4,총자산대비현금흐름,6.909689e-34
5,재고자산회전률,0.0001384389
6,순이익증가율,0.0006370057
7,총자본순이익률,0.0008639113
8,순운전자본회전률,0.002207923
9,당좌비율,0.008492177


In [40]:
result = result.sort_values('p-value', ascending=True).reset_index(drop=True)
result

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,log자산총계,0.0
2,파부비초과여부,0.0
3,기업수명주기,0.0
4,파차의초과여부,0.0
5,총자본증가율,4.006026e-74
6,파로이초과여부,1.909617e-59
7,총자본회전률,8.385601999999999e-36
8,총자산대비현금흐름,6.909689e-34
9,재고자산회전률,0.0001384389


In [41]:
result.head(14)

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,log자산총계,0.0
2,파부비초과여부,0.0
3,기업수명주기,0.0
4,파차의초과여부,0.0
5,총자본증가율,4.006026e-74
6,파로이초과여부,1.909617e-59
7,총자본회전률,8.385601999999999e-36
8,총자산대비현금흐름,6.909689e-34
9,재고자산회전률,0.0001384389


# Wrapper Method

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
# selector = SequentialFeatureSelector(estimator=LogisticRegression(),
#                                      n_features_to_select=14,
#                                      direction='forward',
#                                      scoring='f1',
#                                      cv=5,
#                                      n_jobs=-1)

# # 변수 선택 수행
# selector.fit(X_train, y_train)

# # 선택된 변수의 인덱스
# selected_features = selector.get_support(indices=True)

# # 선택된 변수 출력
# for i in selected_features:
#     print(X_train.columns[i])

In [45]:
selector = SequentialFeatureSelector(estimator=LogisticRegression(), n_features_to_select=14, direction='backward', scoring='f1', cv=5, n_jobs=-1)

# 변수 선택 수행
selector.fit(X_train, y_train)

# 선택된 변수의 인덱스
selected_features = selector.get_support(indices=True)

# 선택된 변수 출력
for i in selected_features:
    print(X_train.columns[i])

부채비율
이자보상배율
자기자본구성비율
자기자본순이익률
총자본순이익률
자기자본회전률
유동자산증가율
부가가치율
OCF이자보상배율
부채상환계수
매출액대비금융비용상환능력
매출액대비현금흐름
총자산대비잉여현금흐름
파차의초과여부


In [57]:
selected_columns = X_train.columns[selected_features]
wrapper_features = list(selected_columns)

# Embedded Method

> Randomforest

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [76]:
selector = SelectFromModel(estimator=RandomForestClassifier(), threshold=0.02398).fit(X_train, y_train)
rf = selector.get_support()
count = np.count_nonzero(rf)
count

14

In [77]:
rf_features = X_train.loc[:, rf].columns.tolist()
rf_features

['총자본회전률',
 '자기자본회전률',
 '순운전자본회전률',
 '재고자산회전률',
 '당좌자산회전률',
 '매출액증가율',
 '총자본증가율',
 '자기자본증가율',
 '유형자산증가율',
 '유동자산증가율',
 '부가가치율',
 '매출액대비금융비용상환능력',
 '총자산대비잉여현금흐름',
 'log자산총계']

In [None]:
len(rf_feautres)

> LASSO

In [74]:
lasso = SelectFromModel(estimator=LogisticRegression(penalty='l1', solver='liblinear', C=0.00055)).fit(X_train, y_train)
lasso_support = lasso.get_support()
lasso_feature = X_train.loc[:,lasso_support].columns.tolist()

In [75]:
len(lasso_feature)

14

In [78]:
lasso_feature

['차입금의존도',
 '자기자본구성비율',
 '총자본순이익률',
 '총자본회전률',
 '순운전자본회전률',
 '유동자산회전률',
 '총자본증가율',
 '자기자본증가율',
 '순이익증가율',
 '영업이익증가율',
 '총자본투자효율',
 '매출액대비잉여현금흐름',
 '기업수명주기',
 'log자산총계']

# 종합

In [79]:
result = result.sort_values('p-value', ascending=True).reset_index(drop=True)
result = result.head(14)
filter = result[['Variable']]

In [80]:
rf_features = pd.DataFrame(rf_features)
lasso_features = pd.DataFrame(lasso_feature)
wrapper_features = pd.DataFrame(wrapper_features)

In [81]:
total = pd.concat([filter, rf_features, lasso_features, wrapper_features], axis=1)

In [82]:
total.columns = ['t&chi', 'rf', 'lasso', 'wrapper']
total

Unnamed: 0,t&chi,rf,lasso,wrapper
0,자기자본구성비율,총자본회전률,차입금의존도,부채비율
1,log자산총계,자기자본회전률,자기자본구성비율,이자보상배율
2,파부비초과여부,순운전자본회전률,총자본순이익률,자기자본구성비율
3,기업수명주기,재고자산회전률,총자본회전률,자기자본순이익률
4,파차의초과여부,당좌자산회전률,순운전자본회전률,총자본순이익률
5,총자본증가율,매출액증가율,유동자산회전률,자기자본회전률
6,파로이초과여부,총자본증가율,총자본증가율,유동자산증가율
7,총자본회전률,자기자본증가율,자기자본증가율,부가가치율
8,총자산대비현금흐름,유형자산증가율,순이익증가율,OCF이자보상배율
9,재고자산회전률,유동자산증가율,영업이익증가율,부채상환계수


In [83]:
filter = total['t&chi'].tolist()
rf = total['rf'].tolist()
lasso = total['lasso'].tolist()
wrapper = total['wrapper'].tolist()

In [84]:
X_train.columns

Index(['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'OCF이자보상배율', '부채상환계수', '장기부채상환능력',
       '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', '기업수명주기', '이보배초과여부', '파부비초과여부', '파당비초과여부',
       '파차의초과여부', '파로이초과여부', 'log자산총계'],
      dtype='object')

In [85]:
total_result = pd.DataFrame(index=X_train.columns)

# 각 컬럼의 값에 인덱스 포함 여부에 따라 True 또는 False 입력
total_result['t&chi'] = total_result.index.isin(filter)
total_result['wrapper'] = total_result.index.isin(wrapper)
total_result['rf'] = total_result.index.isin(rf)
total_result['lasso'] = total_result.index.isin(lasso)
total_result["true_sum"] = total_result.sum(axis=1)

total_result.sort_values('true_sum', ascending=False, inplace=True)
total_result

Unnamed: 0,t&chi,wrapper,rf,lasso,true_sum
log자산총계,True,False,True,True,3
자기자본구성비율,True,True,False,True,3
총자본증가율,True,False,True,True,3
총자본회전률,True,False,True,True,3
총자본순이익률,True,True,False,True,3
순운전자본회전률,True,False,True,True,3
자기자본증가율,False,False,True,True,2
파차의초과여부,True,True,False,False,2
자기자본회전률,False,True,True,False,2
총자산대비잉여현금흐름,False,True,True,False,2


In [97]:
total_result_2 = total_result[total_result['true_sum']>=2]
total_result_2 = total_result_2.reset_index()
total_result_2

Unnamed: 0,index,t&chi,wrapper,rf,lasso,true_sum
0,log자산총계,True,False,True,True,3
1,자기자본구성비율,True,True,False,True,3
2,총자본증가율,True,False,True,True,3
3,총자본회전률,True,False,True,True,3
4,총자본순이익률,True,True,False,True,3
5,순운전자본회전률,True,False,True,True,3
6,자기자본증가율,False,False,True,True,2
7,파차의초과여부,True,True,False,False,2
8,자기자본회전률,False,True,True,False,2
9,총자산대비잉여현금흐름,False,True,True,False,2


In [87]:
total_result_2.index

Index(['log자산총계', '자기자본구성비율', '총자본증가율', '총자본회전률', '총자본순이익률', '순운전자본회전률',
       '자기자본증가율', '파차의초과여부', '자기자본회전률', '총자산대비잉여현금흐름', '부가가치율', '재고자산회전률',
       '유동자산증가율', '기업수명주기', '매출액대비금융비용상환능력', '순이익증가율'],
      dtype='object')

In [89]:
p = X_train[['자기자본증가율', '파차의초과여부', '자기자본회전률', '총자산대비잉여현금흐름', '부가가치율', '재고자산회전률',
       '유동자산증가율', '기업수명주기', '매출액대비금융비용상환능력', '순이익증가율']]

In [90]:
p

Unnamed: 0,자기자본증가율,파차의초과여부,자기자본회전률,총자산대비잉여현금흐름,부가가치율,재고자산회전률,유동자산증가율,기업수명주기,매출액대비금융비용상환능력,순이익증가율
0,0.00,0,2.28,-0.221444,-268.61,9.53,0.00,5,4.174742,0.00
1,1.72,0,0.76,0.069046,42.32,0.00,4.25,3,56.802819,-19.83
2,-11.83,0,0.68,-0.516536,60.22,0.00,-52.85,5,-23.752940,0.00
3,0.00,1,1.48,-0.798633,73.45,0.00,0.00,2,26.994572,0.00
4,0.00,0,17.19,0.418095,31.17,1.47,3.60,4,46.704264,0.00
...,...,...,...,...,...,...,...,...,...,...
137673,102.79,0,14.88,0.049512,13.92,6.91,-0.89,3,213.217431,22.36
137674,10.73,0,8.55,0.126453,6.05,36.81,-20.32,4,195.625970,-62.60
137675,90.71,0,3.27,0.213073,30.08,10.11,78.13,2,288.357414,0.00
137676,1.89,0,1.15,0.066068,6.24,33.58,-16.66,4,-288.743530,2238.26


In [91]:
vif(p)

Unnamed: 0,VIF Factor,features
0,1.128575,기업수명주기
1,1.127885,파차의초과여부
2,1.000277,재고자산회전률
3,1.000232,매출액대비금융비용상환능력
4,1.000148,자기자본증가율
5,1.000133,순이익증가율
6,1.000057,자기자본회전률
7,1.000041,유동자산증가율
8,1.000039,부가가치율
9,1.000033,총자산대비잉여현금흐름


In [100]:
# drop 항목은 다시 확인하고 결정
total_result_2 = total_result_2.drop([9, 10], axis=0).reset_index(drop=True)
total_result_2

Unnamed: 0,index,t&chi,wrapper,rf,lasso,true_sum
0,log자산총계,True,False,True,True,3
1,자기자본구성비율,True,True,False,True,3
2,총자본증가율,True,False,True,True,3
3,총자본회전률,True,False,True,True,3
4,총자본순이익률,True,True,False,True,3
5,순운전자본회전률,True,False,True,True,3
6,자기자본증가율,False,False,True,True,2
7,파차의초과여부,True,True,False,False,2
8,자기자본회전률,False,True,True,False,2
9,재고자산회전률,True,False,True,False,2
