# IMPORT LIBARAY

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import datetime
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from scipy import interpolate
from scipy.stats import stats

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import plotly.graph_objects as go
from matplotlib import dates
from plotly.subplots import make_subplots
import plotly
import plotly.express as px
import plotly.figure_factory as ff
pd.options.display.float_format = '{:.5f}'.format

import FinanceDataReader as fdr

from sklearn.preprocessing import MinMaxScaler

### 설치된 폰트 출력
font_list = [font.name for font in fm.fontManager.ttflist]
font_list
plt.rc("font", family="DejaVu Sans")
parameters = {'axes.labelsize': 20,
          'axes.titlesize': 25,
          'xtick.labelsize': 20,
          'ytick.labelsize': 20,
        'legend.fontsize': 20,
          }
plt.rcParams.update(parameters)
plt.rcParams['font.family'] = 'DejaVu Sans'

current_dir = os.getcwd()
plt.style.use("seaborn-dark")
plt.rcParams['font.family'] = 'DejaVu Sans'

In [2]:
root_path = '../../../'
master_path = os.path.join(root_path, '01. Data','00. master_data')
current_dir = os.getcwd()

file_path = os.path.join(root_path, '01. Data','02. 재무지표2', '01. preprocessed')
result_path = os.path.join(current_dir, 'result')

In [3]:
desc = pd.read_excel(os.path.join(master_path, "desc_v2.2.xlsx"), sheet_name='desc', header=0, usecols=['no','raw','variable','group','importance'])
desc.head(2)

Unnamed: 0,raw,no,variable,group,importance
0,Date_x,1,Date_x,DT,9
1,BS_Earnings Quality Score,2,BS_Earnings Quality Score,BS,5


In [4]:
# desc 데이터 기준, group이 한글인 경우(재무팀에서 뽑아준 항목들) + 전체(영어 포함)를 target variable로 가져감
# target_variable = desc.loc[~desc['group'].apply(lambda x: x.encode().isalpha()),"variable"].values.tolist()

dic_group = dict()
for _, xcol in enumerate(desc['variable'].values):
    dic_group[xcol] = desc.loc[_,'group']


dic_imp = dict()
for _, xcol in enumerate(desc['variable'].values):
    dic_imp[xcol] = desc.loc[_,'importance']



dic_name = dict()
for _, xcol in enumerate(desc['raw'].values):
    dic_name[xcol] = desc.loc[_, 'variable']

dic_name['Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.1'] = "FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.1"
dic_name['Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.2'] = "FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.2"
dic_name['Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.3'] = "FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.3"

dic_name['IS_Other, Net.1'] = "IS_Other, Net.1"
dic_name['IS_Current Tax - Total.1'] = "IS_Current Tax - Total.1"
dic_name['IS_Deferred Tax - Total.1'] = "IS_Deferred Tax - Total.1"


# # C1, C2, C3, C4, C5
# C = ['매출액증가율(YoY)(연도)', 'Gross Investment (GI)','무형자산회전율s','매출총이익(천원)','무형자산'] 
# dic_c = {C[0]:"C1", C[1]: "C2", C[2]: "C3", C[3]:"C4", C[4]: "C5"}
# C_ = ['C1','C2','C3','C4','C5']

# Y1, Y2, Y3, Y4
Y = ['close','close_weighted','PBR','EV_EBITDA']

remove_col = ['인당매출','무형자산회전율','인당영업이익','인당순이익','투자비 대비 매출 (Rev/GI)']
ev_col = ['FS_Enterprise Value', 'EBITDA2(천원)', 'EBIT(천원)' ]

In [5]:
file_lst = ['Accenture_non_shift_clean_total_v1.1.csv','Infosys_non_shift_clean_total_v1.1.csv','TCS_non_shift_clean_total_v1.1.csv']

# Accenture

In [6]:
df = pd.read_csv(os.path.join(file_path,file_lst[0]))
company = file_lst[0].split("_")[0]

In [7]:
final_cols = pd.read_csv(os.path.join(result_path,company+"_final_cols.csv"))['cols'].values.tolist()

In [8]:
### Y과 X들 간 corr 값 계산
target = 'EV_EBITDA'

Y = df[[target]]
X = df[df.columns.difference(['Date_x'])]

cor = X[final_cols+[target]].corr(method='pearson')[[target]].sort_values(by=target, ascending=False, key=abs)
cor = cor.loc[cor.index[cor.index != target],:]
cor

Unnamed: 0,EV_EBITDA
매출액(천원),0.92151
Gross Investment (GI),0.91834
매출총이익(천원),0.91264
BS_Additional Paid-In Capital,0.90161
무형자산감상비 누계,-0.88885
...,...
BS_Current Port. of LT Debt/Capital Leases,-0.01577
CF_Sale of Fixed Assets,-0.01437
EBITDA2증가율(YoY)(비율),-0.01187
매입채무회전율,-0.01106


In [9]:
pear = pd.DataFrame({'cols':[],'pearson':[]})
for _, col in enumerate(X.columns):
    pear.loc[_,'cols'] = col
    pearson = stats.pearsonr(X[col], X['EV_EBITDA'])[1]
    pear.loc[_, 'pearson'] = pearson
    
pear.set_index('cols', inplace=True)
pear.index.name = None
pear
# 0.05보다 작을 시 대립가설 기각(상관성이 유의함)

Unnamed: 0,pearson
"BPS(지배, Adj.)(원/주)",0.23959
BPS증가율(YoY)(비율),0.05308
BS_Accounts Payable,0.00000
"BS_Accounts Receivable - Trade, Gross",0.00000
"BS_Accounts Receivable - Trade, Net",0.00000
...,...
재고자산회전율(비율),0.00363
채무총계,0.00249
총자산회전율(비율),0.00000
투자비 대비 매출 (Rev/GI)s,0.65025


In [10]:
# spearman = pd.DataFrame({'cols':[],'spearman':[]})
# for _, col in enumerate(X.columns):
#     spearman.loc[_,'cols'] = col
#     sp = stats.spearmanr(X[col], X['EV_EBITDA'])[1]
#     spearman.loc[_, 'spearman'] = sp
    
# spearman.set_index('cols', inplace=True)
# spearman.index.name = None
# spearman
# # 0.05보다 작을 시 대립가설 기각(상관성이 유의함)

In [11]:
cor = pd.merge(cor, pear, how='left', left_index=True, right_index=True).sort_values(by='pearson', ascending=True)
# cor = pd.merge(cor, spearman, how='left', left_index=True, right_index=True).sort_values(by='pearson', ascending=True)
cor

Unnamed: 0,EV_EBITDA,pearson
매출액(천원),0.92151,0.00000
Gross Investment (GI),0.91834,0.00000
매출총이익(천원),0.91264,0.00000
BS_Additional Paid-In Capital,0.90161,0.00000
무형자산감상비 누계,-0.88885,0.00000
...,...,...
BS_Current Port. of LT Debt/Capital Leases,-0.01577,0.89248
CF_Sale of Fixed Assets,-0.01437,0.90196
EBITDA2증가율(YoY)(비율),-0.01187,0.91891
매입채무회전율,-0.01106,0.92449


In [12]:
# fig = ff.create_distplot([cor.values.reshape(-1)], ['EV_EBITDA'],bin_size=.2, show_rug=False)

# # Add title
# fig.update_layout(title_text='EV_EBITDA 상관성 분포')
# fig.show()

In [13]:
# plt.figure(figsize=(20,7))
# sns.distplot(cor ,bins=10, kde=False)

In [14]:
# tt = cor[abs(cor['pearson'])>0.05].dropna()
# print(len(tt), len(tt)/len(cor))
# tt

# # pvalue 기준으로 clustering하기 --> +-0.1기준으로



In [15]:
select_cols = cor[abs(cor['EV_EBITDA'])>=0.1].dropna()
print(len(select_cols), len(select_cols)/len(cor))
select_cols.reset_index(inplace=True)
select_cols.rename(columns={'index':'cols','EV_EBITDA':'corr'}, inplace=True)
select_cols['group'] = select_cols['cols'].map(dic_group)
select_cols['importance'] = select_cols['cols'].map(dic_imp)
select_cols

# pvalue 기준으로 clustering하기
# hi->상관분석




117 0.8731343283582089


Unnamed: 0,cols,corr,pearson,group,importance
0,매출액(천원),0.92151,0.00000,성장성,0
1,Gross Investment (GI),0.91834,0.00000,성장성,2
2,매출총이익(천원),0.91264,0.00000,수익성,0
3,BS_Additional Paid-In Capital,0.90161,0.00000,BS,5
4,무형자산감상비 누계,-0.88885,0.00000,안정성,1
...,...,...,...,...,...
112,ROGI(%),-0.13216,0.25508,수익성,0
113,EBITDA2마진율(비율),0.12788,0.27094,수익성,1
114,매출채권회전율,0.12416,0.28525,활동성,2
115,BS_Treas Shares - Common Stock Prmry Issue,-0.11100,0.33977,BS,5


In [16]:

C = ['매출액증가율(YoY)(연도)', 'Gross Investment (GI)','무형자산회전율s','매출총이익(천원)','무형자산']

set(C) & set(select_cols.index)


set()

## 파일 내보내기

In [17]:
select_cols.to_excel(os.path.join(result_path,"backup","Accenture_corr.xlsx"), index=False)

In [18]:
drop_cols = cor[~cor.index.isin(select_cols.cols.values.tolist())].reset_index()
drop_cols.rename(columns={'index':'cols'}, inplace=True)
drop_cols.to_excel(os.path.join(result_path,company+"_cor0.1_dropcols.xlsx"), index=False)

# Infosys

In [19]:
df = pd.read_csv(os.path.join(file_path,file_lst[1]))
company = file_lst[1].split("_")[0]
company

'Infosys'

In [20]:
final_cols = pd.read_csv(os.path.join(result_path,company+"_final_cols.csv"))['cols'].values.tolist()

In [21]:
### Y과 X들 간 corr 값 계산
target = 'EV_EBITDA'

Y = df[[target]]
X = df[df.columns.difference(['Date_x'])]

cor = X[final_cols].corr(method='pearson')[[target]].sort_values(by=target, ascending=False, key=abs)
cor = cor.loc[cor.index[cor.index != target],:]
cor

Unnamed: 0,EV_EBITDA
매출액증가율(YoY)(연도),0.71562
"BPS(지배, Adj.)(원/주)",0.71313
재고자산회전율(비율),0.66231
BS_Construction in Progress - Gross,-0.66142
"FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.3",-0.65435
...,...
이자보상율,-0.01398
CF_Foreign Exchange Effects,0.00890
"CF_Other Investing Cash Flow Items, Total",-0.00838
CF_Accounts Receivable,-0.00838


In [22]:
pear = pd.DataFrame({'cols':[],'pearson':[]})
for _, col in enumerate(X.columns):
    pear.loc[_,'cols'] = col
    pearson = stats.pearsonr(X[col], X['EV_EBITDA'])[1]
    pear.loc[_, 'pearson'] = pearson
    
pear.set_index('cols', inplace=True)
pear.index.name = None
pear
# 0.05보다 작을 시 대립가설 기각(상관성이 유의함)

Unnamed: 0,pearson
"BPS(지배, Adj.)(원/주)",0.00000
BPS증가율(YoY)(비율),0.00274
BS_Accounts Payable,0.37417
"BS_Accounts Receivable - Trade, Gross",0.01567
"BS_Accounts Receivable - Trade, Net",0.15582
...,...
자산총계(천원),0.00120
재고자산회전율(비율),0.00000
총자산회전율(비율),0.85466
투자비 대비 매출 (Rev/GI)s,0.00366


In [23]:
cor = pd.merge(cor, pear, how='left', left_index=True, right_index=True).sort_values(by='pearson', ascending=True)
cor

Unnamed: 0,EV_EBITDA,pearson
매출액증가율(YoY)(연도),0.71562,0.00000
"BPS(지배, Adj.)(원/주)",0.71313,0.00000
재고자산회전율(비율),0.66231,0.00000
BS_Construction in Progress - Gross,-0.66142,0.00000
"FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM.3",-0.65435,0.00000
...,...,...
이자보상율,-0.01398,0.90524
CF_Foreign Exchange Effects,0.00890,0.93960
"CF_Other Investing Cash Flow Items, Total",-0.00838,0.94309
CF_Accounts Receivable,-0.00838,0.94309


In [24]:
select_cols = cor[abs(cor['EV_EBITDA'])>=0.1].dropna()
print(len(select_cols), len(select_cols)/len(cor))
select_cols.reset_index(inplace=True)
select_cols.rename(columns={'index':'cols','EV_EBITDA':'corr'}, inplace=True)
select_cols['group'] = select_cols['cols'].map(dic_group)
select_cols['importance'] = select_cols['cols'].map(dic_imp)
select_cols

# pvalue 기준으로 clustering하기
# hi->상관분석




104 0.7761194029850746


Unnamed: 0,cols,corr,pearson,group,importance
0,매출액증가율(YoY)(연도),0.71562,0.00000,성장성,0.00000
1,"BPS(지배, Adj.)(원/주)",0.71313,0.00000,안정성,2.00000
2,재고자산회전율(비율),0.66231,0.00000,활동성,2.00000
3,BS_Construction in Progress - Gross,-0.66142,0.00000,BS,5.00000
4,FS_Dividend Yield - Common Stock - Net - Issue...,-0.65435,0.00000,,
...,...,...,...,...,...
99,CF_Unusual Items,-0.12516,0.28464,CF,5.00000
100,자본금(천원),-0.11723,0.31651,안정성,2.00000
101,FS_Free Cash Flow Net of Dividends,-0.11440,0.32842,FS,5.00000
102,CF_Cash from Financing Activities,0.10747,0.35877,CF,5.00000


In [25]:

C = ['매출액증가율(YoY)(연도)', 'Gross Investment (GI)','무형자산회전율s','매출총이익(천원)','무형자산']

set(C) & set(select_cols.index)


set()

## 파일 내보내기

In [26]:
# cor=0.1기준으로 살아남은 컬럼 리스트
select_cols.to_excel(os.path.join(result_path,"backup",company+"_corr.xlsx"), index=False)

In [27]:
# cor=0.1기준 drop된 컬럼 리스트
drop_cols = cor[~cor.index.isin(select_cols.cols.values.tolist())].reset_index()
drop_cols.rename(columns={'index':'cols'}, inplace=True)
drop_cols

Unnamed: 0,cols,EV_EBITDA,pearson
0,FS_Earnings Retention Rate,-0.09241,0.43036
1,BS_Short Term Investments,-0.08887,0.44835
2,BS_Other Payables,0.08773,0.45419
3,BS_Discount Rate - Domestic,-0.08396,0.47388
4,IS_Deferred Tax - Foreign,-0.07962,0.49712
5,현금배당성향(%),0.06883,0.55733
6,IS_Domestic Pension Plan Expense,-0.06709,0.5674
7,"BS_Curr Derivative Liab. Hedging, Suppl.",0.06138,0.60089
8,"CF_Issuance (Retirement) of Stock, Net",0.04854,0.67923
9,IS_Investment Income - Non-Operating,0.04412,0.70703


In [28]:
drop_cols.to_excel(os.path.join(result_path,company+"_cor0.1_dropcols.xlsx"), index=False)

# TCS

In [29]:
df = pd.read_csv(os.path.join(file_path,file_lst[2]))
company = file_lst[2].split("_")[0]
company

'TCS'

In [30]:
final_cols = pd.read_csv(os.path.join(result_path,company+"_final_cols.csv"))['cols'].values.tolist()

In [31]:

### Y과 X들 간 corr 값 계산
target = 'EV_EBITDA'

Y = df[[target]]
X = df[df.columns.difference(['Date_x'])]

cor = X[list(set(final_cols)|set([target]))].corr(method='pearson')[[target]].sort_values(by=target, ascending=False, key=abs)
cor = cor.loc[cor.index[cor.index != target],:]
cor

Unnamed: 0,EV_EBITDA
"BPS(지배, Adj.)(원/주)",0.79911
"FS_Free Cash Flow Yield - %, TTM",-0.73477
"FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM",-0.63767
ROGI(%),0.63102
투자비 대비 매출 (Rev/GI)s,0.61075
...,...
"BS_Other Equity, Total",0.02186
FS_Cash & Cash Equivalents,0.02091
IS_Total Pension Expense,0.01765
CF_Purchase of Investments,-0.01761


In [32]:
pear = pd.DataFrame({'cols':[],'pearson':[]})
for _, col in enumerate(X.columns):
    pear.loc[_,'cols'] = col
    pearson = stats.pearsonr(X[col], X['EV_EBITDA'])[1]
    pear.loc[_, 'pearson'] = pearson
    
pear.set_index('cols', inplace=True)
pear.index.name = None
pear
# 0.05보다 작을 시 대립가설 기각(상관성이 유의함)

Unnamed: 0,pearson
"BPS(지배, Adj.)(원/주)",0.00000
BPS증가율(YoY)(비율),0.00081
BS_Accounts Payable,0.00460
"BS_Accounts Receivable - Trade, Gross",0.00173
"BS_Accounts Receivable - Trade, Net",0.00068
...,...
재고자산회전율(비율),0.39083
채무총계,0.00001
총자산회전율(비율),0.16661
투자비 대비 매출 (Rev/GI)s,0.00000


In [33]:
cor = pd.merge(cor, pear, how='left', left_index=True, right_index=True).sort_values(by='pearson', ascending=True)
cor

Unnamed: 0,EV_EBITDA,pearson
"BPS(지배, Adj.)(원/주)",0.79911,0.00000
"FS_Free Cash Flow Yield - %, TTM",-0.73477,0.00000
"FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM",-0.63767,0.00000
ROGI(%),0.63102,0.00000
투자비 대비 매출 (Rev/GI)s,0.61075,0.00000
...,...,...
"BS_Other Equity, Total",0.02186,0.86608
FS_Cash & Cash Equivalents,0.02091,0.87182
IS_Total Pension Expense,0.01765,0.89173
CF_Purchase of Investments,-0.01761,0.89193


In [34]:
select_cols = cor[abs(cor['EV_EBITDA'])>=0.1].dropna()
print(len(select_cols), len(select_cols)/len(cor))
select_cols.reset_index(inplace=True)
select_cols.rename(columns={'index':'cols','EV_EBITDA':'corr'}, inplace=True)
select_cols['group'] = select_cols['cols'].map(dic_group)
select_cols['importance'] = select_cols['cols'].map(dic_imp)
select_cols

# pvalue 기준으로 clustering하기
# hi->상관분석




128 0.8205128205128205


Unnamed: 0,cols,corr,pearson,group,importance
0,"BPS(지배, Adj.)(원/주)",0.79911,0.00000,안정성,2.00000
1,"FS_Free Cash Flow Yield - %, TTM",-0.73477,0.00000,FS,5.00000
2,FS_Dividend Yield - Common Stock - Net - Issue...,-0.63767,0.00000,FS,5.00000
3,ROGI(%),0.63102,0.00000,수익성,0.00000
4,투자비 대비 매출 (Rev/GI)s,0.61075,0.00000,성장성,0.00000
...,...,...,...,...,...
123,FS_Dividend Coverage - %,-0.11291,0.38223,FS,5.00000
124,재고자산회전율(비율),-0.11090,0.39083,활동성,2.00000
125,IS_Investment Income - Non-Operating,0.10966,0.39620,IS,5.00000
126,"IS_Other, Net",-0.10896,0.39921,IS,5.00000


In [35]:
select_cols['cols'].values.tolist()

['BPS(지배, Adj.)(원/주)',
 'FS_Free Cash Flow Yield - %, TTM',
 'FS_Dividend Yield - Common Stock - Net - Issue Specific - %, TTM',
 'ROGI(%)',
 '투자비 대비 매출 (Rev/GI)s',
 'BS_Income Taxes Payable',
 'BS_Pension Benefits - Underfunded',
 'BS_Total Common Shares Outstanding',
 'IS_Interest Expense, Net Non-Operating',
 'BS_Intangibles, Net',
 '부채총계(천원)',
 'BS_Intangibles - Gross',
 'IS_Amort of Intangibles, Supplemental',
 'IS_Special DPS - Common Stock Primary Issue',
 '채무총계',
 '자본금(천원)',
 'CF_Purchase/Acquisition of Intangibles',
 'CF_Acquisition of Business',
 'FS_Pretax ROE - %, TTM',
 '무형자산회전율s',
 '매출총이익(천원)',
 'BS_Land/Improvements - Gross',
 '매출액(천원)',
 '유형자산감상비 누계',
 'BS_Restricted Cash - Current',
 'BS_Other Property/Plant/Equipment - Gross',
 'BS_Notes Receivable - Short Term',
 'BS_Other Long Term Liabilities',
 'CF_Long Term Debt, Net',
 'BS_Other Current liabilities, Total',
 'Opex Margin',
 'BS_Prepaid Expenses',
 'CF_Issuance (Retirement) of Debt, Net',
 'BPS증가율(YoY)(비율)',
 'CF

In [36]:

C = ['매출액증가율(YoY)(연도)', 'Gross Investment (GI)','무형자산회전율s','매출총이익(천원)','무형자산']

set(C) & set(select_cols.index)


set()

## 파일 내보내기

In [37]:
# cor=0.1기준으로 살아남은 컬럼 리스트
select_cols.to_excel(os.path.join(result_path,"backup",company+"_corr.xlsx"), index=False)

In [38]:
# cor=0.1기준 drop된 컬럼 리스트
drop_cols = cor[~cor.index.isin(select_cols.cols.values.tolist())].reset_index()
drop_cols.rename(columns={'index':'cols'}, inplace=True)
drop_cols

Unnamed: 0,cols,EV_EBITDA,pearson
0,IS_Gain (Loss) on Sale of Assets,0.09922,0.44293
1,BS_Deferred Charges,-0.0976,0.45044
2,BS_Deferred Income Tax - LT Liability,0.09094,0.48211
3,유동비율(비율),-0.0872,0.50037
4,BS_Construction in Progress - Gross,-0.08655,0.50358
5,CF_Sale of Fixed Assets,-0.08326,0.51999
6,BS_Dividends Payable,-0.08299,0.52133
7,FS_Dividend Payout Ratio - %,0.08154,0.52869
8,FS_Earnings Retention Rate,-0.08105,0.53114
9,순이익률(비율),0.06832,0.59776


In [39]:
drop_cols.to_excel(os.path.join(result_path,company+"_cor0.1_dropcols.xlsx"), index=False)