In [43]:
import pandas as pd
import numpy as np
import os
from scipy.cluster import hierarchy as hc
import matplotlib
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from scipy import stats
import math

%matplotlib inline
warnings.filterwarnings(action='ignore')

matplotlib.rcParams['font.family'] ='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] =False

# Load data

In [44]:
dir_root = '../../../../../'
current_dir = os.getcwd()
dir_master = os.path.join(dir_root, '01. Data','00. master_data')
dir_processed = os.path.join(dir_root, '01. Data','02. 재무지표2', '01. preprocessed')
dir_result = os.path.join(dir_root, '04. 상관분석','02. 재무지표2','02. result')

In [45]:
y = '매출총이익(천원)'
c1 = 'Accenture'
c2 = 'Infosys'
c3 = 'TCS'
c4 = 'Cognizant'

In [46]:
outperform = [c1, c2, c3]
lowperform = [c4]

In [47]:
dir_result_all = os.path.join(dir_result, 'All_ITS', y)

if os.path.exists(dir_result_all)==False:
    os.makedirs(dir_result_all)

# total 기간에 대해서, 존재하는 변수 기준으로 inner join

In [48]:
df_c1 = pd.read_csv(os.path.join(dir_result, c1, y, y+'.csv')).iloc[:,:5]
df_c2 = pd.read_csv(os.path.join(dir_result, c2, y, y+'.csv')).iloc[:,:5]
df_c3 = pd.read_csv(os.path.join(dir_result, c3, y, y+'.csv')).iloc[:,:5]
df_c4 = pd.read_csv(os.path.join(dir_result, c4, y, y+'.csv')).iloc[:,:5]

In [49]:
df_c1.rename({'corr' : df_c1.company.unique()[0]}, axis=1, inplace=True)
df_c2.rename({'corr' : df_c2.company.unique()[0]}, axis=1, inplace=True)
df_c3.rename({'corr' : df_c3.company.unique()[0]}, axis=1, inplace=True)
df_c4.rename({'corr' : df_c4.company.unique()[0]}, axis=1, inplace=True)

df_c1.drop(['company'], axis=1, inplace=True)
df_c2.drop(['company'], axis=1, inplace=True)
df_c3.drop(['company'], axis=1, inplace=True)
df_c4.drop(['company'], axis=1, inplace=True)

In [50]:
df_all_variable = pd.merge(pd.merge(pd.merge(df_c1, df_c2, on=['variable','group', 'importance'], how='inner'), 
                          df_c3, on=['variable','group', 'importance'], how='inner'), 
                 df_c4, on=['variable','group', 'importance'], how='inner')

In [51]:
df_all_variable = df_all_variable[['variable', 'group', 'importance']+outperform+lowperform]

# 0.7 이상 상관계수가 하나라도 있으면, 남기도록 outer join

In [53]:
df_c1 = pd.read_csv(os.path.join(dir_result, c1, y, 'compare_period_corr_07.csv'))
df_c2 = pd.read_csv(os.path.join(dir_result, c2, y, 'compare_period_corr_07.csv'))
df_c3 = pd.read_csv(os.path.join(dir_result, c3, y, 'compare_period_corr_07.csv'))
df_c4 = pd.read_csv(os.path.join(dir_result, c4, y, 'compare_period_corr_07.csv'))

In [54]:
df_c1.rename({'corr_total' : df_c1.company.unique()[0]}, axis=1, inplace=True)
df_c2.rename({'corr_total' : df_c2.company.unique()[0]}, axis=1, inplace=True)
df_c3.rename({'corr_total' : df_c3.company.unique()[0]}, axis=1, inplace=True)
df_c4.rename({'corr_total' : df_c4.company.unique()[0]}, axis=1, inplace=True)

df_c1.drop(['corr_sub', 'company'], axis=1, inplace=True)
df_c2.drop(['corr_sub', 'company'], axis=1, inplace=True)
df_c3.drop(['corr_sub', 'company'], axis=1, inplace=True)
df_c4.drop(['corr_sub', 'company'], axis=1, inplace=True)

In [55]:
df_c1 = df_c1.dropna()
df_c2 = df_c2.dropna()
df_c3 = df_c3.dropna()
df_c4 = df_c4.dropna()

In [56]:
# df_all = pd.merge(pd.merge(df_c1, df_c3, on=['variable','group', 'importance'], how='outer'), 
#                           df_c4, on=['variable','group', 'importance'], how='outer')
                 

In [57]:
df_all = pd.merge(pd.merge(pd.merge(df_c1, df_c2, on=['variable','group', 'importance'], how='outer'), 
                          df_c3, on=['variable','group', 'importance'], how='outer'), 
                 df_c4, on=['variable','group', 'importance'], how='outer')
# df_all = pd.merge(pd.merge(df_c2, df_c3, on=['variable','group', 'importance'], how='outer'), 
#                           df_c4, on=['variable','group', 'importance'], how='outer')

In [59]:
df_all = df_all[['variable', 'group', 'importance']+outperform+lowperform]

In [61]:
# df_all = df_all[['variable', 'group', 'importance']+[c1, c3, c4]]

# 위의 두 개 inner join

In [62]:
df_all_merged = pd.merge(df_all, df_all_variable, how='inner', on=['variable','group','importance'], suffixes = ("_filtered", "_show"))

In [63]:
# df_all_merged.to_csv(os.path.join(dir_result, 'All_ITS', 'corr_y_x_over7.csv'), index=False, encoding='utf-8-sig')

In [64]:
writer = pd.ExcelWriter(os.path.join(dir_result_all, 'corr_y_x_over7.xlsx'), engine='xlsxwriter')
df_all_merged.to_excel(writer, index=False)
writer.save()

In [65]:
df_all_merged.columns

Index(['variable', 'group', 'importance', 'Accenture_filtered',
       'Infosys_filtered', 'TCS_filtered', 'Cognizant_filtered',
       'Accenture_show', 'Infosys_show', 'TCS_show', 'Cognizant_show'],
      dtype='object')

In [66]:
df_all_merged

Unnamed: 0,variable,group,importance,Accenture_filtered,Infosys_filtered,TCS_filtered,Cognizant_filtered,Accenture_show,Infosys_show,TCS_show,Cognizant_show
0,FS_Gross Profit - Industrials/Property - Total,FS,5.0,1.000,0.860,1.000,1.000,1.000,0.860,1.000,1.000
1,IS_Gross Profit,IS,5.0,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000
2,FS_Operating Profit before Non-Recurring Incom...,FS,5.0,0.995,0.743,0.984,0.988,0.995,0.743,0.984,0.988
3,IS_Normalized EBIT,IS,5.0,0.995,0.996,0.984,0.988,0.995,0.996,0.984,0.988
4,EBIT(천원),수익성,1.0,0.995,0.996,0.984,0.988,0.995,0.996,0.984,0.988
...,...,...,...,...,...,...,...,...,...,...,...
157,매출액이익률(비율),수익성,0.0,,,,-0.824,0.006,-0.534,0.676,-0.824
158,FS_Gross Profit Margin - %,FS,5.0,,,,-0.824,0.006,-0.534,0.676,-0.824
159,FS_Current Ratio,FS,5.0,,,,-0.855,-0.186,-0.248,0.602,-0.855
160,유동비율(비율),안정성,2.0,,,,-0.855,-0.186,-0.248,0.602,-0.855
