# Environmental Data Preprocessing

In [1]:
### package
import os 
import pandas as pd
import numpy as np
import copy
os.chdir("C:/Users/user/Desktop/ESG데이터/E")

### 데이터 로딩

In [2]:
### ESG
raw_esg = pd.read_excel("ESG_DATA.xlsx",converters={'종목코드':str})

### DART
raw_chief = pd.read_excel("DART임원_2021.xlsx",converters={'종목코드':str})
raw_emp = pd.read_excel("DART직원_2021.xlsx",converters={'종목코드':str})

### ESG임원관련(금융회사지배구조)
raw_fgover = pd.read_excel("금융지배구조_2021.xlsx")

### 지속가능경영보고서
raw_report1 = pd.read_excel("DART지가경공시_2021.xlsx")
raw_report2 = pd.read_excel("KSA지가경_2021.xlsx")

### CDP
raw_cdp = pd.read_excel("CDP1_2021.xlsx")

### ESG채권
raw_bond1 = pd.read_excel("국내사회적채권_2021.xlsx")
raw_bond2 = pd.read_excel("국외사회적채권_2021.xlsx") 

### 대기오염물질
raw_pol = pd.read_excel("대기오염물질2y_2021.xlsx",converters={'종목코드':str})

### 온실가스 & 에너지
raw_ngms = pd.read_excel("온실가스2y_2021.xlsx",converters={'종목코드':str})

### 지속가능경영보고서 환경인증
raw_env1 = pd.read_excel("환경1_2021.xlsx")

## 지속가능경영보고서 환경데이터
raw_env2 = pd.read_excel("환경2_2021.xlsx",converters={'종목코드':str})

## 지속가능경영보고서 환경데이터 (지주사)
raw_env3 = pd.read_excel("환경3_2021.xlsx",converters={'종목코드':str})

### 환경마크인증
raw_mark = pd.read_excel("환경마크인증2y_2021.xlsx",converters={'종목코드':str})

### 환경부 녹색기업
raw_envcomp = pd.read_excel("환경부녹색기업3y_2021.xlsx",converters={'종목코드':str})

### 환경정보공개시스템
raw_envdata = pd.read_excel("환경정보_2020.xlsx",converters={'종목코드':str})

### 환경법규위반
raw_envill = pd.read_excel("환경법규위반_2021.xlsx",converters={'종목코드':str})

### 분석대상 key id

In [3]:
### 분석대상 key 변수 : 분석대상에서 SBS미디어홀딩스, 에스케이머티리얼즈 제외
raw_esg = raw_esg[-raw_esg['종목코드'].isin(['101060','036490'])]
code = list(raw_esg['종목코드'])
corp = list(raw_esg['회사명'])
idx_esg = raw_esg[['종목코드','회사명']]
sector_esg = raw_esg[['종목코드','회사명','SICS_Sector_Kr']]

### Social 데이터 분석대상 Filtering

In [4]:
### ESG임원 및 환경관련임원
chief = raw_chief[raw_chief['종목코드'].isin(code)]
fgover = pd.merge(left = raw_fgover, right = idx_esg, how = 'left', on = '회사명')

### 지속가능경영보고서
report1 = pd.merge(left = raw_report1, right = idx_esg, how = 'left', on = '회사명')
report2 = pd.merge(left = raw_report2, right = idx_esg, how = 'inner', on = '회사명')

### ESG채권
bond1 = pd.merge(left = raw_bond1, right = idx_esg, how = 'left', on = '회사명')
bond2 = pd.merge(left = raw_bond2, right = idx_esg, how = 'left', on = '회사명')

### CDP
cdp = pd.merge(left = raw_cdp, right = idx_esg, how = 'left', on = '회사명')

### 지속가능경영보고서 환경
env1 = pd.merge(left = raw_env1, right = idx_esg, how = 'left', on = '회사명')

### 지속가능경영보고서 환경데이터
env2 = copy.deepcopy(raw_env2)

### 지속가능경영보고서 환경데이터 : 지주사
env3 = copy.deepcopy(raw_env3)

### Data Preprocessing

In [5]:
### DART : 임원

## 등기임원 추출
r_chief = chief[-chief['등기임원여부'].isin(['미등기임원','미등기'])]

## 총임원
total_chief = r_chief.groupby('종목코드').size().reset_index().rename(columns = {0:'총이사'})

## ESG관련 이사 수 : ESG, 지속가능, 미래전략, 지속경영 키워드가 관련된 ESG이사 추출
chief_esg = list(r_chief['담당업무'])
for i in range(r_chief.shape[0]) :
    if any(word in chief_esg[i] for word in ['ESG','esg','지속가능','미래전략','지속경영']) :
        chief_esg[i] = 1
    else : chief_esg[i] = 0
        
c = pd.DataFrame({'종목코드':list(r_chief['종목코드']),
                  'ESG이사':chief_esg})
esg_chief = c.groupby('종목코드')['ESG이사'].sum().reset_index()

## 환경관련 이사 여부 : 환경 키워드가 관련된 ESG이사 추출 (미등기임원 포함)
chief_env = list(chief['담당업무'])
for i in range(chief.shape[0]) :
    if any(word in chief_env[i] for word in ['환경']) :
        chief_env[i] = 1
    else : chief_env[i] = 0
        
c = pd.DataFrame({'종목코드':list(chief['종목코드']),
                  '환경이사':chief_env})
env_chief = c[c['환경이사'] == 1].drop_duplicates("종목코드", keep = 'first')

## 데이터 merge
new_chief = pd.merge(left = total_chief, right = esg_chief, how = 'left', on = '종목코드')
new_chief['ESG임원비율'] = new_chief['ESG이사']/new_chief['총이사']
fgover[['종목코드','ESG임원비율']]
new_chief = new_chief.drop(['총이사','ESG이사'],axis=1)
new_chief = pd.concat([fgover[['종목코드','ESG임원비율']],new_chief]).drop_duplicates("종목코드",keep='first')
new_chief = pd.merge(left = new_chief, right = env_chief, how = 'left', on = '종목코드').fillna(0)

### DART : 직원
emp = raw_emp[raw_emp['종목코드'].isin(code)]
emp = pd.concat([emp.iloc[:,:6],emp.iloc[:,6:].replace("-",0)],axis = 1)

to_emp = emp[emp['사업부문'].str.contains('합계')|emp['사업부문'].str.contains('총계')].reset_index()
rm_emp = emp[-emp['종목코드'].isin(to_emp['종목코드'])].reset_index()

emp1 = to_emp.groupby('종목코드')['합계'].sum().reset_index()
emp2 = rm_emp.groupby('종목코드')['합계'].sum().reset_index()
new_emp = pd.concat([emp1, emp2])

### CDP 데이터
r_cdp = cdp[[2021,'종목코드','scope3보고 수']]
r_cdp = r_cdp.drop(r_cdp[r_cdp[2021]=='응답'].index, axis=0).reset_index(drop=True)
r_cdp['CDP참여'] = 1

## CDP참여기업
new_cdp = r_cdp[['종목코드','CDP참여']]

### 지속가능경영보고서
report1['지가경공시'] = 1
report2['지가경발간'] = 1
report = pd.merge(left = report2[['종목코드','지가경발간']], 
                  right = report1[['종목코드','지가경공시']], how = 'left', on = '종목코드')
new_report = report.fillna(0).drop_duplicates('종목코드',keep='first')

### ESG채권
bond = pd.concat([bond1[bond1['채권종류'] != "사회적채권"]['종목코드'],bond2[bond2['채권유형'] != "소셜본드"]['종목코드']]).reset_index()
bond['ESG채권'] = 1
new_bond = bond.drop_duplicates("종목코드",keep ='first')[['종목코드','ESG채권']]

### ISO14001인증
new_iso14001 = env1[['종목코드','ISO14001']]

### ISO50001인증
new_iso50001 = env1[env1['ISO50001'] == 1][['종목코드','ISO50001']]

### 녹색기업선정
raw_envcomp[['녹색기업']] = 1
new_envcomp = raw_envcomp[['종목코드','녹색기업']]
new_envcomp = new_envcomp.drop_duplicates("종목코드", keep = 'first')

### 환경법규위반
new_envill = raw_envill.groupby("종목코드").size().reset_index().rename(columns = {0:'환경법규위반횟수'})

### 환경마크인증여부
raw_mark['환경마크인증'] = 1
new_mark = raw_mark.drop_duplicates("종목코드",keep = 'first')[['종목코드','환경마크인증']]

In [6]:
### 온실가스 Case1. 2020년도 데이터와 2021년도 데이터가 모두 있는 경우
gas = copy.deepcopy(raw_ngms)
gas = gas[['종목코드','대상연도','온실가스 배출량(tCO2-eq)']].groupby(['종목코드','대상연도'])['온실가스 배출량(tCO2-eq)'].sum().reset_index()

## NGMS온실가스 데이터 (모든 NA제외)
corp_gas = gas.iloc[:,[0,1,2]].rename(columns = {'온실가스 배출량(tCO2-eq)':'온실가스'})
gas2020 = corp_gas[corp_gas['대상연도'] == 2020].rename(columns = {'온실가스':'온실가스2020'}).drop("대상연도", axis = 1)
gas2021 = corp_gas[corp_gas['대상연도'] == 2021].rename(columns = {'온실가스':'온실가스2021'}).drop("대상연도", axis = 1)
gas1 = pd.merge(left = gas2020, right = gas2021, how = 'left', on = '종목코드')
all_gas1 = gas1[-(gas1['온실가스2021'].isna())|(gas1['온실가스2020'].isna())]

## 지속가능경영보고서 데이터 1
gas2 = env2[['종목코드','온실가스2020','온실가스2021']]
all_gas2 = gas2[(gas2['온실가스2020'] != '-') | (gas2['온실가스2021'] != '-')]
all_gas2 = all_gas2.astype({'온실가스2020':float, '온실가스2021':float})

## case1 merge (지주사 제외)
all_gas12 = pd.concat([all_gas1,all_gas2]).drop_duplicates('종목코드',keep = 'first')
all_gas12 = pd.merge(left = all_gas12, right = new_emp, how = 'left', on = '종목코드')
all_gas12['온실가스증감률'] = all_gas12['온실가스2021']/all_gas12['온실가스2020']
all_gas12['1인당온실가스'] = all_gas12['온실가스2021']/all_gas12['합계']

## 지속가능경영보고서 데이터 2 : 지주사 (지주사는 전체 그룹직원으로 나눔)
all_gas3 = env3[['종목코드','온실가스2020','온실가스2021','2021직원']]
all_gas3 = all_gas3[all_gas3['온실가스2020'] != '-']
all_gas3 = all_gas3.astype({'온실가스2020':float, '온실가스2021':float})
all_gas3['온실가스증감률'] = all_gas3['온실가스2021']/all_gas3['온실가스2020']
all_gas3['1인당온실가스'] = all_gas3['온실가스2021']/all_gas3['2021직원']

## case1 merge
all_gas = pd.concat([all_gas12[['종목코드','온실가스증감률','1인당온실가스']],all_gas3[['종목코드','온실가스증감률','1인당온실가스']]])

### 온실가스 Case2. 2020년도 데이터만 있고 2021년도 데이터가 없으며, 2019년도 대비 2020년 비율 데이터가 존재하는 경우

## 환경정보공개시스템
gas4 = raw_envdata[['종목코드','온실가스2020','온실가스증감비율2020']].rename(columns = {'온실가스증감비율2020':'온실가스증감률2020'})
gas4 = pd.merge(left = gas4, right = new_emp, how = 'left', on = '종목코드')

na_gas1 = gas4[gas4['온실가스증감률2020'] != float(np.inf)]
na_gas1['온실가스증감률2020'] = na_gas1['온실가스증감률2020']/100
na_gas1 = na_gas1.rename(columns = {'온실가스증감률2020':'온실가스증감률'})
na_gas1['온실가스2021'] = na_gas1['온실가스2020'] * na_gas1['온실가스증감률']
na_gas1['1인당온실가스'] = na_gas1['온실가스2021']/na_gas1['합계']
na_gas1 = na_gas1[['종목코드','온실가스증감률','1인당온실가스']]

### 온실가스 Case3. 2021년도에만 데이터가 있는 경우
gas5 = pd.merge(left = gas2020, right = gas2021, how = 'outer', on = '종목코드')
na_gas2 = gas5[gas5['온실가스2020'].isna()]
na_gas2 = pd.merge(left = na_gas2, right = new_emp, how = 'left', on = '종목코드')
na_gas2['1인당온실가스'] = na_gas2['온실가스2021']/na_gas2['합계']
na_gas2['온실가스증감률'] = float(np.NaN)
na_gas2 = na_gas2[['종목코드','온실가스증감률','1인당온실가스']]

### 온실가스 데이터 merge
new_gas = pd.concat([all_gas,na_gas1,na_gas2]).drop_duplicates("종목코드",keep = 'first')

### 온실가스 데이터 imputation : 결측치는 train set에서의 95% quantile 값으로 대체
new_gas = pd.merge(left = raw_esg[['종목코드','SICS_Sector_Kr']], right = new_gas, how = 'left', on = '종목코드')

## train, test split
train_idx = raw_esg.iloc[:661,1]
train1_q95 = new_gas[new_gas['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['온실가스증감률'].quantile(q=0.95).to_dict()
train2_q95 = new_gas[new_gas['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['1인당온실가스'].quantile(q=0.95).to_dict()

new_gas['온실가스증감률'] = new_gas['온실가스증감률'].fillna(pd.Series([train1_q95[x] for x in new_gas['SICS_Sector_Kr']]))
new_gas['1인당온실가스'] = new_gas['1인당온실가스'].fillna(pd.Series([train2_q95[x] for x in new_gas['SICS_Sector_Kr']]))
new_gas = new_gas.drop("SICS_Sector_Kr", axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  na_gas1['온실가스증감률2020'] = na_gas1['온실가스증감률2020']/100


In [7]:
### 에너지 Case1. 2020년도 데이터와 2021년도 데이터가 모두 있는 경우
ener = copy.deepcopy(raw_ngms)
ener = ener[['종목코드','대상연도','에너지 사용량(TJ)']].groupby(['종목코드','대상연도'])['에너지 사용량(TJ)'].sum().reset_index()

## NGMS에너지 데이터 (모든 NA제외)
corp_ener = ener.iloc[:,[0,1,2]].rename(columns = {'에너지 사용량(TJ)':'에너지'})
ener2020 = corp_ener[corp_ener['대상연도'] == 2020].rename(columns = {'에너지':'에너지2020'}).drop("대상연도", axis = 1)
ener2021 = corp_ener[corp_ener['대상연도'] == 2021].rename(columns = {'에너지':'에너지2021'}).drop("대상연도", axis = 1)
ener1 = pd.merge(left = ener2020, right = ener2021, how = 'left', on = '종목코드')
all_ener1 = ener1[-(ener1['에너지2021'].isna())|(ener1['에너지2020'].isna())]

## 지속가능경영보고서 데이터 1
ener2 = env2[['종목코드','에너지2020','에너지2021']]
all_ener2 = ener2[(ener2['에너지2020'] != '-') & (ener2['에너지2021'] != '-')]
all_ener2 = all_ener2.astype({'에너지2020':float, '에너지2021':float})

## case1 merge (지주사 제외)
all_ener12 = pd.concat([all_ener1,all_ener2]).drop_duplicates('종목코드',keep = 'first')
all_ener12 = pd.merge(left = all_ener12, right = new_emp, how = 'left', on = '종목코드')
all_ener12['에너지증감률'] = all_ener12['에너지2021']/all_ener12['에너지2020']
all_ener12['1인당에너지'] = all_ener12['에너지2021']/all_ener12['합계']

## 지속가능경영보고서 데이터 2 : 지주사 (지주사는 전체 그룹직원으로 나눔)
all_ener3 = env3[['종목코드','에너지2020','에너지2021','2021직원']]
all_ener3 = all_ener3[all_ener3['에너지2020'] != '-']
all_ener3 = all_ener3.astype({'에너지2020':float, '에너지2021':float})
all_ener3['에너지증감률'] = all_ener3['에너지2021']/all_ener3['에너지2020']
all_ener3['1인당에너지'] = all_ener3['에너지2021']/all_ener3['2021직원']

### 에너지 Case2. 2021년도에만 데이터가 있는 경우
ener4 = pd.merge(left = ener2020, right = ener2021, how = 'outer', on = '종목코드')
na_ener1 = ener4[ener4['에너지2020'].isna()]
na_ener1 = pd.merge(left = na_ener1, right = new_emp, how = 'left', on = '종목코드')
na_ener1['1인당에너지'] = na_ener1['에너지2021']/na_ener1['합계']
na_ener1['에너지증감률'] = float(np.NaN)
na_ener1 = na_ener1[['종목코드','에너지증감률','1인당에너지']]

## merge
new_ener = pd.concat([all_ener12[['종목코드','에너지증감률','1인당에너지']],all_ener3[['종목코드','에너지증감률','1인당에너지']],na_ener1])

### 에너지 데이터 imputation : 결측치는 train set에서의 95% quantile 값으로 대체
new_ener = pd.merge(left = raw_esg[['종목코드','SICS_Sector_Kr']], right = new_ener, how = 'left', on = '종목코드')

## train, test split
train_idx = raw_esg.iloc[:661,1]
train1_975 = new_ener[new_ener['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['에너지증감률'].quantile(q=0.95).to_dict()
train2_975 = new_ener[new_ener['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['1인당에너지'].quantile(q=0.95).to_dict()

new_ener['에너지증감률'] = new_ener['에너지증감률'].fillna(pd.Series([train1_975[x] for x in new_ener['SICS_Sector_Kr']]))
new_ener['1인당에너지'] = new_ener['1인당에너지'].fillna(pd.Series([train2_975[x] for x in new_ener['SICS_Sector_Kr']]))
new_ener = new_ener.drop("SICS_Sector_Kr", axis = 1)

In [8]:
### 대기오염

## 지속가능경영보고서 데이터 1
pol1 = env2[['종목코드','대기오염2020','대기오염2021']]
pol1 = pol1[(pol1['대기오염2020'] != "-") & (pol1['대기오염2021'] != "-")]
pol1 = pol1.astype({'대기오염2020':float, '대기오염2021':float})
pol1 = pd.merge(left = pol1, right = new_emp, how = 'left', on = '종목코드')

# case 1. 대기오염이 둘 다 0인 경우
no_pol1 = pol1[(pol1['대기오염2020'] == 0) & (pol1['대기오염2021'] == 0)]
no_pol1['대기오염증감률'] = 0
no_pol1['1인당대기오염'] = 0
no_pol1 = no_pol1[['종목코드','대기오염증감률','1인당대기오염']]

# case 2. 대기오염이 2020년에는 0, 2021년에 생긴 경우 (1.5부여)
no_pol2 = pol1[(pol1['대기오염2020'] == 0) & (pol1['대기오염2021'] != 0)]
no_pol2['대기오염증감률'] = 1.5
no_pol2['1인당대기오염'] = no_pol2['대기오염2021']/no_pol2['합계']
no_pol2 = no_pol2[['종목코드','대기오염증감률','1인당대기오염']]

# case 3. 대기오염이 둘 다 있는 경우
yes_pol1 = pol1[-((pol1['종목코드'].isin(no_pol1['종목코드'])) |(pol1['종목코드'].isin(no_pol2['종목코드'])))]
yes_pol1['대기오염증감률'] = yes_pol1['대기오염2021']/yes_pol1['대기오염2020']
yes_pol1['1인당대기오염'] = yes_pol1['대기오염2021']/yes_pol1['합계']
yes_pol1 = yes_pol1[['종목코드','대기오염증감률','1인당대기오염']]

## 지속가능경영보고서 데이터 2 : 지주사 (지주사는 전체 그룹직원으로 나눔)
yes_pol2 = env3[['종목코드','대기오염2020','대기오염2021','2021직원']]
yes_pol2 = yes_pol2[yes_pol2['대기오염2020'] != "-"]
yes_pol2 = yes_pol2.astype({'대기오염2020':float, '대기오염2021':float})

yes_pol2['대기오염증감률'] = yes_pol2['대기오염2021']/yes_pol2['대기오염2020']
yes_pol2['1인당대기오염'] = yes_pol2['대기오염2021']/yes_pol2['2021직원']
yes_pol2 = yes_pol2.iloc[:,[0,4,5]]

## 환경정보공개시스템
pol3 = raw_envdata[['종목코드','대기오염물질2020','대기오염물질증감비율2020']].rename(columns = {'대기오염물질2020':'대기오염2020','대기오염물질증감비율2020':'대기오염증감률2020'})
pol3 = pd.merge(left = pol3, right = new_emp, how = 'left', on = '종목코드')

na_pol3 = pol3[pol3['대기오염증감률2020'] != float(np.inf)]
na_pol3['대기오염증감률2020'] = na_pol3['대기오염증감률2020']/100
na_pol3 = na_pol3.rename(columns = {'대기오염증감률2020':'대기오염증감률'})
na_pol3['대기오염2021'] = na_pol3['대기오염2020'] * na_pol3['대기오염증감률']
na_pol3['1인당대기오염'] = na_pol3['대기오염2021']/na_pol3['합계']
na_pol3 = na_pol3[['종목코드','대기오염증감률','1인당대기오염']]

### 대기오염 merge
all_pol = pd.concat([no_pol1,no_pol2,yes_pol1,yes_pol2,na_pol3]).drop_duplicates("종목코드",keep='first')

### 대기오염 데이터 imputation
all_pol = pd.merge(left = raw_esg[['종목코드','SICS_Sector_Kr']], right = all_pol, how = 'left', on = '종목코드')
all_pol['대기오염증감률'] = all_pol['대기오염증감률'].fillna(all_pol.groupby('SICS_Sector_Kr')['대기오염증감률'].transform(lambda x: x.quantile(q=0.75)))
all_pol['1인당대기오염'] = all_pol['1인당대기오염'].fillna(all_pol.groupby('SICS_Sector_Kr')['1인당대기오염'].transform(lambda x: x.quantile(q=0.75)))
new_pol = all_pol.drop("SICS_Sector_Kr",axis=1)

### 대기오염 데이터 imputation : 결측치는 train set에서의 95% quantile 값으로 대체
new_pol = pd.merge(left = raw_esg[['종목코드','SICS_Sector_Kr']], right = new_pol, how = 'left', on = '종목코드')

## train, test split
train_idx = raw_esg.iloc[:661,1]
train1_q95 = new_pol[new_pol['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['대기오염증감률'].quantile(q=0.95).to_dict()
train2_q95 = new_pol[new_pol['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['1인당대기오염'].quantile(q=0.95).to_dict()

new_pol['대기오염증감률'] = new_pol['대기오염증감률'].fillna(pd.Series([train1_q95[x] for x in new_pol['SICS_Sector_Kr']]))
new_pol['1인당대기오염'] = new_pol['1인당대기오염'].fillna(pd.Series([train2_q95[x] for x in new_pol['SICS_Sector_Kr']]))
new_pol = new_pol.drop("SICS_Sector_Kr", axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_pol1['대기오염증감률'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_pol1['1인당대기오염'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_pol2['대기오염증감률'] = 1.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [9]:
### 용수

## 지속가능경영보고서 데이터 1
wat1 = env2[['종목코드','용수2020','용수2021']]
wat1 = wat1[(wat1['용수2020'] != "-") & (wat1['용수2021'] != "-")]
wat1 = wat1.astype({'용수2020':float, '용수2021':float})
wat1 = pd.merge(left = wat1, right = new_emp, how = 'left', on = '종목코드')

# case 1. 용수가 2020년에는 0, 2021년에 생긴 경우 (1.5부여)
no_wat1 = wat1[(wat1['용수2020'] == 0) & (wat1['용수2021'] != 0)]
no_wat1['용수증감률'] = 1.5
no_wat1['1인당용수'] = no_wat1['용수2021']/no_wat1['합계']
no_wat1 = no_wat1[['종목코드','용수증감률','1인당용수']]

# case 2. 용수가 둘 다 있는 경우
yes1_wat1 = wat1[-wat1['종목코드'].isin(no_wat1['종목코드'])]
yes1_wat1['용수증감률'] = yes1_wat1['용수2021']/yes1_wat1['용수2020']
yes1_wat1['1인당용수'] = yes1_wat1['용수2021']/yes1_wat1['합계']
yes1_wat1 = yes1_wat1[['종목코드','용수증감률','1인당용수']]

## 지속가능경영보고서 데이터 2 : 지주사 (지주사는 전체 그룹직원으로 나눔)
yes2_wat1 = env3[['종목코드','용수2020','용수2021','2021직원']]
yes2_wat1 = yes2_wat1[yes2_wat1['용수2020'] != "-"]
yes2_wat1 = yes2_wat1.astype({'용수2020':float, '용수2021':float})

yes2_wat1['용수증감률'] = yes2_wat1['용수2021']/yes2_wat1['용수2020']
yes2_wat1['1인당용수'] = yes2_wat1['용수2021']/yes2_wat1['2021직원']
yes2_wat1 = yes2_wat1.iloc[:,[0,4,5]]

## 환경정보공개시스템
wat2 = raw_envdata[['종목코드','용수2020','용수증감비율2020']].rename(columns = {'용수증감비율2020':'용수증감률2020'})
wat2 = pd.merge(left = wat2, right = new_emp, how = 'left', on = '종목코드')

na_wat2 = wat2[wat2['용수증감률2020'] != float(np.inf)]
na_wat2['용수증감률2020'] = na_wat2['용수증감률2020']/100
na_wat2 = na_wat2.rename(columns = {'용수증감률2020':'용수증감률'})
na_wat2['용수2021'] = na_wat2['용수2020'] * na_wat2['용수증감률']
na_wat2['1인당용수'] = na_wat2['용수2021']/na_wat2['합계']
na_wat2 = na_wat2[['종목코드','용수증감률','1인당용수']]

### 용수 merge
new_wat = pd.concat([no_wat1,yes1_wat1,yes2_wat1,na_wat2]).drop_duplicates("종목코드",keep='first')

### 용수 데이터 imputation : 결측치는 train set에서의 95% quantile 값으로 대체
new_wat = pd.merge(left = raw_esg[['종목코드','SICS_Sector_Kr']], right = new_wat, how = 'left', on = '종목코드')

## train, test split
train_idx = raw_esg.iloc[:661,1]
train1_q95 = new_wat[new_wat['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['용수증감률'].quantile(q=0.95).to_dict()
train2_q95 = new_wat[new_wat['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['1인당용수'].quantile(q=0.95).to_dict()

new_wat['용수증감률'] = new_wat['용수증감률'].fillna(pd.Series([train1_q95[x] for x in new_wat['SICS_Sector_Kr']]))
new_wat['1인당용수'] = new_wat['1인당용수'].fillna(pd.Series([train2_q95[x] for x in new_wat['SICS_Sector_Kr']]))
new_wat1 = new_wat.drop("SICS_Sector_Kr", axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_wat1['용수증감률'] = 1.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_wat1['1인당용수'] = no_wat1['용수2021']/no_wat1['합계']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yes1_wat1['용수증감률'] = yes1_wat1['용수2021']/yes1_wat1['용수2020']
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [10]:
### 용수재활용

## 지속가능경영보고서 데이터 1
wat3 = env2[['종목코드','용수2021','용수재활용2021']]
wat3 = wat3[wat3['용수재활용2021'] != "-"]
wat3 = wat3.astype({'용수2021':float, '용수재활용2021':float})
wat3['용수재활용비율'] = wat3['용수재활용2021']/wat3['용수2021']
wat3 = wat3[['종목코드','용수재활용비율']]

## 지속가능경영보고서 데이터 2 : 지주사 
wat4 = env3[['종목코드','용수2021','용수재활용2021']]
wat4 = wat4[wat4['용수재활용2021'] != "-"]
wat4 = wat4.astype({'용수2021':float, '용수재활용2021':float})
wat4['용수재활용비율'] = wat4['용수재활용2021']/wat4['용수2021']
wat4 = wat4[['종목코드','용수재활용비율']]

## 환경정보공개시스템
wat5 = raw_envdata[['종목코드','용수2020','용수재활용2020']]
wat5['용수재활용비율'] = wat5['용수재활용2020']/wat5['용수2020']
wat5 = wat5[['종목코드','용수재활용비율']]

## merge
new_wat2 = pd.merge(left = raw_esg['종목코드'], right = pd.concat([wat3,wat4,wat5]).drop_duplicates("종목코드",keep = 'first'),
                    how = 'left',on = '종목코드').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wat5['용수재활용비율'] = wat5['용수재활용2020']/wat5['용수2020']


In [11]:
### 폐기물

## 지속가능경영보고서 데이터 1
garb1 = env2[['종목코드','폐기물2020','폐기물2021']]
garb1 = garb1[(garb1['폐기물2020'] != "-") & (garb1['폐기물2021'] != "-")]
garb1 = garb1.astype({'폐기물2020':float, '폐기물2021':float})
garb1 = pd.merge(left = garb1, right = new_emp, how = 'left', on = '종목코드')

# case 1. 폐기물이 둘 다 0인 경우
no1_garb1 = garb1[(garb1['폐기물2020'] == 0) & (garb1['폐기물2021'] == 0)]
no1_garb1['폐기물증감률'] = 0
no1_garb1['1인당폐기물'] = 0
no1_garb1 = no1_garb1[['종목코드','폐기물증감률','1인당폐기물']]

# case 2. 폐기물이 2020년에는 0, 2021년에 생긴 경우 (1.5부여)
no2_garb1 = garb1[(garb1['폐기물2020'] == 0) & (garb1['폐기물2021'] != 0)]
no2_garb1['폐기물증감률'] = 1.5
no2_garb1['1인당폐기물'] = no2_garb1['폐기물2021']/no2_garb1['합계']
no2_garb1 = no2_garb1[['종목코드','폐기물증감률','1인당폐기물']]

# case 3. 폐기물이 둘 다 있는 경우
yes1_garb1 = garb1[-((garb1['종목코드'].isin(no1_garb1['종목코드'])) |(garb1['종목코드'].isin(no2_garb1['종목코드'])))]
yes1_garb1['폐기물증감률'] = yes1_garb1['폐기물2021']/yes1_garb1['폐기물2020']
yes1_garb1['1인당폐기물'] = yes1_garb1['폐기물2021']/yes1_garb1['합계']
yes1_garb1 = yes1_garb1[['종목코드','폐기물증감률','1인당폐기물']]

## 지속가능경영보고서 데이터 2 : 지주사 (지주사는 전체 그룹직원으로 나눔)
garb1_2 = env3[['종목코드','폐기물2020','폐기물2021','2021직원']]

# case 1. 폐기물이 2020년에는 0, 2021년에 생긴 경우 (1.5부여)
no3_garb1 = garb1_2[(garb1_2['폐기물2020'] == 0) & (garb1_2['폐기물2021'] != 0)]
no3_garb1['폐기물증감률'] = 1.5
no3_garb1['1인당폐기물'] = no3_garb1['폐기물2021']/no3_garb1['2021직원']
no3_garb1 = no3_garb1[['종목코드','폐기물증감률','1인당폐기물']]

# case 2. 폐기물이 둘 다 있는 경우
yes2_garb1 = garb1_2[-garb1_2['종목코드'].isin(no3_garb1['종목코드'])]
yes2_garb1['폐기물증감률'] = yes2_garb1['폐기물2021']/yes2_garb1['폐기물2020']
yes2_garb1['1인당폐기물'] = yes2_garb1['폐기물2021']/yes2_garb1['2021직원']
yes2_garb1 = yes2_garb1.iloc[:,[0,4,5]]

## 환경정보공개시스템
garb2 = raw_envdata[['종목코드','폐기물2020','폐기물증감비율2020']].rename(columns = {'폐기물증감비율2020':'폐기물증감률2020'})
garb2 = pd.merge(left = garb2, right = new_emp, how = 'left', on = '종목코드')

na_garb2 = garb2[garb2['폐기물증감률2020'] != float(np.inf)]
na_garb2['폐기물증감률2020'] = na_garb2['폐기물증감률2020']/100
na_garb2 = na_garb2.rename(columns = {'폐기물증감률2020':'폐기물증감률'})
na_garb2['폐기물2021'] = na_garb2['폐기물2020'] * na_garb2['폐기물증감률']
na_garb2['1인당폐기물'] = na_garb2['폐기물2021']/na_garb2['합계']
na_garb2 = na_garb2[['종목코드','폐기물증감률','1인당폐기물']]

### 폐기물 merge
new_garb = pd.concat([no1_garb1,no2_garb1,yes1_garb1,no3_garb1,yes2_garb1,na_garb2]).drop_duplicates("종목코드",keep='first')

### 폐기물 데이터 imputation : 결측치는 train set에서의 95% quantile 값으로 대체
new_garb = pd.merge(left = raw_esg[['종목코드','SICS_Sector_Kr']], right = new_garb, how = 'left', on = '종목코드')

## train, test split
train_idx = raw_esg.iloc[:661,1]
train1_q95 = new_garb[new_garb['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['폐기물증감률'].quantile(q=0.95).to_dict()
train2_q95 = new_garb[new_garb['종목코드'].isin(train_idx)].groupby('SICS_Sector_Kr')['1인당폐기물'].quantile(q=0.95).to_dict()

new_garb['폐기물증감률'] = new_garb['폐기물증감률'].fillna(pd.Series([train1_q95[x] for x in new_garb['SICS_Sector_Kr']]))
new_garb['1인당폐기물'] = new_garb['1인당폐기물'].fillna(pd.Series([train2_q95[x] for x in new_garb['SICS_Sector_Kr']]))
new_garb1 = new_garb.drop("SICS_Sector_Kr", axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no1_garb1['폐기물증감률'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no1_garb1['1인당폐기물'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no2_garb1['폐기물증감률'] = 1.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the

In [12]:
### 폐기물재활용

## 지속가능경영보고서 데이터 1
garb3 = env2[['종목코드','폐기물2021','폐기물재활용2021']]
garb3 = garb3[garb3['폐기물재활용2021'] != "-"]
garb3 = garb3.astype({'폐기물2021':float, '폐기물재활용2021':float})
garb3['폐기물재활용비율'] = garb3['폐기물재활용2021']/garb3['폐기물2021']
garb3 = garb3[['종목코드','폐기물재활용비율']]

## 지속가능경영보고서 데이터 2 : 지주사 
garb4 = env3[['종목코드','폐기물2021','폐기물재활용2021']]
garb4 = garb4[garb4['폐기물재활용2021'] != "-"]
garb4 = garb4.astype({'폐기물2021':float, '폐기물재활용2021':float})
garb4['폐기물재활용비율'] = garb4['폐기물재활용2021']/garb4['폐기물2021']
garb4 = garb4[['종목코드','폐기물재활용비율']]

## 환경정보공개시스템
garb5 = raw_envdata[['종목코드','폐기물2020','폐기물재활용2020']]
garb5['폐기물재활용비율'] = garb5['폐기물재활용2020']/garb5['폐기물2020']
garb5 = garb5[['종목코드','폐기물재활용비율']]

## merge
new_garb2 = pd.merge(left = raw_esg['종목코드'], right = pd.concat([garb3,garb4,garb5]).drop_duplicates("종목코드",keep = 'first'),
                     how = 'left',on = '종목코드').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  garb5['폐기물재활용비율'] = garb5['폐기물재활용2020']/garb5['폐기물2020']


### Environmental Data Total Merge

In [13]:
environmental = pd.merge(left = raw_esg[['종목코드','회사명','SICS_Sector_Kr','E_GRADE']], right = new_wat1, how = 'left', on = '종목코드')
environmental = pd.merge(left = environmental, right = new_wat2, how = 'left', on = '종목코드')
environmental = pd.merge(left = environmental, right = new_ener, how = 'left', on = '종목코드')
environmental = pd.merge(left = environmental, right = new_pol, how = 'left', on = '종목코드')
environmental = pd.merge(left = environmental, right = new_gas, how = 'left', on = '종목코드')
environmental = pd.merge(left = environmental, right = new_garb1, how = 'left', on = '종목코드')
environmental = pd.merge(left = environmental, right = new_garb2, how = 'left', on = '종목코드')
environmental = pd.merge(left = environmental, right = new_cdp, how = 'left', on = '종목코드').fillna(0)
environmental = pd.merge(left = environmental, right = new_envcomp, how = 'left', on = '종목코드').fillna(0)
environmental = pd.merge(left = environmental, right = new_iso14001, how = 'left', on = '종목코드').fillna(0)
environmental = pd.merge(left = environmental, right = new_mark, how = 'left', on = '종목코드').fillna(0)
environmental = pd.merge(left = environmental, right = new_bond, how = 'left', on = '종목코드').fillna(0)
environmental = pd.merge(left = environmental, right = new_envill, how = 'left', on = '종목코드').fillna(0)
environmental = pd.merge(left = environmental, right = new_report, how = 'left', on = '종목코드').fillna(0)
environmental = pd.merge(left = environmental, right = new_chief, how = 'left', on = '종목코드').fillna(0)

### Enviromental Key Risk

In [14]:
### environmental key risk
ekr_base = raw_esg[['종목코드','SICS_Sector_Kr']]

## 지속가능경영보고서 
new_report['지속가능경영보고서'] = (new_report['지가경공시'] + new_report['지가경발간'])/2
ekr_k1 = new_report[['종목코드','지속가능경영보고서']]

## ESG임원비율
ekr_k2 = new_chief[['종목코드','ESG임원비율']]

## 환경법규위반
envill1 = new_envill[new_envill['환경법규위반횟수'] == 1]
envill2 = new_envill[new_envill['환경법규위반횟수'] > 1]
envill1['환경법규위반'] = 0.5
envill2['환경법규위반'] = 0
ekr_k3 = pd.merge(left = ekr_base['종목코드'], right = pd.concat([envill1,envill2])[['종목코드','환경법규위반']], how = 'left', on = '종목코드').fillna(1)

## CDP기업
ekr_k4 = copy.deepcopy(new_cdp)

## ESG채권
ekr_k5 = copy.deepcopy(new_bond)

## 환경경영시스템인증 (지가경)
ekr_k6 = copy.deepcopy(new_iso14001)

## 온실가스

# 온실가스 scope1,2
ekr_k7 = environmental[['종목코드','온실가스증감률']]
ekr_k7.loc[ekr_k7['온실가스증감률'] >= 2,'온실가스증감률'] = 2
ekr_k7['온실가스증감률'] = 1 - ekr_k7['온실가스증감률']/2 

# 온실가스 scope3 기입여부
ekr_k8 = pd.concat([env2[['종목코드','온실가스3_기입여부']],env3[['종목코드','온실가스3_기입여부']]])

## 대기오염물질
ekr_k9 = environmental[['종목코드','대기오염증감률']]
ekr_k9.loc[ekr_k9['대기오염증감률'] >= 2,'대기오염증감률'] = 2
ekr_k9['대기오염증감률'] = 1 - ekr_k9['대기오염증감률']/2 

## 에너지

# 에너지
ekr_k10 = environmental[['종목코드','에너지증감률']]
ekr_k10.loc[ekr_k10['에너지증감률'] >= 2,'에너지증감률'] = 2
ekr_k10['에너지증감률'] = 1 - ekr_k10['에너지증감률']/2 

# ISO50001인증
ekr_k11 = new_iso50001

## 용수
ekr_k12 = environmental[['종목코드','용수증감률']]
ekr_k12.loc[ekr_k12['용수증감률'] >= 2,'용수증감률'] = 2
ekr_k12['용수증감률'] = 1 - ekr_k12['용수증감률']/2

## 용수 재활용량
ekr_k13 = environmental[['종목코드','용수재활용비율']]
ekr_k13.loc[ekr_k13['용수재활용비율'] >= 2,'용수재활용비율'] = 1

## 폐기물
ekr_k14 = environmental[['종목코드','폐기물증감률']]
ekr_k14.loc[ekr_k14['폐기물증감률'] >= 2,'폐기물증감률'] = 2
ekr_k14['폐기물증감률'] = 1 - ekr_k14['폐기물증감률']/2

## 용수 재활용량
ekr_k15 = environmental[['종목코드','폐기물재활용비율']]
ekr_k15.loc[ekr_k15['폐기물재활용비율'] >= 2,'폐기물재활용비율'] = 1

### environmental key risk data set merge 
ekr = pd.merge(left = ekr_base, right = ekr_k1, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k2, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k3, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k4, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k5, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k6, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k7, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k8, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k9, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k10, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k11, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k12, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k13, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k14, how = 'left', on = '종목코드')
ekr = pd.merge(left = ekr, right = ekr_k15, how = 'left', on = '종목코드')
ekr = ekr.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  envill1['환경법규위반'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  envill2['환경법규위반'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ekr_k7['온실가스증감률'] = 1 - ekr_k7['온실가스증감률']/2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

In [15]:
### weight matrix
sics = pd.DataFrame({'소비재':[1/10,1/10,1/10,1/10,1/10,1/10,0,0,0,1/10,1/10,1/10,1/10,0,0],
                     '추출물 및 광물 처리':[1/16,1/16,1/16,1/16,1/16,1/16,2/16,2/16,2/16,1/16,1/16,2/16,2/16,2/16,2/16],
                     '금융':[1/6,1/6,1/6,1/6,1/6,1/6,0,0,0,0,0,0,0,0,0],
                     '식음료':[1/20,1/20,1/20,1/20,1/20,1/20,2/20,2/20,0,2/20,2/20,2/20,2/20,1/20,1/20],
                     '헬스케어':[1/12,1/12,1/12,1/12,1/12,1/12,1/12,1/12,0,1/12,1/12,0,0,1/12,1/12],
                     '인프라':[1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15,1/15],
                     '재생가능 자원 및 대체 에너지':[1/19,1/19,1/19,1/19,1/19,1/19,1/19,1/19,1/19,2/19,2/19,2/19,2/19,1/19,1/19],
                     '자원 변환':[1/19,1/19,1/19,1/19,1/19,1/19,1/19,1/19,1/19,2/19,2/19,1/19,1/19,2/19,2/19],
                     '서비스':[1/10,1/10,1/10,1/10,1/10,1/10,0,0,0,1/10,1/10,1/10,1/10,0,0],
                     '기술 및 통신':[1/16,1/16,1/16,1/16,1/16,1/16,1/16,1/16,0,2/16,2/16,1/16,1/16,1/16,1/16],
                     '운송':[1/16,1/16,1/16,1/16,1/16,1/16,2/16,2/16,2/16,1/16,1/16,0,0,1/16,1/16]})

In [16]:
### key risk : matrix 곱
a = [[0 for col in range(12)] for row in range(ekr.shape[0])]
for i in range(ekr.shape[0]) :
    a[i] = sics[ekr['SICS_Sector_Kr'][i]].tolist()

ekr['E_risk'] = np.diag(np.array(ekr.iloc[:,2:]) @ np.array(a).T)

In [17]:
### modeling에 필요한 data split stratified 작업
environmental = pd.merge(left = environmental, right = ekr[['종목코드','E_risk']], how = 'left', on = '종목코드')
environmental = environmental.fillna(0)

E_group = [0 for col in range(environmental.shape[0])]
for i in range(environmental.shape[0]) :
    if environmental['E_GRADE'].astype(int)[i] in [1,2,3] :
        E_group[i] = "G1"
    elif environmental['E_GRADE'].astype(int)[i] in [4,5,6] :
        E_group[i] = "G2"
    elif environmental['E_GRADE'].astype(int)[i] in [7,8,9] : 
        E_group[i] = "G3"
    else : E_group[i] = "NA"
        
environmental['Stratify'] = environmental['SICS_Sector_Kr'] + "_" + E_group
environmental.to_excel("C:/Users/user/Desktop/ESG데이터/E_final.xlsx",index = False)
environmental.to_excel("C:/Users/user/Desktop/ESG데이터/ESG/E_final.xlsx",index = False)
environmental.to_excel("C:/Users/user/Desktop/ESG데이터/ESGI/E_final.xlsx",index = False)
environmental.to_excel("C:/Users/user/Desktop/ESG데이터/Tableau/E_final.xlsx",index = False)