In [1]:
import OpenDartReader
from dotenv import load_dotenv
import os 
import pandas as pd

load_dotenv()
api_key =  os.environ.get('OpenDartReader_key')
dart = OpenDartReader(api_key)

AWS_KEY = os.environ.get('AWS_KEY')
AWS_SECRET = os.environ.get('AWS_SECRET')
tableau_key = os.environ.get('tableau_red')

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 20)

In [3]:
def readFromPrestoDB(sql_command):
    ''' Hive DB에서 데이터 로딩 '''
    try:
        from pyhive import presto
        import pandas as pd

        presto_conn = presto.connect(
            host = "presto-internal.dp.zigbang.net",
            port = 80,
            username = "biglabred", catalog = "hive", schema = ""
        )

        df = pd.read_sql(sql_command, presto_conn)

        presto_conn.close()
        return df
    
    except:
        print('Failed to read data from hive DB')

In [4]:
def get_sql(AWS_KEY, AWS_SECRET, sql_command):
    import cryptocode
    import pymysql
    import s3fs
    # 암호화된 태블로 DB 접속키 로딩
    # 암호화된 태블로 DB 접속키 로딩
    fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET)
    with fs.open("s3://zigbang-mlops/models/red/red_encrypted.txt", mode="r") as f:
        key = f.readline()

    conn = pymysql.connect(
        #host="biglab.c3svvjp5iqfn.ap-northeast-1.rds.amazonaws.com", port=3306,
        host="rds-red-w.zigbang.io", port=3306,
        user="zigbang_tableau", passwd=tableau_key, #cryptocode.decrypt(key, AWS_SECRET),
        db="tableau", charset="utf8"
    )
    cursor = conn.cursor() 
    cursor.execute(sql_command)
    result = pd.DataFrame(cursor.fetchall())

    num_fileds = len(cursor.description)
    field_names = [i[0] for i in cursor.description]
    result.columns = field_names

    conn.close()
        
    return result

In [5]:
def load_xml_to_dataframe(response, keyword):
    import bs4 as bs
    import urllib.request
    soup = bs.BeautifulSoup(response_xml,'xml')
    
    rows = soup.find_all(keyword)
    columns = rows[0].find_all()
    
    rowList = []
    nameList = []
    columnList = []

    rowsLen = len(rows)
    columnsLen = len(columns)

    for i in range(0, rowsLen):
        columns = rows[i].find_all()

        for j in range(0, columnsLen):
            if i == 0:
                nameList.append(columns[j].name) #header
            eachColumn = columns[j].text #value
            columnList.append(eachColumn)
        rowList.append(columnList)
        columnList = []    # 다음 row의 값을 넣기 위해 비워준다

    result = pd.DataFrame(rowList, columns=nameList)
    return result

In [6]:
def ExportDataFrameGspread(googlespread_link, sheetname, dataframe, AWS_KEY, AWS_SECRET):    
    import gspread
    from oauth2client.service_account import ServiceAccountCredentials
    import s3fs
    import numpy as np

    scope = ['https://spreadsheets.google.com/feeds',
     'https://www.googleapis.com/auth/drive']
    fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET)
    key_file_name = "gspreadkey.json"
    fs.download(f"s3://zigbang-mlops/models/red/{key_file_name}", key_file_name)

    credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_name, scope)
    gc = gspread.authorize(credentials)
    spreadsheet_url= googlespread_link
    doc = gc.open_by_url(spreadsheet_url)

    worksheet = doc.worksheet(sheetname)
    worksheet.clear()  #clear selected worksheet before export data frame
    
    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataframe.fillna('', inplace=True)
    worksheet.update([dataframe.columns.values.tolist()] + dataframe.values.tolist())
           
    if os.path.exists("gspreadkey.json"):
        os.remove("gspreadkey.json")

In [7]:
def push_into_tableau_DB(data, AWS_KEY, AWS_SECRET, table_name, sql_create, replace=False):
    import cryptocode
    import pymysql
    import s3fs
    
    # 암호화된 태블로 DB 접속키 로딩
    fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET)
    with fs.open("s3://zigbang-mlops/models/red/red_encrypted.txt", mode="r") as f:
        key = f.readline()
    
    # 태블로 DB에 접속하는 코드
    tableau_conn = pymysql.connect(
        host="rds-red-w.zigbang.io", port=3306,
        user="zigbang_tableau", passwd=cryptocode.decrypt(key, AWS_SECRET),
        db="tableau", charset="utf8"
    )
    cur = tableau_conn.cursor()

    # table_name이 존재하는지 체크하는 코드
    exist = cur.execute(f"SHOW TABLES LIKE '{table_name}'")
    
    if not exist:
        # table_name이 없다면 data 기반으로 생성하는 코드
        cur.execute(sql_create)
        
    # NULL 데이터 처리
    data = data.where(pd.notnull(data),"")
    
    # run_date 추가
    #data["run_date"] = datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S")
    
    # Replace할 경우 기존 데이터 삭제
    if replace:
        cur.execute(f"DELETE FROM {table_name}")
    
    # 데이터를 INSERT 하는 코드
    columns = ", ".join(data.columns)
    values = '('+', '.join(['%s']*len(data.columns))+')'
    statement = "INSERT INTO " + table_name + " (" + columns + ") VALUES " + values
    insert = [tuple(x) for x in data.values]
    cur.executemany(statement, insert)
    
    # Commit 및 접속 종료
    cur.execute("COMMIT")
    tableau_conn.close()

In [8]:
def readFromGspread(googlespread_link, sheetname, AWS_KEY, AWS_SECRET):
    ''' 구글 스프레드시트에서 데이터 로딩 '''
    import gspread
    from oauth2client.service_account import ServiceAccountCredentials
    import s3fs
    
    scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
    fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET)
    key_file_name = "gspreadkey.json"
    fs.download(f"s3://zigbang-mlops/models/red/{key_file_name}", key_file_name)
    #with fs.open("s3://zigbang-mlops/models/red/{key_file_name}}",key_file_name, mode="r") as f:
    #    key = f.readline()
    
    credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_name, scope)
    gc = gspread.authorize(credentials)
    spreadsheet_url= googlespread_link
    doc = gc.open_by_url(spreadsheet_url)
    
    df = pd.DataFrame.from_records(doc.worksheet(sheetname).get_all_values())
    
    df.columns = df.iloc[0]
    df = df[1:]
    
    import os
    os.remove(key_file_name)
              
    return df

## 데이터 불러오기

#### 기업개요 및 주소 정보

In [9]:
df_company = readFromPrestoDB('''
select * from 
(
select "고유번호", "정식명칭", "영문명칭", "종목명", "종목코드","법인구분", "법인등록번호",
"사업자등록번호", "주소", "업종코드", "업종명", "설립일",
"결산월", "created_at", "updated_at" from hive.ods.biglab_disclosure_company_overview
where "법인등록번호" !=''  and  "종목코드" != ''
) X
left join 
(
select * from
hive.ods.biglab_disclosure_company_address 
) Y
on X."주소" = Y."주소" 
'''
)

#법인등록번호가 null값인 것은 해외소재주소지로 제외 
#종목코드가 있는 상장사로 필터

#### 기업 분류 기준

In [10]:
url_industry_categorization = 'https://docs.google.com/spreadsheets/d/1N8iOHx1M8PLqlGiA2tgOKl-AFoCNw3Ew9bwHMMhBIIU/edit#gid=201266420'
df_industry_categorization = readFromGspread(url_industry_categorization, '표준산업대분류', AWS_KEY, AWS_SECRET)
df_industry_categorization = df_industry_categorization[['산업대분류', 'range_start', 'range_end']]

#df_industry_categorization = readFromGspread(url_industry_categorization, '표준산업분류', AWS_KEY, AWS_SECRET) 
#df_industry_categorization  = df_industry_categorization.CODE.astype(str).str.zfill(2)

#### 직원현황 공시정보

In [45]:
df_employee = readFromPrestoDB('''
select * from  hive.ods.biglab_disclosure_business_report_employee_summary    
where "보고서코드" ='11011'                         
''')
cols = ['정규직수', '합계', '연간급여총액', '1인평균급여액']
df_employee[cols] = df_employee[cols].apply(pd.to_numeric, errors='coerce')

#### 사업보고서 공시 자료 api 호출 중. (추후 태블로 DB에서 가져올 예정)

df_financial_stat = readFromPrestoDB('''
select * from hive.ods.biglab_disclosure_financial_statement
where "재무제표종류"='재무상태표, 유동/비유동법-연결재무제표'  -- '재무상태표, 유동/비유동법-별도재무제표'
and "보고서종류"='사업보고서'
and "항목코드"='ifrs-full_Assets' or "항목코드"= 'ifrs_Assets' -- 자산총계
''')

In [62]:
%%time

api_key =  os.environ.get('OpenDartReader_key')
dart = OpenDartReader(api_key)
df = pd.DataFrame()
for yr in range(2018, 2021):
    for stock_code in df_company.종목코드:
        try:
            row = dart.finstate(corp=stock_code, bsns_year=yr , reprt_code='11011')
            if  row.shape!=(0,0):  #013 :조회된 데이타가 없습니다 일 경우에는 쉐입이 (0,0) 이 됨..
                df = df.append(row)
            else:
                print("해당보고서가 없습니다: ",stock_code, "  year:", yr)
                continue 
        except:
            row = []
            continue 


{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  037600   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  007490   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  001950   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  114410   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  103150   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  001190   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  076850   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  008400   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  051820   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  052210   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  064060   year: 2018
{'status': '020', 'message': '사용한도를 초과하였습니다.'}

해당보고서가 없습니다:  060670   year: 2018
{'status': '020'

df_org = df.copy()

#### 업종코드 얻기위해 api 호출, 추후 기업개황 db테이블 완료되면 가져오게 해야함.

In [None]:
df_company2 = pd.DataFrame()
for stock_code in df_company.종목코드.unique():
    row = pd.DataFrame(dart.company(stock_code), index=[0])
    if row['status']!='100':
        df_company2 = df_company2.append(row)
    else:
        print(stock_code)
        continue

df_company2['induty_code_cat'] = df_company2['induty_code'].astype(str).str[:2]

#### 공시자료에서 연결재무재표 중 영업이익, 자산총계, 매출만 사용

In [20]:
df = df[((df.fs_div=='CFS') & (df.account_nm.str.contains('영업이익')) ) |
   ((df.fs_div=='CFS') & (df.account_nm.str.contains('자산총계')) ) |
   ((df.fs_div=='CFS') & (df.account_nm.str.contains('매출')) )
   ]

In [34]:
df = df[df.currency=='KRW']

## 데이터 결합

In [None]:
df = df.pivot_table(index=['rcept_no',	'reprt_code',	'bsns_year',	'corp_code', 'stock_code'], 
               columns='account_nm'	, values='thstrm_amount', aggfunc=sum).reset_index() 

In [53]:
df_employee = df_employee.groupby(["고유번호", "사업연도" ,"보고서코드", "법인명" ])[['정규직수', '합계', '연간급여총액']].sum().reset_index()
df_merged = df.merge(df_employee, how='inner', left_on=['bsns_year', 'corp_code'], right_on=['사업연도', '고유번호']) 
df_merged = df_merged.merge(df_company[['종목코드', '업종명', '주소', '시도', '시군구', '읍면동', '번지', '지역코드', '위도', '경도' ]], 
                how='left',   left_on ='stock_code', right_on='종목코드')

df_merged.rename(columns={"합계":"종업원수"}, inplace= True)


In [212]:

df_merged = df_merged[['rcept_no', 'bsns_year', 'account_nm', 'fs_nm', 'sj_nm', 'stock_code', 'corp_code', 'thstrm_amount', '법인명', '업종명' , '정규직수', '종업원수', '연간급여총액',
                       '시도', '시군구', '읍면동', '번지', '지역코드', '위도', '경도' ]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [180]:
table_name = 'dart_disclosure'
sql_create = f"""
    CREATE TABLE {table_name} (
    rcept_no VARCHAR(100),
    bsns_year VARCHAR(4),
    account_nm  VARCHAR(100),
    fs_nm VARCHAR(100),
    sj_nm VARCHAR(100),
    stock_code VARCHAR(6) ,
    corp_code VARCHAR(10),
    thstrm_amount bigint ,
    법인명 VARCHAR(100),
    업종명 VARCHAR(100),
    정규직수 int, 
    종업원수 int, 
    연간급여총액  bigint,
    시도	 VARCHAR(10),
    시군구	VARCHAR(10),
    읍면동	VARCHAR(20),
    번지	VARCHAR(20),
    지역코드	VARCHAR(10),
    위도	double,
    경도    double
    ) CHARSET=utf8;
"""

In [181]:
push_into_tableau_DB(df_merged,  AWS_KEY, AWS_SECRET, table_name , sql_create, replace=True)

In [390]:
df_company2['induty_code_cat'] = df_company2['induty_code'].astype(str).str[:2].astype(int)

In [391]:
s = pd.IntervalIndex.from_arrays(df_category_big.range_start,
                                 df_category_big.range_end, 'both')
df_company2.assign(industry = df_industry_categorization.set_index(s).loc[df_company2.induty_code_cat].산업대분류.values )

Unnamed: 0,status,message,corp_code,corp_name,corp_name_eng,stock_name,stock_code,ceo_nm,corp_cls,jurir_no,bizr_no,adres,hm_url,ir_url,phn_no,fax_no,induty_code,est_dt,acc_mt,induty_code_cat,industry
0,000,정상,01063990,(주)로보로보,"RoboRobo Co., Ltd.",로보로보,215100,최영석,K,1101115638741,1078841033,서울특별시 강북구 도봉로54길 6 로보로보빌딩(미아동),www.roborobo.co.kr,,02-909-5050,02-917-3511,289,20150209,12,28,제 조 업
0,000,정상,00442145,(주)아바코,"AVACOCO.,LTD.",아바코,083930,김광현,K,1701110166737,5038143827,대구 달서구 월암동 1107번지,www.avaco.co.kr,,053-583-8150,053-588-9209,29272,20000116,12,29,제 조 업
0,000,정상,01414936,에스케이제6호기업인수목적 주식회사,SK NO.6 SPECIAL PURPOSE ACQUISITION COMPANY,에스케이제6호기업인수목적,340350,송문규,E,1101117262936,2978101897,"서울특별시 영등포구 국제금융로8길 31, 4층",,,02-3773-9131,,66199,20191017,12,66,금융 및 보험업
0,000,정상,01050738,(주)솔트룩스,SALTLUX Inc,솔트룩스,304100,이경일,K,1101110301632,1028113061,서울특별시 강남구 언주로 538,www.saltlux.com/,,02-2193-1600,02-6499-0092,582,19810817,12,58,정보통신업
0,000,정상,00574611,에스디엔(주),SDN Company Ltd.,SDN,099220,최기혁,K,1101111020299,2078141182,경기도 성남시 수정구 대왕판교로 1281 -,www.sdn-i.com,,02-446-6691,02-446-6626,4659,19940318,12,46,도매 및 소매업
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,000,정상,00351630,(주)세코닉스,"SEKONIX CO., LTD.",세코닉스,053450,박은경,K,1158110006449,1278112192,경기도 동두천시 평화로2862번길 28,www.sekonix.com,www.sekonix.com,031-860-1000,031-860-1091,273,19881224,12,27,제 조 업
0,000,정상,00544452,(주)이리츠코크렙기업구조조정부동산투자회사,E KOCREF CR-REIT,이리츠코크렙,088260,허승재,Y,1101113275959,1078667081,서울특별시 강남구 삼성로 511,,,02-787-0000,02-2112-0939,68112,20050726,06,68,부동산업
0,000,정상,00608839,(주)에이루트,"Aroot Co., Ltd.",에이루트,096690,서문동군,K,1341110095394,1238177233,경기도 오산시 가장산업동로 28-6,www.aroot.co.kr,,031-8077-5000,02-6252-8150,26329,20020408,12,26,제 조 업
0,000,정상,00449254,주식회사쎄트렉아이,"SatrecInitiativeCo.,Ltd.",쎄트렉아이,099320,김이을,K,1601110091283,3148129846,대전광역시 유성구 유성대로1628번길 21 (주)쎄트렉아이,www.satreci.com,www.satreci.com,042-365-7500,042-365-7549,3131,19991229,12,31,제 조 업
