In [2]:
import OpenDartReader
from dotenv import load_dotenv
import os 
import pandas as pd

load_dotenv()
api_key =  os.environ.get('OpenDartReader_key')
dart = OpenDartReader(api_key)

AWS_KEY = os.environ.get('AWS_KEY')
AWS_SECRET = os.environ.get('AWS_SECRET')
tableau_key = os.environ.get('tableau_red')




In [3]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 20)

In [4]:
def readFromPrestoDB(sql_command):
    ''' Hive DB에서 데이터 로딩 '''
    try:
        from pyhive import presto
        import pandas as pd

        presto_conn = presto.connect(
            host = "presto-internal.dp.zigbang.net",
            port = 80,
            username = "biglabred", catalog = "hive", schema = ""
        )

        df = pd.read_sql(sql_command, presto_conn)

        presto_conn.close()
        return df
    
    except:
        print('Failed to read data from hive DB')

In [5]:
def get_sql(AWS_KEY, AWS_SECRET, sql_command):
    import cryptocode
    import pymysql
    import s3fs
    # 암호화된 태블로 DB 접속키 로딩
    # 암호화된 태블로 DB 접속키 로딩
    fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET)
    with fs.open("s3://zigbang-mlops/models/red/red_encrypted.txt", mode="r") as f:
        key = f.readline()

    conn = pymysql.connect(
        #host="biglab.c3svvjp5iqfn.ap-northeast-1.rds.amazonaws.com", port=3306,
        host="rds-red-w.zigbang.io", port=3306,
        user="zigbang_tableau", passwd=tableau_key, #cryptocode.decrypt(key, AWS_SECRET),
        db="tableau", charset="utf8"
    )
    cursor = conn.cursor() 
    cursor.execute(sql_command)
    result = pd.DataFrame(cursor.fetchall())

    num_fileds = len(cursor.description)
    field_names = [i[0] for i in cursor.description]
    result.columns = field_names

    conn.close()
        
    return result

In [6]:
def load_xml_to_dataframe(response, keyword):
    import bs4 as bs
    import urllib.request
    soup = bs.BeautifulSoup(response_xml,'xml')
    
    rows = soup.find_all(keyword)
    columns = rows[0].find_all()
    
    rowList = []
    nameList = []
    columnList = []

    rowsLen = len(rows)
    columnsLen = len(columns)

    for i in range(0, rowsLen):
        columns = rows[i].find_all()

        for j in range(0, columnsLen):
            if i == 0:
                nameList.append(columns[j].name) #header
            eachColumn = columns[j].text #value
            columnList.append(eachColumn)
        rowList.append(columnList)
        columnList = []    # 다음 row의 값을 넣기 위해 비워준다

    result = pd.DataFrame(rowList, columns=nameList)
    return result

In [7]:
def ExportDataFrameGspread(googlespread_link, sheetname, dataframe, AWS_KEY, AWS_SECRET):    
    import gspread
    from oauth2client.service_account import ServiceAccountCredentials
    import s3fs
    import numpy as np

    scope = ['https://spreadsheets.google.com/feeds',
     'https://www.googleapis.com/auth/drive']
    fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET)
    key_file_name = "gspreadkey.json"
    fs.download(f"s3://zigbang-mlops/models/red/{key_file_name}", key_file_name)

    credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_name, scope)
    gc = gspread.authorize(credentials)
    spreadsheet_url= googlespread_link
    doc = gc.open_by_url(spreadsheet_url)

    worksheet = doc.worksheet(sheetname)
    worksheet.clear()  #clear selected worksheet before export data frame
    
    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataframe.fillna('', inplace=True)
    worksheet.update([dataframe.columns.values.tolist()] + dataframe.values.tolist())
           
    if os.path.exists("gspreadkey.json"):
        os.remove("gspreadkey.json")

In [8]:
def push_into_tableau_DB(data, AWS_KEY, AWS_SECRET, table_name, sql_create, replace=False):
    import cryptocode
    import pymysql
    import s3fs
    
    # 암호화된 태블로 DB 접속키 로딩
    fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET)
    with fs.open("s3://zigbang-mlops/models/red/red_encrypted.txt", mode="r") as f:
        key = f.readline()
    
    # 태블로 DB에 접속하는 코드
    tableau_conn = pymysql.connect(
        host="rds-red-w.zigbang.io", port=3306,
        user="zigbang_tableau", passwd=cryptocode.decrypt(key, AWS_SECRET),
        db="tableau", charset="utf8"
    )
    cur = tableau_conn.cursor()

    # table_name이 존재하는지 체크하는 코드
    exist = cur.execute(f"SHOW TABLES LIKE '{table_name}'")
    
    if not exist:
        # table_name이 없다면 data 기반으로 생성하는 코드
        cur.execute(sql_create)
        
    # NULL 데이터 처리
    data = data.where(pd.notnull(data),"")
    
    # run_date 추가
    #data["run_date"] = datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S")
    
    # Replace할 경우 기존 데이터 삭제
    if replace:
        cur.execute(f"DELETE FROM {table_name}")
    
    # 데이터를 INSERT 하는 코드
    columns = ", ".join(data.columns)
    values = '('+', '.join(['%s']*len(data.columns))+')'
    statement = "INSERT INTO " + table_name + " (" + columns + ") VALUES " + values
    insert = [tuple(x) for x in data.values]
    cur.executemany(statement, insert)
    
    # Commit 및 접속 종료
    cur.execute("COMMIT")
    tableau_conn.close()

In [9]:
df_company = readFromPrestoDB('''
select * from 
(
select "고유번호", "정식명칭", "영문명칭", "종목명", "종목코드","법인구분", "법인등록번호",
"사업자등록번호", "주소", "업종코드", "업종명", "설립일",
"결산월", "created_at", "updated_at" from hive.ods.biglab_disclosure_company_overview
where "법인등록번호" !=''  and  "종목코드" != ''
) X
left join 
(
select * from
hive.ods.biglab_disclosure_company_address 
) Y
on X."주소" = Y."주소" 
'''
)

#법인등록번호가 null값인 것은 해외소재주소지로 제외 
#종목코드가 있는 상장사로 필터

In [18]:
# 법인구분	법인구분 : Y(유가), K(코스닥), N(코넥스), E(기타)
df_company = df_company[df_company.법인구분!='E']

 df_company[df_company.법인구분=='E'].to_excel('E.xlsx')

df_company

In [19]:
codes = ''.join(','.join(df_company.종목코드))

In [20]:
codes[:104]

'215100,083930,340350,304100,099220,175250,064850,140670,005740,348150,048770,290380,337450,263800,251970'

dart.finstate(corp='168490,094800,007530,069080,115440,305090,066910,042600,331520,086220,215100,355150,339950,233250,033270' , bsns_year=2021 , reprt_code='11011')

In [165]:
api_key =  os.environ.get('OpenDartReader_key')
dart = OpenDartReader(api_key)
df = pd.DataFrame()
for yr in [2018, 2019, 2020, 2021]:
    for stock_code in df_company.종목코드:
        try:
            row = dart.finstate(corp=stock_code, bsns_year=yr , reprt_code='11011')
            if  row.shape!=(0,0):  #013 :조회된 데이타가 없습니다 일 경우에는 쉐입이 (0,0) 이 됨..
                df = df.append(row)
            else:
                print(stock_code, "  year:", yr)
                continue 
        except:
            row = []
            continue 


215100
083930
340350
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

304100
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

099220
175250
064850
140670
005740
348150
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

048770
290380
337450
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

263800
251970
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

149300
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

234070
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

121060
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

203650
115390
290670
208890
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

010400
005430
356890
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

267320
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

335810
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

038500
037440
228760
007540
050960
284740
182690
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

267850
195940
{'status': '013', 'message': '조회된 데이타가 없습니다.'}

012610
090850
137400
059100
040420
004380
008600
084180
012700
208140
079810

In [247]:
pd.DataFrame.from_records([dart.finstate(corp='251970', bsns_year=2015 , reprt_code='11011')])#['status']


{'status': '013', 'message': '조회된 데이타가 없습니다.'}



dart.finstate(corp='005740,348150,048770,290380,337450,263800,251970', bsns_year=2021 , reprt_code='11011')

In [168]:
len(df)

6638

In [167]:
df = df[(df.fs_div=='CFS') & (df.account_nm.str.contains('자산총계'))]

In [169]:
df_employee = readFromPrestoDB('''
select * from  hive.ods.biglab_disclosure_business_report_employee_summary    
where "보고서코드" ='11011'                         
                               ''')

In [170]:
cols = ['정규직수', '합계', '연간급여총액', '1인평균급여액']
#cols = df.columns[df.dtypes.eq('object')]
df_employee[cols] = df_employee[cols].apply(pd.to_numeric, errors='coerce')

In [171]:
df_employee = df_employee.groupby(["고유번호", "사업연도" ,"보고서코드", "법인명" ])[['정규직수', '합계', '연간급여총액']].sum().reset_index()

In [172]:
df_employee[df_employee.고유번호 =='00140858']

Unnamed: 0,고유번호,사업연도,보고서코드,법인명,정규직수,합계,연간급여총액
1507,140858,2018,11011,영신금속,276.0,276.0,16227000000.0
1508,140858,2019,11011,영신금속,274.0,274.0,15428080000.0
1509,140858,2020,11011,영신금속,291.0,291.0,13670740000.0


In [173]:
df_merged = df.merge(df_employee, how='inner', left_on=['bsns_year', 'corp_code'], right_on=['사업연도', '고유번호']) 

In [175]:
df_merged = df_merged.merge(df_company[['종목코드', '업종명', '주소', '시도', '시군구', '읍면동', '번지', '지역코드', '위도', '경도' ]], 
                how='left',   left_on ='stock_code', right_on='종목코드')


df_financial_stat = readFromPrestoDB('''
select * from hive.ods.biglab_disclosure_financial_statement
where "재무제표종류"='재무상태표, 유동/비유동법-연결재무제표'  -- '재무상태표, 유동/비유동법-별도재무제표'
and "보고서종류"='사업보고서'
and "항목코드"='ifrs-full_Assets' or "항목코드"= 'ifrs_Assets' -- 자산총계
''')

In [212]:
df_merged.rename(columns={"합계":"종업원수"}, inplace= True)
df_merged = df_merged[['rcept_no', 'bsns_year', 'account_nm', 'fs_nm', 'sj_nm', 'stock_code', 'corp_code', 'thstrm_amount', '법인명', '업종명' , '정규직수', '종업원수', '연간급여총액',
                       '시도', '시군구', '읍면동', '번지', '지역코드', '위도', '경도' ]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [178]:
len(df_merged)

4881

In [213]:
df_merged[df_merged.stock_code =='348150']

Unnamed: 0,rcept_no,bsns_year,account_nm,fs_nm,sj_nm,stock_code,corp_code,thstrm_amount,법인명,업종명,정규직수,종업원수,연간급여총액,시도,시군구,읍면동,번지,지역코드,위도,경도
3153,20210325000195,2020,자산총계,연결재무제표,재무상태표,348150,1399071,58537096638,고바이오랩,의학 및 약학 연구개발업,39.0,39.0,2090064000.0,서울,관악구,신림동,56-1,1162010200,37.468038,126.959294


In [180]:
table_name = 'dart_disclosure'
sql_create = f"""
    CREATE TABLE {table_name} (
    rcept_no VARCHAR(100),
    bsns_year VARCHAR(4),
    account_nm  VARCHAR(100),
    fs_nm VARCHAR(100),
    sj_nm VARCHAR(100),
    stock_code VARCHAR(6) ,
    corp_code VARCHAR(10),
    thstrm_amount bigint ,
    법인명 VARCHAR(100),
    업종명 VARCHAR(100),
    정규직수 int,
    종업원수 int, 
    연간급여총액  bigint,
    시도	 VARCHAR(10),
    시군구	VARCHAR(10),
    읍면동	VARCHAR(20),
    번지	VARCHAR(20),
    지역코드	VARCHAR(10),
    위도	double,
    경도    double
    ) CHARSET=utf8;
"""

In [181]:
push_into_tableau_DB(df_merged,  AWS_KEY, AWS_SECRET, table_name , sql_create, replace=True)

In [197]:
df_category = pd.read_excel('/Users/reejungkim/Downloads/20210401_기업공시정보.xlsx', sheet_name='표준산업분류')
df_category.CODE = df_category.CODE.astype(str).str.zfill(2)


In [303]:
pd.DataFrame(dart.company('000000'), index=[0])['status']=='100'

0    True
Name: status, dtype: bool

In [248]:
df_company2 = pd.DataFrame()
for stock_code in df_company.종목코드.unique():
    row = pd.DataFrame(dart.company(stock_code), index=[0])
    if row['status']!='100':
        df_company2 = df_company2.append(row)
    else:
        print(stock_code)
        continue

In [272]:
df_company2['induty_code_cat'] = df_company2['induty_code'].astype(str).str[:2]

In [298]:
d = df_company2.merge(df_category, how='left', left_on='induty_code_cat', right_on='CODE')


In [301]:
d[d.CODE.isnull()]['induty_code_cat'].unique()

array(['76'], dtype=object)