# 5 금융데이터 수집하기 (기본)
API와 크롤링을 이용한다면 비용을 지불하지 않고 얼마든지 금융 데이터를 수집할 수있습니다. 이 CHAPTER에서는 금융 데이터를 받기 위해 필요한 주식티커를 구하는 방법과 섹터별 구성종목을 크롤링하는 방법을 알아보겠습니다.


## 5.1.1 업종분류 현황 크롤링

In [1]:
import requests
import json 
import pandas as pd
import datetime
now = (datetime.datetime.now()-datetime.timedelta(1)).strftime("%Y%m%d")
# now = datetime.datetime.now().strftime("%Y%m%d")


gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'

# 코스피 크롤링
gen_otp_data = {
  'mktId' : 'STK',
  'trdDd' : f'{now}',
  'money' : '1',
  'csvxls_isNo' :'false',
  'name' : 'fileDown',
  'url' : 'dbms/MDC/STAT/standard/MDCSTAT03901'
}

otp = requests.post(gen_otp_url,  gen_otp_data).text

otp_json = {
    'code':otp
}

referer = {
    'referer':gen_otp_url
}

sector_ks = requests.post(down_url, otp_json, referer)
sector_ks.encoding = 'EUC-KR'

sector_ks = sector_ks.text

df = pd.DataFrame([[i][0].replace('"농업, 임업 및 어업"','"농업/임업 및 어업"').replace('"','').split(',') for i in sector_ks.split('\n')])
df.columns = df.iloc[0,:].values
df_kospi = df.iloc[1:,:-1]

# 코스닥 크롤링
gen_otp_data = {
  'mktId' : 'KSQ',
  'trdDd' : f'{now}',
  'money' : '1',
  'csvxls_isNo' :'false',
  'name' : 'fileDown',
  'url' : 'dbms/MDC/STAT/standard/MDCSTAT03901'
}

otp = requests.post(gen_otp_url,  gen_otp_data).text

otp_json = {
    'code':otp
}

referer = {
    'referer':gen_otp_url
}

sector_ks = requests.post(down_url, otp_json, referer)
sector_ks.encoding = 'EUC-KR'

sector_ks = sector_ks.text

df = pd.DataFrame([[i][0].replace('"농업, 임업 및 어업"','"농업/임업 및 어업"').replace('"','').split(',') for i in sector_ks.split('\n')])
df.columns = df.iloc[0,:].values
df_kosdaq = df.iloc[1:,:]

# 데이터 전처리
df_sec = pd.concat([df_kospi,df_kosdaq]).reset_index(drop=True)

df_sec['종목코드'] = df_sec['종목코드'].astype('str')
df_sec['종목명'] = df_sec['종목명'].astype('str')
df_sec['시장구분'] = df_sec['시장구분'].astype('str')
df_sec['업종명'] = df_sec['업종명'].astype('str')
df_sec['종가'] = df_sec['종가'].astype('int')
df_sec['대비'] = df_sec['대비'].astype('int')
df_sec['등락률'] = df_sec['등락률'].astype('float')
df_sec['시가총액'] = df_sec['시가총액'].fillna("0").astype('int')


df_sec.to_csv('./data/krx_sector.csv')
df_sec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2572 entries, 0 to 2571
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   종목코드    2572 non-null   object 
 1   종목명     2572 non-null   object 
 2   시장구분    2572 non-null   object 
 3   업종명     2572 non-null   object 
 4   종가      2572 non-null   int64  
 5   대비      2572 non-null   int64  
 6   등락률     2572 non-null   float64
 7   시가총액    2572 non-null   int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 160.9+ KB


In [2]:
df_kospi

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률
1,095570,AJ네트웍스,KOSPI,서비스업,5400,50,0.93
2,006840,AK홀딩스,KOSPI,기타금융,17050,-360,-2.07
3,027410,BGF,KOSPI,기타금융,4530,45,1.00
4,282330,BGF리테일,KOSPI,유통업,183900,1900,1.04
5,138930,BNK금융지주,KOSPI,기타금융,6550,-40,-0.61
...,...,...,...,...,...,...,...
939,005010,휴스틸,KOSPI,철강금속,6370,140,2.25
940,000540,흥국화재,KOSPI,보험,3700,-90,-2.37
941,000547,흥국화재2우B,KOSPI,보험,19770,100,0.51
942,000545,흥국화재우,KOSPI,보험,6860,-60,-0.87


## 5.1.2 개별종목 지표 크롤링

In [3]:
# 코스닥 크롤링

gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'

gen_otp_data = {
  'searchType ' : '1',
  'mktId' : 'ALL',
  'trdDd' : f'{now}',
  'csvxls_isNo' :'false',
  'name' : 'fileDown',
  'url' : 'dbms/MDC/STAT/standard/MDCSTAT03501'
}

otp = requests.post(gen_otp_url, gen_otp_data).text

otp_json = {
    'code':otp
}

referer = {
    'referer':gen_otp_url
}

down_ind = requests.post(down_url, otp_json, referer)
down_ind.encoding = 'EUC-KR'

down_ind = down_ind.text

df = pd.DataFrame([[i][0].replace('"','').split(',') for i in down_ind.split('\n')])
df
df.columns = df.iloc[0,:].values
df_ind = df.iloc[1:,:-1]

df_ind['종목코드'] = df_ind['종목코드'].astype('str')
df_ind['종목명'] = df_ind['종목명'].astype('str')
df_ind['종가'] = df_ind['종가'].astype('int')
df_ind['대비'] = df_ind['대비'].astype('int')
df_ind['등락률'] = df_ind['등락률'].astype('float')
df_ind['EPS'] = df_ind['EPS'].str.replace('','0').astype('float')
df_ind['PER'] = df_ind['PER'].str.replace('','0').astype('float')
df_ind['선행 EPS'] = df_ind['선행 EPS'].str.replace('','0').astype('float')
df_ind['선행 PER'] = df_ind['선행 PER'].str.replace('','0').astype('float')
df_ind['BPS'] = df_ind['BPS'].str.replace('','0').astype('float')
df_ind['PBR'] = df_ind['PBR'].str.replace('','0').astype('float')
df_ind['주당배당금'] = df_ind['주당배당금'].str.replace('','0').astype('float')



df_ind.to_csv('./data/krx_ind.csv')
df_ind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2524 entries, 1 to 2524
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   종목코드    2524 non-null   object 
 1   종목명     2524 non-null   object 
 2   종가      2524 non-null   int64  
 3   대비      2524 non-null   int64  
 4   등락률     2524 non-null   float64
 5   EPS     2524 non-null   float64
 6   PER     2524 non-null   float64
 7   선행 EPS  2524 non-null   float64
 8   선행 PER  2524 non-null   float64
 9   BPS     2524 non-null   float64
 10  PBR     2524 non-null   float64
 11  주당배당금   2524 non-null   float64
dtypes: float64(8), int64(2), object(2)
memory usage: 236.8+ KB


## 5.1.4 거래소 데이터 정리하기
5.1.3은 별 쓸모 없어서 스킵합니다.


In [4]:
# 컬럼명의 교집합 찾기
list(set(df_sec.columns)& set(df_ind.columns))

['종목명', '종가', '종목코드', '대비', '등락률']

In [5]:
# 종목명의 교집합 찾기
drop = ['등락률', '종목명', '대비', '종가']
df_ind = df_ind.drop(drop,axis=1)
KOR_ticker = pd.merge(df_sec, df_ind,on = '종목코드',how='inner')
KOR_ticker = KOR_ticker.sort_values('시가총액',ascending=False)
KOR_ticker.to_csv('./data/KOR_ticker.csv')
KOR_ticker

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액,EPS,PER,선행 EPS,선행 PER,BPS,PBR,주당배당금
1773,247540,에코프로비엠,KOSDAQ,일반전기전자,209000,3500,1.70,20440480896000,1.010503e+07,108010.0207,4.060004e+07,4050.0400,5.090609e+07,3050.0001,9.020000e+05
1516,091990,셀트리온헬스케어,KOSDAQ,유통,63900,1100,1.75,10112335452900,9.040000e+05,6070.0908,1.040404e+07,4040.0205,1.020906e+09,40.0903,2.060000e+05
1817,066970,엘앤에프,KOSDAQ,IT부품,246000,1000,0.41,8860505736000,0.000000e+00,0.0000,1.000007e+09,2040.0403,1.090809e+09,1020.0306,0.000000e+00
1772,086520,에코프로,KOSDAQ,금융,334000,42000,14.38,8600630594000,1.010200e+09,2090.0801,2.060804e+09,1020.0404,4.050806e+09,70.0208,4.030000e+05
1718,041510,에스엠,KOSDAQ,오락·문화,158500,8800,5.88,3773948558500,5.070500e+07,2070.0507,4.020806e+07,3060.0908,2.060303e+09,60.0002,2.000000e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,248170,샘표식품,KOSPI,음식료품,30350,-350,-1.14,0,5.010808e+07,50.0805,0.000000e+00,0.0000,4.070007e+09,0.0604,2.000000e+05
453,007540,샘표,KOSPI,기타금융,48600,-750,-1.52,0,5.030506e+07,90.0007,0.000000e+00,0.0000,8.060007e+09,0.0506,2.000000e+05
779,002960,한국쉘석유,KOSPI,화학,235500,-500,-0.21,0,2.020304e+09,1000.0504,0.000000e+00,0.0000,8.040207e+09,20.0709,1.090000e+09
452,075180,새론오토모티브,KOSPI,운수장비,4695,5,0.11,0,7.030000e+05,60.0403,0.000000e+00,0.0000,1.020401e+09,0.0308,1.040000e+05


In [6]:
# 우선주 확인
KOR_ticker[KOR_ticker['종목코드'].apply(lambda x: x[-1]!='0')]
KOR_ticker[KOR_ticker['종목코드'].apply(lambda x: x[-1]!='0')]


Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액,EPS,PER,선행 EPS,선행 PER,BPS,PBR,주당배당금
2480,03481K,해성산업1우,KOSDAQ,금융,13160,-280,-2.08,18278213520,0.0,0.0,0.0,0.0,0.0,0.0,108000.0
1271,08537M,루트로닉3우C,KOSDAQ,의료·정밀기기,36000,1500,4.35,6645276000,0.0,0.0,0.0,0.0,0.0,0.0,300020.0
1521,032685,소프트센우,KOSDAQ,유통,28950,-650,-2.20,4135710150,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1166,021045,대호특수강우,KOSDAQ,금속,9200,-60,-0.65,3903063200,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231,005257,녹십자홀딩스2우,KOSPI,서비스업,33200,-250,-0.75,0,0.0,0.0,0.0,0.0,0.0,0.0,400000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,000075,삼양홀딩스우,KOSPI,기타금융,55700,-800,-1.42,0,0.0,0.0,0.0,0.0,0.0,0.0,30005000.0
473,004365,세방우,KOSPI,운수창고업,11380,-770,-6.34,0,0.0,0.0,0.0,0.0,0.0,0.0,200050.0
468,004985,성신양회우,KOSPI,비금속광물,12450,50,0.40,0,0.0,0.0,0.0,0.0,0.0,0.0,205000.0
465,014915,성문전자우,KOSPI,전기전자,8690,-220,-2.47,0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0


In [7]:
# 우선주 제외 항목만
KOR_ticker_tr= KOR_ticker[~KOR_ticker['종목코드'].apply(lambda x: x[-1]!='0')]
KOR_ticker_tr

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액,EPS,PER,선행 EPS,선행 PER,BPS,PBR,주당배당금
1773,247540,에코프로비엠,KOSDAQ,일반전기전자,209000,3500,1.70,20440480896000,1.010503e+07,108010.0207,4.060004e+07,4050.0400,5.090609e+07,3050.0001,9.020000e+05
1516,091990,셀트리온헬스케어,KOSDAQ,유통,63900,1100,1.75,10112335452900,9.040000e+05,6070.0908,1.040404e+07,4040.0205,1.020906e+09,40.0903,2.060000e+05
1817,066970,엘앤에프,KOSDAQ,IT부품,246000,1000,0.41,8860505736000,0.000000e+00,0.0000,1.000007e+09,2040.0403,1.090809e+09,1020.0306,0.000000e+00
1772,086520,에코프로,KOSDAQ,금융,334000,42000,14.38,8600630594000,1.010200e+09,2090.0801,2.060804e+09,1020.0404,4.050806e+09,70.0208,4.030000e+05
1718,041510,에스엠,KOSDAQ,오락·문화,158500,8800,5.88,3773948558500,5.070500e+07,2070.0507,4.020806e+07,3060.0908,2.060303e+09,60.0002,2.000000e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,248170,샘표식품,KOSPI,음식료품,30350,-350,-1.14,0,5.010808e+07,50.0805,0.000000e+00,0.0000,4.070007e+09,0.0604,2.000000e+05
453,007540,샘표,KOSPI,기타금융,48600,-750,-1.52,0,5.030506e+07,90.0007,0.000000e+00,0.0000,8.060007e+09,0.0506,2.000000e+05
779,002960,한국쉘석유,KOSPI,화학,235500,-500,-0.21,0,2.020304e+09,1000.0504,0.000000e+00,0.0000,8.040207e+09,20.0709,1.090000e+09
452,075180,새론오토모티브,KOSPI,운수장비,4695,5,0.11,0,7.030000e+05,60.0403,0.000000e+00,0.0000,1.020401e+09,0.0308,1.040000e+05


## 5.2 WICS 기준 섹터정보 크롤링
일반적으로 주식의 섹터를 나누는 기준은 MSCI와 S&P가 개발한 GICS12를 가장 많이 사용합니다. 국내 종목의 GICS 기준 정보 역시 한국거래소에서 제공하고 있으나, 이는 독점적 지적재산으로 명시했기에 사용하는 데 무리가 있습니다. 그러나 지수제공업체인 와이즈인덱스13에서는 GICS와 비슷한 WICS 산업분류를 발표하고 있습니다. WICS를 크롤링해 필요한 정보를 수집해보겠습니다.

In [8]:
import json
import pandas
import requests

secta = ['G25', 'G35', 'G50', 'G40', 'G10', 'G20', 'G55', 'G30', 'G15', 'G45']
cols = ['IDX_CD',	'IDX_NM_KOR',	'ALL_MKT_VAL',	'CMP_CD',	'CMP_KOR',	'MKT_VAL',	'WGT',	'S_WGT',	'CAL_WGT',	'SEC_CD',	'SEC_NM_KOR',	'SEQ',	'TOP60',	'APT_SHR_CNT']
df = pd.DataFrame(columns =cols)
for c in secta:
    url = f'http://www.wiseindex.com/Index/GetIndexComponets?ceil_yn=0&dt={now}&sec_cd={c}'
    data = requests.get(url).text
    json_ = json.loads(data)
    df_temp = pd.DataFrame(json_['list'])
    df = pd.concat([df,df_temp])

In [9]:
df

Unnamed: 0,IDX_CD,IDX_NM_KOR,ALL_MKT_VAL,CMP_CD,CMP_KOR,MKT_VAL,WGT,S_WGT,CAL_WGT,SEC_CD,SEC_NM_KOR,SEQ,TOP60,APT_SHR_CNT
0,G25,WICS 경기관련소비재,128243303,005380,현대차,24417928,19.04,19.04,1.0,G25,경기관련소비재,1,9,139610794
1,G25,WICS 경기관련소비재,128243303,000270,기아,19829564,15.46,34.50,1.0,G25,경기관련소비재,2,9,251325275
2,G25,WICS 경기관련소비재,128243303,012330,현대모비스,13613353,10.62,45.12,1.0,G25,경기관련소비재,3,9,63171013
3,G25,WICS 경기관련소비재,128243303,051900,LG생활건강,6006759,4.68,49.80,1.0,G25,경기관련소비재,4,9,9370918
4,G25,WICS 경기관련소비재,128243303,090430,아모레퍼시픽,4127483,3.22,53.02,1.0,G25,경기관련소비재,5,9,30416235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,G45,WICS IT,514288637,009140,경인전자,13250,0.00,99.37,1.0,G45,IT,638,2,549779
638,G45,WICS IT,514288637,174880,장원테크,10779,0.00,99.37,1.0,G45,IT,639,2,16867870
639,G45,WICS IT,514288637,033200,모아텍,10545,0.00,99.37,1.0,G45,IT,640,2,1863054
640,G45,WICS IT,514288637,043590,크로바하이텍,10530,0.00,99.37,1.0,G45,IT,641,2,10019000


In [10]:
df.to_csv('./data/KOR_sector.csv')