# 최근 영업일 기준 데이터 받기

In [37]:
import requests as rq
from bs4 import BeautifulSoup

url = 'https://finance.naver.com/sise/sise_deposit.nhn'
data = rq.get(url)
data_html = BeautifulSoup(data.content)
# parse_day = data_html.select_one('div.subtop_size_graph2 > ul.subtop_chart_note > li > span.tab').text
parse_day = data_html.select_one(
    'div.subtop_sise_graph2 > ul.subtop_chart_note > li > span.tah').text
# parse_day = parse_day.get_text(strip=True) if parse_day else '날짜를 찾을 수 없습니다.'

print(parse_day)

  |  2024.06.20


In [14]:
import re

biz_day = re.findall('[0-9]+', parse_day)
biz_day = ''.join(biz_day)
print(biz_day)

20240620


In [38]:
# 최근 영업일 기준 데이터 받기
def get_biz_day():
    url = "https://finance.naver.com/sise/sise_deposit.nhn"
    data = rq.get(url)
    data_html = BeautifulSoup(data.content, "lxml")

    parse_day = data_html.select_one(
        "#type_0 > div > ul.subtop_chart_note > li > span"
    ).text[-10:]

    return parse_day.replace(".", "")

# 한국거래소의 업종 분류 현황 및 개별지표 크롤링

## 업종분류 현황 크롤링

In [21]:
import pandas as pd
from io import BytesIO

In [22]:
def get_krx_sector(biz_day):
    # OTP를 받아 오는 과정
    gen_otp_url = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd" # 원하는 항목을 제출할 URL
    # 개발자 도구 화면의 쿼리 내용
    gen_otp_stk = {
        "mktId": "STK", # 코스피
        "trdDd": biz_day, # 최근 영업일
        "money": "1",
        "csvxls_isNo": "false",
        "name": "fileDown",
        "url": "dbms/MDC/STAT/standard/MDCSTAT03901",
    }

    # 해더 부분에 리퍼러 추가, 방문 할 때 남기는 흥적
    headers = {
        "Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201"
    }
    otp_stk = rq.post(gen_otp_url, gen_otp_stk, headers=headers).text
    down_url = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd" # OTP를 제출할 URL
    down_sector_stk = rq.post(down_url, {"code": otp_stk}, headers=headers) # OTP 제출
    sector_stk = pd.read_csv(BytesIO(down_sector_stk.content), encoding="EUC_KR") # 받은 뎅터의 content부분을 BytesIO로 방너리 형태로 만든 후, 데이터 읽어 온다.

    gen_otp_ksq = {
        "mktId": "KSQ",  # 코스닥
        "trdDd": biz_day,
        "money": "1",
        "csvxls_isNo": "false",
        "name": "fileDown",
        "url": "dbms/MDC/STAT/standard/MDCSTAT03901",
    }
    otp_ksq = rq.post(gen_otp_url, gen_otp_ksq, headers=headers).text

    down_sector_ksq = rq.post(down_url, {"code": otp_ksq}, headers=headers)
    sector_ksq = pd.read_csv(BytesIO(down_sector_ksq.content), encoding="EUC_KR")

    krx_sector = pd.concat([sector_stk, sector_ksq], ignore_index=True, axis=0)
    krx_sector["종목명"] = krx_sector["종목명"].str.strip()
    krx_sector["기준일"] = biz_day
    return krx_sector

In [30]:
krx_sector = get_krx_sector(biz_day)

In [31]:
krx_sector

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액,기준일
0,095570,AJ네트웍스,KOSPI,서비스업,4680,125,2.74,211782912120,20240619
1,006840,AK홀딩스,KOSPI,기타금융,14570,-30,-0.21,193016963770,20240619
2,027410,BGF,KOSPI,기타금융,3500,-50,-1.41,335008768500,20240619
3,282330,BGF리테일,KOSPI,유통업,111300,-200,-0.18,1923698737800,20240619
4,138930,BNK금융지주,KOSPI,기타금융,8030,80,1.01,2586370157140,20240619
...,...,...,...,...,...,...,...,...,...
2683,024060,흥구석유,KOSDAQ,유통,14660,300,2.09,219900000000,20240619
2684,010240,흥국,KOSDAQ,기계·장비,5500,-10,-0.18,67774828000,20240619
2685,189980,흥국에프엔비,KOSDAQ,음식료·담배,2620,-180,-6.43,105161106740,20240619
2686,037440,희림,KOSDAQ,기타서비스,6260,30,0.48,87154693500,20240619


## 개별종목 지표 크롤링

In [34]:
# 개별종목 지표 크롤링
def get_krx_ind(biz_day):
    gen_otp_url = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
    gen_otp_data = {
        "searchType": "1",
        "mktId": "ALL",
        "trdDd": biz_day,
        "csvxls_isNo": "false",
        "name": "fileDown",
        "url": "dbms/MDC/STAT/standard/MDCSTAT03501",
    }
    headers = {
        "Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201"
    }
    otp = rq.post(gen_otp_url, gen_otp_data, headers=headers).text

    down_url = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
    krx_ind = rq.post(down_url, {"code": otp}, headers=headers)

    krx_ind = pd.read_csv(BytesIO(krx_ind.content), encoding="EUC_KR")
    krx_ind["종목명"] = krx_ind["종목명"].str.strip()
    krx_ind["기준일"] = biz_day
    return krx_ind

In [35]:
krx_ind = get_krx_ind(biz_day)

In [36]:
krx_ind

Unnamed: 0,종목코드,종목명,종가,대비,등락률,EPS,PER,선행 EPS,선행 PER,BPS,PBR,주당배당금,배당수익률,기준일
0,060310,3S,2710,55,2.07,30.0,90.33,,,947.0,2.86,0,0.00,20240619
1,095570,AJ네트웍스,4680,125,2.74,367.0,12.75,873.0,5.36,9326.0,0.50,270,5.77,20240619
2,006840,AK홀딩스,14570,-30,-0.21,2635.0,5.53,,,44339.0,0.33,200,1.37,20240619
3,054620,APS,6690,-120,-1.76,667.0,10.03,,,11683.0,0.57,0,0.00,20240619
4,265520,AP시스템,27850,-500,-1.76,3997.0,6.97,4581.0,6.08,21396.0,1.30,270,0.97,20240619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2634,000540,흥국화재,3880,-5,-0.13,4664.0,0.83,,,20881.0,0.19,0,0.00,20240619
2635,000545,흥국화재우,6000,0,0.00,,,,,,,0,0.00,20240619
2636,003280,흥아해운,2360,25,1.07,142.0,16.62,,,690.0,3.42,0,0.00,20240619
2637,037440,희림,6260,30,0.48,489.0,12.80,1113.0,5.62,5583.0,1.12,150,2.40,20240619


## 데이터 정리하기

In [41]:
# 데이터 정리하기
def get_kor_ticker(krx_sector, krx_ind):
    diff = list(set(krx_sector["종목명"]).symmetric_difference(krx_ind["종목명"]))

    kor_ticker = pd.merge(
        krx_sector,
        krx_ind,
        on=krx_sector.columns.intersection(krx_ind.columns).tolist(),
        how="outer",
    )

    kor_ticker["종목구분"] = np.where(
        kor_ticker["종목명"].str.contains("스팩|[0-9]+호"),
        "스팩",
        np.where(
            kor_ticker["종목코드"].str[-1:] != "0",
            "우선주",
            np.where(
                kor_ticker["종목명"].str.endswith("리츠"),
                "리츠",
                np.where(kor_ticker["종목명"].isin(diff), "기타", "보통주"),
            ),
        ),
    )

    kor_ticker = kor_ticker.reset_index(drop=True)
    kor_ticker.columns = kor_ticker.columns.str.replace(" ", "")

    kor_ticker = kor_ticker[
        [
            "종목코드",
            "종목명",
            "시장구분",
            "종가",
            "시가총액",
            "기준일",
            "EPS",
            "선행EPS",
            "BPS",
            "주당배당금",
            "종목구분",
        ]
    ]
    kor_ticker = kor_ticker.replace(np.nan, None)
    kor_ticker["기준일"] = pd.to_datetime(kor_ticker["기준일"])
    return kor_ticker

In [42]:
import numpy as np

# 최근 영업일 기준 데이터 받기
biz_day = get_biz_day()
# 업종분류 현황 크롤링
krx_sector = get_krx_sector(biz_day)
# 개별종목 지표 크롤링
krx_ind = get_krx_ind(biz_day)
# 데이터 정리하기
kor_ticker = get_kor_ticker(krx_sector, krx_ind)

In [43]:
kor_ticker

Unnamed: 0,종목코드,종목명,시장구분,종가,시가총액,기준일,EPS,선행EPS,BPS,주당배당금,종목구분
0,000020,동화약품,KOSPI,8140,227362165800,2024-06-20,991.0,,13413.0,180.0,보통주
1,000040,KR모터스,KOSPI,652,39206629936,2024-06-20,,,618.0,0.0,보통주
2,000050,경방,KOSPI,7750,212468342500,2024-06-20,,,29623.0,125.0,보통주
3,000070,삼양홀딩스,KOSPI,70900,607206813900,2024-06-20,22269.0,,257475.0,3500.0,보통주
4,000075,삼양홀딩스우,KOSPI,55500,16875219000,2024-06-20,,,,3550.0,우선주
...,...,...,...,...,...,...,...,...,...,...,...
2683,950170,JTC,KOSDAQ,5770,298576427960,2024-06-20,,,,,기타
2684,950190,고스트스튜디오,KOSDAQ,10440,141774072480,2024-06-20,,,,,기타
2685,950200,소마젠,KOSDAQ,4360,83869191080,2024-06-20,,,,,기타
2686,950210,프레스티지바이오파마,KOSPI,8950,537860587250,2024-06-20,,,,,기타


In [44]:
kor_ticker['시장구분'].unique()

array(['KOSPI', 'KOSDAQ'], dtype=object)

In [45]:
kor_ticker['종목구분'].unique()

array(['보통주', '우선주', '기타', '리츠', '스팩'], dtype=object)

In [47]:
!pip install pandas_market_calendars


Collecting pandas_market_calendars
  Downloading pandas_market_calendars-4.4.1-py3-none-any.whl.metadata (9.0 kB)
Collecting exchange-calendars>=3.3 (from pandas_market_calendars)
  Downloading exchange_calendars-4.5.4-py3-none-any.whl.metadata (37 kB)
Collecting pyluach (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting toolz (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting korean-lunar-calendar (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Downloading pandas_market_calendars-4.4.1-py3-none-any.whl (107 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.3/107.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading exchange_calendars-4.5.4-py3-none-any.whl (192 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━

In [54]:
import pandas_market_calendars as mcal
import datetime
# 최근 1달간의 비즈니스 데이 정보를 가져오는 함수
def get_recent_biz_days(days):
    today = datetime.datetime.today()
    start_date = (today - datetime.timedelta(days=days)).strftime('%Y-%m-%d')
    end_date = today.strftime('%Y-%m-%d')

    krx = mcal.get_calendar('XKRX')
    # break_start와 break_end 시간을 제거
    krx.remove_time('break_start')
    krx.remove_time('break_end')

    biz_days = krx.schedule(start_date=start_date, end_date=end_date)
    biz_days_list = biz_days.index.strftime('%Y%m%d').tolist()

    return biz_days_list

In [56]:
get_recent_biz_days(30)



['20240527',
 '20240528',
 '20240529',
 '20240530',
 '20240531',
 '20240603',
 '20240604',
 '20240605',
 '20240607',
 '20240610',
 '20240611',
 '20240612',
 '20240613',
 '20240614',
 '20240617',
 '20240618',
 '20240619',
 '20240620',
 '20240621',
 '20240624']

In [59]:
import pandas_market_calendars as mcal
import datetime
import warnings

# 특정 경고 메시지 무시
warnings.filterwarnings("ignore", category=UserWarning, module="pandas_market_calendars")

# 비즈니스 데이 정보를 가져오는 함수
def get_biz_days(start_date, end_date):
    krx = mcal.get_calendar('XKRX')
    # break_start와 break_end 시간을 제거
    krx.remove_time('break_start')
    krx.remove_time('break_end')

    biz_days = krx.schedule(start_date=start_date, end_date=end_date)
    biz_days_list = biz_days.index.strftime('%Y%m%d').tolist()

    return biz_days_list

# 함수 실행 예시
start_date = '2023-05-01'
end_date = '2023-06-01'
print(get_biz_days(start_date, end_date))


['20230502', '20230503', '20230504', '20230508', '20230509', '20230510', '20230511', '20230512', '20230515', '20230516', '20230517', '20230518', '20230519', '20230522', '20230523', '20230524', '20230525', '20230526', '20230530', '20230531', '20230601']


# WICS 기준 섹터 정보 크롤링

In [69]:
from tqdm import tqdm 
import random
import time 

def get_kor_sector(biz_day):
    sector_code = ["G25", "G35", "G50", "G40", "G10", "G20", "G55", "G30", "G15", "G45"]

    data_sector = []

    for code in tqdm(sector_code):
        url = f"""http://www.wiseindex.com/Index/GetIndexComponets?ceil_yn=0&dt={biz_day}&sec_cd={code}"""
        data = rq.get(url).json()
        data_pd = pd.json_normalize(data["list"])

        data_sector.append(data_pd)
        rand_sec = random.uniform(3, 5)
        time.sleep(rand_sec)

    kor_sector = pd.concat(data_sector, axis=0)
    kor_sector = kor_sector[["IDX_CD", "CMP_CD", "CMP_KOR", "SEC_NM_KOR"]]
    kor_sector["기준일"] = biz_day
    kor_sector["기준일"] = pd.to_datetime(kor_sector["기준일"])
    return kor_sector

In [70]:
biz_day = get_biz_day()

In [71]:
kor_sector = get_kor_sector(biz_day)

100%|███████████████████████████████████████████| 10/10 [00:43<00:00,  4.33s/it]


In [73]:
kor_sector.to_csv('kor_sector.csv', index=False)

# 수정주가 크롤링
## 개별종목 주가 크롤링