In [None]:
#이전 데이터도 미리 fetch 해 놓기 위한 ipynb

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import calendar
import pandas_market_calendars as mcal
import time
import requests
import pandas as pd
import numpy as np
import os
import io
import boto3
from datetime import datetime, date
from zoneinfo import ZoneInfo 
from pykrx import stock
from bs4 import BeautifulSoup
from io import BytesIO

def get_kst_now() -> datetime:
    return datetime.now(ZoneInfo("Asia/Seoul"))

def is_trading_day_krx(d: date) -> bool:
    krx = mcal.get_calendar("XKRX")
    ds = d.strftime("%Y-%m-%d")
    schedule = krx.schedule(start_date=ds, end_date=ds)
    return not schedule.empty

def first_trading_day_of_month(year: int, month: int) -> date | None:
    krx = mcal.get_calendar("XKRX")
    start = date(year, month, 1)
    last_day = calendar.monthrange(year, month)[1]
    end = date(year, month, last_day)

    schedule = krx.schedule(start_date=start.strftime("%Y-%m-%d"),
                            end_date=end.strftime("%Y-%m-%d"))
    if schedule.empty:
        return None
    return schedule.index[0].date()

def is_first_trading_day(d: date) -> bool:
    if not is_trading_day_krx(d):
        return False
    ftd = first_trading_day_of_month(d.year, d.month)
    return ftd is not None and d == ftd

def get_stock_info(biz_day, mktId): #STK, KSQ
    gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
    gen_otp_stk = {
        'mktId': mktId,
        'trdDd': biz_day,
        'money': '1',
        'csvxls_isNo': 'false',
        'name': 'fileDown',
        'url': 'dbms/MDC/STAT/standard/MDCSTAT03901'
        }
    headers = {'Referer': 'http://data.krx.co.kr/contents/MDC/MDI/mdiLoader',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}
    otp_stk = requests.post(gen_otp_url, gen_otp_stk, headers=headers).text
    down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
    down_sector_stk = requests.post(down_url, {'code': otp_stk}, headers=headers) 
    sector_stk = pd.read_csv(BytesIO(down_sector_stk.content), encoding='EUC-KR')
    sector_stk.columns = ['ticker', 'name', 'market', 'industry', 'close', 'change', 'change_rate', 'market_cap']
    sector_stk.set_index('ticker', inplace=True)
    return sector_stk

def get_explanation(code):
    url = f"https://navercomp.wisereport.co.kr/v2/company/c1010001.aspx?cmp_cd={code}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    elements = soup.find_all('li', class_="dot_cmp")
    return " ".join(li.text.strip() for li in elements)

def get_explanation_per_tickers(tickers, sleep):
    explained = []

    for idx, i in enumerate(tickers, 1):
        explanation = get_explanation(i)
        explained.append({'ticker': i, 'explanation': explanation})
        time.sleep(sleep)
        if idx % 100 == 0: print(f"{idx} tickers processed...")

    explained_df = pd.DataFrame(explained).set_index('ticker')
    return explained_df

def save_df_s3(prefix, today, market, df):
    bucket = os.getenv("S3_BUCKET")
    s3 = boto3.client("s3")

    year, month= today.year, today.month

    key = f"{prefix}/year={year}/month={month}/market={market}/{today.strftime("%Y-%m-%d")}.parquet"

    buffer = io.BytesIO()
    df.to_parquet(buffer, engine="pyarrow", index=True)
    buffer.seek(0)

    s3.upload_fileobj(buffer, bucket, key)
    print(f"[S3] Uploaded to s3://{bucket}/{key}")
    return 0


now = get_kst_now()
today = now.date()
print(today)

trading_day = is_trading_day_krx(today)
first_trading_day = is_first_trading_day(today)
print(trading_day, first_trading_day)

2025-09-30
True False


In [3]:
krx = mcal.get_calendar("XKRX")
schedule = krx.schedule(start_date='2024-01-01', end_date='2025-09-29')
fetch_list = schedule.index
fetch_list

DatetimeIndex(['2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05',
               '2024-01-08', '2024-01-09', '2024-01-10', '2024-01-11',
               '2024-01-12', '2024-01-15',
               ...
               '2025-09-16', '2025-09-17', '2025-09-18', '2025-09-19',
               '2025-09-22', '2025-09-23', '2025-09-24', '2025-09-25',
               '2025-09-26', '2025-09-29'],
              dtype='datetime64[ns]', length=426, freq=None)

In [9]:
for i in fetch_list:
    today_str = i.strftime("%Y%m%d")
    print(today_str)
    kospi = get_stock_info(today_str, 'STK')
    kosdaq = get_stock_info(today_str, 'KSQ')
    kospi_fundamental = stock.get_market_fundamental(today_str, market="KOSPI")
    kosdaq_fundamental = stock.get_market_fundamental(today_str, market="KOSDAQ")
    kospi_fundamental['ROE'] = kospi_fundamental['EPS']/kospi_fundamental['BPS']
    kosdaq_fundamental['ROE'] = kosdaq_fundamental['EPS']/kosdaq_fundamental['BPS']
    kospi_df = pd.concat([kospi, kospi_fundamental], axis=1).dropna(how='all')
    kosdaq_df = pd.concat([kosdaq, kosdaq_fundamental], axis=1).dropna(how='all')
    save_df_s3('price-financial-info', i, 'kospi', kospi_df)
    save_df_s3('price-financial-info', i, 'kosdaq', kosdaq_df)
    time.sleep(2)

20240102
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kospi/2024-01-02.parquet
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kosdaq/2024-01-02.parquet
20240103
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kospi/2024-01-03.parquet
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kosdaq/2024-01-03.parquet
20240104
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kospi/2024-01-04.parquet
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kosdaq/2024-01-04.parquet
20240105
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kospi/2024-01-05.parquet
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kosdaq/2024-01-05.parquet
20240108
[S3] Uploaded to s3://swpp-12-bucket/price-financial-info/year=2024/month=1/market=kosp

In [7]:
kospi_explanation = pd.read_csv('20250929_kospi_explanation.csv', index_col='ticker')
kosdaq_explanation = pd.read_csv('20250929_kosdaq_explanation.csv', index_col='ticker')
kospi_explanation

Unnamed: 0_level_0,explanation
ticker,Unnamed: 1_level_1
095570,"동사는 2000년 OA기기 및 산업용 특수장비의 대여, 판매를 목적으로 설립되어 2..."
006840,"동사는 2012년 인적분할로 지주회사로 전환되어 애경케미칼, 애경산업, 제주항공, ..."
027410,동사는 1994년 설립되어 2017년 투자사업부문과 사업부문으로 분할 후 2018년...
282330,"동사는 2017년 비지에프에서 분할된 법인으로, 2025년 BGF RETAIL HA..."
138930,"동사는 2011년 주식 포괄적 이전으로 설립된 지주회사이며, 부산은행, BNK투자증..."
...,...
079980,동사는 2000년 삼양사와 SK케미칼의 폴리에스터 원사 사업을 출자해 50:50으로...
005010,동사는 1967년 설립되어 1973년 한국거래소에 상장된 강관 전문 제조업체임. 배...
000540,"동사는 1948년 고려화재해상보험으로 설립되었고 2006년 태광그룹에 편입, 현재 ..."
000545,"동사는 1948년 고려화재해상보험으로 설립되었고 2006년 태광그룹에 편입, 현재 ..."


In [8]:
i = pd.to_datetime('2025-09-29')
save_df_s3('company-profile', i, 'kospi', kospi_explanation)
save_df_s3('company-profile', i, 'kosdaq', kosdaq_explanation)

[S3] Uploaded to s3://swpp-12-bucket/company-profile/year=2025/month=9/market=kospi/2025-09-29.parquet
[S3] Uploaded to s3://swpp-12-bucket/company-profile/year=2025/month=9/market=kosdaq/2025-09-29.parquet


0