In [72]:
import pandas as pd
import plotly.io as pio
import importlib

# from utils import common_utils
# importlib.reload(common_utils)
from utils.common_utils import *

# from utils import chart_utils
# importlib.reload(chart_utils)
from utils.chart_utils import *

# from utils import krx_utils
# importlib.reload(krx_utils)
from utils.krx_utils import *

# from utils import ki_utils
# importlib.reload(ki_utils)
from utils.ki_utils import *

pd.options.display.float_format = "{:.3f}".format

pd.set_option('future.no_silent_downcasting', True)

# 출력 가능한 최대 행 수를 None으로 설정 (제한 없음)
pd.set_option('display.max_rows', None)

# 출력 가능한 최대 컬럼 수를 None으로 설정 (제한 없음)
pd.set_option('display.max_columns', None)

# Plotly 데이터를 Notebook 파일에 저장하지 않도록 설정
pio.renderers.default = "notebook_connected"

# 매매 전략
- 매매 (슬리피지 최소화)
  - 당일 거래 대금순 TopN 종목
  - 종가가 상한가 근처(29%~)인 경우 제외
  - 시간외 종가 매수(15:50~16:00)
  - n일 후 종가 매도
- 주요 피쳐
  - 캔들 상태
    - 양봉/음봉, 위/아래 꼬리 비율, 바디 비율
  - 전일 종가 대비 시가/고가/저가/종가 등락률
  - 거래량 변화율
  - 이동평균선
    - 종가 이평선 5, 20일 상태
    - 거래량 이평선 5, 20일 상태
  - 매매동향
    - 개인, 기관, 외국인 순매수량
  - 지수
    - 코스피/코스닥 지수, 각종 해외 선물 지수, 변동성 지수(VIX)
  - 각종 기술 지표
    - RSI, ATR, MACD
  - ~~한국/미국 선옵 만기일(변동성 고려)~~
- 수익률
  - 수수료 및 세금 제외

In [2]:
import pandas as pd
from datetime import datetime
from utils.common_utils import *
from pykrx.website.krx.market.ticker import StockTicker
import pandas_market_calendars as mcal
import warnings


# 상장 폐지 종목
delisted_stocks = StockTicker().delisted.index


def get_n_trading_days_before(year, ndays):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        # 한국거래소 캘린더 불러오기
        krx = mcal.get_calendar('XKRX')
        
        # 연도 기준 n거래일 전 날짜 가져오기
        schedule = krx.schedule(start_date=f'{year-1}-01-01', end_date=f'{year}-01-01')
        return schedule.index[-ndays].strftime('%Y%m%d')
        

# VWMA (Volume-Weighted Moving Average: 거래 가중 이동평균)
def vwma(price, volume, window):
    pv = price * volume
    return pv.rolling(window=window).sum() / volume.rolling(window=window).sum()

    
def get_topn_stocks(start_year, end_year, topn, min_market_cap):
    topn_df = get_market_years('market', start_year - 1, end_year)

    # 상장 폐지 종목 제외
    topn_df = topn_df[~topn_df['stock_code'].isin(delisted_stocks)]

    # 코스피/코스닥 종목
    topn_df = topn_df[topn_df['market_name'].isin(['KOSPI', 'KOSDAQ', 'KOSDAQGLOBAL'])]

    # 종목별 마지막 거래정지일
    last_suspend_date = (
        topn_df[topn_df['open'] == 0]
        .groupby('stock_code')['date']
        .max()
        .rename('suspend_date')
    )
    topn_df = topn_df.merge(last_suspend_date, on='stock_code', how='left')

    # 거래 일시 정지 이후만 유지
    topn_df = topn_df[(topn_df['suspend_date'].isna()) | (topn_df['date'] > topn_df['suspend_date'])]
    
    # 컬럼 지정
    topn_df = topn_df[['stock_code', 'stock_name', 'date', 'market_cap', 'market_name', 'trading_value']]

    # 시가총액 필터링 (전체 기간 동안 최소 시가 총액에 한번도 도달하지 못한 종목 제외)
    valid_stocks = topn_df.loc[topn_df['market_cap'] >= min_market_cap, 'stock_code'].unique()
    topn_df = topn_df[topn_df['stock_code'].isin(valid_stocks)]

    # 일별 거래대금 순 상위 종목 선정 (전체 기간 동안 topn에 한번도 도달하지 못한 종목 제외)
    rank = topn_df.groupby('date')['trading_value'].rank(ascending=False, method='min')
    topn_stocks = topn_df.loc[rank <= topn, 'stock_code'].unique()
    topn_df = topn_df[topn_df['stock_code'].isin(topn_stocks)]

    # 종목명 변경
    # topn_df.loc[topn_df['stock_code'] == '035420', 'stock_name'] = '네이버'

    # 시가/저가/고가/종가/거래량
    ohlcv_df = get_market_years('ohlcv', start_year - 1, end_year)
    topn_df = topn_df.merge(ohlcv_df, on=['stock_code', 'date'], how='left')

    # 개인/기관/외국인 순매수량
    trading_df = get_market_years('trading', start_year - 1, end_year)
    topn_df = topn_df.merge(trading_df, on=['stock_code', 'date'], how='left')

    # 코스피 VIX (변동성 지수: 공포 지수)
    # https://kr.investing.com/indices/kospi-volatility-historical-data
    # 종가는 15:30 확정
    kospi_vix_df = get_index_years('KOSPI_VIX', start_year - 1, end_year)
    kospi_vix_df = kospi_vix_df.rename(columns={
        'close': 'vix_close',
        # 'close_rate': 'vix_close_rate',
    })
    kospi_vix_df = kospi_vix_df[['date', 'vix_close']]#, 'vix_close_rate']]
    kospi_vix_df = kospi_vix_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    kospi_vix_df["vix_close_rate"] = kospi_vix_df['vix_close'] / kospi_vix_df['vix_close'].shift(1)
    topn_df = topn_df.merge(kospi_vix_df, on=['date'], how='left')

    # 코스피
    kospi_df = get_index_years('KOSPI', start_year - 1, end_year)
    kospi_df = kospi_df.rename(columns={'close': 'kospi_close', 'trading_volume': 'kospi_trading_volume', 'trading_value': 'kospi_trading_value'})
    kospi_df = kospi_df[['date', 'kospi_close', 'kospi_trading_volume', 'kospi_trading_value']]
    # 코스피 등락률
    kospi_df = kospi_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    kospi_df["kospi_close_rate"] = kospi_df['kospi_close'] / kospi_df['kospi_close'].shift(1)
    # 코스피 이동평균
    kospi_ma5 = kospi_df['kospi_close'].rolling(window=5).mean()
    kospi_ma20 = kospi_df['kospi_close'].rolling(window=20).mean()
    kospi_df["kospi_ma5_gap"] = (kospi_df["kospi_close"] - kospi_ma5) / kospi_ma5
    kospi_df["kospi_ma20_gap"] = (kospi_df["kospi_close"] - kospi_ma20) / kospi_ma20
    kospi_vwma5 = vwma(kospi_df["kospi_close"], kospi_df["kospi_trading_volume"], 5)
    kospi_vwma20 = vwma(kospi_df["kospi_close"], kospi_df["kospi_trading_volume"], 20)
    kospi_df["kospi_vwma5_gap"] = (kospi_df["kospi_close"] - kospi_vwma5) / kospi_vwma5
    kospi_df["kospi_vwma20_gap"] = (kospi_df["kospi_close"] - kospi_vwma20) / kospi_vwma20
    # 코스피 전일 대비 거래대금 변화율
    kospi_df["kospi_trading_rate"] = kospi_df['kospi_trading_value'] / kospi_df['kospi_trading_value'].shift(1)
    # 코스피 5일 평균 대비 거래대금 변화율
    kospi_trading_ma5 = kospi_df["kospi_trading_value"].rolling(5).mean()
    kospi_df["kospi_trading_dev"] = kospi_df["kospi_trading_value"] / kospi_trading_ma5
    topn_df = topn_df.merge(kospi_df, on=['date'], how='left')

    # 코스피200 선물
    # https://kr.investing.com/indices/kospi-200-future-historical-data
    # 종가는 15:50 확정
    kospi200f_df = get_index_years('KOSPI_200_FUTURES', start_year - 1, end_year)
    kospi200f_df = kospi200f_df.rename(columns={'close': 'kospi200f_close'})#, 'close_rate': 'kospi200f_close_rate'})
    kospi200f_df = kospi200f_df[['date', 'kospi200f_close']]#, 'kospi200f_close_rate']]
    kospi200f_df = kospi200f_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    kospi200f_df["kospi200f_close_rate"] = kospi200f_df['kospi200f_close'] / kospi200f_df['kospi200f_close'].shift(1)
    topn_df = topn_df.merge(kospi200f_df, on=['date'], how='left')

    # 코스닥
    kosdaq_df = get_index_years('KOSDAQ', start_year - 1, end_year)
    kosdaq_df = kosdaq_df.rename(columns={'close': 'kosdaq_close', 'trading_volume': 'kosdaq_trading_volume', 'trading_value': 'kosdaq_trading_value'})
    kosdaq_df = kosdaq_df[['date', 'kosdaq_close', 'kosdaq_trading_volume', 'kosdaq_trading_value']]
    # 코스닥 등락률
    kosdaq_df = kosdaq_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    kosdaq_df["kosdaq_close_rate"] = kosdaq_df['kosdaq_close'] / kosdaq_df['kosdaq_close'].shift(1)
    # 코스닥 이동평균
    kosdaq_ma5 = kosdaq_df['kosdaq_close'].rolling(window=5).mean()
    kosdaq_ma20 = kosdaq_df['kosdaq_close'].rolling(window=20).mean()
    kosdaq_df["kosdaq_ma5_gap"] = (kosdaq_df["kosdaq_close"] - kosdaq_ma5) / kosdaq_ma5
    kosdaq_df["kosdaq_ma20_gap"] = (kosdaq_df["kosdaq_close"] - kosdaq_ma20) / kosdaq_ma20
    kosdaq_vwma5 = vwma(kosdaq_df["kosdaq_close"], kosdaq_df["kosdaq_trading_volume"], 5)
    kosdaq_vwma20 = vwma(kosdaq_df["kosdaq_close"], kosdaq_df["kosdaq_trading_volume"], 20)
    kosdaq_df["kosdaq_vwma5_gap"] = (kosdaq_df["kosdaq_close"] - kosdaq_vwma5) / kosdaq_vwma5
    kosdaq_df["kosdaqi_vwma20_gap"] = (kosdaq_df["kosdaq_close"] - kosdaq_vwma20) / kosdaq_vwma20
    # 코스닥 전일 대비 거래대금 변화율
    kosdaq_df["kosdaq_trading_rate"] = kosdaq_df['kosdaq_trading_value'] / kosdaq_df['kosdaq_trading_value'].shift(1)
    # 코스닥 5일 평균 대비 거래대금 변화율
    kosdaq_trading_ma5 = kosdaq_df["kosdaq_trading_value"].rolling(5).mean()
    kosdaq_df["kosdaq_trading_dev"] = kosdaq_df["kosdaq_trading_value"] / kosdaq_trading_ma5
    topn_df = topn_df.merge(kosdaq_df, on=['date'], how='left')

    # 코스피 코스닥 거래대금 비율
    topn_df["kospi_kosdaq_ratio"] = topn_df["kosdaq_trading_value"] / topn_df["kospi_trading_value"]

    # 미국 선물 종가는 한국 시간 기준 다음날 아침에 마감되기 때문에 한국 날짜 기준으로 입력 데이터에 포함하면 데이터 누수가 발생한다.
    # 나스닥100, S&P500 선물 일별 종가(정산가) 결정 시간
    # CME(시카고상품거래소)에서 매일 오후 4~5시(ET) 정산가(Settlement Price)를 결정한다.
    # 이는 한국 시간(KST)으로 다음날 오전 5~6시 (인베스팅은 오전 10시쯤 업데이트된다.)

    # 나스닥100 선물
    # https://kr.investing.com/indices/nq-100-futures-historical-data
    nasdaq100f_df = get_index_years('NASDAQ_100_FUTURES', start_year - 1, end_year)
    nasdaq100f_df = nasdaq100f_df.rename(columns={'close': 'nasdaq100f_close'})#, 'close_rate': 'nasdaq100f_close_rate'})
    nasdaq100f_df = nasdaq100f_df[['date', 'nasdaq100f_close']]#, 'nasdaq100f_close_rate']]
    nasdaq100f_df = nasdaq100f_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    nasdaq100f_df["nasdaq100f_close_rate"] = nasdaq100f_df['nasdaq100f_close'] / nasdaq100f_df['nasdaq100f_close'].shift(1)
    # 나스닥 선물 이동평균
    nasdaq100f_ma5 = nasdaq100f_df['nasdaq100f_close'].rolling(window=5).mean()
    nasdaq100f_ma20 = nasdaq100f_df['nasdaq100f_close'].rolling(window=20).mean()
    nasdaq100f_df["nasdaq100f_ma5_gap"] = (nasdaq100f_df["nasdaq100f_close"] - nasdaq100f_ma5) / nasdaq100f_ma5
    nasdaq100f_df["nasdaq100f_ma20_gap"] = (nasdaq100f_df["nasdaq100f_close"] - nasdaq100f_ma20) / nasdaq100f_ma20
    # 날짜를 하루 뒤로 이동해서, 실질적으로 하루 전 데이터가 매칭되도록 조정
    nasdaq100f_df['date'] = pd.to_datetime(nasdaq100f_df['date'], format='%Y%m%d') + pd.Timedelta(days=1)
    nasdaq100f_df['date'] = nasdaq100f_df['date'].dt.strftime('%Y%m%d')
    topn_df = topn_df.merge(nasdaq100f_df, on=['date'], how='left')

    # S&P500 선물
    # https://kr.investing.com/indices/us-spx-500-futures
    sp500f_df = get_index_years('S&P_500_FUTURES', start_year - 1, end_year)
    sp500f_df = sp500f_df.rename(columns={'close': 'sp500f_close'})#, 'close_rate': 'sp500f_close_rate'})
    sp500f_df = sp500f_df[['date', 'sp500f_close']]#, 'sp500f_close_rate']]
    sp500f_df = sp500f_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    sp500f_df["sp500f_close_rate"] = sp500f_df['sp500f_close'] / sp500f_df['sp500f_close'].shift(1)
    # S&P500 선물 이동평균
    sp500f_ma5 = sp500f_df['sp500f_close'].rolling(window=5).mean()
    sp500f_ma20 = sp500f_df['sp500f_close'].rolling(window=20).mean()
    sp500f_df["sp500f_ma5_gap"] = (sp500f_df["sp500f_close"] - sp500f_ma5) / sp500f_ma5
    sp500f_df["sp500f_ma20_gap"] = (sp500f_df["sp500f_close"] - sp500f_ma20) / sp500f_ma20
    # 날짜를 하루 뒤로 이동해서, 실질적으로 하루 전 데이터가 매칭되도록 조정
    sp500f_df['date'] = pd.to_datetime(sp500f_df['date'], format='%Y%m%d') + pd.Timedelta(days=1)
    sp500f_df['date'] = sp500f_df['date'].dt.strftime('%Y%m%d')
    topn_df = topn_df.merge(sp500f_df, on=['date'], how='left')

    # S&P500 VIX
    # https://kr.investing.com/indices/volatility-s-p-500-historical-data
    sp500v_df = get_index_years('S&P_500_VIX', start_year - 1, end_year)
    sp500v_df = sp500v_df.rename(columns={'close': 'sp500v_close'})#, 'close_rate': 'sp500v_close_rate'})
    sp500v_df = sp500v_df[['date', 'sp500v_close']]#, 'sp500v_close_rate']]
    sp500v_df = sp500v_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    sp500v_df["sp500v_close_rate"] = sp500v_df['sp500v_close'] / sp500v_df['sp500v_close'].shift(1)
    # 날짜를 하루 뒤로 이동해서, 실질적으로 하루 전 데이터가 매칭되도록 조정
    sp500v_df['date'] = pd.to_datetime(sp500v_df['date'], format='%Y%m%d') + pd.Timedelta(days=1)
    sp500v_df['date'] = sp500v_df['date'].dt.strftime('%Y%m%d')
    topn_df = topn_df.merge(sp500v_df, on=['date'], how='left')

    # S&P500 VIX 선물
    # https://kr.investing.com/indices/us-spx-vix-futures-historical-data
    sp500vf_df = get_index_years('S&P_500_VIX_FUTURES', start_year - 1, end_year)
    sp500vf_df = sp500vf_df.rename(columns={'close': 'sp500vf_close'})#, 'close_rate': 'sp500vf_close_rate'})
    sp500vf_df = sp500vf_df[['date', 'sp500vf_close']]#, 'sp500vf_close_rate']]
    sp500vf_df = sp500vf_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    sp500vf_df["sp500vf_close_rate"] = sp500vf_df['sp500vf_close'] / sp500vf_df['sp500vf_close'].shift(1)
    # 날짜를 하루 뒤로 이동해서, 실질적으로 하루 전 데이터가 매칭되도록 조정
    sp500vf_df['date'] = pd.to_datetime(sp500vf_df['date'], format='%Y%m%d') + pd.Timedelta(days=1)
    sp500vf_df['date'] = sp500vf_df['date'].dt.strftime('%Y%m%d')
    topn_df = topn_df.merge(sp500vf_df, on=['date'], how='left')

    # 한국과 미국의 거래일 차이로 인해 데이터 병합 시 NaN 발생
    # 데이터가 없는 날은 가장 가까운 과거일 값 사용
    topn_df = topn_df.sort_values(by='date', ascending=True).reset_index(drop=True)
    cols_to_ffill = [
        "nasdaq100f_close", "nasdaq100f_close_rate", "nasdaq100f_ma5_gap", "nasdaq100f_ma20_gap",
        "sp500f_close", "sp500f_close_rate", "sp500f_ma5_gap", "sp500f_ma20_gap", 
        "sp500v_close", "sp500v_close_rate",
        "sp500vf_close", "sp500vf_close_rate",
    ]
    topn_df[cols_to_ffill] = topn_df[cols_to_ffill].ffill()

    # 마켓 타입
    topn_df["is_kospi"] = topn_df["market_name"] == "KOSPI"
    
    # 캔들
    body = abs(topn_df['open'] - topn_df['close'])
    upper_tail = topn_df['high'] - topn_df[['open', 'close']].max(axis=1)
    lower_tail = topn_df[['open', 'close']].min(axis=1) - topn_df['low']
    total_range = topn_df['high'] - topn_df['low']
    # 캔들 윗꼬리 비율
    topn_df['candle_upper_tail_ratio'] = np.where(total_range == 0, 0, upper_tail / total_range)
    # 캔들 아래 꼬리 비율
    topn_df['candle_lower_tail_ratio'] = np.where(total_range == 0, 0, lower_tail / total_range)
    # 캔들 바디 비율
    topn_df['candle_body_ratio'] = np.where(total_range == 0, 0, body / total_range)
    # 캔들 양봉/음봉
    topn_df['candle_sign'] = topn_df['close'] >= topn_df['open']
    # 변동성
    topn_df['open_to_close'] = (topn_df['close'] - topn_df['open']) / topn_df['open']
    topn_df['high_to_low'] = (topn_df['high'] - topn_df['low']) / topn_df['low']

    # 날짜 관련 컬럼 추가
    topn_df['datetime'] = pd.to_datetime(topn_df['date'])
    topn_df['year'] = topn_df['datetime'].dt.year
    topn_df['month'] = topn_df['datetime'].dt.month
    topn_df['day_of_week'] = topn_df['datetime'].dt.dayofweek
    # 월 정보
    topn_df['month_sin'] = np.sin(2 * np.pi * topn_df['month'] / 12)
    topn_df['month_cos'] = np.cos(2 * np.pi * topn_df['month'] / 12)
    # 요일 정보
    topn_df['dow_sin'] = np.sin(2 * np.pi * topn_df['day_of_week'] / 5)
    topn_df['dow_cos'] = np.cos(2 * np.pi * topn_df['day_of_week'] / 5)

    df_list = []
    for stock_code, df in topn_df.groupby('stock_code'):        
        # 시간순 정렬
        df = df.sort_values(by='date', ascending=True).reset_index(drop=True)

        # 표준산업분류코드
        stock_info = get_stock_info(stock_code)
        df['industry_code'] = stock_info['industry_code'][:4]

        # 상장일
        df['listing_date'] = stock_info['listing_date']
        df['listing_date'] = pd.to_datetime(df['listing_date'], format='%Y%m%d')
    
        # RSI(Relative Strength Index: 상대강도지수)
        # 현재 가격이 과매수(Overbought) 또는 과매도(Oversold) 상태인지 측정하는 모멘텀 지표
        df['rsi'] = calculate_rsi_ema(df, 14, True)

        # ATR(Average True Range: 평균 진폭)
        # 변동성을 측정하는 지표로, 주가의 평균적인 변동 폭을 나타낸다.
        atr = calculate_atr(df)
        df["atr_ratio"] = atr / df["close"]

        # MACD(Moving Average Convergence Divergence: 이동평균 수렴확산 지수)
        # 단기 이동평균과 장기 이동평균 간의 차이를 분석하여 추세 강도와 전환점을 파악하는 지표
        macd, macd_signal, macd_histogram = calculate_macd(df)
        df["macd_ratio"] = macd / df["close"]
        df["macd_signal_ratio"] = macd_signal / df["close"]
        # MACD가 Signal Line을 상향 돌파할 때 (골든 크로스)
        df["macd_golden_cross"] = (macd > macd_signal) & (macd.shift(1) <= macd_signal.shift(1))
        # MACD가 Signal Line을 하향 돌파할 때 (데드 크로스)
        df["macd_dead_cross"] = (macd < macd_signal) & (macd.shift(1) >= macd_signal.shift(1))

        # 이평선
        df["ma5"] = df['close'].rolling(window=5).mean()
        df["ma20"] = df['close'].rolling(window=20).mean()

        # 이평선과 종가 간 괴리도
        df["ma5_gap"] = (df["close"] - df["ma5"]) / df["ma5"]
        df["ma20_gap"] = (df["close"] - df["ma20"]) / df["ma20"]

        # VWMA 생성
        df["vwma5"] = vwma(df["close"], df["trading_volume"], 5)
        df["vwma20"] = vwma(df["close"], df["trading_volume"], 20)
        df["vwma12"] = vwma(df["close"], df["trading_volume"], 12)
        df["vwma26"] = vwma(df["close"], df["trading_volume"], 26)

        # VWMA와 종가 간 괴리도 (gap)
        df["vwma5_gap"] = (df["close"] - df["vwma5"]) / df["vwma5"]
        df["vwma20_gap"] = (df["close"] - df["vwma20"]) / df["vwma20"]

        # VWMA 간 추세 비율
        df["vwma5_to_20_ratio"] = df["vwma5"] / df["vwma20"]
        df["close_to_vwma5_ratio"] = df["close"] / df["vwma5"]
    
        # VWMA의 기울기
        df["vwma5_slope_ratio"] = df["vwma5"].diff() / df["vwma5"]

        # VWMA - SMA 차이
        df["vwma5_sma5_diff"] = (df["vwma5"] - df["ma5"]) / df["ma5"]
        df["vwma20_sma20_diff"] = (df["vwma20"] - df["ma20"]) / df["ma20"]

        # VWMA 기반 MACD
        vwma_macd = df["vwma12"] - df["vwma26"]
        vwma_macd_signal = vwma_macd.ewm(span=9).mean()
        df["vwma_macd_ratio"] = vwma_macd / df["close"]
        df["vwma_macd_signal_ratio"] = vwma_macd_signal / df["close"]
        df["vwma_macd_golden_cross"] = ((vwma_macd > vwma_macd_signal) & (vwma_macd.shift(1) <= vwma_macd_signal.shift(1))).astype(int)
        df["vwma_macd_dead_cross"] = ((vwma_macd < vwma_macd_signal) & (vwma_macd.shift(1) >= vwma_macd_signal.shift(1))).astype(int)

        # VWMA 기반 볼린저 밴드
        vwma_std = df["close"].rolling(window=20).std()
        df["vwma_bb_upper"] = df["vwma20"] + 2 * vwma_std
        df["vwma_bb_lower"] = df["vwma20"] - 2 * vwma_std
        df["vwma_bb_upper_ratio"] = df["vwma_bb_upper"] / df["close"]
        df["vwma_bb_lower_ratio"] = df["vwma_bb_lower"] / df["close"]
        df["vwma_bb_width"] = (df["vwma_bb_upper"] - df["vwma_bb_lower"]) / df["close"]

        # VWMA 크로스 신호
        df["close_vwma_golden_cross"] = ((df["close"] > df["vwma5"]) & (df["close"].shift(1) <= df["vwma5"].shift(1))).astype(int)
        df["close_vwma_dead_cross"] = ((df["close"] < df["vwma5"]) & (df["close"].shift(1) >= df["vwma5"].shift(1))).astype(int)
        df["vwma_golden_cross"] = ((df["vwma5"] > df["vwma20"]) & (df["vwma5"].shift(1) <= df["vwma20"].shift(1))).astype(int)
        df["vwma_dead_cross"] = ((df["vwma5"] < df["vwma20"]) & (df["vwma5"].shift(1) >= df["vwma20"].shift(1))).astype(int)

        # 볼린저 밴드
        # 가격 변동성과 추세를 분석하는 기술적 지표
        # 이동평균선(SMA:Simple Moving Average)과 표준편차(SD: Standard Deviation)를 활용하여 상한선과 하한선을 형성
        stddev = df["close"].rolling(window=20).std()
        df["bb_upper"] = df["ma20"] + (stddev * 2)  # 상한선
        df["bb_lower"] = df["ma20"] - (stddev * 2)  # 하한선
        df["bb_upper_ratio"] = df["bb_upper"] / df["close"]
        df["bb_lower_ratio"] = df["bb_lower"] / df["close"]
        df["bb_width"] = (df["bb_upper"] - df["bb_lower"]) / df["close"] * 100
        
        # 크로스 신호
        df["golden_cross"] = ((df["ma5"] > df["ma20"]) & (df["ma5"].shift(1) <= df["ma20"].shift(1))).astype(int)
        df["dead_cross"] = ((df["ma5"] < df["ma20"]) & (df["ma5"].shift(1) >= df["ma20"].shift(1))).astype(int)

        # 거래량 이평선
        df["vol_ma5"] = df['trading_volume'].rolling(window=5).mean()
        df["vol_ma20"] = df['trading_volume'].rolling(window=20).mean()
        
        # 거래량 크로스 신호
        df["trading_golden_cross"] = ((df["vol_ma5"] > df["vol_ma20"]) & (df["vol_ma5"].shift(1) <= df["vol_ma20"].shift(1))).astype(int)
        df["trading_dead_cross"] = ((df["vol_ma5"] < df["vol_ma20"]) & (df["vol_ma5"].shift(1) >= df["vol_ma20"].shift(1))).astype(int)
        # 거래량 변동성
        # 최근 10일간 거래량의 변동성(표준편차)을 평균 거래량으로 정규화한 값
        volatility_10d = df["trading_volume"].rolling(window=10).std()
        mean_volume_10d = df["trading_volume"].rolling(window=10).mean()
        df["trading_volume_volatility_ratio"] = volatility_10d / mean_volume_10d

         # 최근 10일 기관/외국인 순매수량 평균대비 당일 순매수량 증가율
        df["foreign_rate"] = get_net_buy_rate(df, "foreign", 10)
        df["institution_rate"] = get_net_buy_rate(df, "institution", 10)
        df["individual_rate"] = get_net_buy_rate(df, "individual", 10)

        # 순매수 유지 일수
        df["foreign_net_buy_days"] = get_net_buy_days(df, "foreign")
        df["institution_net_buy_days"] = get_net_buy_days(df, "institution")
        df["individual_net_buy_days"] = get_net_buy_days(df, "individual")

        df['prev_close'] = df["close"].shift(1)
        df['prev_trading_volume'] = df["trading_volume"].shift(1)
        df['next1_close'] = df["close"].shift(-1)
        df['next2_close'] = df["close"].shift(-2)
        df['next3_close'] = df["close"].shift(-3)
        df['next4_close'] = df["close"].shift(-4)
        df['next5_close'] = df["close"].shift(-5)
        df = df.dropna(subset=['next5_close'])

        # 전일 종가 대비 시가/저가/고가/종가 등락률
        df["open_rate"] = (df['open'] / df['prev_close'] - 1) * 100
        df["low_rate"] = (df['low'] / df['prev_close'] - 1) * 100
        df["high_rate"] = (df['high'] / df['prev_close'] - 1) * 100
        df["close_rate"] = (df['close'] / df['prev_close'] - 1) * 100
        
        # 전일 거래량 대비
        df["trading_change"] = df['trading_volume'] / df['prev_trading_volume']
        
        # 평균 거래량 대비
        df["trading_rolling_change"] = df["trading_volume"] / df["trading_volume"].rolling(5).mean()

        # 당일 종가 대비 n일 후 종가 등락률 (label로 사용)
        df["next1_close_rate"] = (df['next1_close'] / df['close'] - 1) * 100
        df["next2_close_rate"] = (df['next2_close'] / df['close'] - 1) * 100
        df["next3_close_rate"] = (df['next3_close'] / df['close'] - 1) * 100
        df["next4_close_rate"] = (df['next4_close'] / df['close'] - 1) * 100
        df["next5_close_rate"] = (df['next5_close'] / df['close'] - 1) * 100

        df["rsi"] = df["rsi"].clip(lower=0, upper=100) # 이상치 제거
        df["open_rate"] = df["open_rate"].clip(lower=-30, upper=30) # 이상치 제거
        df["low_rate"] = df["low_rate"].clip(lower=-30, upper=30) # 이상치 제거
        df["high_rate"] = df["high_rate"].clip(lower=-30, upper=30) # 이상치 제거
        df["close_rate"] = df["close_rate"].clip(lower=-30, upper=30) # 이상치 제거
        df = df.reset_index(drop=True)

        df_list.append(df)

    df = pd.concat(df_list)

    # 최소 시가총액 종목 대상 일별 거래대금 순 rank 저장
    mask = df['market_cap'] >= min_market_cap
    df['rank'] = np.nan
    df.loc[mask, 'rank'] = df[mask].groupby('date')['trading_value'].rank(ascending=False, method='min')
    df['rank'] = df['rank'].fillna(9999).astype(int)

    # # 섹터 순위
    # df_sector = (
    #     df.groupby(['date', 'industry_code'])['close_rate']
    #     .mean()
    #     .reset_index(name='sector_close_rate')
    # )
    # df_sector['sector_rank'] = (
    #     df_sector.groupby('date')['sector_close_rate']
    #     .rank(method='min', ascending=False)
    # )
    # df = df.merge(df_sector[['date', 'industry_code', 'sector_rank', 'sector_close_rate']], on=['date', 'industry_code'], how='left')

    # # 섹터 비율
    # df_top200 = df[df['rank'] <= 200]
    # df_sector_counts = df_top200.groupby(['date', 'industry_code']).size().reset_index(name='sector_count')
    # df_total_counts = df_top200.groupby('date').size().reset_index(name='total_count')
    # df_sector_counts = df_sector_counts.merge(df_total_counts, on='date')
    # df_sector_counts['sector_ratio'] = df_sector_counts['sector_count'] / df_sector_counts['total_count']
    # df = df.merge(df_sector_counts[['date', 'industry_code', 'sector_ratio']], on=['date', 'industry_code'], how='left')
    # df['sector_ratio'] = df['sector_ratio'].fillna(0)

    # 상장된지 6개월 이상 지난 종목만 사용
    df = df[(df['datetime'] - df['listing_date']).dt.days >= 125]

    # 30일 이상 데이터가 확보된 종목만 사용
    df = df.sort_values(['stock_code', 'date']).reset_index(drop=True)
    df['day_index'] = df.groupby('stock_code').cumcount()
    df = df[df['day_index'] >= 30]
    
    # 데이터를 확보하기 위해 시작 날짜 조정
    start_date = get_n_trading_days_before(start_year, 30)
    df = df[df['date'] >= start_date]
    
    print(f'{start_year} - {end_year}, topn: {topn}, min_market_cap: {min_market_cap:,}, count: {len(df):,}')
    return df

In [3]:
TOPN = 100
MIN_MARKET_CAP = 500_000_000_000
df_topn = get_topn_stocks(2020, 2025, TOPN, MIN_MARKET_CAP)

2020 - 2025, topn: 100, min_market_cap: 500,000,000,000, count: 907,585


In [4]:
nan = df_topn[df_topn.isna().any(axis=1)]
print(f'NaN: {len(nan)}')
# display(nan.head())

inf = df_topn[df_topn.isin([np.inf, -np.inf]).any(axis=1)]
print(f'INF: {len(inf)}')
# display(inf.head())

NaN: 0
INF: 0


# 피쳐 선정 및 전처리

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from utils.common_utils import get_industry_codes
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew, kurtosis


MAX_HOlDING_DAYS = 5


# 당일 종가 대비 n일 후 종가 등락률(라벨로 사용)
label_columns = [
    'next1_close_rate',
    'next2_close_rate',
    'next3_close_rate',
    'next4_close_rate',
    'next5_close_rate',
]


columns = [
    'rank',
    'market_cap',
    'market_name',    
    'stock_code', 
    'stock_name', 
    'datetime',
    'date',
    'year',
    'month',
    'open',
    'close',

    'industry_id', # 산업표준코드
    # 'sector_rank', 
    # 'sector_close_rate',
    # 'sector_ratio',

    "is_kospi",  # 마켓 타입

    'close_rate',  # 전일 종가 대비 종가 등락률

    "vwma5_gap",  # 종가 VWMA 5일선 갭
    "vwma20_gap",  # 종가 VWMA 20일선 갭

    "vwma_bb_upper_ratio",  # 현재 종가보다 몇 % 위에 볼린저 밴드가 있는가 (VWMA 기반)
    "vwma_bb_lower_ratio",  # 현재 종가보다 몇 % 아래 볼린저 밴드가 있는가 (VWMA 기반)
    "vwma_bb_width",  # 볼린저 밴드의 너비를 종가 대비 백분율로 나타냄 (VWMA 기반)

    "kospi_vwma5_gap",  # 코스피 VWMA 5일선 갭
    "kospi_vwma20_gap",  # 코스피 VWMA 20일선 갭
    "kosdaq_vwma5_gap",  # 코스닥 VWMA 5일선 갭
    "kosdaqi_vwma20_gap",  # 코스닥 VWMA 20일선 갭

    'open_to_close',  # 시가 대비 종가 비율
    'high_to_low',  # 고가 대비 저가 비율
    'rsi', # 상대 강도 지수
    'macd_ratio', # MACD 값이 종가 대비 몇 % 수준인지 (모멘텀 등락률)
    'macd_signal_ratio', # MACD Signal 값이 종가 대비 몇 % 수준인지 (모멘텀 등락률)
    'macd_golden_cross', # 모멘텀 개선 + 추세 반전 가능성
    'macd_dead_cross', # 모멘텀 약화 + 하락 가능성 증가
    'atr_ratio', # 변동성 변화율

    'kospi_close_rate',  # 코스피 등락률
    'kosdaq_close_rate',  # 코스닥 등락률
    'sp500f_close_rate',  # S&P500 선물 등락률
    'sp500f_ma5_gap',  # S&P500 선물 5일선 갭
    'sp500f_ma20_gap',  # S&P500 선물 20일선 갭
    'nasdaq100f_close_rate',  # 나스닥100 선물 등락률
    'nasdaq100f_ma5_gap',  # 나스닥100 선물 5일선 갭
    'nasdaq100f_ma20_gap',  # 나스닥100 선물 20일선 갭
    'vix_close_rate',  # 코스피 vix 등락률
    'sp500v_close_rate',  # S&P500 vix 등락률

    'trading_volume_volatility_ratio', # 최근 10일간 거래량의 변동성(표준편차)을 평균 거래량으로 정규화한 값
    'trading_change',  # 거래량 변화율
    'trading_rolling_change',  # 거래량 최근 평균 거래량 대비 변화율
    'foreign_rate',  # 외국 순매수량 변화율
    'institution_rate',  # 기관 순매수량 변화율
    'individual_rate',  # 개인 순매수량 변화율
    'foreign_net_buy_days',  # 외국인 연속 순매수 일 수
    'institution_net_buy_days',  # 기관 연속 순매수 일 수

    'candle_upper_tail_ratio',  # 캔들 위 꼬리 비율
    'candle_lower_tail_ratio',  # 캔들 아래 꼬리 비율
    'candle_body_ratio',  # 캔들 바디 비율
    'candle_sign',  # 캔들 양봉 유무
] + label_columns


exclude_columns = [
    'rank',
    'market_cap',
    'market_name',    
    'stock_code', 
    'stock_name', 
    'datetime',
    'date',
    'year',
    'month',
    'open',
    'close',

    'industry_id',
    # 'sector_rank', 
    # 'sector_close_rate',
    # 'sector_ratio',
]


feature_columns = [x for x in columns if x not in exclude_columns + label_columns]

# 산업 분류 코드 목록
defined_industry_codes = get_industry_mid_codes()
industry_encoder = LabelEncoder()
industry_encoder.fit(defined_industry_codes)


# 매월 두 번째 목요일(선옵 만기일) 찾기
def get_second_thursday(year, month):
    first_day = datetime(year, month, 1)
    first_thursday = first_day + timedelta(days=(3 - first_day.weekday() + 7) % 7)  # 첫 번째 목요일
    second_thursday = first_thursday + timedelta(weeks=1)  # 두 번째 목요일
    return second_thursday


def apply_log_transform(series, use_signed=True, shift_eps=1e-6):
    if use_signed:
        return np.sign(series) * np.log1p(np.abs(series))
    else:
        shift = abs(series.min()) + shift_eps
        return np.log1p(series + shift)
            

def preprocessing(df):
    df = df.copy()
    
    # # 월 주차 원핫 인코딩
    # df['week_of_month'] = (df['datetime'].dt.day.sub(1) // 7 + 1)
    # df = pd.get_dummies(df, columns=['week_of_month'], prefix='week')
    
    # # 각 날짜에 해당하는 월의 선물·옵션 만기일 추가
    # df['options_expiry_date'] = df['datetime'].apply(lambda d: get_second_thursday(d.year, d.month))
    
    # # 옵션 만기일 여부
    # df['is_options_expiry'] = (df['datetime'] == df['options_expiry_date']).astype(int)
    
    # # 옵션 만기일까지 남은 일수
    # df['days_to_options_expiry'] = (df['options_expiry_date'] - df['datetime']).dt.days
    # df['days_to_expiry_sin'] = np.sin(2 * np.pi * df['days_to_options_expiry'] / df['days_to_options_expiry'].max())
    # df['days_to_expiry_cos'] = np.cos(2 * np.pi * df['days_to_options_expiry'] / df['days_to_options_expiry'].max())
    
    # # 쿼드러플 위칭 여부
    # df['is_quadruple_witching'] = ((df['is_options_expiry'] == 1) & (df['datetime'].dt.month.isin([3, 6, 9, 12]))).astype(int)

    # # 산업 별 수익률
    # df['industry_avg_return'] = df.groupby(['date', 'industry_code'])['close_rate'].transform('mean')
    # df = df.sort_values(by=['industry_code', 'date'])
    # df['industry_momentum'] = df.groupby('industry_code')['industry_avg_return'].transform(lambda x: x.rolling(5, min_periods=1).mean())
    # # 해당 산업 내 수익률 순위
    # df['industry_rank'] = df.groupby('date')['industry_avg_return'].rank(ascending=False, pct=True)
    # df['industry_momentum_rank'] = df.groupby('date')['industry_momentum'].rank(ascending=False, pct=True)
    # # 해당 산업 내 수익률 표준편차 (산업 내 변동성)
    # df['industry_volatility'] = df.groupby(['date', 'industry_code'])['close_rate'].transform('std')
    
    # 산업 코드 인코딩    
    df['industry_id'] = industry_encoder.transform(df['industry_code'])
    df['industry_id'] = df['industry_id'].astype('category')
    # TODO: 신규 산업 코드는 000000(해당사항없음)을 사용하자.

    boolean_features = [x for x, y in df.dtypes.items() if y == 'bool']
    df[boolean_features] = df[boolean_features].astype(float)

    for col in [
        'trading_value', 'trading_change', 
        'foreign_net_buy_days', 'institution_net_buy_days', 'individual_net_buy_days', 
        'foreign_rate', 'institution_rate', 'individual_rate', 
        'macd_ratio', 'macd_signal_ratio', 
        'vwma_bb_upper_ratio', 'vwma_bb_width',
    ]:
        df[col] = apply_log_transform(df[col])

    return df[columns]

df_train = preprocessing(df_topn)
print(f'df_train: {len(df_train):,}')

df_train: 907,585


In [6]:
used_industry_codes = df_topn['industry_code'].unique()
print(f'defined_industry_codes: {len(defined_industry_codes)}, used_industry_codes; {len(used_industry_codes)}')
for code in used_industry_codes:
    if code not in defined_industry_codes:
        print(f'[Warning] Undefined industry code found in data: {code}')

defined_industry_codes: 78, used_industry_codes; 54


In [7]:
# from utils.chart_utils import draw_sin_cos_encoding

# # Sin-Cos Encoding은 적은 차원으로 주기성(순환) 데이터를 표현할 수 있는 이점이 있다.
# # One-Hot Encoding은 순환성이 없는 명확한 범주형 데이터에 적합하다.

# # 1월~12월
# months = [f'{x}월' for x in range(1, 13)]
# draw_sin_cos_encoding(months, 'Month')

# # 월~금
# weekdays = ["월", "화", "수", "목", "금"]
# draw_sin_cos_encoding(weekdays, 'Weekday')

# 피쳐 데이터 분포

In [8]:
import importlib
from utils import chart_utils
importlib.reload(chart_utils)

from utils.chart_utils import show_histogram

for col in feature_columns:
    show_histogram(df_train[col], col)

In [9]:
df_train[feature_columns].quantile(np.linspace(0, 1, 11))

Unnamed: 0,is_kospi,close_rate,vwma5_gap,vwma20_gap,vwma_bb_upper_ratio,vwma_bb_lower_ratio,vwma_bb_width,kospi_vwma5_gap,kospi_vwma20_gap,kosdaq_vwma5_gap,kosdaqi_vwma20_gap,open_to_close,high_to_low,rsi,macd_ratio,macd_signal_ratio,macd_golden_cross,macd_dead_cross,atr_ratio,kospi_close_rate,kosdaq_close_rate,sp500f_close_rate,sp500f_ma5_gap,sp500f_ma20_gap,nasdaq100f_close_rate,nasdaq100f_ma5_gap,nasdaq100f_ma20_gap,vix_close_rate,sp500v_close_rate,trading_volume_volatility_ratio,trading_change,trading_rolling_change,foreign_rate,institution_rate,individual_rate,foreign_net_buy_days,institution_net_buy_days,candle_upper_tail_ratio,candle_lower_tail_ratio,candle_body_ratio,candle_sign
0.0,0.0,-30.0,-0.565,-0.693,0.555,-2.846,0.005,-0.11,-0.243,-0.125,-0.275,-0.323,0.0,0.721,-0.786,-0.777,0.0,0.0,0.003,0.912,0.883,0.896,-0.105,-0.198,0.892,-0.098,-0.181,0.7,0.718,0.049,0.008,0.012,0.0,0.0,0.0,0.0,0.0,-0.1,0.0,0.0,0.0
0.1,0.0,-3.107,-0.037,-0.09,0.702,0.814,0.069,-0.015,-0.031,-0.02,-0.047,-0.03,0.016,27.57,-0.041,-0.039,0.0,0.0,0.022,0.987,0.981,0.987,-0.014,-0.029,0.981,-0.02,-0.04,0.942,0.93,0.272,0.399,0.512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091,0.0
0.2,0.0,-1.903,-0.023,-0.058,0.709,0.872,0.09,-0.009,-0.019,-0.012,-0.031,-0.019,0.021,34.467,-0.025,-0.024,0.0,0.0,0.027,0.992,0.99,0.993,-0.007,-0.014,0.99,-0.011,-0.018,0.961,0.955,0.325,0.485,0.629,0.0,0.0,0.0,0.0,0.0,0.059,0.056,0.184,0.0
0.3,0.0,-1.158,-0.014,-0.038,0.717,0.903,0.108,-0.005,-0.01,-0.006,-0.017,-0.012,0.025,39.483,-0.016,-0.015,0.0,0.0,0.031,0.995,0.994,0.996,-0.004,-0.002,0.995,-0.005,-0.006,0.976,0.973,0.372,0.55,0.72,0.0,0.0,0.0,0.0,0.0,0.124,0.111,0.273,0.0
0.4,0.0,-0.581,-0.008,-0.023,0.724,0.925,0.125,-0.001,-0.004,-0.002,-0.005,-0.007,0.029,43.92,-0.009,-0.009,0.0,0.0,0.034,0.998,0.998,0.999,-0.001,0.004,0.999,-0.001,0.006,0.986,0.985,0.42,0.61,0.804,0.0,0.0,0.0,0.0,0.0,0.182,0.167,0.364,0.0
0.5,1.0,-0.078,-0.003,-0.01,0.733,0.942,0.145,0.002,0.002,0.002,0.006,-0.002,0.033,48.263,-0.003,-0.003,0.0,0.0,0.038,1.001,1.001,1.001,0.002,0.01,1.001,0.004,0.012,0.996,0.997,0.475,0.671,0.889,0.0,0.0,0.099,0.0,0.0,0.25,0.222,0.448,0.0
0.6,1.0,0.336,0.003,0.003,0.744,0.956,0.168,0.004,0.009,0.005,0.015,0.003,0.039,52.652,0.003,0.003,0.0,0.0,0.043,1.003,1.004,1.003,0.004,0.014,1.004,0.007,0.018,1.004,1.01,0.542,0.738,0.985,0.338,0.17,0.436,0.693,0.693,0.316,0.286,0.538,1.0
0.7,1.0,0.901,0.009,0.017,0.757,0.969,0.196,0.007,0.016,0.009,0.022,0.008,0.046,57.408,0.01,0.01,0.0,0.0,0.048,1.005,1.007,1.005,0.007,0.019,1.008,0.011,0.026,1.018,1.027,0.633,0.818,1.104,0.713,0.571,0.744,0.693,0.693,0.396,0.357,0.625,1.0
0.8,1.0,1.726,0.017,0.034,0.776,0.981,0.236,0.01,0.023,0.013,0.031,0.016,0.056,62.856,0.02,0.019,0.0,0.0,0.056,1.009,1.011,1.008,0.009,0.024,1.012,0.014,0.033,1.033,1.049,0.777,0.929,1.271,1.099,1.015,1.092,1.099,1.099,0.5,0.448,0.72,1.0
0.9,1.0,3.261,0.029,0.063,0.809,0.995,0.307,0.015,0.033,0.019,0.043,0.03,0.075,70.577,0.035,0.033,0.0,0.0,0.068,1.013,1.017,1.013,0.014,0.032,1.02,0.02,0.043,1.066,1.087,1.058,1.13,1.589,1.633,1.621,1.584,1.386,1.609,0.613,0.573,0.833,1.0


# 피쳐간 상관관계

In [10]:
from utils.chart_utils import show_heatmap

show_heatmap(df_train, feature_columns + label_columns[-1:], 1800, 1800)

# StockPatchTST(Stock Patch Time Series Transformer) 모델

In [11]:
import os
import torch
from torch import nn
from torch import optim
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler


class RankingTimeSeriesDataset(Dataset):
    def __init__(self, grouped_data, window_size=30):
        # grouped_data: dict[date] -> list of (x, y, industry_code, meta_info)
        self.data_by_date = list(grouped_data.items())
        # self.window_size = window_size

    def __len__(self):
        return len(self.data_by_date)

    def __getitem__(self, idx):
        date, daily_data = self.data_by_date[idx]
        return daily_data  # 텐서 변환 X (DataLoader에서 변환)

    
def get_ranking_dataset(df, start_year, end_year, sliding_window_size, is_train):
    # 30일간 데이터를 확보하기 위해 시작 날짜 조정
    start_date = get_n_trading_days_before(start_year, 30)
    end_date = f'{end_year}1231'
    df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

    # (임시 저장소) 날짜별 데이터 모음
    temp_grouped = defaultdict(list)

    # 종목별 슬라이딩 윈도우 생성
    df = df.sort_values(['stock_code', 'date'])
    for _, df_stock in df.groupby("stock_code"):
        features = df_stock[feature_columns].copy()
        for i in range(len(features) - sliding_window_size + 1):
            meta_info = df_stock.iloc[i + sliding_window_size - 1]
            if meta_info['year'] < start_year or meta_info['year'] > end_year:
                continue

            # 시총, 거래 대금, 상한가
            if meta_info['rank'] > 200 or meta_info['close_rate'] >= 29:
                continue

            x = features.iloc[i:i + sliding_window_size]  # (30, F)
            
            y = meta_info[label_columns[-1]] / 100
            industry_id = meta_info['industry_id']  # 업종
            date = meta_info['date']

            temp_grouped[date].append((x.values, y, industry_id, meta_info))

    # 분위수 라벨 처리
    grouped_by_date = defaultdict(list)
    for date, group in temp_grouped.items():
        # 해당 날짜의 수익률 리스트
        returns = [sample[1] for sample in group]
        quantiles = pd.qcut(returns, q=5, labels=[0.0, 1.0, 2.0, 3.0, 4.0])

        for i, (x, _, industry_id, meta_info) in enumerate(group):
            # label = int(quantiles[i])  # 분위수 기반 라벨
            grouped_by_date[date].append((x, quantiles[i], industry_id, meta_info))

    return RankingTimeSeriesDataset(grouped_by_date)

In [12]:
train_dataset = get_ranking_dataset(df_train, 2020, 2023, 30, True)
val_dataset = get_ranking_dataset(df_train, 2024, 2024, 30, False)
test_dataset = get_ranking_dataset(df_train, 2025, 2025, 30, False)

In [13]:
print(
    f'train_dataset: {len(train_dataset):,}, daily_stocks: {np.percentile([len(x) for x in train_dataset], [i * 10 for i in range(11)])}\n'
    f'val_dataset: {len(val_dataset):,}, daily_stocks: {np.percentile([len(x) for x in val_dataset], [i * 10 for i in range(11)])}\n'
    f'test_dataset: {len(test_dataset):,}, daily_stocks: {np.percentile([len(x) for x in test_dataset], [i * 10 for i in range(11)])}'
)

train_dataset: 987, daily_stocks: [174. 182. 184. 185. 186. 187. 188. 189. 190. 192. 195.]
val_dataset: 244, daily_stocks: [186.  189.  190.  191.  191.  192.  193.  193.  194.  195.7 198. ]
test_dataset: 52, daily_stocks: [191.  192.1 193.  193.  194.  194.  194.  194.  194.  195.  196. ]


In [14]:
show_histogram([y for daily_data in train_dataset.data_by_date for _, y, _, _ in daily_data[1]], 'Training labels')
show_histogram([y for daily_data in val_dataset.data_by_date for _, y, _, _ in daily_data[1]], 'Validation labels')
show_histogram([y for daily_data in test_dataset.data_by_date for _, y, _, _ in daily_data[1]], 'Test labels')

In [15]:
show_histogram([meta_info['next5_close_rate'] for daily_data in train_dataset.data_by_date for _, _, _, meta_info in daily_data[1]], 'Training labels')
show_histogram([meta_info['next5_close_rate'] for daily_data in val_dataset.data_by_date for _, _, _, meta_info in daily_data[1]], 'Validation labels')
show_histogram([meta_info['next5_close_rate'] for daily_data in test_dataset.data_by_date for _, _, _, meta_info in daily_data[1]], 'Test labels')

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

        
class StockPatchTST(nn.Module):
    def __init__(self, input_dim, patch_len, stride, model_dim, num_heads, num_layers, output_dim, industry_vocab_size, industry_embed_dim, dropout=0.0):
        super(StockPatchTST, self).__init__()

        # num_patches = ((seq_len - patch_len) / stride) + 1
        self.patch_len = patch_len  # 시계열을 의미 있는 단위로 분할(예: 주간 패턴)
        self.stride = stride  # 적절한 중복으로 패치 수 증가 (예: 30일 사이즈 윈도우를 6일 사이즈 패치로 분할하는데 stride가 3이면 총 9개의 패치가 만들어진다.)

        # 산업군 임베딩
        self.industry_embedding = nn.Embedding(industry_vocab_size, industry_embed_dim)

        self.total_input_dim = industry_embed_dim + input_dim  # concat 이후 총 feature 차원

        # Patch Embedding (Conv1D)
        self.patch_embedding = nn.Conv1d(
            in_channels=self.total_input_dim,
            out_channels=model_dim,
            kernel_size=patch_len,
            stride=stride
        )

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim,
            nhead=num_heads,
            dim_feedforward=2048,
            dropout=dropout,
            activation='relu',
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Fully Connected Output
        self.fc_out = nn.Linear(model_dim, output_dim)

    def forward(self, x, industry_ids):  # x: (batch_size, seq_len, input_dim)
        # 산업군 임베딩 추가
        industry_embed = self.industry_embedding(industry_ids)  # (batch_size, embed_dim)
        industry_embed = industry_embed.unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([industry_embed, x], dim=-1)   # (batch, seq_len, total_input_dim)

        # Patch Embedding (Conv1D)
        x = x.permute(0, 2, 1)  # → (batch, total_input_dim, seq_len)
        x = self.patch_embedding(x)  # -> (batch, model_dim, num_patches)
        x = x.permute(0, 2, 1)  # → (batch, num_patches, model_dim)

        # Transformer Encoding
        x = self.transformer_encoder(x)  # (batch, num_patches, model_dim)

        # 평균 Pooling
        x = x.mean(dim=1)
        # 마지막 패치 (예측하려는 시점이 가장 최근 패치 직후에 있을 때 사용)
        # x = x[:, -1, :]  # shape: (batch_size, model_dim)

        # Fully Connected Output
        out = self.fc_out(x)  # shape: (batch_size, output_dim)
        return out

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import scipy.stats


class LambdaRankLoss(nn.Module):
    def __init__(self, eps=1e-10, min_dcg=1e-6):
        super().__init__()
        self.eps = eps
        self.min_dcg = min_dcg

    def forward(self, preds, targets):
        """
        preds: (N,) 예측 점수
        targets: (N,) 실제 relevance
        """
        N = preds.size(0)
        device = preds.device

        # 1. relevance를 양수로 제한 (NDCG 정의 상)
        safe_targets = torch.clamp_min(targets, 0.0)

        # 2. ideal DCG 계산 (정답 순서대로 정렬)
        sorted_targets, _ = torch.sort(safe_targets, descending=True)
        ideal_dcg = self.dcg(sorted_targets)
        ideal_dcg = torch.clamp(ideal_dcg, min=self.min_dcg)  # 너무 작은 DCG 방지

        # 3. pairwise 차이 계산
        pred_diffs = preds.unsqueeze(1) - preds.unsqueeze(0)       # (N, N)
        target_diffs = targets.unsqueeze(1) - targets.unsqueeze(0) # (N, N)

        S_ij = torch.sign(target_diffs)  # 순서 비교 (+1, -1, 0)

        # 4. 예측값 순위 → 정확한 랭킹 계산
        rank_positions = self.compute_ranks(preds) - 1  # 0-based
        pos_i = rank_positions.unsqueeze(1)
        pos_j = rank_positions.unsqueeze(0)

        log_i = torch.log2(pos_i + 2.0)
        log_j = torch.log2(pos_j + 2.0)

        # 5. delta NDCG 계산 (쌍별 DCG 변화량 정규화)
        delta_dcg = torch.abs(1.0 / log_i - 1.0 / log_j) * torch.abs(target_diffs)
        delta_ndcg = delta_dcg / ideal_dcg
        delta_ndcg = torch.clamp(delta_ndcg, min=1e-4)  # 너무 작은 weight 방지

        # 6. loss 계산 (logsigmoid * delta_ndcg)
        log_loss = F.logsigmoid(S_ij * pred_diffs)

        # 유효한 쌍만 계산 (같은 타겟일 경우 제외)
        mask = (S_ij != 0).float()
        loss_matrix = -log_loss * delta_ndcg * mask

        loss_sum = loss_matrix.sum()
        pair_count = mask.sum()

        return loss_sum / pair_count if pair_count > 0 else torch.tensor(0.0, requires_grad=True, device=device)

    def dcg(self, relevance):
        device = relevance.device
        denom = torch.log2(torch.arange(relevance.size(0), device=device).float() + 2.0)
        return (relevance / denom).sum()

    def compute_ranks(self, scores):
        """
        정확한 rank 계산: 높은 점수에 낮은 순위 (1등 = 1.0)
        """
        scores_np = scores.detach().cpu().numpy()
        ranks = scipy.stats.rankdata(-scores_np, method="ordinal")  # 높은 점수에 낮은 순위
        return torch.tensor(ranks, dtype=torch.float32, device=scores.device)

    def compute_ranks_torch(self, scores):
        return (scores.unsqueeze(0) < scores.unsqueeze(1)).sum(dim=1).float() + 1.0

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def ranking_collate_fn(batch):
    daily_data = batch[0]  # batch_size=1이므로

    # numpy array 변환 후 tensor
    x_batch = torch.tensor(np.array([item[0] for item in daily_data]), dtype=torch.float32)
    y_batch = torch.tensor(np.array([item[1] for item in daily_data]), dtype=torch.float32)

    # industry_id 추출하여 텐서로 변환
    industry_id_batch = torch.tensor([item[2] for item in daily_data], dtype=torch.long)

    # meta_info는 변형 없이 리스트 유지
    meta_batch = [item[3] for item in daily_data]

    return x_batch, y_batch, industry_id_batch, meta_batch

    
# 랜덤 시드 설정
def set_seed(seed):    
    # Python 및 NumPy 랜덤 시드 고정
    random.seed(seed)
    np.random.seed(seed)

    # PyTorch 랜덤 시드 고정
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # 멀티 GPU 사용 시 모든 GPU에 적용

    # PyTorch 연산의 결정론적 실행 보장
    torch.backends.cudnn.deterministic = True # 결정론적 연산 유지
    torch.backends.cudnn.benchmark = False  # 일관된 연산을 보장 (성능보다 재현성을 우선)


def seed_worker(worker_id):
    # 각 worker의 시드 고정
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


def train_stock_patch_tst(df, params, device):    
    # 롤링 윈도우 적용: N년 학습 + 1년 검증 + 이전 모델 파인튜닝
    start_year = 2020
    end_year = 2024
    rolling_window_size = 4  # 학습 기간
    validation_size = 1  # 검증 기간
    prev_model_path = None  # 이전 학습된 모델 경로

    for train_start in range(start_year, end_year - rolling_window_size + 1):
        train_end = train_start + (rolling_window_size - 1)
        val_start = train_end + 1
        val_end = val_start

        # 데이터셋 생성
        # train_dataset = get_dataset(df, train_start, train_end, params['sliding_window_size'], True)
        # val_dataset = get_dataset(df, val_start, val_end, params['sliding_window_size'], False)

        # 데이터 로더 생성
        # ✅ 학습은 epoch 마다 다른 샘플을 사용하고 검증은 epoch 마다 같은 샘플을 사용한다.
        train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=ranking_collate_fn, worker_init_fn=seed_worker, generator=g)
        val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=ranking_collate_fn, worker_init_fn=seed_worker, generator=g)
        
        print(
            f'==========================================================================================\n'
            f'✅ Fine-Tuning from {train_start} to {train_end}, Validation on {val_start}\n'
            f'==========================================================================================\n'
            f"Train size: {len(train_dataset):,}, Validation size: {len(val_dataset):,}\n"
        )

        # 모델 초기화 (이전 모델이 있으면 불러와서 파인튜닝)
        model = StockPatchTST(
            input_dim=params['input_dim'],
            patch_len=params['patch_len'],
            stride=params['stride'],
            model_dim=params['model_dim'],
            num_heads=params['num_heads'],
            num_layers=params['num_layers'],
            output_dim=params['output_dim'],
            industry_vocab_size=len(industry_encoder.classes_),
            industry_embed_dim=params['industry_embed_dim'],
            dropout=params['dropout']
        ).to(device)
        if prev_model_path and os.path.exists(prev_model_path):
            print(f"Loading previous model: {prev_model_path}")
            model.load_state_dict(torch.load(prev_model_path))
        
        # 손실 함수
        criterion = LambdaRankLoss()
        
        # 옵티마이저
        # weight_decay: L2 정규화와 동일한 역할을 수행.
        # L2 정규화는 손실 함수에 가중치의 제곱을 추가하는 방식으로 모델의 복잡도를 제한
        # 추천 값: 1e-5 ~ 1e-4
        # 과적합이 심한 경우: 1e-3 ~ 1e-2 (강한 정규화)
        # 과소적합(underfitting)이 발생하는 경우: 1e-6 이하로 조정
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
        
        # Early Stopping 설정
        min_loss = float('inf')  # 현재 최소 loss 초기화
        counter = 0  # 개선되지 않은 epoch 카운트
        best_model = None
        
        # 학습 루프
        num_epochs = params['num_epochs']
        for epoch in range(num_epochs):            
            # 학습 모드
            model.train()
            total_train_loss = 0
            
            for x, y, industry_ids, _ in train_loader:
                x = x.to(device)  # GPU 이동
                y = y.unsqueeze(1).float().to(device)  # -> shape: (B, 1)
                industry_ids = industry_ids.to(device)
                optimizer.zero_grad() # 이전 배치의 기울기 초기화
                outputs = model(x, industry_ids) # 순전파 -> shape: (B, 1)
                loss = criterion(outputs, y) # 손실 계산
                loss.backward() # 역전파: 손실 값을 기반으로 모델의 모든 가중치에 대한 기울기를 계산
                optimizer.step() # 모델의 가중치 업데이트
                total_train_loss += loss.item()
        
            avg_train_loss = total_train_loss / len(train_loader) # 평균 Loss 계산
        
            if params['skip_validation']:
                best_model = model.state_dict()
                print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {avg_train_loss:.8f}")
            else:
                # 검증 모드
                model.eval()
                total_val_loss = 0
                with torch.no_grad():
                    for x, y, industry_ids, _ in val_loader:
                        x = x.to(device)
                        y = y.unsqueeze(1).float().to(device)  # -> shape: (B, 1)
                        y = y.float().to(device)
                        industry_ids = industry_ids.to(device)
                        outputs = model(x, industry_ids) # -> shape: (B, 1)
                        loss = criterion(outputs, y)
                        total_val_loss += loss.item()
            
                avg_val_loss = total_val_loss / len(val_loader)  # 평균 Loss 계산
                print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {avg_train_loss:.8f}, Val Loss: {avg_val_loss:.8f}")
            
                # Early Stopping Check
                if avg_val_loss < min_loss:
                    min_loss = avg_val_loss  # 최소 loss 갱신
                    counter = 0  # 개선되었으므로 카운터 초기화
                    best_model = model.state_dict()
                    best_epoch = epoch+1
                else:
                    counter += 1  # 개선되지 않은 epoch 증가
            
                if counter >= params['epoch_patience']:  # 일정 횟수 동안 개선이 없으면 학습 종료
                    print(f"Early stopping at epoch {epoch+1} - Best Epoch: {best_epoch}, Best Val loss: {min_loss:.8f}")
                    break
        
        # 모델 저장 (Fine-Tuned Model)
        model_filename = f"stock_patch_tst_{train_start}_{val_start}.pth"
        torch.save(best_model, model_filename)
        print(f"Saved fine-tuned model: {model_filename}\n")
    
        # 다음 롤링 윈도우에서 이 모델을 파인튜닝
        prev_model_path = model_filename

In [19]:
# 시드 설정
random_seed=42
set_seed(random_seed)
g = torch.Generator()
g.manual_seed(random_seed)

# 하이퍼파라미터
params = {
    'input_dim': len(feature_columns),
    'model_dim': 64,
    'patch_len': 6,
    'stride': 3,    
    'num_heads': 4,
    'num_layers': 2,
    'output_dim': 1,
    'industry_embed_dim': 4,
    'dropout': 0.2,
    'learning_rate': 0.0005,  # 5e-4
    "weight_decay": 5e-5,  # 0.00005
    'num_epochs': 100,
    'sliding_window_size': 30,
    'epoch_patience': 10,
    'skip_validation': False
}

# GPU 사용: nvidia-smi (GPU-Util)
device_cuda = torch.device("cuda")

train_stock_patch_tst(df_train, params, device_cuda)

✅ Fine-Tuning from 2020 to 2023, Validation on 2024
Train size: 987, Validation size: 244

Epoch [1/100] - Train Loss: 0.01300784, Val Loss: 0.01335481
Epoch [2/100] - Train Loss: 0.01299615, Val Loss: 0.01335268
Epoch [3/100] - Train Loss: 0.01299996, Val Loss: 0.01335299
Epoch [4/100] - Train Loss: 0.01299180, Val Loss: 0.01336729
Epoch [5/100] - Train Loss: 0.01300844, Val Loss: 0.01334579
Epoch [6/100] - Train Loss: 0.01299963, Val Loss: 0.01334017
Epoch [7/100] - Train Loss: 0.01298800, Val Loss: 0.01331684
Epoch [8/100] - Train Loss: 0.01298803, Val Loss: 0.01332084
Epoch [9/100] - Train Loss: 0.01298447, Val Loss: 0.01333782
Epoch [10/100] - Train Loss: 0.01298076, Val Loss: 0.01333132
Epoch [11/100] - Train Loss: 0.01297853, Val Loss: 0.01333584
Epoch [12/100] - Train Loss: 0.01298157, Val Loss: 0.01332697
Epoch [13/100] - Train Loss: 0.01297816, Val Loss: 0.01331206
Epoch [14/100] - Train Loss: 0.01298302, Val Loss: 0.01332470
Epoch [15/100] - Train Loss: 0.01298032, Val Loss:

In [None]:
device_cpu = torch.device("cpu")

# 빈 모델 생성
best_model = StockPatchTST(
    input_dim=params['input_dim'],
    patch_len=params['patch_len'],
    stride=params['stride'],
    model_dim=params['model_dim'],
    num_heads=params['num_heads'],
    num_layers=params['num_layers'],
    output_dim=params['output_dim'],
    industry_vocab_size=len(industry_encoder.classes_),
    industry_embed_dim=params['industry_embed_dim'],
    dropout=params['dropout']
).to(device_cpu)

# 모델 가중치 로드
best_model.load_state_dict(torch.load('stock_patch_tst_2020_2024.pth', map_location=device_cpu))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
import shap
import matplotlib.pyplot as plt
from sklearn.metrics import ndcg_score
from collections import Counter


def normalize_label_ranks(y_true):
    if len(y_true) == 0:
        return np.array([0.0])
    if np.all(y_true == y_true[0]):
        return np.zeros_like(y_true)  # 동일한 값일 때는 모두 0
    ranks = np.argsort(np.argsort(-y_true))  # 내림차순 순위
    return ranks / ranks.max()


def predict_stock_patch_tst(model, loader, device, topn):
    model.eval()
    predictions = []
    with torch.no_grad():
        for x, y, industry_codes, meta_infos in loader:
            x, y = x.to(device), y.to(device)
            industry_codes = industry_codes.to(device)
            outputs = model(x, industry_codes)
            preds = outputs.squeeze(1).cpu().numpy()

            for i in range(len(meta_infos)):
                row = meta_infos[i].copy()
                row['pred'] = preds[i].item()
                predictions.append(row)

    df = pd.DataFrame(predictions)

    # Precision, 평균 수익률, NDCG 계산
    total_buy_count = 0
    daily_stats = []
    for date, group in df.groupby("date"):            
        group = group.sort_values("pred", ascending=False)
        group = group.head(topn)
        actual_n = len(group)
        if actual_n < 1:
            continue

        total_buy_count += actual_n

        precision_at_k = (group[label_columns[-1]] > 0).mean()
        mean_return_at_k = group[label_columns[-1]].mean()

        y_true_rank = normalize_label_ranks(group[label_columns[-1]].values).reshape(1, -1)
        y_score_rank = group["pred"].values.reshape(1, -1)
        if actual_n > 1:
            ndcg = ndcg_score(y_true_rank, y_score_rank, k=actual_n)
        else:
            ndcg = 0.0

        daily_stats.append({
            "date": date,
            "buy_count": actual_n,
            "precision": precision_at_k,
            "mean_return": mean_return_at_k,
            "ndcg": ndcg
        })

    eval_df = pd.DataFrame(daily_stats)
    print(
        f'==========================================================================================\n'
        f'TOP{topn}\n'
        f'==========================================================================================\n'
        f"✅ 랭킹 모델 매수 카운트: {total_buy_count:,}\n"
        f"✅ 랭킹 모델 평균 Precision: {eval_df['precision'].mean():.4f}\n"
        f"✅ 랭킹 모델 평균 수익률: {eval_df['mean_return'].mean():.4f}\n"
        f"✅ 랭킹 모델 평균 NDCG: {eval_df['ndcg'].mean():.4f}\n"
    )

    return df

In [26]:
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=ranking_collate_fn, worker_init_fn=seed_worker, generator=g)
predictions_val = predict_stock_patch_tst(best_model, val_loader, device_cpu, topn=3)

TOP3
✅ 랭킹 모델 매수 카운트: 732
✅ 랭킹 모델 평균 Precision: 0.5260
✅ 랭킹 모델 평균 수익률: 0.8664
✅ 랭킹 모델 평균 NDCG: 0.7998



In [27]:
val_top3 = predictions_val.groupby('date').apply(lambda g: g.nlargest(3, 'pred'), include_groups=False)
show_histogram(val_top3['pred'], 'val_top3 pred')

In [28]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=ranking_collate_fn, worker_init_fn=seed_worker, generator=g)
predictions_test = predict_stock_patch_tst(best_model, test_loader, device_cpu, topn=3)

TOP3
✅ 랭킹 모델 매수 카운트: 156
✅ 랭킹 모델 평균 Precision: 0.6859
✅ 랭킹 모델 평균 수익률: 2.4409
✅ 랭킹 모델 평균 NDCG: 0.7988



In [29]:
test_top3 = predictions_test.groupby('date').apply(lambda g: g.nlargest(3, 'pred'), include_groups=False)
show_histogram(test_top3['pred'], 'test_top3 pred')

# 수익률 평가

In [30]:
def get_roi(buy_price, sell_price, investment):
    # 세전 수익률
    pre_tax_ev = (sell_price - buy_price) / buy_price

    # 매수/매도 수수료 및 세금 (매수/매도 대금이 100만원으로 동일할 경우 수수료&세금이 0.834% 정도 발생한다.)
    buying_fee = get_fee_tax(investment, False)
    buying_fee = buying_fee / investment
    total_balance = investment + (investment * pre_tax_ev)
    selling_fee_tax = get_fee_tax(total_balance, True)
    selling_fee_tax = selling_fee_tax / total_balance

    # 세후 수익률
    post_tax_ev = pre_tax_ev - (buying_fee + selling_fee_tax)
    return post_tax_ev

def detect_sell_signal(sell_row, buy_row, highest_price, holding_days, max_holding_days):
    return (    
        # 보유일이 n일 되는 시점
        holding_days >= max_holding_days
    )

def detect_stop_loss(sell_row, buy_row, highest_price, ndays, max_holding_days):
    return None

def set_signals(df, investment, topn):
    df_list = []
    df = df.sort_values(["stock_code", "date"]).reset_index(drop=True)
    for stock_code, stock_df in df.groupby('stock_code'):
        stock_df["sell_signal"] = False
        stock_df["roi"] = None
        stock_df["holding_trading_days"] = 0
        stock_df["holding_calendar_days"] = 0
        stock_df["stop_loss"] = False
        max_holding_days = None
        buy_trigger_index = None  # 최초 매수 발생 인덱스 저장
        highest_price = None

        stock_df = stock_df.reset_index(drop=True)
        for i in range(len(stock_df)):        
            # 매도 발생 시 최초 매도 신호 기록 (한 번만)
            if buy_trigger_index is not None:
                buy_price = stock_df.loc[buy_trigger_index, "close"]
                sell_price = stock_df.loc[i, "close"]
                sell_trigger = False

                # 보유 기간
                holding_trading_days = i - buy_trigger_index
                buy_date = stock_df.loc[buy_trigger_index, "datetime"]
                sell_date = stock_df.loc[i, "datetime"]
                holding_calendar_days = (sell_date - buy_date).days

                if highest_price < sell_price:
                    highest_price = sell_price

                # 매도 시그널 감지
                if detect_sell_signal(stock_df.loc[i], stock_df.loc[buy_trigger_index], highest_price, holding_trading_days, max_holding_days):
                    sell_trigger = True

                # 스탑로스 감지
                stop_loss = detect_stop_loss(stock_df.loc[i], stock_df.loc[buy_trigger_index], highest_price, holding_trading_days, max_holding_days)
                if stop_loss:
                    sell_price = stop_loss
                    stock_df.loc[i, "stop_loss"] = True
                    sell_trigger = True

                if sell_trigger:
                    stock_df.loc[i, "sell_signal"] = True
                    stock_df.loc[i, "holding_trading_days"] = holding_trading_days
                    stock_df.loc[buy_trigger_index, "holding_trading_days"] = holding_trading_days
                    stock_df.loc[i, "holding_calendar_days"] = holding_calendar_days
                    stock_df.loc[buy_trigger_index, "holding_calendar_days"] = holding_calendar_days
                    # 수익률 기록
                    roi = get_roi(buy_price, sell_price, investment)
                    stock_df.loc[i, "sell_roi"] = roi
                    stock_df.loc[buy_trigger_index, "buy_roi"] = roi
                    # 매수 기록 초기화 (새로운 매수 가능)
                    buy_trigger_index = None
                    highest_price = 0

            # 매수 신호 발생 시 기록
            if stock_df.loc[i, "buy_signal"]:
                if (i + MAX_HOlDING_DAYS) < len(stock_df):
                    if buy_trigger_index is None:
                        buy_trigger_index = i  # 매수 발생 행 기록
                        highest_price = stock_df.loc[i, "close"]
                        max_holding_days = MAX_HOlDING_DAYS
                    else:
                        # 주식 보유 중 매수 시그널이 발생하면 보유 기간을 연장한다. (Trailing Entry 전략)
                        max_holding_days = (i - buy_trigger_index) + MAX_HOlDING_DAYS
                else:
                    # 거래일이 부족하면 매수 시그널 제거
                    stock_df.loc[i, "buy_signal"] = False
        
            # 매수 이후 추가 매수 시그널 제거
            if i != buy_trigger_index:
                stock_df.loc[i, "buy_signal"] = False

        df_list.append(stock_df)

    return pd.concat(df_list).reset_index(drop=True)


def evaluate_roi(investment, predictions, topn):
    df = predictions.copy()
    year = df.head(1)['year'].item()

    # 매수 시그널
    df = df.reset_index(drop=True)
    df["pred_rank"] = df.groupby("date")["pred"].rank(method="first", ascending=False)
    df["buy_signal"] = (df["pred_rank"] <= topn) & (df["atr_ratio"] > 0.03) & (df["close_rate"] > -9)
    df = df[['stock_code', 'date', 'pred', 'buy_signal']]
    df = df_topn[df_topn['year'] == year].merge(df, on=['stock_code', 'date'], how='left')
    df["buy_signal"] = df["buy_signal"].fillna(False)

    # 전체
    total_count = len(df)

    # 매수/매도 시그널 설정
    df = set_signals(df, investment, topn)

    # 매수 횟수
    buy_count = int(df['buy_signal'].sum())

    # 매도 횟수
    sell_count = int(df['sell_signal'].sum())

    # 성공 횟수    
    df['success'] = (df['buy_signal'] == True) & (df['buy_roi'] > 0)
    success_count = int((df['buy_signal'] & df['success']).sum())
    success_rate = success_count * 100 / sell_count

    # 실패 횟수
    fail_count = sell_count - success_count
    fail_rate = fail_count * 100 / sell_count

    # 성공 시 수익률
    profit_rate = df.loc[df['buy_signal'] & (df["buy_roi"] > 0), "buy_roi"].mean() * 100
    profit_rate = 0 if np.isnan(profit_rate) else profit_rate

    # 성공 시 보유기간
    profit_holding_calendar_days = df.loc[df["buy_signal"] & (df["buy_roi"] > 0), "holding_calendar_days"].mean()
    profit_holding_trading_days = df.loc[df["buy_signal"] & (df["buy_roi"] > 0), "holding_trading_days"].mean()

    # 실패 시 손실률
    loss_rate = df.loc[df['buy_signal'] & (df["buy_roi"] <= 0), "buy_roi"].mean() * 100
    loss_rate = 0 if np.isnan(loss_rate) else loss_rate

    # 실패 시 보유기간
    loss_holding_calendar_days = df.loc[df["buy_signal"] & (df["buy_roi"] <= 0), "holding_calendar_days"].mean()
    loss_holding_trading_days = df.loc[df["buy_signal"] & (df["buy_roi"] <= 0), "holding_trading_days"].mean()

    # 수익률 10분위수
    quantile_roi = df.loc[df['buy_signal'], "buy_roi"].quantile([x/10 for x in range(11)]).to_list()
    quantile_roi = [x * 100 for x in quantile_roi]
    quantile_roi = ', '.join([f'{x:.1f}' for x in quantile_roi])

    # 기대 수익률
    roi = df.loc[df["buy_signal"], "buy_roi"].mean()
    
    # 누적 수익 금액
    cumulative_profit = sell_count * investment * roi

    print(
        f'==========================================================================================\n'
        f'✅ {year} TOP{topn}\n'
        f'==========================================================================================\n'
        f"샘플: {total_count:,}, 매수: {buy_count:,}, 매도: {sell_count:,}\n"
        f"승률: {success_count:04d}({success_rate:05.2f}%), 평균 등락률: {profit_rate:.2f}%, 평균 보유 기간: {profit_holding_calendar_days:.1f}({profit_holding_trading_days:.1f})\n"
        f"패율: {fail_count:04d}({fail_rate:05.2f}%), 평균 등락률: {loss_rate:.2f}%, 평균 보유 기간: {loss_holding_calendar_days:.1f}({loss_holding_trading_days:.1f})\n"
        f"수익률 10분위수: [{quantile_roi}]\n"
        f"거래 대금: {investment:,}\n"
        f"기대 수익률(세후): {roi * 100:.3f}%\n"
        f"누적 수익금(세후): {int(cumulative_profit):,}\n"
    )

    return df.reset_index(drop=True)

In [31]:
investment = 10_000_000
df_val_roi = evaluate_roi(investment, predictions_val, 3)
df_test_roi = evaluate_roi(investment, predictions_test, 3)

✅ 2024 TOP3
샘플: 191,718, 매수: 201, 매도: 201
승률: 0105(52.24%), 평균 등락률: 7.07%, 평균 보유 기간: 9.9(6.6)
패율: 0096(47.76%), 평균 등락률: -4.50%, 평균 보유 기간: 9.9(6.7)
수익률 10분위수: [-36.0, -7.1, -3.5, -1.7, -0.7, 0.5, 2.3, 4.1, 5.9, 9.2, 36.3]
거래 대금: 10,000,000
기대 수익률(세후): 1.544%
누적 수익금(세후): 31,043,339

✅ 2025 TOP3
샘플: 42,082, 매수: 38, 매도: 38
승률: 0027(71.05%), 평균 등락률: 7.99%, 평균 보유 기간: 9.7(6.3)
패율: 0011(28.95%), 평균 등락률: -3.56%, 평균 보유 기간: 10.2(6.2)
수익률 10분위수: [-8.2, -4.2, -1.7, 0.1, 0.9, 2.2, 4.0, 7.3, 8.7, 19.8, 30.7]
거래 대금: 10,000,000
기대 수익률(세후): 4.646%
누적 수익금(세후): 17,653,692



In [32]:
# columns = [x for x, y in df_val_roi.dtypes.items() if y != 'bool' and x in feature_columns]
# display(df_val_roi[df_val_roi['buy_signal'] & (df_val_roi['roi'] > 0)][columns].quantile(np.linspace(0, 1, 11)))
# display(df_val_roi[df_val_roi['buy_signal'] & (df_val_roi['roi'] <= 0)][columns].quantile(np.linspace(0, 1, 11)))

In [33]:
# columns = [x for x, y in df_test_roi.dtypes.items() if y != 'bool' and x in feature_columns]
# display(df_test_roi[df_test_roi['buy_signal'] & (df_test_roi['roi'] > 0)][columns].quantile(np.linspace(0, 1, 11)))
# display(df_test_roi[df_test_roi['buy_signal'] & (df_test_roi['roi'] <= 0)][columns].quantile(np.linspace(0, 1, 11)))

# 결과 분석

In [34]:
from utils.chart_utils import draw_month_roi

draw_month_roi(df_val_roi)

In [35]:
from utils.chart_utils import draw_daily_roi

draw_daily_roi(df_val_roi, 1)

In [36]:
from utils.chart_utils import draw_month_roi

draw_month_roi(df_test_roi)

In [37]:
from utils.chart_utils import draw_daily_roi

draw_daily_roi(df_test_roi, 1)

In [38]:
from utils.chart_utils import draw_stock_roi

draw_stock_roi(df_val_roi, 30)

In [39]:
from utils.chart_utils import show_binary_continuous_correlation

show_binary_continuous_correlation(df_val_roi, 'buy_signal', '매수', label_columns[-1:], 'n일 후 수익률')

✅ 2024


# 개별 종목 분석

In [40]:
# 등락률 기준 내림차순
display(df_val_roi[(df_val_roi['buy_signal'] == True) & (df_val_roi['success'] == True)].sort_values(by=label_columns[-1], ascending=False).head())

# 등락률 기준 오름차순
display(df_val_roi[(df_val_roi['buy_signal'] == True) & (df_val_roi['success'] == False)].sort_values(by=label_columns[-1], ascending=True).head())

Unnamed: 0,stock_code,stock_name,date,market_cap,market_name,trading_value,open,high,low,close,trading_volume,individual,foreign,institution,vix_close,vix_close_rate,kospi_close,kospi_trading_volume,kospi_trading_value,kospi_close_rate,kospi_ma5_gap,kospi_ma20_gap,kospi_vwma5_gap,kospi_vwma20_gap,kospi_trading_rate,kospi_trading_dev,kospi200f_close,kospi200f_close_rate,kosdaq_close,kosdaq_trading_volume,kosdaq_trading_value,kosdaq_close_rate,kosdaq_ma5_gap,kosdaq_ma20_gap,kosdaq_vwma5_gap,kosdaqi_vwma20_gap,kosdaq_trading_rate,kosdaq_trading_dev,kospi_kosdaq_ratio,nasdaq100f_close,nasdaq100f_close_rate,nasdaq100f_ma5_gap,nasdaq100f_ma20_gap,sp500f_close,sp500f_close_rate,sp500f_ma5_gap,sp500f_ma20_gap,sp500v_close,sp500v_close_rate,sp500vf_close,sp500vf_close_rate,is_kospi,candle_upper_tail_ratio,candle_lower_tail_ratio,candle_body_ratio,candle_sign,open_to_close,high_to_low,datetime,year,month,day_of_week,month_sin,month_cos,dow_sin,dow_cos,industry_code,listing_date,rsi,atr_ratio,macd_ratio,macd_signal_ratio,macd_golden_cross,macd_dead_cross,ma5,ma20,ma5_gap,ma20_gap,vwma5,vwma20,vwma12,vwma26,vwma5_gap,vwma20_gap,vwma5_to_20_ratio,close_to_vwma5_ratio,vwma5_slope_ratio,vwma5_sma5_diff,vwma20_sma20_diff,vwma_macd_ratio,vwma_macd_signal_ratio,vwma_macd_golden_cross,vwma_macd_dead_cross,vwma_bb_upper,vwma_bb_lower,vwma_bb_upper_ratio,vwma_bb_lower_ratio,vwma_bb_width,close_vwma_golden_cross,close_vwma_dead_cross,vwma_golden_cross,vwma_dead_cross,bb_upper,bb_lower,bb_upper_ratio,bb_lower_ratio,bb_width,golden_cross,dead_cross,vol_ma5,vol_ma20,trading_golden_cross,trading_dead_cross,trading_volume_volatility_ratio,foreign_rate,institution_rate,individual_rate,foreign_net_buy_days,institution_net_buy_days,individual_net_buy_days,prev_close,prev_trading_volume,next1_close,next2_close,next3_close,next4_close,next5_close,open_rate,low_rate,high_rate,close_rate,trading_change,trading_rolling_change,next1_close_rate,next2_close_rate,next3_close_rate,next4_close_rate,next5_close_rate,rank,day_index,pred,buy_signal,sell_signal,roi,holding_trading_days,holding_calendar_days,stop_loss,sell_roi,buy_roi,success
75121,47920,HLB제약,20240522,534315910820,KOSDAQ,32121729050,16950.0,16960.0,16070.0,16820.0,1939245.0,-190352.0,192426.0,47.0,15.71,0.991,2723.46,484723878.0,12378510253371.0,1.0,-0.004,0.009,-0.004,0.007,1.18,1.038,1863.68,0.999,845.72,914521669.0,9034343724969.0,0.999,-0.008,-0.016,-0.008,-0.016,0.944,0.981,0.73,19257.25,1.002,0.01,0.035,5345.25,1.003,0.003,0.027,11.86,0.976,12.2,0.971,False,0.011,0.843,0.146,False,-0.008,0.055,2024-05-22,2024,5,2,0.5,-0.866,0.588,-0.809,1370,2015-12-21,17.187,0.185,-0.201,-0.088,False,False,20284.0,29441.0,-0.171,-0.429,17602.694,22883.388,19648.063,23756.05,-0.044,-0.265,0.769,0.956,-0.04,-0.132,-0.223,-0.244,-0.127,0,0,35705.719,10061.057,2.123,0.598,1.525,0,0,0,0,42263.331,16618.669,2.513,0.988,152.465,0,0,4115517.8,1570273.1,0,0,1.387,10.732,0.196,0.0,1,2,0,16500.0,6564310.0,16700.0,17590.0,18990.0,22700.0,23000.0,2.727,-2.606,2.788,1.939,0.295,0.471,-0.713,4.578,12.901,34.958,36.742,88,1327,0.156,True,False,,5,7,False,,0.363,True
72103,42660,한화오션,20241101,8211878959200,KOSPI,17435622100,26500.0,27100.0,26100.0,26800.0,651694.0,130848.0,-27040.0,-101391.0,22.95,1.013,2542.36,319848769.0,7944318405637.0,0.995,-0.016,-0.02,-0.016,-0.02,0.723,0.855,1671.76,0.992,729.05,747067063.0,5452481185437.0,0.981,-0.013,-0.035,-0.014,-0.035,0.959,1.042,0.686,20021.75,0.975,-0.021,-0.018,5738.5,0.981,-0.016,-0.019,23.16,1.138,20.96,1.085,True,0.3,0.4,0.3,True,0.011,0.038,2024-11-01,2024,11,4,-0.5,0.866,-0.951,0.309,331,2001-02-02,13.218,0.04,-0.038,-0.022,False,False,27820.0,30067.5,-0.037,-0.109,27739.981,29786.797,28965.073,30153.316,-0.034,-0.1,0.931,0.966,-0.011,-0.003,-0.009,-0.044,-0.028,0,0,32805.384,26768.21,1.224,0.999,0.225,0,0,0,0,33086.087,27048.913,1.235,1.009,22.527,0,0,1155203.4,841485.7,0,0,0.542,0.0,0.0,0.621,0,0,9,26750.0,1302949.0,27050.0,27600.0,27800.0,33850.0,36200.0,-0.935,-2.43,1.308,0.187,0.5,0.564,0.933,2.985,3.731,26.306,35.075,107,1436,0.179,True,False,,5,7,False,,0.346,True
54672,28300,HLB,20240522,6450676445200,KOSDAQ,340136945700,49250.0,49600.0,47250.0,49300.0,6976093.0,-362786.0,347629.0,21397.0,15.71,0.991,2723.46,484723878.0,12378510253371.0,1.0,-0.004,0.009,-0.004,0.007,1.18,1.038,1863.68,0.999,845.72,914521669.0,9034343724969.0,0.999,-0.008,-0.016,-0.008,-0.016,0.944,0.981,0.73,19257.25,1.002,0.01,0.035,5345.25,1.003,0.003,0.027,11.86,0.976,12.2,0.971,False,0.128,0.851,0.021,True,0.001,0.05,2024-05-22,2024,5,2,0.5,-0.866,0.588,-0.809,321,1996-07-27,12.424,0.191,-0.224,-0.082,False,False,61540.0,94430.0,-0.199,-0.478,53330.492,71006.355,62361.801,74152.811,-0.076,-0.306,0.751,0.924,-0.047,-0.133,-0.248,-0.239,-0.071,0,0,115144.993,26867.717,2.336,0.545,1.791,0,0,0,0,138568.638,50291.362,2.811,1.02,179.061,0,0,10180854.8,3856814.1,0,0,1.2,7.432,0.9,0.0,2,2,0,48500.0,21085467.0,48700.0,50700.0,56200.0,64700.0,66100.0,1.546,-2.577,2.268,1.649,0.331,0.685,-1.217,2.84,13.996,31.237,34.077,10,1128,0.19,True,False,,5,7,False,,0.336,True
122215,115450,HLB테라퓨틱스,20240520,506401991820,KOSDAQ,56378456640,4781.0,6086.0,4781.0,6087.0,10591465.0,-684745.0,228839.0,-69589.0,15.95,0.998,2742.14,582071105.0,12502253995995.0,1.006,0.002,0.02,0.003,0.02,1.136,1.054,1872.68,1.009,847.08,1017956271.0,8345055428621.0,0.991,-0.013,-0.015,-0.012,-0.015,0.902,0.914,0.667,19103.5,0.998,0.015,0.037,5320.25,0.998,0.007,0.03,12.42,0.998,12.83,0.996,False,-0.001,0.0,1.001,True,0.273,0.273,2024-05-20,2024,5,0,0.5,-0.866,0.0,1.0,746,2010-03-26,9.405,0.16,-0.187,-0.109,False,False,7542.6,9902.35,-0.193,-0.385,6890.87,9139.754,8060.542,10602.101,-0.117,-0.334,0.754,0.883,-0.229,-0.086,-0.077,-0.418,-0.414,0,0,12682.183,5597.325,2.083,0.92,1.164,0,0,0,0,13444.779,6359.921,2.209,1.045,116.393,0,0,3287620.8,2033857.5,1,0,1.173,1.515,0.0,0.0,2,0,0,5944.0,447753.0,6773.0,7097.0,7249.0,7802.0,7687.0,-19.566,-19.566,2.389,2.406,23.655,3.222,11.27,16.593,19.09,28.175,26.286,49,1325,0.178,True,False,,6,8,False,,0.31,True
157978,272210,한화시스템,20240611,3249413490800,KOSPI,12975903260,17510.0,17520.0,17190.0,17200.0,748634.0,213945.0,-173602.0,-40595.0,17.16,0.999,2705.32,558892980.0,11776520079436.0,1.002,0.003,0.0,0.004,0.001,1.04,0.952,1843.56,1.002,868.36,1038631016.0,10531409778429.0,1.004,0.011,0.021,0.011,0.022,1.174,1.119,0.894,19592.75,1.004,0.006,0.02,5371.25,1.003,0.004,0.01,12.74,1.043,13.03,1.002,True,0.03,0.03,0.939,False,-0.018,0.019,2024-06-11,2024,6,1,0.0,-1.0,0.951,0.309,326,2019-11-13,40.379,0.034,-0.015,-0.01,False,False,17242.0,17869.0,-0.002,-0.037,17172.758,17907.118,17580.742,18257.747,0.002,-0.039,0.959,1.002,-0.008,-0.004,0.002,-0.039,-0.048,0,0,19032.554,16781.682,1.107,0.976,0.131,0,0,0,0,18994.436,16743.564,1.104,0.973,13.086,0,0,1101148.6,1159832.9,0,1,0.284,0.0,0.0,1.16,0,0,2,17410.0,900697.0,18120.0,18390.0,18700.0,19550.0,21700.0,0.574,-1.264,0.632,-1.206,0.831,0.68,5.349,6.919,8.721,13.663,26.163,178,1043,0.157,True,False,,5,7,False,,0.257,True


Unnamed: 0,stock_code,stock_name,date,market_cap,market_name,trading_value,open,high,low,close,trading_volume,individual,foreign,institution,vix_close,vix_close_rate,kospi_close,kospi_trading_volume,kospi_trading_value,kospi_close_rate,kospi_ma5_gap,kospi_ma20_gap,kospi_vwma5_gap,kospi_vwma20_gap,kospi_trading_rate,kospi_trading_dev,kospi200f_close,kospi200f_close_rate,kosdaq_close,kosdaq_trading_volume,kosdaq_trading_value,kosdaq_close_rate,kosdaq_ma5_gap,kosdaq_ma20_gap,kosdaq_vwma5_gap,kosdaqi_vwma20_gap,kosdaq_trading_rate,kosdaq_trading_dev,kospi_kosdaq_ratio,nasdaq100f_close,nasdaq100f_close_rate,nasdaq100f_ma5_gap,nasdaq100f_ma20_gap,sp500f_close,sp500f_close_rate,sp500f_ma5_gap,sp500f_ma20_gap,sp500v_close,sp500v_close_rate,sp500vf_close,sp500vf_close_rate,is_kospi,candle_upper_tail_ratio,candle_lower_tail_ratio,candle_body_ratio,candle_sign,open_to_close,high_to_low,datetime,year,month,day_of_week,month_sin,month_cos,dow_sin,dow_cos,industry_code,listing_date,rsi,atr_ratio,macd_ratio,macd_signal_ratio,macd_golden_cross,macd_dead_cross,ma5,ma20,ma5_gap,ma20_gap,vwma5,vwma20,vwma12,vwma26,vwma5_gap,vwma20_gap,vwma5_to_20_ratio,close_to_vwma5_ratio,vwma5_slope_ratio,vwma5_sma5_diff,vwma20_sma20_diff,vwma_macd_ratio,vwma_macd_signal_ratio,vwma_macd_golden_cross,vwma_macd_dead_cross,vwma_bb_upper,vwma_bb_lower,vwma_bb_upper_ratio,vwma_bb_lower_ratio,vwma_bb_width,close_vwma_golden_cross,close_vwma_dead_cross,vwma_golden_cross,vwma_dead_cross,bb_upper,bb_lower,bb_upper_ratio,bb_lower_ratio,bb_width,golden_cross,dead_cross,vol_ma5,vol_ma20,trading_golden_cross,trading_dead_cross,trading_volume_volatility_ratio,foreign_rate,institution_rate,individual_rate,foreign_net_buy_days,institution_net_buy_days,individual_net_buy_days,prev_close,prev_trading_volume,next1_close,next2_close,next3_close,next4_close,next5_close,open_rate,low_rate,high_rate,close_rate,trading_change,trading_rolling_change,next1_close_rate,next2_close_rate,next3_close_rate,next4_close_rate,next5_close_rate,rank,day_index,pred,buy_signal,sell_signal,roi,holding_trading_days,holding_calendar_days,stop_loss,sell_roi,buy_roi,success
60093,33100,제룡전기,20240830,901101144900,KOSDAQ,18598660600,56500.0,56700.0,55200.0,56100.0,332509.0,-21506.0,16990.0,2859.0,18.81,0.95,2674.31,278845310.0,10340760628054.0,1.005,-0.003,0.012,-0.003,0.018,0.902,1.044,1802.36,1.003,767.66,823296386.0,7401837254629.0,1.015,0.005,0.005,0.005,0.006,1.009,1.052,0.716,19625.5,0.999,-0.009,0.014,5610.0,1.0,-0.004,0.021,15.65,0.915,15.94,0.962,False,0.133,0.6,0.267,False,-0.007,0.027,2024-08-30,2024,8,4,-0.866,-0.5,-0.951,0.309,328,1997-08-18,27.064,0.07,-0.102,-0.09,False,False,57720.0,64815.0,-0.028,-0.134,57754.347,65595.746,62326.718,68835.634,-0.029,-0.145,0.88,0.971,-0.015,0.001,0.012,-0.116,-0.147,0,0,75852.303,55339.19,1.352,0.986,0.366,0,0,0,0,75071.556,54558.444,1.338,0.973,36.565,0,0,505405.2,638411.0,0,0,0.289,0.333,2.0,0.0,5,1,0,55000.0,555402.0,55100.0,54500.0,50800.0,46950.0,44750.0,2.727,0.364,3.091,2.0,0.599,0.658,-1.783,-2.852,-9.447,-16.31,-20.232,120,1397,0.175,True,False,,8,12,False,,-0.056,False
20915,5070,코스모신소재,20241107,2802427167200,KOSPI,41989652800,92900.0,93000.0,85600.0,86200.0,478572.0,20366.0,18370.0,-39264.0,19.77,0.954,2564.63,465146173.0,10809704016429.0,1.0,-0.001,-0.01,-0.001,-0.01,0.917,1.167,1686.83,0.998,733.52,812895086.0,6599193632501.0,0.987,-0.012,-0.021,-0.012,-0.021,0.865,1.054,0.61,20894.0,1.027,0.029,0.023,5958.25,1.025,0.027,0.018,16.27,0.794,16.25,0.886,True,0.014,0.081,0.905,False,-0.072,0.086,2024-11-07,2024,11,3,-0.5,0.866,-0.588,-0.809,320,1987-09-28,23.142,0.071,-0.061,-0.043,False,False,97240.0,105070.0,-0.114,-0.18,93576.184,102104.821,97441.987,107019.242,-0.079,-0.156,0.916,0.921,-0.057,-0.038,-0.028,-0.111,-0.104,0,0,116763.621,87446.021,1.355,1.014,0.34,0,0,0,0,119728.8,90411.2,1.389,1.049,34.011,0,0,242136.2,140725.6,0,0,0.801,3.474,0.0,1.994,1,0,3,93600.0,312763.0,85600.0,80500.0,78000.0,75000.0,71200.0,-0.748,-8.547,-0.641,-7.906,1.53,1.976,-0.696,-6.613,-9.513,-12.993,-17.401,67,1440,0.216,True,False,,28,40,False,,-0.36,False
49328,20150,롯데에너지머티리얼즈,20240719,2031182281750,KOSPI,9120253950,43900.0,44200.0,42950.0,44050.0,209460.0,15905.0,-7609.0,-8848.0,17.27,1.029,2795.46,426207952.0,9585234666046.0,0.99,-0.015,-0.011,-0.016,-0.01,0.616,0.75,1917.62,0.989,828.72,871199222.0,6829503353816.0,1.008,-0.007,-0.017,-0.007,-0.017,0.981,0.968,0.713,20148.5,0.995,-0.018,-0.019,5594.5,0.992,-0.012,-0.001,15.93,1.1,15.29,1.101,True,0.12,0.76,0.12,True,0.003,0.029,2024-07-19,2024,7,4,-0.5,-0.866,-0.951,0.309,326,2011-03-04,15.392,0.043,-0.043,-0.021,False,False,44910.0,50045.0,-0.019,-0.12,45088.92,50348.337,48096.673,52456.605,-0.023,-0.125,0.896,0.977,-0.017,0.004,0.006,-0.099,-0.067,0,0,57318.038,43378.636,1.301,0.985,0.316,0,0,0,0,57014.701,43075.299,1.294,0.978,31.645,0,0,252348.2,303090.9,0,0,0.378,0.0,0.0,0.735,0,0,1,44100.0,188764.0,40500.0,39200.0,38350.0,37500.0,36800.0,-0.454,-2.608,0.227,-0.113,1.11,0.83,-8.059,-11.01,-12.94,-14.869,-16.459,183,1368,0.168,True,False,,10,14,False,,-0.152,False
72530,42700,한미반도체,20240730,12483080695800,KOSPI,230013361700,135200.0,135600.0,125300.0,128700.0,1784351.0,-332106.0,347835.0,-6437.0,17.78,1.015,2738.19,392868090.0,10199651173207.0,0.99,-0.001,-0.027,-0.001,-0.027,0.93,0.866,1872.41,0.991,803.78,842014161.0,6262355850075.0,0.995,-0.0,-0.033,-0.0,-0.033,1.166,1.006,0.614,19450.75,1.014,0.006,-0.039,5503.0,1.001,0.004,-0.016,16.6,1.013,15.97,0.987,True,0.039,0.33,0.631,False,-0.048,0.082,2024-07-30,2024,7,1,-0.5,-0.866,0.951,0.309,329,2005-07-22,17.137,0.065,-0.062,-0.039,False,False,136500.0,153310.0,-0.057,-0.161,135564.321,153386.065,147286.974,157161.382,-0.051,-0.161,0.884,0.949,-0.03,-0.007,0.0,-0.077,-0.064,0,0,177227.374,129544.756,1.377,1.007,0.37,0,0,0,0,177151.309,129468.691,1.376,1.006,37.049,0,0,1289412.2,1253879.35,1,0,0.301,5.04,0.0,0.0,1,0,0,137400.0,665775.0,131200.0,127300.0,115400.0,102600.0,107600.0,-1.601,-8.806,-1.31,-6.332,2.68,1.384,1.943,-1.088,-10.334,-20.28,-16.395,8,571,0.182,True,False,,12,17,False,,-0.012,False
164007,298050,HS효성첨단소재,20241108,936309132000,KOSPI,9055575500,219000.0,220000.0,208000.0,209000.0,42675.0,10277.0,-922.0,-7747.0,19.03,0.963,2561.15,448734499.0,9921212340072.0,0.999,-0.004,-0.011,-0.003,-0.011,0.918,1.027,1684.11,0.998,743.38,766825742.0,7206375001249.0,1.013,-0.002,-0.006,-0.002,-0.006,1.092,1.09,0.726,21224.75,1.016,0.033,0.037,6003.75,1.008,0.025,0.024,15.2,0.934,15.64,0.962,True,0.083,0.083,0.833,False,-0.046,0.058,2024-11-08,2024,11,4,-0.5,0.866,-0.951,0.309,320,2018-07-13,16.17,0.054,-0.067,-0.044,False,False,225200.0,252975.0,-0.072,-0.174,222869.08,239780.113,232161.713,246067.434,-0.062,-0.128,0.929,0.938,-0.035,-0.01,-0.052,-0.067,-0.065,0,0,277575.233,201984.994,1.328,0.966,0.362,0,0,0,0,290770.119,215179.881,1.391,1.03,36.168,0,0,36961.6,17968.8,0,0,0.792,0.0,0.0,1.493,0,0,3,218000.0,23749.0,195300.0,190500.0,182100.0,179200.0,178700.0,0.459,-4.587,0.917,-4.128,1.797,1.155,-6.555,-8.852,-12.871,-14.258,-14.498,169,1441,0.175,True,False,,5,7,False,,-0.15,False


In [73]:
from utils.chart_utils import show_stock_chart

stock_name = 'HLB제약'
df_stock = df_val_roi[df_val_roi['stock_name'] == stock_name]
show_stock_chart(df_stock, '종가 베팅')
display(df_stock[df_stock['buy_signal']].sort_values(by='date', ascending=True))

Unnamed: 0,stock_code,stock_name,date,market_cap,market_name,trading_value,open,high,low,close,trading_volume,individual,foreign,institution,vix_close,vix_close_rate,kospi_close,kospi_trading_volume,kospi_trading_value,kospi_close_rate,kospi_ma5_gap,kospi_ma20_gap,kospi_vwma5_gap,kospi_vwma20_gap,kospi_trading_rate,kospi_trading_dev,kospi200f_close,kospi200f_close_rate,kosdaq_close,kosdaq_trading_volume,kosdaq_trading_value,kosdaq_close_rate,kosdaq_ma5_gap,kosdaq_ma20_gap,kosdaq_vwma5_gap,kosdaqi_vwma20_gap,kosdaq_trading_rate,kosdaq_trading_dev,kospi_kosdaq_ratio,nasdaq100f_close,nasdaq100f_close_rate,nasdaq100f_ma5_gap,nasdaq100f_ma20_gap,sp500f_close,sp500f_close_rate,sp500f_ma5_gap,sp500f_ma20_gap,sp500v_close,sp500v_close_rate,sp500vf_close,sp500vf_close_rate,is_kospi,candle_upper_tail_ratio,candle_lower_tail_ratio,candle_body_ratio,candle_sign,open_to_close,high_to_low,datetime,year,month,day_of_week,month_sin,month_cos,dow_sin,dow_cos,industry_code,listing_date,rsi,atr_ratio,macd_ratio,macd_signal_ratio,macd_golden_cross,macd_dead_cross,ma5,ma20,ma5_gap,ma20_gap,vwma5,vwma20,vwma12,vwma26,vwma5_gap,vwma20_gap,vwma5_to_20_ratio,close_to_vwma5_ratio,vwma5_slope_ratio,vwma5_sma5_diff,vwma20_sma20_diff,vwma_macd_ratio,vwma_macd_signal_ratio,vwma_macd_golden_cross,vwma_macd_dead_cross,vwma_bb_upper,vwma_bb_lower,vwma_bb_upper_ratio,vwma_bb_lower_ratio,vwma_bb_width,close_vwma_golden_cross,close_vwma_dead_cross,vwma_golden_cross,vwma_dead_cross,bb_upper,bb_lower,bb_upper_ratio,bb_lower_ratio,bb_width,golden_cross,dead_cross,vol_ma5,vol_ma20,trading_golden_cross,trading_dead_cross,trading_volume_volatility_ratio,foreign_rate,institution_rate,individual_rate,foreign_net_buy_days,institution_net_buy_days,individual_net_buy_days,prev_close,prev_trading_volume,next1_close,next2_close,next3_close,next4_close,next5_close,open_rate,low_rate,high_rate,close_rate,trading_change,trading_rolling_change,next1_close_rate,next2_close_rate,next3_close_rate,next4_close_rate,next5_close_rate,rank,day_index,pred,buy_signal,sell_signal,roi,holding_trading_days,holding_calendar_days,stop_loss,sell_roi,buy_roi,success
75121,47920,HLB제약,20240522,534315910820,KOSDAQ,32121729050,16950.0,16960.0,16070.0,16820.0,1939245.0,-190352.0,192426.0,47.0,15.71,0.991,2723.46,484723878.0,12378510253371.0,1.0,-0.004,0.009,-0.004,0.007,1.18,1.038,1863.68,0.999,845.72,914521669.0,9034343724969.0,0.999,-0.008,-0.016,-0.008,-0.016,0.944,0.981,0.73,19257.25,1.002,0.01,0.035,5345.25,1.003,0.003,0.027,11.86,0.976,12.2,0.971,False,0.011,0.843,0.146,False,-0.008,0.055,2024-05-22,2024,5,2,0.5,-0.866,0.588,-0.809,1370,2015-12-21,17.187,0.185,-0.201,-0.088,False,False,20284.0,29441.0,-0.171,-0.429,17602.694,22883.388,19648.063,23756.05,-0.044,-0.265,0.769,0.956,-0.04,-0.132,-0.223,-0.244,-0.127,0,0,35705.719,10061.057,2.123,0.598,1.525,0,0,0,0,42263.331,16618.669,2.513,0.988,152.465,0,0,4115517.8,1570273.1,0,0,1.387,10.732,0.196,0.0,1,2,0,16500.0,6564310.0,16700.0,17590.0,18990.0,22700.0,23000.0,2.727,-2.606,2.788,1.939,0.295,0.471,-0.713,4.578,12.901,34.958,36.742,88,1327,0.156,True,False,,5,7,False,,0.363,True
