In [1]:
from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller

def stationarity(timeseries):
    #1) Seasonality 
    for _ in range(5): #maximum differencing 5 times
        try:
            exception = 0
            result = adfuller(timeseries)
        except:
            exception = 1 # when input closing price data is the same, error occurs -> break
            break
        if result[1] < 0.05: # pvalue < 0.05
            break
        # differencing
        timeseries = timeseries.diff().dropna() #timeseries.diff() = timeseries - timeseries.shift()     
    
    #2) Trend
    if exception < 1:
        if result[1] > 0.05: #If Still not stationary, apply log transformation
            timeseries = timeseries.apply(np.log)
            logtrans = 1
    return timeseries

In [5]:
train = pd.read_csv("./open/train.csv")

In [6]:
# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

# 각 종목코드에 대해서 모델 학습 및 추론 반복
for code in tqdm(unique_codes):
    
    logtrans = 0
    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][['일자', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close['종가'] = stationarity(train_close['종가'])
    train_close.set_index('일자', inplace=True)
    tc = train_close['종가']
    
    # 모델 선언, 학습 및 추론
    model = ARIMA(tc, order=(2, 1, 2))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측
    
    if logtrans == 1:
        predictions = np.exp(predictions)
    # 최종 수익률 계산
    final_return = (predictions.iloc[-1] - predictions.iloc[0]) / predictions.iloc[0]
    
    # 결과 저장
    results_df = pd.concat([results_df, pd.DataFrame({'종목코드': [code], 'final_return': [final_return]})])

100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.52it/s]


In [7]:
results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
results_df

Unnamed: 0,종목코드,final_return,순위
0,A060310,-0.634147,8
0,A095570,1.07693,1
0,A006840,0.00304,3
0,A054620,-0.45927,7
0,A265520,-0.004132,4
0,A211270,-0.40587,6
0,A027410,-0.121139,5
0,A282330,-0.843253,9
0,A126600,-0.908344,10
0,A138930,0.528496,2
