In [1]:
# from google.colab import drive

# # Mount Google Drive
# drive.mount('/content/drive')

In [1]:
import pandas as pd
import numpy as np
# import category_encoders as ce
# import copy
# import polars as pl

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import matplotlib
import seaborn as sns
# import plotly.express as px
# %matplotlib inline
# matplotlib.rcParams['font.family'] = 'Malgun Gothic' # 한글 패치
# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn.decomposition import PCA

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
# from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRegressor, XGBRFRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
# from sklearn.base import ClassifierMixin

# CatBoost
# from catboost import CatBoostRegressor

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import Parameter
from torch import Tensor
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss,mean_squared_error
import sklearn

# Utility
import os
import time
import datetime # ⚠️2019년 12월30일과 31일의 week of year가 1인 오류가 있음
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import holidays

# from bayes_opt import BayesianOptimization
# from num2words import num2words
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence

In [2]:
pd.set_option('display.max_row',None)
pd.set_option('display.max_column',None)

### Setting universal random_state
np.random.seed(142)
random.seed(142)
sklearn.utils.check_random_state(142)
torch.manual_seed(142)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark=False

try:
    import google.colab
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

# Make use of a GPU or MPS (Apple) if one is available.  (see module 3.2)
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Note: not using Google CoLab
Using device: cuda


In [3]:
class PrepareData():
    def __init__(self,stock_fp,news_fp):
        self.stock_filepath = stock_fp
        self.news_filepath = news_fp
        self.topic_classes = ['CEO', 'CFO', 'Layoffs', 'Political', 'PressRelease', 'Undefined',
       'cramer', 'earnings', 'gold', 'manda', 'paylimitwall', 'paywall',
       'product', 'recession', 'tanalysis'] # undefined의 class가 5

    def load_data(self):
        stock = pd.read_csv(self.stock_filepath,index_col=0)
        news = pd.read_csv(self.news_filepath,index_col=0)
        ### parse date manually
        stock['Date'] = pd.to_datetime(stock['Date'])
        news['date'] = pd.to_datetime(news['date'])
        return stock, news

    def merging(self, stock, news):
        ### fill na value of PINS column
        stock['PINS'] = stock['PINS'].fillna(stock['PINS'].iloc[75])

        ### drop 'news_id' column
#         news = news.drop(columns=['news_id'])

        ### add date range from 18.01.02 to 18.12.31
        temp_range = pd.DataFrame(dict(zip(stock.columns,[pd.date_range(start='2018-01-02',end='2018-12-31'),
                                0,0,0,0, # 4
                                0,0,0,0,
                                0,0,0,0,
                                0,0,0,0, # 16
                                0,0,0,0,
                                0,0,0,0,
                                0,0,0,0, # 28
                                0,0,0,0,
                                0,0,0,0, # 36
                                0 # 37
                            ])))
        stock_inc = pd.concat([temp_range,stock],axis=0)

        ### merge stock_inc and news
        # left = stock_inc
        # on = date
        # how = left
        # rename 'date' to 'Date' of news df
        news = news.rename(columns={'date':'Date'})
        merged = pd.merge(left=stock_inc,right=news,on='Date',how='left')

        ### Cut before 2018-02-13
        merged = merged[42:].reset_index(drop=True)

        # fill na with latest non-null values
        columns_to_fill = ['source_name', 'topics', 'rank_score',
                        'sentiment_Negative','sentiment_Neutral',
                        'sentiment_Positive', 'type_Article', 'type_Video']
        merged_fillna = merged.copy()
        for column in columns_to_fill:
            merged_fillna[column].fillna(method='ffill',inplace=True)

        ### add moving average to sentiments
        ma_nums = [5,60,120]
        def mode_window(window):
            return window.mode().iloc[0] if not len(window.mode())==0 else None
        for num in ma_nums:
            merged_fillna[f'{num}MA_sent_Neg']=merged_fillna['sentiment_Negative'].rolling(
            window=num).mean()
            merged_fillna[f'{num}MA_sent_Neu']=merged_fillna['sentiment_Neutral'].rolling(
            window=num).mean()
            merged_fillna[f'{num}MA_sent_Pos']=merged_fillna['sentiment_Positive'].rolling(
            window=num).mean()
        ### add moving mode to sentiments
        for num in ma_nums:
            merged_fillna[f'{num}MM_sent_Neg']=merged_fillna['sentiment_Negative'].rolling(
            window=num).apply(mode_window)
            merged_fillna[f'{num}MM_sent_Neu']=merged_fillna['sentiment_Neutral'].rolling(
            window=num).apply(mode_window)
            merged_fillna[f'{num}MM_sent_Pos']=merged_fillna['sentiment_Positive'].rolling(
            window=num).apply(mode_window)
        ### adding moving mode to topics
        for num in ma_nums:
            merged_fillna[f'{num}MM_topics']=merged_fillna['topics'].rolling(
            window=num).apply(mode_window)

        ### drop before 2019-01-02
        total_df = merged_fillna.iloc[322:]
        total_df = total_df.reset_index(drop=True)

        ### drop unnecessaray columns
        drop_cols = ['source_name','topics','rank_score',
                    'sentiment_Negative','sentiment_Neutral',
                    'sentiment_Positive','type_Article','type_Video']
        total_df = total_df.drop(columns=drop_cols)

        return total_df

In [4]:
# 전체 오차율
def error_ratio(pred, true):
    return np.mean(np.abs(pred-true)/true)

In [5]:
stock_filepath = '../../data/stock_price/netflix_60.csv' # 각자 파일 경로 설정
news_filepath = '../../data/scraping/news_processed_filtered_2.csv'
# stock_filepath = './drive/MyDrive/Colab Notebooks/data/bitamin_mini_project/netflix_60.csv'
# news_filepath = './drive/MyDrive/Colab Notebooks/data/bitamin_mini_project/news_processed_filtered_2.csv'
loader = PrepareData(stock_filepath,news_filepath)
stock_df, news_df=loader.load_data() # >> 감성분석 미포함으로 모델 돌릴 땐 stock_df 바로 사용하면 됨
total_df = loader.merging(stock=stock_df, news=news_df) # 주식데이터셋에 감성분석,토픽 포함시킨 전체 데이터셋

In [6]:
stock_df['PINS'][:3]

0    24.99
1    24.99
2    24.99
Name: PINS, dtype: float64

In [7]:
### Set index as Date
stock_df = stock_df.set_index('Date')
total_df = total_df.set_index('Date')

In [9]:
test_filepath = '../../data/test.csv'
# test_filepath = './drive/MyDrive/Colab Notebooks/data/bitamin_mini_project/test.csv'
y_test = pd.read_csv(test_filepath,index_col=0)
y_test.head(2)

Unnamed: 0_level_0,Close Price
Date,Unnamed: 1_level_1
2024-01-02,468.5
2024-01-03,470.26


In [10]:
### 각종 함수 및 클래스 정의용
def split_xy(dataset, time_steps, y_column):
        x, y = list(), list()
        for i in range(len(dataset)):
            x_end_number = i + time_steps
            y_end_number = x_end_number + y_column

            if y_end_number > len(dataset):
                break
            tmp_x = dataset.iloc[i:x_end_number, ]  # Adjusted for Pandas
            tmp_y = dataset.iloc[x_end_number:y_end_number, 5]  # Adjusted for Pandas
            x.append(tmp_x.values)  # Convert to numpy array
            y.append(tmp_y.values)  # Convert to numpy array

        return np.array(x), np.array(y)

# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=120):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Model definition using Transformer
class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, nhead=4, num_layers=2, dropout=0.2, output_size=10,max_len=120):
        super(TransformerModel, self).__init__()
        
        self.encoder = nn.Linear(input_dim, d_model,bias=True)
        self.pos_encoder = PositionalEncoding(d_model, dropout,max_len)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead,bias=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, output_size,bias=True)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x[:, -1, :])
        return x
    
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))
    
import copy
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs."
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

In [11]:
final = pd.DataFrame({
    'seq_size':[0],
    'batch_size':[0],
    'output_size':[0],
    'hidden_size':[0],
    'error_ratio_total':[0],
    'r_squared_total':[0],
    'p_value_total':[0],
    'error_ratio_stock':[0],
    'r_squared_stock':[0],
    'p_value_stock':[0]
})
final

Unnamed: 0,seq_size,batch_size,output_size,hidden_size,error_ratio_total,r_squared_total,p_value_total,error_ratio_stock,r_squared_stock,p_value_stock
0,0,0,0,0,0,0,0,0,0,0


In [12]:
val_loss_df = pd.DataFrame({
        'seq_size':[0],
        'batch_size':[0],
        'output_size':[0],
        'hidden_size':[0],
        'val_loss_total':[0],
        'val_loss_stock':[0]
    })
val_loss_df

Unnamed: 0,seq_size,batch_size,output_size,hidden_size,val_loss_total,val_loss_stock
0,0,0,0,0,0,0


In [13]:
SEQUENCE_SIZEs = [30,60,120]
# BATCH_SIZEs = [1,5,32,64,128]
BATCH_SIZEs = [1,4,8]
OUTPUT_SIZEs = [10,20]
HIDDEN_SIZEs = [64,128,256]
import itertools

last_val_losses_total, last_val_losses_stock = [],[]


for SEQUENCE_SIZE,BATCH_SIZE,OUTPUT_SIZE,HIDDEN_SIZE in itertools.product(SEQUENCE_SIZEs,BATCH_SIZEs,OUTPUT_SIZEs,HIDDEN_SIZEs):

    x_total, y_total = split_xy(total_df,SEQUENCE_SIZE,OUTPUT_SIZE)
    x_stock, y_stock = split_xy(stock_df, SEQUENCE_SIZE,OUTPUT_SIZE)
    
    ### Set Test data
    x_test_total = np.reshape(total_df.iloc[-SEQUENCE_SIZE:].values,(1,SEQUENCE_SIZE,-1))
    x_test_stock = np.reshape(stock_df.iloc[-SEQUENCE_SIZE:].values,(1,SEQUENCE_SIZE,-1))

    ### Scaling
    # reshape to 2d and scaling
    ss = StandardScaler()
    x_total_scaled = ss.fit_transform(np.reshape(x_total,(
            x_total.shape[0],x_total.shape[1]*x_total.shape[2]
        )))
    x_test_total = ss.transform(np.reshape(x_test_total,(
            x_test_total.shape[0],x_test_total.shape[1]*x_test_total.shape[2]
        )))
    x_stock_scaled = ss.fit_transform(np.reshape(x_stock,(
            x_stock.shape[0],x_stock.shape[1]*x_stock.shape[2]
        )))
    x_test_stock = ss.transform(np.reshape(x_test_stock,(
            x_test_stock.shape[0],x_test_stock.shape[1]*x_test_stock.shape[2]
        )))

    ### Train, Validation Split
    tv_ratio = int(x_total_scaled.shape[0]*0.2) 
    x_train_total = x_total_scaled[:-tv_ratio]
    x_val_total = x_total_scaled[-tv_ratio:]
    y_train_total = y_total[:-tv_ratio]
    y_val_total = y_total[-tv_ratio:]

    tv_ratio = int(x_stock_scaled.shape[0]*0.2) 
    x_train_stock = x_stock_scaled[:-tv_ratio]
    x_val_stock = x_stock_scaled[-tv_ratio:]
    y_train_stock = y_stock[:-tv_ratio]
    y_val_stock = y_stock[-tv_ratio:]

    ### reshape to 3d
    x_train_total = np.reshape(x_train_total,(-1,SEQUENCE_SIZE,x_total.shape[2]))
    x_val_total = np.reshape(x_val_total,(-1,SEQUENCE_SIZE,x_total.shape[2]))
    x_test_total = np.reshape(x_test_total,(1,SEQUENCE_SIZE,-1))

    x_train_stock = np.reshape(x_train_stock,(-1,SEQUENCE_SIZE,x_stock.shape[2]))
    x_val_stock = np.reshape(x_val_stock,(-1,SEQUENCE_SIZE,x_stock.shape[2]))
    x_test_stock = np.reshape(x_test_stock,(1,SEQUENCE_SIZE,-1))
    
    ### to DataLoader
    train_loader_total = DataLoader(
            TensorDataset(torch.tensor(x_train_total,dtype=torch.float32),
                          torch.tensor(y_train_total,dtype=torch.float32)),
            batch_size=BATCH_SIZE, shuffle=True)
    val_loader_total = DataLoader(
            TensorDataset(torch.tensor(x_val_total,dtype=torch.float32),
                          torch.tensor(y_val_total,dtype=torch.float32)),
            batch_size=BATCH_SIZE, shuffle=False
        )

    train_loader_stock = DataLoader(
            TensorDataset(torch.tensor(x_train_stock,dtype=torch.float32),
                          torch.tensor(y_train_stock,dtype=torch.float32)),
            batch_size=BATCH_SIZE, shuffle=True)
    val_loader_stock = DataLoader(
            TensorDataset(torch.tensor(x_val_stock,dtype=torch.float32),
                          torch.tensor(y_val_stock,dtype=torch.float32)),
            batch_size=BATCH_SIZE, shuffle=False
        )

    ### test data to Tensor
    x_test_total = torch.tensor(x_test_total,dtype=torch.float32)
    x_test_stock = torch.tensor(x_test_stock,dtype=torch.float32)
    
    ### Model Declare
    model_total = TransformerModel(input_dim=x_train_total.shape[2],
                               max_len=240,
                               d_model=HIDDEN_SIZE,
                               output_size=OUTPUT_SIZE).to(device)
    model_stock = TransformerModel(input_dim=x_train_stock.shape[2],
                               max_len=240,
                               d_model=HIDDEN_SIZE,
                               output_size=OUTPUT_SIZE).to(device)
    
    # Model Compile
    criterion_total = RMSELoss()
    # criterion_total = nn.MSELoss()
    optimizer_total = torch.optim.Adam(model_total.parameters(), lr=0.001)
    scheduler_total = ReduceLROnPlateau(optimizer_total, 'min', factor=0.3, patience=10, verbose=False)

    criterion_stock = RMSELoss()
    # criterion_stock = nn.MSELoss()
    optimizer_stock = torch.optim.Adam(model_stock.parameters(), lr=0.001)
    scheduler_stock = ReduceLROnPlateau(optimizer_stock, 'min', factor=0.3, patience=10, verbose=False)
    
    ### Total dataset Training
    epochs = 1000
    epoch_counter = 0
    min_val_loss = float('inf')
    done = False
    patience = 50
    es = EarlyStopping(patience=patience)
    tr_losses_fp, val_losses_fp = [],[]

    while not done and epoch_counter<epochs:
        epoch_counter+=1

        ### Training
        model_total.train()
        train_losses = []
        for batch in train_loader_total:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer_stock.zero_grad()
            outputs = model_total(x_batch)
            loss = criterion_total(outputs, y_batch)
            loss.backward()
            optimizer_total.step()
            train_losses.append(loss.item())
        train_loss = np.mean(train_losses)
        tr_losses_fp.append(train_loss)

        # Validation
        model_total.eval()
        val_losses = []
        with torch.no_grad():
            for batch in val_loader_total:
                x_batch, y_batch = batch
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                outputs = model_total(x_batch)
                loss = criterion_total(outputs, y_batch)
                val_losses.append(loss.item())
        val_loss = np.mean(val_losses)
        val_losses_fp.append(val_loss)
        scheduler_total.step(val_loss)

        if es(model_total, val_loss):
            done = True

        # print loss every 5 epochs
    #     if epoch_counter%5 == 0 :
    #         print(f"Epoch {epoch_counter}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    
    last_val_losses_total.append(val_loss)
    val_loss_total = val_loss
    
    ### Plot Train-Val loss
#     path = f'../../plots/train_val_loss/transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_total.png'
#     plt.plot(range(len(tr_losses_fp)),tr_losses_fp,color='blue',label='train_loss')
#     plt.plot(range(len(val_losses_fp)),val_losses_fp,color='red',label='val_loss')
#     plt.legend()
#     plt.savefig(path)
#     plt.clf() # clear current figure
    
    ### evaluation
    model_total.eval()
    with torch.no_grad():
        pred_total = model_total(x_test_total.to(device))
    pred_total = pred_total.to('cpu').detach().numpy()
    pred_total_reshape = np.reshape(pred_total,(-1,1))
    
    y_test_SEQ = y_test[:OUTPUT_SIZE]
    
    ### Evaluate with error_ratio
    print(f'🔸EVAL START🔸 \n transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_total')
    error_ratio_total = error_ratio(pred_total_reshape, y_test_SEQ)
    print('Error Ratio :',error_ratio_total)
    ### R-squared, p-value
    # r_squared는 1에 가까울수록 모델이 좋은 것
    # p_value는 0.05 보다 작으면 유의미한 것
    # 두가지를 만족하지 못한 것은 과적합된 것
    r_squared_total = sm.OLS(y_test_SEQ,sm.add_constant(pred_total_reshape)).fit().rsquared
    p_value_total = sm.OLS(y_test_SEQ,sm.add_constant(pred_total_reshape)).fit().f_pvalue
    print("R-squared:", r_squared_total)
    print("p-value:", p_value_total)
    print('last validation loss:',val_loss)
    print('🔸EVAL END🔸')
    
    ### Save result
    path = f'../../data/results/transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_total_0219.csv'
    result_total = pd.DataFrame(data={'true':np.reshape(y_test_SEQ,(-1)),
                  'pred':np.reshape(pred_total_reshape,(-1)),
                  'error_ratio':((pred_total_reshape-y_test_SEQ)/y_test_SEQ).iloc[:,0].values},
                  index=y_test_SEQ.index)

    result_total.to_csv(path)
    
    ### Plot Result
    r = len(np.reshape(y_test_SEQ,(-1)))
    plt.plot(list(range(r)),np.reshape(y_test_SEQ,(-1)), color='blue', label='true value')
    plt.plot(list(range(r)),np.reshape(pred_total_reshape,(-1)), color='red',alpha=0.6, label='prediction')
    plt.title(f'transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_total')
    plt.legend()
    fig_path = f'../../plots/results/0219_transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_total.png'
    plt.savefig(fig_path,dpi=300)
#     plt.show()
    plt.clf()
    
#     actual_stock_price = stock_df[['Close','5MA','120MA']].reset_index().rename(columns={'Close':'true'})
#     actual_stock_price[['pred','error_ratio']] = 0
#     combined_df = pd.concat([actual_stock_price,
#                             result_total.copy().reset_index()],
#                             axis=0)
#     combined_df['Date'] = pd.to_datetime(combined_df['Date'])
#     combined_df = combined_df.reset_index(drop=True)

#     plt.figure(figsize=(20,10))
#     plt.plot(combined_df['Date'], combined_df['true'],
#              color='blue',label='Actual Stock Price')
#     plt.plot(combined_df['Date'][:-OUTPUT_SIZE],combined_df['5MA'][:-OUTPUT_SIZE],
#             color='purple',linestyle=':',label='5MA')
#     plt.plot(combined_df['Date'][:-OUTPUT_SIZE],combined_df['5MA'][:-OUTPUT_SIZE],
#             color='green',linestyle='--',label='120MA')
#     plt.plot(combined_df['Date'][-OUTPUT_SIZE:],combined_df['pred'][-OUTPUT_SIZE:],color='red',
#              label='Predicted Stock Price')
#     for i,row in combined_df.iterrows():
#         if not np.isnan(row['error_ratio']):
#             plt.annotate(f'{row["error_ratio"]*100:1f}%',(i,row['pred']),
#                      textcoords='offset points',xytext=(0,10),ha='center')

#     plt.title(f'transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_total')
#     plt.xlabel('Date')
#     plt.ylabel('Price')
#     plt.legend()

#     fig_path = f'../../plots/results/transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_total.png'
#     plt.savefig(fig_path,dpi=300)
# #     plt.show()
#     plt.clf()
    
    ### Stock-only dataset Training
    epochs = 1000
    epoch_counter = 0
    min_val_loss = float('inf')
    done = False
    patience = 50
    es = EarlyStopping(patience=patience)
    tr_losses_fp, val_losses_fp = [],[]

    while not done and epoch_counter<epochs:
        epoch_counter+=1

        ### Training
        model_stock.train()
        train_losses = []
        for batch in train_loader_stock:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer_stock.zero_grad()
            outputs = model_stock(x_batch)
            loss = criterion_stock(outputs, y_batch)
            loss.backward()
            optimizer_stock.step()
            train_losses.append(loss.item())
        train_loss = np.mean(train_losses)
        tr_losses_fp.append(train_loss)

        # Validation
        model_stock.eval()
        val_losses = []
        with torch.no_grad():
            for batch in val_loader_stock:
                x_batch, y_batch = batch
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                outputs = model_stock(x_batch)
                loss = criterion_stock(outputs, y_batch)
                val_losses.append(loss.item())
        val_loss = np.mean(val_losses)
        val_losses_fp.append(val_loss)
        scheduler_stock.step(val_loss)

        if es(model_stock, val_loss):
            done = True

        # print loss every 5 epochs
#         if epoch_counter%5 == 0 :
#             print(f"Epoch {epoch_counter}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    last_val_losses_stock.append(val_loss)
    val_loss_stock = val_loss

    ### Plot Train-Val loss
#     path = f'../../plots/train_val_loss/transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_stock.png'
#     plt.plot(range(len(tr_losses_fp)),tr_losses_fp,color='blue',label='train_loss')
#     plt.plot(range(len(val_losses_fp)),val_losses_fp,color='red',label='val_loss')
#     plt.legend()
#     plt.savefig(path)
#     plt.clf() # clear current figure
    
    # evaluation
    model_stock.eval()
    with torch.no_grad():
        pred_stock = model_stock(x_test_stock.to(device))
    pred_stock = pred_stock.to('cpu').detach().numpy()
    pred_stock_reshape = np.reshape(pred_stock,(-1,1))
    
    ### Evaluate with error_ratio
    print(f'🔸EVAL START🔸 \n transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_stock')
    error_ratio_stock = error_ratio(pred_stock_reshape, y_test_SEQ)
    print('Error Ratio :',error_ratio_stock)
    ### R-squared, p-value
    # r_squared는 1에 가까울수록 모델이 좋은 것
    # p_value는 0.05 보다 작으면 유의미한 것
    # 두가지를 만족하지 못한 것은 과적합된 것
    r_squared_stock = sm.OLS(y_test_SEQ,sm.add_constant(pred_stock_reshape)).fit().rsquared
    p_value_stock = sm.OLS(y_test_SEQ,sm.add_constant(pred_stock_reshape)).fit().f_pvalue
    print("R-squared:", r_squared_stock)
    print("p-value:", p_value_stock)
    print('last validation loss:',val_loss)
    print('🔸EVAL END🔸')
    
    ### Save result
    path = f'../../data/results/transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_stock_0219.csv'

    result_stock = pd.DataFrame(data={'true':np.reshape(y_test_SEQ,(-1)),
                  'pred':np.reshape(pred_stock_reshape,(-1)),
                  'error_ratio':((pred_stock_reshape-y_test_SEQ)/y_test_SEQ).iloc[:,0].values},
                  index=y_test_SEQ.index)

    result_stock.to_csv(path)

    ### Plot Result
    r = len(np.reshape(y_test_SEQ,(-1)))
    plt.plot(list(range(r)),np.reshape(y_test_SEQ,(-1)), color='blue', label='true value')
    plt.plot(list(range(r)),np.reshape(pred_stock_reshape,(-1)), color='red',alpha=0.6, label='prediction')
    plt.title(f'transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_stock')
    plt.legend()
    fig_path = f'../../plots/results/0219_transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_stock.png'
    plt.savefig(fig_path,dpi=300)
#     plt.show()
    plt.clf()
    
#     actual_stock_price = stock_df[['Close','5MA','120MA']].reset_index().rename(columns={'Close':'true'})
#     actual_stock_price[['pred','error_ratio']] = 0
#     combined_df = pd.concat([actual_stock_price,
#                             result_stock.copy().reset_index()],
#                             axis=0)
#     combined_df['Date'] = pd.to_datetime(combined_df['Date'])
#     combined_df = combined_df.reset_index(drop=True)
#     combined_df

#     plt.figure(figsize=(20,10))
#     plt.plot(combined_df['Date'], combined_df['true'],
#              color='blue',label='Actual Stock Price')
#     plt.plot(combined_df['Date'][:-OUTPUT_SIZE],combined_df['5MA'][:-OUTPUT_SIZE],
#             color='purple',linestyle=':',label='5MA')
#     plt.plot(combined_df['Date'][:-OUTPUT_SIZE],combined_df['5MA'][:-OUTPUT_SIZE],
#             color='green',linestyle='--',label='120MA')
#     plt.plot(combined_df['Date'][-OUTPUT_SIZE:],combined_df['pred'][-OUTPUT_SIZE:],color='red',
#              label='Predicted Stock Price')
#     for i,row in combined_df.iterrows():
#         if not np.isnan(row['error_ratio']):
#             plt.annotate(f'{row["error_ratio"]*100:1f}%',(i,row['pred']),
#                      textcoords='offset points',xytext=(0,10),ha='center')

#     plt.title(f'transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_stock')
#     plt.xlabel('Date')
#     plt.ylabel('Price')
#     plt.legend()

#     fig_path = f'../../plots/results/transformer_seq{SEQUENCE_SIZE}_batch{BATCH_SIZE}_out{OUTPUT_SIZE}_hidden{HIDDEN_SIZE}_stock.png'
#     plt.savefig(fig_path,dpi=300)
# #     plt.show()
#     plt.clf()
    
    ### add results to final dataframe
    results_df = pd.DataFrame({
        'seq_size':[SEQUENCE_SIZE],
        'batch_size':[BATCH_SIZE],
        'output_size':[OUTPUT_SIZE],
        'hidden_size':[HIDDEN_SIZE],
        'error_ratio_total':[error_ratio_total],
        'r_squared_total':[r_squared_total],
        'p_value_total':[p_value_total],
        'error_ratio_stock':[error_ratio_stock],
        'r_squared_stock':[r_squared_stock],
        'p_value_stock':[p_value_stock]
    })
    final = pd.concat([final,results_df],axis=0)
    
    val_loss_df_temp = pd.DataFrame({
        'seq_size':[SEQUENCE_SIZE],
        'batch_size':[BATCH_SIZE],
        'output_size':[OUTPUT_SIZE],
        'hidden_size':[HIDDEN_SIZE],
        'val_loss_total':[val_loss_total],
        'val_loss_stock':[val_loss_stock]
    })
    val_loss_df = pd.concat([val_loss_df,val_loss_df_temp],axis=0)

KeyboardInterrupt: 

In [13]:
### save final 
# remove first row(dummy row)
final = final.iloc[1:].reset_index(drop=True)
print(final.shape)
final.to_csv('../../data/results/final_all_results_0219.csv')
val_loss_df = val_loss_df.iloc[1:].reset_index(drop=True)
print(val_loss_df.shape)
val_loss_df.to_csv('../../data/results/val_loss_results_0219.csv')

(90, 10)


In [16]:
final_df = pd.read_csv('../../data/results/final_all_results.csv',index_col=0)

In [17]:
final_df

Unnamed: 0,seq_size,batch_size,output_size,hidden_size,error_ratio_total,r_squared_total,p_value_total,error_ratio_stock,r_squared_stock,p_value_stock
0,30,1,10,64,0.203406,0.02420091,0.667815,0.04517,0.407733,0.04691567
1,30,1,10,128,0.157909,0.003318244,0.874408,0.02165,0.008583,0.7990683
2,30,1,10,256,0.031342,0.03007018,0.631873,0.089128,0.183508,0.2167777
3,30,1,20,64,0.121427,0.1325274,0.114582,0.052569,0.201426,0.04715242
4,30,1,20,128,0.100733,0.6809472,8e-06,0.056673,0.06863,0.2645186
5,30,1,20,256,0.177341,0.1919437,0.053345,0.064862,0.808851,6.945284e-08
6,30,5,10,64,0.184827,0.01806441,0.711245,0.033213,0.089823,0.4001782
7,30,5,10,128,0.190569,0.001371833,0.91909,0.044442,0.028878,0.6388165
8,30,5,10,256,0.188393,0.07151231,0.455092,0.106056,0.158785,0.2540497
9,30,5,20,64,0.219501,0.02530574,0.502924,0.067613,0.000251,0.9471068
