In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from typing import Union
from tqdm import tqdm

import os 

while "notebooks" in os.getcwd():
    os.chdir("..")

from src.preprocessing.lstm.make_dataset import StockReturnsDataset

In [36]:
aapl_prices = yf.download(
    tickers= "AAPL",
    start="2000-12-30",
    end="2016-12-30"
)

[*********************100%***********************]  1 of 1 completed


In [37]:
prices_df = aapl_prices[["Close"]]

In [38]:
returns = prices_df\
    .pct_change(-1)\
    .dropna()

In [47]:
prices_df\
    .pct_change()\
    .head(30)

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2001-01-02,
2001-01-03,0.100841
2001-01-04,0.041985
2001-01-05,-0.040294
2001-01-08,0.01145
2001-01-09,0.037737
2001-01-10,-0.036365
2001-01-11,0.086794
2001-01-12,-0.045139
2001-01-16,-0.003636


In [52]:
past_returns = prices_df\
            .pct_change()\
            .dropna()
        
future_returns = prices_df\
    .pct_change(-1)\
    .dropna()\
    .Close

for i in range(10, 0, -1):
    past_returns[f"d-{i}"] = past_returns\
        ["Close"]\
        .shift(i)

    past_returns = past_returns.dropna()

idx = future_returns.index\
    .intersection(past_returns.index)

past_returns = past_returns\
    .reindex(idx)

future_returns = future_returns\
            .reindex(idx)

In [53]:
past_returns

Unnamed: 0_level_0,Close,d-10,d-9,d-8,d-7,d-6,d-5,d-4,d-3,d-2,d-1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2001-03-23,0.063582,-0.027028,-0.080247,0.050335,0.044728,-0.036695,-0.003177,0.047773,-0.042553,0.022221,0.074535
2001-03-26,-0.053042,-0.080247,0.050335,0.044728,-0.036695,-0.003177,0.047773,-0.042553,0.022221,0.074535,0.063582
2001-03-27,0.050045,0.050335,0.044728,-0.036695,-0.003177,0.047773,-0.042553,0.022221,0.074535,0.063582,-0.053042
2001-03-28,-0.030608,0.044728,-0.036695,-0.003177,0.047773,-0.042553,0.022221,0.074535,0.063582,-0.053042,0.050045
2001-03-29,0.016237,-0.036695,-0.003177,0.047773,-0.042553,0.022221,0.074535,0.063582,-0.053042,0.050045,-0.030608
...,...,...,...,...,...,...,...,...,...,...,...
2016-12-21,0.000941,0.009823,0.009817,0.016322,-0.005704,0.016681,0.000000,0.005469,0.001295,0.005777,0.002658
2016-12-22,-0.006578,0.009817,0.016322,-0.005704,0.016681,0.000000,0.005469,0.001295,0.005777,0.002658,0.000941
2016-12-23,0.001978,0.016322,-0.005704,0.016681,0.000000,0.005469,0.001295,0.005777,0.002658,0.000941,-0.006578
2016-12-27,0.006351,-0.005704,0.016681,0.000000,0.005469,0.001295,0.005777,0.002658,0.000941,-0.006578,0.001978


In [54]:
future_returns

Date
2001-03-23    0.056013
2001-03-26   -0.047660
2001-03-27    0.031574
2001-03-28   -0.015977
2001-03-29    0.020842
                ...   
2016-12-21    0.006621
2016-12-22   -0.001974
2016-12-23   -0.006311
2016-12-27    0.004282
2016-12-28    0.000257
Name: Close, Length: 3967, dtype: float64

In [63]:
(0.410714 - 0.386161)/0.410714 

0.05978125897826723

In [132]:
returns_dataset = StockReturnsDataset(prices_df, lookback = 100)

  .Close


In [133]:
returns_dataset.future_returns

Date
2001-05-29   -0.078716
2001-05-30    0.008595
2001-05-31    0.047119
2001-06-01   -0.011010
2001-06-04    0.013553
                ...   
2016-12-21   -0.006578
2016-12-22    0.001978
2016-12-23    0.006351
2016-12-27   -0.004264
2016-12-28   -0.000257
Name: Close, Length: 3922, dtype: float64

In [134]:
returns_dataloader = DataLoader(
    returns_dataset,
    batch_size= 1,
    shuffle=True
)

## Building LSTM model


In [124]:
lookback = 100
input_dim = 1
hidden_dim = 5
num_layers = 2
output_dim = 1
num_epochs = 20

In [125]:
class LSTM(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hidden_dim, 
                 num_layers, 
                 output_dim
                ):
        
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        # print(h0.shape, x.shape)
        out, (h, c) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :]) 
        return out

In [126]:
model = LSTM(
    input_dim,
    hidden_dim,
    num_layers,
    output_dim
)

In [127]:
treinable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

In [128]:
treinable_params

406

In [129]:
loss_fn = torch.nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

In [130]:
X, y

(tensor([[[-0.0560],
          [-0.0589],
          [ 0.0402],
          [-0.0441],
          [ 0.0107],
          [-0.0706],
          [ 0.0588],
          [ 0.0140],
          [-0.0188],
          [-0.0445]],
 
         [[ 0.0060],
          [ 0.0340],
          [ 0.0370],
          [ 0.0291],
          [ 0.0212],
          [-0.0291],
          [ 0.0115],
          [ 0.0279],
          [ 0.0054],
          [-0.0159]],
 
         [[ 0.0134],
          [-0.0184],
          [-0.0171],
          [ 0.0023],
          [-0.0049],
          [ 0.0320],
          [ 0.0565],
          [-0.0376],
          [-0.0263],
          [ 0.0237]],
 
         [[-0.0012],
          [-0.0169],
          [ 0.0105],
          [ 0.0021],
          [ 0.0103],
          [ 0.0158],
          [-0.0132],
          [ 0.0023],
          [-0.0090],
          [ 0.0301]],
 
         [[ 0.0199],
          [-0.0013],
          [-0.0141],
          [ 0.0198],
          [-0.0154],
          [ 0.0014],
          [-0.0149],
 

In [131]:
hist = np.zeros(num_epochs)

for t in (range(num_epochs)):
    for X, y in tqdm(returns_dataloader):
        y_train_pred = model(X)

        loss = loss_fn(y_train_pred, y)
        
        hist[t] = loss.item()
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    with torch.no_grad():
        y_pred = model(returns_dataset[:][0])
        y = returns_dataset[:][1]
        epoch_accuracy = (y_pred.sign() == y.sign()).sum()/y.size(0)

        print(f"Epoch {t+1} accuracy : {epoch_accuracy}")

  0%|          | 0/4012 [00:00<?, ?it/s]

100%|██████████| 4012/4012 [00:10<00:00, 385.01it/s]


Epoch 1 accuracy : 0.517198383808136


100%|██████████| 4012/4012 [00:10<00:00, 380.95it/s]


Epoch 2 accuracy : 0.47756728529930115


100%|██████████| 4012/4012 [00:11<00:00, 363.07it/s]


Epoch 3 accuracy : 0.517198383808136


100%|██████████| 4012/4012 [00:11<00:00, 357.94it/s]


Epoch 4 accuracy : 0.517198383808136


100%|██████████| 4012/4012 [00:10<00:00, 370.19it/s]


Epoch 5 accuracy : 0.517198383808136


100%|██████████| 4012/4012 [00:10<00:00, 382.62it/s]


Epoch 6 accuracy : 0.47756728529930115


100%|██████████| 4012/4012 [00:11<00:00, 364.26it/s]


Epoch 7 accuracy : 0.47756728529930115


100%|██████████| 4012/4012 [00:13<00:00, 289.26it/s]


Epoch 8 accuracy : 0.5196909308433533


100%|██████████| 4012/4012 [00:10<00:00, 395.20it/s]


Epoch 9 accuracy : 0.517198383808136


100%|██████████| 4012/4012 [00:13<00:00, 301.15it/s]


Epoch 10 accuracy : 0.5216849446296692


100%|██████████| 4012/4012 [00:10<00:00, 397.02it/s]


Epoch 11 accuracy : 0.5176969170570374


100%|██████████| 4012/4012 [00:12<00:00, 321.09it/s]


Epoch 12 accuracy : 0.5174476504325867


100%|██████████| 4012/4012 [00:13<00:00, 287.90it/s]


Epoch 13 accuracy : 0.518195390701294


100%|██████████| 4012/4012 [00:13<00:00, 306.48it/s]


Epoch 14 accuracy : 0.4975074827671051


100%|██████████| 4012/4012 [00:12<00:00, 327.83it/s]


Epoch 15 accuracy : 0.519940197467804


100%|██████████| 4012/4012 [00:10<00:00, 371.77it/s]


Epoch 16 accuracy : 0.5062313079833984


100%|██████████| 4012/4012 [00:12<00:00, 316.93it/s]


Epoch 17 accuracy : 0.5176969170570374


100%|██████████| 4012/4012 [00:11<00:00, 342.39it/s]


Epoch 18 accuracy : 0.517198383808136


100%|██████████| 4012/4012 [00:11<00:00, 340.73it/s]


Epoch 19 accuracy : 0.518943190574646


100%|██████████| 4012/4012 [00:10<00:00, 367.88it/s]


Epoch 20 accuracy : 0.48953139781951904


In [113]:
hist

array([6.20609440e-04, 1.21760252e-03, 3.61778715e-04, 1.43581280e-03,
       4.49815823e-04, 2.63162469e-03, 2.29830021e-05, 1.37747370e-03,
       1.28702712e-04, 1.52274035e-04, 1.19904277e-03, 1.67016638e-04,
       1.53039291e-05, 4.10288747e-04, 4.14957380e-04, 4.32084635e-05,
       2.35700369e-04, 1.12152286e-03, 3.36647063e-04, 2.39328248e-04])

In [40]:
y_pred = gru(returns_dataset[:][0])

In [45]:
y = returns_dataset[:][1]

In [49]:
(y_pred.sign() == y.sign()).sum()/y.size(0)

tensor(0.5223)

In [51]:
aapl_validation = yf.download(
    tickers= "AAPL",
    start="2016-12-30",
    end="2018-12-30"
)

[*********************100%***********************]  1 of 1 completed


In [54]:
returns_val = aapl_validation\
    .pct_change()\
    .dropna()