# Data preparation

In [8]:
import pandas as pd
import torch
import numpy as np
import mlflow
from torch import nn, Tensor
import math
from pandas_datareader import data as pdr
import yfinance as yfin
yfin.pdr_override()
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

In [14]:
curr_time = datetime.now()
start_time = datetime(curr_time.year - 5, curr_time.month , curr_time.day)

data = pdr.get_data_yahoo('AAPL', start_time, curr_time)

data.head(5)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-16,44.474998,44.8475,44.035,44.047501,41.88475,118263600
2018-01-17,44.037498,44.8125,43.767502,44.775002,42.576538,137547200
2018-01-18,44.842499,45.025002,44.5625,44.814999,42.614559,124773600
2018-01-19,44.6525,44.895,44.352501,44.615002,42.424389,129700400
2018-01-22,44.325001,44.445,44.150002,44.25,42.077324,108434400


In [15]:
data.isnull().any()

Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

# Dataset definition

In [110]:
class TransformerDataset(torch.utils.data.Dataset):
    def __init__(self, data, indices, enc_seq_len,
                 dec_seq_len, target_seq_len, normalize=False):
        self.data = data
        self.indices = indices
        self.enc_seq_len = enc_seq_len
        self.dec_seq_len = dec_seq_len
        self.target_seq_len = target_seq_len
        if normalize:
            self.data = (self.data-self.data.mean())/self.data.std()
        
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, index):
        """
        Get tuple: [src, trg, trg_y]
        """
        starting_index = self.indices[index][0]
        ending_index = self.indices[index][1]
        
        sequence = self.data[starting_index:ending_index]
        
        return self.get_src_trg(sequence)
        
    def get_src_trg(self, sequence):
        assert(len(sequence)==self.dec_seq_len+self.target_seq_len)
        
        src = sequence[:self.enc_seq_len]
        trg = sequence[self.enc_seq_len-1:len(sequence)-1]
        
        assert(len(trg) == self.target_seq_len)
        trg_y = sequence[-self.target_seq_len:]
        
        assert(len(trg_y) == self.target_seq_len)
        src = torch.tensor(src.values.astype(np.float32))
        trg = torch.tensor(trg.values.astype(np.float32))
        trg_y = torch.tensor(trg_y.values.astype(np.float32))
        return src, trg, trg_y

# Model definition

In [111]:
class PositionalEncoder(nn.Module):
    def __init__(self, dropout=0.1, max_seq_len=5000, d_model=512, batch_first=False):
        super().__init__()
        
        self.d_model = d_model    
        self.dropout = nn.Dropout(p=dropout)
        self.batch_first = batch_first
        
        self.x_dim = 1 if batch_first else 0
        
        position = torch.arange(max_seq_len).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        pe = torch.zeros(max_seq_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position*div_term)
        pe[:, 0, 1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:x.size(self.x_dim)]
        
        return self.dropout(x)

In [112]:
class TransformerModel(nn.Module):
    def __init__(self, input_size,
                 nhead=8,
                 d_model=512,
                 batch_first=False,
                 out_seq_len=10,
                 n_encoder_layers=4,
                 n_decoder_layers=4,
                 dropout_enc=.2,
                 dropout_dec=.2,
                 dropout_pos_enc=.1,
                 dim_feedforward_encoder=2048,
                 dim_feedforward_decoder=2048,
                 n_predicted_features=1):
        super(TransformerModel, self).__init__()
        
        self.d_model = d_model
        self.nhead = nhead
        self.input_size = input_size
        
        self.encoder_input_layer = nn.Linear(
            in_features=input_size, 
            out_features=d_model
        )
        
        encoder_layer = torch.nn.TransformerEncoderLayer(d_model = d_model,
                                                         nhead = nhead,
                                                         dropout = dropout_enc,
                                                         dim_feedforward = dim_feedforward_encoder,)
        self.encoder = torch.nn.TransformerEncoder(encoder_layer,
                                                   num_layers=n_encoder_layers,
                                                   norm=None)
        
        self.decoder_input_layer = nn.Linear(
            in_features = n_predicted_features,
            out_features = d_model
        )
        
        decoder_layer = torch.nn.TransformerDecoderLayer(d_model = d_model,
                                                         nhead = nhead,
                                                         dropout = dropout_dec,
                                                         dim_feedforward = dim_feedforward_decoder,)
        self.decoder = torch.nn.TransformerDecoder(decoder_layer, num_layers = n_decoder_layers, norm = None)
        
        self.linear_mapping = nn.Linear(
            in_features = d_model,
            out_features = n_predicted_features
        )
        
        self.positional_encoding_layer = PositionalEncoder(
            d_model=d_model,
            dropout=dropout_pos_enc
        )
        
    def forward(self, src, tgt, src_mask = None, tgt_mask = None):
        print(src.type())
        src = self.encoder_input_layer(src) #[Batch_size, src_len, dim_val]
        
        src = self.positional_encoding_layer(src) #[Batch_size, src_len, dim_val]
        
        src = self.encoder(src=src) #[batch_size, enc_seq_len, dim_val]
        
        decoder_out = self.decoder_input_layer(tgt) #[target sequence length, batch_size, dim_val]
        
        decoder_out = self.decoder( # [batch_size, target seq len, dim_val]
            tgt = decoder_out,
            memory = src,
            tgt_mask = tgt_mask,
            src_mask = src_mask
        )
        
        decoder_out = self.linear_mapping(decoder_out) #[batch_size, target seq len]

# Utility functions

In [113]:
def get_src_trg(self, sequence, enc_seq_len, target_seq_len):
        assert (len(sequence) == enc_seq_len + target_seq_len)
        
        src = sequence[:enc_seq_len] 
        
        # decoder input. As per the paper, it must have the same dimension as the 
        # target sequence, and it must contain the last value of src, and all
        # values of trg_y except the last (i.e. it must be shifted right by 1)
        trg = sequence[enc_seq_len-1:len(sequence)-1]

        trg = trg[:, 0]

        if len(trg.shape) == 1:
            trg = trg.unsqueeze(-1)
        
        assert (len(trg) == target_seq_len)

        # The target sequence against which the model output will be compared to compute loss
        trg_y = sequence[-target_seq_len:]

        trg_y = trg_y[:, 0]
        
        assert (len(trg_y) == target_seq_len)

        return src, trg, trg_y.squeeze(-1)

In [114]:
def generate_square_subsequent_mask(target_sequence_length, encoder_sequence_length):
    """
    Generates an upper-triangular matrix of -inf, with zeros on diag.
    """
    return torch.triu(torch.ones(target_sequence_length, encoder_sequence_length) * float('-inf'), diagonal=1)

In [115]:
def get_indices_entire_sequence(data: pd.DataFrame, window_size, step_size):
        stop_position = len(data)-1 # 1- because of 0 indexing        
        # Start the first sub-sequence at index position 0
        subseq_first_idx = 0
        
        subseq_last_idx = window_size
        
        indices = []
        
        while subseq_last_idx <= stop_position:

            indices.append((subseq_first_idx, subseq_last_idx))
            
            subseq_first_idx += step_size
            
            subseq_last_idx += step_size

        return indices

In [116]:
def reset_weights(m):
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

### train/test split

In [142]:
train_size=0.85
train_size_abs = int(len(data)*train_size)

train_data = data[0:train_size_abs]
test_data = data[train_size_abs:]

In [144]:
#Model parameters
n_epochs = 100
lr = 1e-3
dim_val = 512 
n_heads = 8 
n_decoder_layers = 4
n_encoder_layers = 4
input_size = 6
dec_seq_len = 50
enc_seq_len = 50
output_sequence_length = 10
window_size = enc_seq_len+output_sequence_length
max_seq_len = enc_seq_len 
batch_first = False
normalize = False
batch_size=10
dropout_dec = .2
dropout_enc = .2
dropout_pos_enc = .1

In [143]:
training_indices = get_indices_entire_sequence(train_data, window_size, 1)
training_dataset = TransformerDataset(train_data,
                                      training_indices,
                                      enc_seq_len,
                                      dec_seq_len,
                                      output_sequence_length,
                                      normalize=normalize)

training_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size)

test_indices = get_indices_entire_sequence(test_data, window_size, 1)
test_dataset = TransformerDataset(test_data,
                                      test_indices,
                                      enc_seq_len,
                                      dec_seq_len,
                                      output_sequence_length,
                                      normalize=normalize)

test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size)

trg_mask = generate_square_subsequent_mask(
    output_sequence_length,
    output_sequence_length
   )

src_mask = generate_square_subsequent_mask(
    output_sequence_length,
    enc_seq_len
    )

model = TransformerModel(
    d_model = dim_val,
    input_size = input_size, 
    out_seq_len = output_sequence_length, 
    n_decoder_layers = n_decoder_layers,
    n_encoder_layers = n_encoder_layers,
    nhead = n_heads,
    dropout_dec = dropout_dec,
    dropout_enc = dropout_enc,
    dropout_pos_enc = dropout_pos_enc,
    batch_first = batch_first,
    )

# Train

We will use MSE loss because we want to penalty our model for big mistakes stronger, because It would lead us to lose money

In [139]:
model.train()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(n_epochs):    
    for src, trg, trg_y in training_dataloader:
        optimizer.zero_grad()

        src = src.permute(1, 0, 2)
        trg = trg.permute(1, 0, 2)

        output = m(src, trg)
        
        loss = criterion(output, trg_y)
        
        loss.backward()
        optimizer.step()
        
    
        if (epoch+1) % 10 == 0:
                    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
                
torch.save({'state_dict': model.state_dict()},
       'models/transformer.pth')

mlflow.sklearn.log_model(model,
         'transformer.pth')

In [26]:
def train_kfolds(data, indices_generator, config):
    for fold_counter, (indices_train, indices_test) in enumerate(indices_generator):
        train_loader = torch.utils.data.DataLoader(data[indices_train],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        test_loader = torch.utils.data.DataLoader(data[indices_test],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        #define model
        model = SimpleTransformer(input_dim, d_model, nhead, num_layers)
    
        # Define the loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        model.train()
        
        # Train the model
        num_epochs = 100
        with mlflow.start_run():
            mlflow.log_param("input_dim", input_dim)
            mlflow.log_param("d_model", d_model)
            mlflow.log_param("nhead", nhead)
            mlflow.log_param("num_layers", num_layers)
            for epoch in range(num_epochs):
                #for x in train_dataloader....
                
                # Forward pass
                y_pred = model(X_train)

                # Compute the loss
                loss = criterion(y_pred, X_train)
                mlflow.log_metric("train_loss", loss.item(), step=epoch)
                # Zero gradients, perform a backward pass, and update the weights
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                #for x in test_dataloader...
                #mlflow.log_metric("test_loader", loss.item(), step=epoch)
                
                if (epoch+1) % 10 == 0:
                    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

            torch.save({'state_dict': model.state_dict(),
                'fold_num': fold_counter},
               'models/{}_fold_transformer.pth'.format(str(fold_counter)))
            
            mlflow.sklearn.log_model(model,
                                     '{}_fold_transformer.pth'.format(str(fold_counter)))