# Data preparation

In [5]:
import pandas as pd
import torch
import numpy as np
import mlflow
from torch import nn, Tensor
import math

df = pd.read_csv('S&P_500/AAPL_diff.csv')
df = df.drop(columns=['Date', 'Name'])

We will use forward chaining KFold split strategy, <br>
For this sklearn has function called TimeSeriesSplit

In [6]:
from sklearn.model_selection import TimeSeriesSplit
splitter = TimeSeriesSplit(n_splits=N_FOLDS)

## !!!ADD MIN/MAX SCALER!!!

# Model definition

In [76]:
class PositionalEncoder(nn.Module):
    def __init__(self, dropout=0.1, max_seq_len=5000, d_model=512, batch_first=False):
        super().__init__()
        
        self.d_model = d_model    
        self.dropout = nn.Dropout(p=dropout)
        self.batch_first = batch_first
        
        self.x_dim = 1 if batch_first else 0
        
        position = torch.arange(max_seq_len).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        pe = torch.zeros(max_seq_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position*div_term)
        pe[:, 0, 1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:x.size(self.x_dim)]
        
        return self.dropout(x)

In [79]:
class TransformerModel(nn.Module):
    def __init__(self, input_size,
                 nhead=8,
                 num_layers=2,
                 d_model=512,
                 batch_first=False,
                 out_seq_len=10,
                 n_encoder_layers=4,
                 n_decoder_layers=4,
                 dropout_enc=.2,
                 dropout_dec=.2,
                 dropout_pos_enc=.1,
                 dim_feedforward_encoder=2048,
                 dim_feedforward_decoder=2048,
                 n_predicted_features=1):
        super(TransformerModel, self).__init__()
        
        self.d_model = d_model
        self.nhead = nhead
        self.num_layers = num_layers
        self.input_size = input_size
        self.deq_seq_len = deq_seq_len
        
        self.encoder_input_layer = nn.Linear(
            in_features=input_size, 
            out_features=d_model
        )
        
        encoder_layer = torch.nn.TransformerEncoderLayer(d_model = d_model,
                                                         nhead = nhead,
                                                         dropout = dropout_enc,
                                                         dim_feedforward = dim_feedforward_encoder,
                                                         batch_first = batch_first)
        self.encoder = torch.nn.TransformerEncoder(encoder_layer,
                                                   num_layers=n_encoder_layers,
                                                   norm=None)
        
        self.decoder_input_layer = nn.Linear(
            in_features = n_predicted_features,
            out_features = d_model
        )
        
        decoder_layer = torch.nn.TransformerDecoderLayer(d_model = d_model,
                                                         nhead = nhead,
                                                         dropout = dropout_dec,
                                                         dim_feedforward = dim_feedforward_decoder,
                                                         batch_first = batch_first)
        self.decoder = torch.nn.TransformerDecoder(decoder_layer, num_layers = n_decoder_layers, norm = None)
        
        self.linear_mapping = nn.Linear(
            in_features = d_model,
            out_features = n_predicted_features
        )
        
        self.positional_encoding_layer = PositionalEncoder(
            d_model=d_model,
            dropout=dropout_pos_enc
        )
        
    def forward(self, src, tgt, src_mask = None, tgt_mask = None):
        src = self.encoder_input_layer(src) #[Batch_size, src_len, dim_val]
        
        src = self.positional_encoding_layer(src) #[Batch_size, src_len, dim_val]
        
        src = self.encoder(src=src) #[batch_size, enc_seq_len, dim_val]
        
        decoder_out = self.decoder_input_layer(tgt) #[target sequence length, batch_size, dim_val]
        
        decoder_out = self.decoder( # [batch_size, target seq len, dim_val]
            tgt = decoder_out,
            memory = src,
            tgt_mask = tgt_mask,
            src_mask = src_mask
        )
        
        decoder_out = self.linear_mapping(decoder_out) #[batch_size, target seq len]

# Utility functions

In [80]:
def get_src_trg(self, sequence, enc_seq_len, target_seq_len):
        assert (len(sequence) == enc_seq_len + target_seq_len)
        
        src = sequence[:enc_seq_len] 
        
        # decoder input. As per the paper, it must have the same dimension as the 
        # target sequence, and it must contain the last value of src, and all
        # values of trg_y except the last (i.e. it must be shifted right by 1)
        trg = sequence[enc_seq_len-1:len(sequence)-1]

        trg = trg[:, 0]

        if len(trg.shape) == 1:
            trg = trg.unsqueeze(-1)
        
        assert (len(trg) == target_seq_len)

        # The target sequence against which the model output will be compared to compute loss
        trg_y = sequence[-target_seq_len:]

        trg_y = trg_y[:, 0]
        
        assert (len(trg_y) == target_seq_len)

        return src, trg, trg_y.squeeze(-1)

In [None]:
def generate_square_subsequent_mask(target_sequence_length, encoder_sequence_length):
    """
    Generates an upper-triangular matrix of -inf, with zeros on diag.
    """
    return torch.triu(torch.ones(target_sequence_length, encoder_sequence_length) * float('-inf'), diagonal=1)

In [21]:
def reset_weights(m):
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

In [None]:
#Model parameters
dim_val = 512 
n_heads = 8 
n_decoder_layers = 4
n_encoder_layers = 4
input_size = 2
dec_seq_len = 100 
enc_seq_len = 100 
output_sequence_length = 10
max_seq_len = enc_seq_len 

# Make src mask for decoder with size:
tgt_mask = generate_square_subsequent_mask(
    dim1 = output_sequence_length,
    dim2 = output_sequence_length
   )

src_mask = generate_square_subsequent_mask(
    dim1 = output_sequence_length,
    dim2 = enc_seq_len
    )

model = TransformerModel(
    dim_val = dim_val,
    input_size = input_size, 
    dec_seq_len = dec_seq_len,
    max_seq_len = max_seq_len,
    out_seq_len = output_sequence_length, 
    n_decoder_layers = n_decoder_layers,
    n_encoder_layers = n_encoder_layers,
    n_heads = n_heads)

We will use MSE loss because we want to penalty our model for big mistakes stronger, because It would lead us to lose money

## **remake sliding window/rethink transformer structure and wrap train in mlflow**

In [None]:
for fold_counter, (indices_train, indices_test) in enumerate(splitter.split(data)):
        train_loader = torch.utils.data.DataLoader(data[indices_train],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        test_loader = torch.utils.data.DataLoader(data[indices_test],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        for x in train_loader:
            print(x)
            break
        break

In [26]:
def train_kfolds(data, indices_generator, config):
    for fold_counter, (indices_train, indices_test) in enumerate(indices_generator):
        train_loader = torch.utils.data.DataLoader(data[indices_train],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        test_loader = torch.utils.data.DataLoader(data[indices_test],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        #define model
        model = SimpleTransformer(input_dim, d_model, nhead, num_layers)
    
        # Define the loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        model.train()
        
        # Train the model
        num_epochs = 100
        with mlflow.start_run():
            mlflow.log_param("input_dim", input_dim)
            mlflow.log_param("d_model", d_model)
            mlflow.log_param("nhead", nhead)
            mlflow.log_param("num_layers", num_layers)
            for epoch in range(num_epochs):
                #for x in train_dataloader....
                
                # Forward pass
                y_pred = model(X_train)

                # Compute the loss
                loss = criterion(y_pred, X_train)
                mlflow.log_metric("train_loss", loss.item(), step=epoch)
                # Zero gradients, perform a backward pass, and update the weights
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                #for x in test_dataloader...
                #mlflow.log_metric("test_loader", loss.item(), step=epoch)
                
                if (epoch+1) % 10 == 0:
                    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

            torch.save({'state_dict': model.state_dict(),
                'fold_num': fold_counter},
               'models/{}_fold_transformer.pth'.format(str(fold_counter)))
            
            mlflow.sklearn.log_model(model,
                                     '{}_fold_transformer.pth'.format(str(fold_counter)))

In [None]:
train_kfolds(data, splitter.split(data), config: dict)

after all -- try to eval quality of ensemble of model's trained on different kfolds. 