In [74]:
N_FOLDS = 10
BATCH_SIZE = 100
INPUT_DIM = 2
D_MODEL = 128
NHEAD = 2
NUM_LAYERS = 3

# Data preparation

In [5]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import mlflow

df = pd.read_csv('S&P_500/AAPL_diff.csv')
df = df.drop(columns=['Date', 'Name'])

We will use forward chaining KFold split strategy, <br>
For this sklearn has function called TimeSeriesSplit

In [6]:
from sklearn.model_selection import TimeSeriesSplit
splitter = TimeSeriesSplit(n_splits=N_FOLDS)

## !!!ADD MIN/MAX SCALER!!!

# Model definition

In [73]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len=80):
        super().__init__()
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1)) / d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].clone().detach()
        return x

In [65]:
# Define the transformer model
class TransformerModel(nn.Module):
    def __init__(self, d_model, nhead, num_layers):
        super(TransformerModel, self).__init__()

        self.d_model = d_model
        self.nhead = nhead
        self.num_layers = num_layers

        self.embedding = nn.Embedding(num_embeddings, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer_layers = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x, y):
        x = self.embedding(x)
        x = self.pos_encoder(x)
        x = self.transformer_layers(x, y)
        x = self.fc(x)
        return x 

In [21]:
def reset_weights(m):
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

We will use MSE loss because we want to penalty our model for big mistakes stronger, because It would lead us to lose money

## **remake sliding window/rethink transformer structure and wrap train in mlflow**

In [None]:
for fold_counter, (indices_train, indices_test) in enumerate(splitter.split(data)):
        train_loader = torch.utils.data.DataLoader(data[indices_train],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        test_loader = torch.utils.data.DataLoader(data[indices_test],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        for x in train_loader:
            print(x)
            break
        break

In [26]:
def train_kfolds(data, indices_generator, n_folds, d_model, nhead, num_layers):
    for fold_counter, (indices_train, indices_test) in enumerate(indices_generator):
        train_loader = torch.utils.data.DataLoader(data[indices_train],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        test_loader = torch.utils.data.DataLoader(data[indices_test],
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=4)
        #define model
        model = SimpleTransformer(input_dim, d_model, nhead, num_layers)
    
        # Define the loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        model.train()
        
        # Train the model
        num_epochs = 100
        with mlflow.start_run():
            mlflow.log_param("input_dim", input_dim)
            mlflow.log_param("d_model", d_model)
            mlflow.log_param("nhead", nhead)
            mlflow.log_param("num_layers", num_layers)
            for epoch in range(num_epochs):
                #for x in train_dataloader....
                
                # Forward pass
                y_pred = model(X_train)

                # Compute the loss
                loss = criterion(y_pred, X_train)
                mlflow.log_metric("train_loss", loss.item(), step=epoch)
                # Zero gradients, perform a backward pass, and update the weights
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                #for x in test_dataloader...
                #mlflow.log_metric("test_loader", loss.item(), step=epoch)
                
                if (epoch+1) % 10 == 0:
                    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

            torch.save({'state_dict': model.state_dict(),
                'fold_num': fold_counter},
               'models/{}_fold_transformer.pth'.format(str(fold_counter)))
            
            mlflow.sklearn.log_model(model,
                                     '{}_fold_transformer.pth'.format(str(fold_counter)))

In [None]:
train_kfolds(data, splitter.split(data), N_FOLDS, D_MODEL, NHEAD, NUM_LAYERS)

after all -- try to eval quality of ensemble of model's trained on different kfolds. 