In [1]:
# Mount into drive

from google.colab import drive

drive.mount("/content/drive")

%cd '/content/drive/MyDrive/Colab Notebooks/Money Printer/'

# !pip install -r requirements.txt
# !pip install pandas
# !pip install -U scikit-learn
# !pip install torchinfo

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Money Printer


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pathlib import Path

# Data Loader Script

In [4]:
def get_data_loaders(device, label="label_1", batch_size=64, use_custom_cols=True, window_size=10, flatten=True):
    train_data = pd.read_csv("./data/Train_NoAuction_Zscore_test.csv")
    test_data = pd.read_csv("./data/Test_NoAuction_Zscore_test.csv")

    X_train = train_data.drop(
        columns=["label_1", "label_2", "label_3", "label_5", "label_10"])
    X_test = test_data.drop(
        columns=["label_1", "label_2", "label_3", "label_5", "label_10"])

    y_train = train_data[label] - 1
    y_test = test_data[label] - 1
    if not use_custom_cols:
        X_train = X_train.iloc[:, :40]
        X_test = X_test.iloc[:, :40]

    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()

    # sliding window
    if window_size != 1:
        D = X_train.shape[1]
        X_train = np.lib.stride_tricks.sliding_window_view(
            X_train, (window_size, D))
        if flatten:
          X_train = X_train.reshape((-1, window_size*D))
        y_train = y_train[window_size-1:]

        X_test = np.lib.stride_tricks.sliding_window_view(
            X_test, (window_size, D)).squeeze()
        if flatten:
            X_test = X_test.reshape((-1, window_size*D))
        y_test = y_test[window_size-1:]

    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, shuffle=True, test_size=0.2)

    X_train_tensor = torch.tensor(
        X_train, dtype=torch.float32).to(device)
    X_val_tensor = torch.tensor(
        X_val, dtype=torch.float32).to(device)
    # Unsqueeze here for DeepLOB and TransLOB models:
    X_test_tensor = torch.tensor(
        X_test, dtype=torch.float32).unsqueeze(1).to(device)


    y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

# Model Script:

In [5]:
class MultiHeadSelfAttention(nn.Module):
    """
    Multi-head Self-Attention layer in PyTorch.
    """
    def __init__(self, d_model: int, num_heads: int, use_masking: bool = False):
        """
        :param d_model: Dimensionality of the input features.
        :param num_heads: Number of attention heads.
        :param use_masking: Whether to apply causal masking for autoregressive tasks.
        """
        super(MultiHeadSelfAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads."

        self.d_model = d_model
        self.num_heads = num_heads
        self.use_masking = use_masking
        self.depth = d_model // num_heads

        # Query, Key, and Value projections
        self.qkv_proj = nn.Linear(d_model, d_model * 3, bias=False)

        # Output projection
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x: torch.Tensor):
        """
        Forward pass for multi-head self-attention.
        :param x: Input tensor of shape (batch_size, seq_len, d_model).
        :return: Output tensor of shape (batch_size, seq_len, d_model).
        """
        batch_size, seq_len, _ = x.size()

        # Project inputs to Query, Key, and Value tensors
        qkv = self.qkv_proj(x)  # (batch_size, seq_len, d_model * 3)
        q, k, v = torch.chunk(qkv, chunks=3, dim=-1)

        # Reshape for multi-head attention
        q = q.view(batch_size, seq_len, self.num_heads, self.depth).permute(0, 2, 1, 3)
        k = k.view(batch_size, seq_len, self.num_heads, self.depth).permute(0, 2, 3, 1)
        v = v.view(batch_size, seq_len, self.num_heads, self.depth).permute(0, 2, 1, 3)

        # Scaled dot-product attention
        scores = torch.matmul(q, k) / np.sqrt(self.depth)  # (batch_size, num_heads, seq_len, seq_len)

        if self.use_masking:
            mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)
            scores = scores.masked_fill(mask, float('-inf'))

        attention_weights = F.softmax(scores, dim=-1)
        attention = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len, depth)

        # Reshape and combine heads
        attention = attention.permute(0, 2, 1, 3).contiguous()  # (batch_size, seq_len, num_heads, depth)
        attention = attention.view(batch_size, seq_len, self.d_model)  # (batch_size, seq_len, d_model)

        # Apply output projection
        output = self.out_proj(attention)
        return output

In [6]:
class LayerNormalization(nn.Module):
    """
    Implementation of Layer Normalization.
    """
    def __init__(self, d_model: int, eps=1e-5):
        """
        :param d_model: Dimensionality of the input.
        :param eps: A small epsilon to avoid division by zero.
        """
        super(LayerNormalization, self).__init__()
        self.gain = nn.Parameter(torch.ones(d_model))
        self.bias = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        normalized = (x - mean) / torch.sqrt(variance + self.eps)
        return self.gain * normalized + self.bias


class TransformerTransition(nn.Module):
    """
    Transformer transition function with feed-forward layers.
    """
    def __init__(self, d_model: int, size_multiplier: int = 4, activation=F.relu):
        """
        :param d_model: Dimensionality of the input/output.
        :param size_multiplier: Multiplier for the hidden layer size.
        :param activation: Activation function.
        """
        super(TransformerTransition, self).__init__()
        self.activation = activation
        self.hidden_layer = nn.Linear(d_model, size_multiplier * d_model)
        self.output_layer = nn.Linear(size_multiplier * d_model, d_model)

    def forward(self, x):
        x = self.activation(self.hidden_layer(x))
        x = self.output_layer(x)
        return x


class TransformerBlock(nn.Module):
    """
    A single Transformer block with self-attention, residual connections,
    normalization, and a feed-forward transition layer.
    """
    def __init__(self, d_model: int, num_heads: int, use_masking: bool = True, size_multiplier: int = 4, dropout_rate: float = 0.1):
        """
        :param d_model: Dimensionality of the model.
        :param num_heads: Number of attention heads.
        :param use_masking: Whether to apply masking in the attention mechanism.
        :param size_multiplier: Multiplier for the hidden size in the feed-forward layer.
        :param dropout_rate: Dropout probability.
        """
        super(TransformerBlock, self).__init__()
        self.self_attention = MultiHeadSelfAttention(d_model, num_heads, use_masking)
        self.norm1 = LayerNormalization(d_model)
        self.norm2 = LayerNormalization(d_model)
        self.transition = TransformerTransition(d_model, size_multiplier)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # Self-attention with residual connection and layer normalization
        attention_output = self.self_attention(x)
        attention_output = self.dropout(attention_output)
        x = self.norm1(x + attention_output)

        # Feed-forward transition with residual connection and layer normalization
        transition_output = self.transition(x)
        transition_output = self.dropout(transition_output)
        x = self.norm2(x + transition_output)

        return x

In [7]:
class CausalConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, dilation):
        super(CausalConv1d, self).__init__()
        self.kernel_size = kernel_size
        self.dilation = dilation
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, dilation=dilation)

    def forward(self, x):
        # Calculate the padding for causal convolution with dilation
        padding = (self.kernel_size - 1) * self.dilation
        # Pad the input tensor on the left (only past context)
        x = F.pad(x, (padding, 0), mode='constant', value=0)
        # Perform the convolution
        return self.conv(x)

In [8]:
class TransLOB(nn.Module):
    def __init__(self, n_classes=3):
        super(TransLOB, self).__init__()

        self.n_classes = n_classes
        # 1. Convolutional layers
        ## 1st convolution:

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )

        ## 2nd:
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
        )

        ## 3rd:
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,10)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )

        # 2. Inception Layer:
        ##1st inc 1x1 3x1
        self.inc1 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1, 1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3, 1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )

        ##2nd inc 1x1 5x1
        self.inc2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1, 1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(5, 1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )

        ##3nd inc max_pool 1x1
        self.inc3 = nn.Sequential(
            nn.MaxPool2d(kernel_size=(3, 1), stride=1, padding=(1, 0)),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1, 1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )

        #Dilated CNN:
        self.dilated = nn.Sequential(
            CausalConv1d(192, 14, kernel_size=2, stride=1, dilation=1),
            nn.ReLU(),
            CausalConv1d(14, 14, kernel_size=2, stride=1, dilation=2),
            nn.ReLU(),
            CausalConv1d(14, 14, kernel_size=2, stride=1, dilation=4),
            nn.ReLU(),
            CausalConv1d(14, 14, kernel_size=2, stride=1, dilation=8),
            nn.ReLU(),
            CausalConv1d(14, 14, kernel_size=2, stride=1, dilation=16),
            nn.ReLU(),
        )

        #Layer Normalization block:
        # self.norm1 = nn.LayerNorm()

        #Transformer block
        self.transformer1 = TransformerBlock(d_model=15, num_heads=3)
        self.transformer2 = TransformerBlock(d_model=15, num_heads=3)

        #Feed forward block (MLP)
        self.fc1 = nn.Linear(15*82,64)
        self.dropout = nn.Dropout(p=0.1)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, self.n_classes)
        self.softmax = nn.Softmax(dim=1)

    def positional_encoding(self, x):
        """
        Adds positional encoding to the input tensor in PyTorch.
        :param x: Input tensor of shape (batch_size, steps, d_model)
        :return: Tensor with added positional encoding (batch_size, steps, d_model+1)
        """
        # Extract sequence length and model dimension
        steps, d_model = x.shape[1], x.shape[2]

        # Compute positional encoding
        ps = torch.linspace(-1, 1, steps, dtype=x.dtype, device=x.device).view(-1, 1)  # Shape: (steps, 1)
        ps = ps.unsqueeze(0).expand(x.size(0), -1, -1)  # Expand to batch size, Shape: (batch_size, steps, 1)

        # Concatenate positional encoding to the input
        x = torch.cat([x, ps], dim=-1)  # Shape: (batch_size, steps, d_model + 1)

        return x

    def forward(self, x):
        #Normal CNN
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)

        #Inception:
        inception1 = self.inc1(x)
        inception2 = self.inc2(x)
        inception3 = self.inc3(x)
        x = torch.cat((inception1, inception2, inception3), dim=1)

        # Reshape for Dilated CNN (from 4D to 3D)
        batch, channels, height, width = x.size()
        x = x.view(batch, channels, -1)


        #Dilated CNN:
        x = self.dilated(x)
        # x = self.norm1(x)


        #Positional encoding:
        x = x.permute(0, 2, 1)
        x = self.positional_encoding(x)
        # print(x.shape)

        #Transformer block:
        x = self.transformer1(x)
        x = self.transformer2(x)

        #MLP and output:
        x = x.view(x.size(0), -1)
        x = self.dropout(self.fc1(x))
        x = self.relu(x)
        x = self.fc2(x)
        output = self.softmax(x)

        return output

# Training and Testing Script

In [9]:
def test_model(model, test_loader, experiment_name, save=True, display=False):
    test_predictions = []
    test_labels = []

    model.eval()
    with torch.no_grad():
        for batch, (X, y) in enumerate(test_loader):
          pred = model(X)
          _, predicted = torch.max(pred, 1)
          test_predictions.extend(predicted.cpu().numpy())
          test_labels.extend(y.cpu().numpy())
    report = classification_report(test_labels, test_predictions)
    if display:
        print(report)
    if save:
        with open(f"results/{experiment_name}/report.txt", "w") as f:
            f.write(report)


def visualize_curves(train_losses, val_losses, train_accuracies, val_accuracies, experiment_name, save=True, display=False):
    plt.plot(np.arange(epochs), train_losses, label="train")
    plt.plot(np.arange(epochs), val_losses, label="val")
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.title(f"{experiment_name} loss curves")
    plt.legend()
    if display:
        plt.show()
    if save:
        plt.savefig(f"results/{experiment_name}/loss_curves.png")
    plt.close()

    plt.plot(np.arange(epochs), train_accuracies, label="train")
    plt.plot(np.arange(epochs), val_accuracies, label="val")
    plt.xlabel("epoch")
    plt.ylabel("accuracy")
    plt.title(f"{experiment_name} accuracy curves")
    plt.legend()
    if display:
        plt.show()
    if save:
        plt.savefig(f"results/{experiment_name}/accuracy_curves.png")
    plt.close()


def train_and_evaluate_model(train_loader, val_loader, test_loader, experiment_name, optimizer, criterion, epochs):
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for i in range(epochs):
        model.train()
        running_train_loss = 0
        train_correct_predictions = 0
        train_total_samples = 0
        for batch, (X, y) in enumerate(train_loader):
            pred = model(X)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            running_train_loss += loss.item()

            _, predicted = torch.max(pred, 1)
            train_correct_predictions += (predicted == y).sum().item()
            train_total_samples += y.size(0)

        print(f"Epoch {i} train loss {running_train_loss/len(train_loader)}")
        train_losses.append(running_train_loss/len(train_loader))

        train_accuracy = train_correct_predictions / train_total_samples
        train_accuracies.append(train_accuracy)
        print(f"Epoch {i} train accuracy {train_accuracy}")

        val_correct_predictions = 0
        val_total_samples = 0
        model.eval()
        with torch.no_grad():
            running_val_loss = 0
            for batch, (X, y) in enumerate(val_loader):
                pred = model(X)
                loss = criterion(pred, y)
                running_val_loss += loss.item()

                _, predicted = torch.max(pred, 1)
                val_correct_predictions += (predicted == y).sum().item()
                val_total_samples += y.size(0)

            print(f"Epoch {i} val loss {running_val_loss/len(val_loader)}")
            val_losses.append(running_val_loss/len(val_loader))

            val_accuracy = val_correct_predictions / val_total_samples
            val_accuracies.append(val_accuracy)
            print(f"Epoch {i} val accuracy {val_accuracy}")

    test_model(model, test_loader, experiment_name)
    visualize_curves(train_losses, val_losses,
                     train_accuracies, val_accuracies, experiment_name)

In [10]:
# get device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# hyperparameters
learning_rate = 5e-4
batch_size = 64
epochs = 50
window_size = 100

# model definition
# model = MLP(input_size=NUM_FEATURES*window_size, hidden_size=64)
model = TransLOB(n_classes = 3)
model = model.to(device)

# experiment name
experiment_name = "translob-s10"

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for label in ["label_1", "label_2", "label_3", "label_5", "label_10"]:
    new_experiment_name = experiment_name + f"/{label}"
    Path(
        f"results/{new_experiment_name}").mkdir(parents=True, exist_ok=True)

    # get data loaders
    train_loader, val_loader, test_loader = get_data_loaders(device=device, window_size=100, batch_size=64,
                                                              flatten=False,use_custom_cols=False)

    # training and evaluation
    train_and_evaluate_model(train_loader, val_loader,
                              test_loader, new_experiment_name, optimizer, criterion, epochs)

Using cuda device
Epoch 0 train loss 0.9520773955553126
Epoch 0 train accuracy 0.5939498278406297
Epoch 0 val loss 1.0297817424871027
Epoch 0 val accuracy 0.40304930529939753
Epoch 1 train loss 0.9456025285898819
Epoch 1 train accuracy 0.5962862764387604
Epoch 1 val loss 0.960586444940418
Epoch 1 val accuracy 0.5855157998278618


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 0 train loss 0.9461910195341279
Epoch 0 train accuracy 0.5943187407771766
Epoch 0 val loss 1.0011821035295725
Epoch 0 val accuracy 0.5911717693348087
Epoch 1 train loss 0.9449394047845792
Epoch 1 train accuracy 0.5938883423512051
Epoch 1 val loss 0.9729921412654221
Epoch 1 val accuracy 0.5911717693348087


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 0 train loss 0.9454477850955221
Epoch 0 train accuracy 0.5945031972454501
Epoch 0 val loss 0.9809569735080004
Epoch 0 val accuracy 0.5976884298536825
Epoch 1 train loss 0.9476392106363488
Epoch 1 train accuracy 0.5925664043285784
Epoch 1 val loss 0.9551817998290062
Epoch 1 val accuracy 0.5976884298536825


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 0 train loss 0.9457248271331338
Epoch 0 train accuracy 0.5929660600098376
Epoch 0 val loss 0.9425955102778971
Epoch 0 val accuracy 0.6043280462314029
Epoch 1 train loss 0.943529123875856
Epoch 1 train accuracy 0.5949028529267093
Epoch 1 val loss 1.1585507094860077
Epoch 1 val accuracy 0.19881962375507192


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 0 train loss 0.9419318106179153
Epoch 0 train accuracy 0.5968703885882931
Epoch 0 val loss 0.9445130885578692
Epoch 0 val accuracy 0.5959670478298291
Epoch 1 train loss 0.9430100647311782
Epoch 1 train accuracy 0.5946876537137236
Epoch 1 val loss 1.0299532958306372
Epoch 1 val accuracy 0.4817410549612689


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
