In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import math
import matplotlib.pyplot as plt

In [2]:
messages = pd.read_csv("AMZN_2012-06-21_34200000_57600000_message_10.csv", header= None)
orderbook = pd.read_csv("AMZN_2012-06-21_34200000_57600000_orderbook_10.csv", header = None)

In [3]:
ask_p_cols = [4*i for i in range(10)] # 0,4,8,…,36
ask_v_cols = [4*i + 1 for i in range(10)] # 1,5,9,…,37
bid_p_cols = [4*i + 2 for i in range(10)] # 2,6,10,…,38
bid_v_cols = [4*i + 3 for i in range(10)] # 3,7,11,…,39
orderbook[ask_p_cols] = orderbook[ask_p_cols] /1e4
orderbook[ask_v_cols] = orderbook[ask_v_cols] /1e3
orderbook[bid_p_cols] = orderbook[bid_p_cols] /1e4
orderbook[bid_v_cols] = orderbook[bid_v_cols] /1e3

In [4]:
best_ask = orderbook[0]
best_bid = orderbook[2]
mid = 0.5 * (best_ask + best_bid)

In [5]:
horizon = 10
future_mid = mid.shift(-horizon)

In [6]:
y_raw = (future_mid < mid).astype(int).iloc[:-horizon].values
X_raw = orderbook.iloc[:-horizon].values

In [7]:
messages

Unnamed: 0,0,1,2,3,4,5
0,34200.017460,5,0,1,2238200,-1
1,34200.189608,1,11885113,21,2238100,1
2,34200.189608,1,3911376,20,2239600,-1
3,34200.189608,1,11534792,100,2237500,1
4,34200.189608,1,1365373,13,2240000,-1
...,...,...,...,...,...,...
269743,57599.872741,3,286560364,100,2207600,-1
269744,57599.903989,3,287142900,100,2206200,-1
269745,57599.955242,3,286967592,170,2206900,-1
269746,57599.958245,1,287174077,100,2206300,-1


In [8]:
def create_sequences(data, labels, window, step):
    X_seq, y_seq = [], []
    end = len(data) - window
    for i in range(0, end, step):
        X_seq.append(data[i:i+window])
        y_seq.append(labels[i+window-1])
    return np.array(X_seq), np.array(y_seq)

In [9]:
X_cnn, y_cnn = create_sequences(X_raw, y_raw, window=50, step=1)

In [10]:
split_idx = int(len(X_cnn) * 0.8)
X_train, X_test = X_cnn[:split_idx], X_cnn[split_idx:]
y_train, y_test = y_cnn[:split_idx], y_cnn[split_idx:]

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_flat = X_train.reshape(-1, 40)
X_test_flat = X_test.reshape(-1, 40)
scaler.fit(X_train_flat)
X_train = scaler.transform(X_train_flat).reshape(X_train.shape)
X_test = scaler.transform(X_test_flat).reshape(X_test.shape)

In [12]:
class LOBDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).permute(0, 2, 1)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
        
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
train_loader = DataLoader(LOBDataset(X_train, y_train), batch_size=64, shuffle=True)
test_loader = DataLoader(LOBDataset(X_test, y_test), batch_size=64)

In [13]:
class LOB_CNN(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.conv1 = nn.Conv1d(num_features, 64, kernel_size=3)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(2)
        self.dropout1 = nn.Dropout(0.3)

        self.conv2 = nn.Conv1d(64, 128, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(2)
        self.dropout2 = nn.Dropout(0.3)

        dummy = torch.zeros(1, num_features, 50)
        x = self.pool1(F.relu(self.bn1(self.conv1(dummy))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        flatten_size = x.shape[1] * x.shape[2]

        self.fc1 = nn.Linear(flatten_size, 64)
        self.dropout3 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.dropout1(x)
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.dropout2(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout3(x)
        return self.fc2(x)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LOB_CNN(num_features=40).to(device)

pos_weight = torch.tensor([np.bincount(y_train)[0] / np.bincount(y_train)[1]], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(101):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

Epoch 1, Loss: 1.0264
Epoch 2, Loss: 1.0186
Epoch 3, Loss: 1.0132
Epoch 4, Loss: 1.0065
Epoch 5, Loss: 1.0020
Epoch 6, Loss: 0.9968
Epoch 7, Loss: 0.9908
Epoch 8, Loss: 0.9862
Epoch 9, Loss: 0.9803
Epoch 10, Loss: 0.9748
Epoch 11, Loss: 0.9690
Epoch 12, Loss: 0.9629
Epoch 13, Loss: 0.9582
Epoch 14, Loss: 0.9520
Epoch 15, Loss: 0.9481
Epoch 16, Loss: 0.9425
Epoch 17, Loss: 0.9404
Epoch 18, Loss: 0.9360
Epoch 19, Loss: 0.9311
Epoch 20, Loss: 0.9297
Epoch 21, Loss: 0.9249
Epoch 22, Loss: 0.9225
Epoch 23, Loss: 0.9186
Epoch 24, Loss: 0.9164
Epoch 25, Loss: 0.9122
Epoch 26, Loss: 0.9106
Epoch 27, Loss: 0.9075
Epoch 28, Loss: 0.9032
Epoch 29, Loss: 0.9005
Epoch 30, Loss: 0.8943
Epoch 31, Loss: 0.8920
Epoch 32, Loss: 0.8892
Epoch 33, Loss: 0.8869
Epoch 34, Loss: 0.8846
Epoch 35, Loss: 0.8807
Epoch 36, Loss: 0.8791
Epoch 37, Loss: 0.8765
Epoch 38, Loss: 0.8747
Epoch 39, Loss: 0.8714
Epoch 40, Loss: 0.8694
Epoch 41, Loss: 0.8678
Epoch 42, Loss: 0.8647
Epoch 43, Loss: 0.8611
Epoch 44, Loss: 0.86

In [15]:
pos_weight

tensor([2.8992], device='cuda:0')

In [16]:
from sklearn.metrics import log_loss, roc_auc_score

In [17]:
print("Train class balance:", np.bincount(y_train))
print("Test  class balance:", np.bincount(y_test))

model.eval()
y_prob, y_true = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        y_prob.append(torch.sigmoid(model(xb.to(device))).cpu())
        y_true.append(yb)
y_prob = torch.cat(y_prob).numpy().ravel()
y_true = torch.cat(y_true).numpy().ravel()

from sklearn.metrics import roc_auc_score
print("ROC‑AUC:", roc_auc_score(y_true, y_prob))

Train class balance: [160418  55332]
Test  class balance: [40627 13311]
ROC‑AUC: 0.5291486902165479


In [18]:
preds = (y_prob > 0.5).astype(int)
print(confusion_matrix(y_true, preds))
print(classification_report(y_true, preds, digits = 3))

[[29730 10897]
 [ 9316  3995]]
              precision    recall  f1-score   support

         0.0      0.761     0.732     0.746     40627
         1.0      0.268     0.300     0.283     13311

    accuracy                          0.625     53938
   macro avg      0.515     0.516     0.515     53938
weighted avg      0.640     0.625     0.632     53938

