In [1]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub
import shutil

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader
from torchinfo import summary

from models.transLOB_v2 import TransLOB
from utils.preprocessing_v2 import create_windows, generate_labels, normalize_features, add_features
from utils.training import train, validate
from utils.loader import LOBDataset

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
from utils.fi2010_loader import Dataset_fi2010



# dataset_train, dataset_val = __get_dataset__(model_id, dataset_type, normalization, lighten, T, k, stock, train_test_ratio)

#     dataset_type = 'fi2010'
#     normalization = 'Zscore'
#     lighten = True
#     model_type = 'lobster'

#     T = 100
#     k = 4
#     stock = [0, 1, 2, 3, 4]
#     train_test_ratio = 0.7

#     # generate model id
#     model_id = logger.generate_id(model_type)
#     print(f"Model ID: {model_id}")

#     train.train(model_id=model_id, dataset_type=dataset_type, normalization=normalization,
#                 lighten=True,T=T, k=k, stock=stock, train_test_ratio=train_test_ratio,
#                 model_type=model_type)


dataset_type='fi2010'
normalization = 'DecPre'
lighten = True

T = 100
k = 4
stock = [0, 1, 2, 3, 4]
train_test_ratio = 0.8
auction = False
days = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

train_day_length = round(len(days) * train_test_ratio)
train_days = days[:train_day_length]
test_days = days[train_day_length:]

dataset_train_val = Dataset_fi2010(auction, normalization, stock, train_days, T, k, lighten)
dataset_test = Dataset_fi2010(auction, normalization, stock, test_days, T, k, lighten)

data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Training Train_Dst_NoAuction_DecPre_CF_1.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_NoAuction_DecPre_CF_1.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_NoAuction_DecPre_CF_2.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_NoAuction_DecPre_CF_3.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_NoAuction_DecPre_CF_4.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_NoAuction_DecPre_CF_5.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_NoAuction_DecPre_CF_6.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_NoAuction_DecPre_CF_7.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Training Train_Dst_NoAuction_DecPre_CF_1.txt
data FI-2010 NoAuction 3.NoAuction_DecPre NoAuction_DecPre_Testing Test_Dst_No

In [17]:
dataset_train = dataset_train_val
dataset_val = dataset_test

print(f"Training Data Size : {dataset_train.__len__()}")
print(f"Validation Data Size : {dataset_val.__len__()}")


Training Data Size : 306268
Validation Data Size : 83119


In [18]:
if lighten:
    feature_size = 20
else:
    feature_size = 40

In [19]:
from torch.utils.data import DataLoader

batch_size= 128
learning_rate= 0.0001
epsilon= 1e-08
epoch= 30
num_workers= 4

train_loader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(dataset=dataset_val, batch_size=batch_size, shuffle=False, num_workers=num_workers)


In [20]:
from models.deepLOB.deepLOB_model import Deeplob
from torchinfo import summary

model = Deeplob(lighten=lighten)
model.to(model.device)
summary(model, (1, 1, 100, feature_size))

Layer (type:depth-idx)                   Output Shape              Param #
Deeplob                                  [1, 3]                    --
├─Sequential: 1-1                        [1, 32, 94, 10]           --
│    └─Conv2d: 2-1                       [1, 32, 100, 10]          96
│    └─LeakyReLU: 2-2                    [1, 32, 100, 10]          --
│    └─BatchNorm2d: 2-3                  [1, 32, 100, 10]          64
│    └─Conv2d: 2-4                       [1, 32, 97, 10]           4,128
│    └─LeakyReLU: 2-5                    [1, 32, 97, 10]           --
│    └─BatchNorm2d: 2-6                  [1, 32, 97, 10]           64
│    └─Conv2d: 2-7                       [1, 32, 94, 10]           4,128
│    └─LeakyReLU: 2-8                    [1, 32, 94, 10]           --
│    └─BatchNorm2d: 2-9                  [1, 32, 94, 10]           64
├─Sequential: 1-2                        [1, 32, 88, 5]            --
│    └─Conv2d: 2-10                      [1, 32, 94, 5]            2,080
│    └

In [21]:
from torch import nn

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
from tqdm import tqdm
from datetime import datetime

def batch_gd(model_id, model, criterion, optimizer, train_loader, val_loader, epochs, name):
    training_info = {
        'train_loss_hist': [],
        'val_loss_hist': [],
        'train_acc_hist': [],
        'val_acc_hist': []
    }

    best_test_loss = np.inf
    best_test_epoch = 0

    for iter in tqdm(range(epochs)):
        model.train()
        t0 = datetime.now()
        train_loss = []
        train_acc = []
        for inputs, targets in tqdm(train_loader):
            # move data to GPU
            inputs, targets = inputs.to(model.device, dtype=torch.float), targets.to(model.device, dtype=torch.int64)
            # zero the parameter gradients
            optimizer.zero_grad()
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            # Backward and optimize
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            tmp_acc = torch.count_nonzero(torch.argmax(outputs, dim = 1) == targets).item()/targets.size(0)
            train_acc.append(tmp_acc)
        # Get train loss and test loss
        train_loss = np.mean(train_loss)
        train_acc = np.mean(train_acc)

        model.eval()
        val_loss = []
        val_acc = []
        for inputs, targets in tqdm(val_loader):
            inputs, targets = inputs.to(model.device, dtype=torch.float), targets.to(model.device, dtype=torch.int64)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss.append(loss.item())
            tmp_acc = torch.count_nonzero(torch.argmax(outputs, dim=1) == targets).item() / targets.size(0)
            val_acc.append(tmp_acc)
        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)

        # Save losses
        training_info['train_loss_hist'].append(train_loss)
        training_info['val_loss_hist'].append(val_loss)
        training_info['train_acc_hist'].append(train_acc)
        training_info['val_acc_hist'].append(val_acc)

        if val_loss < best_test_loss:
            torch.save(model, os.path.join(logger.find_save_path(model_id), 'best_val_model.pt'))
            best_test_loss = val_loss
            best_test_epoch = iter
            print('model saved')

        dt = datetime.now() - t0
        print(f'Epoch {iter + 1}/{epochs}, '
              f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc: .4f}, '
              f'Validation Loss: {val_loss:.4f}, Validation Acc: {val_acc: .4f}, '
              f'Duration: {dt}, Best Val Epoch: {best_test_epoch}')

    torch.save({
        'epoch': epochs,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': train_loss,
    }, os.path.join(logger.find_save_path(model_id), 'checkpoint.pt'))

    with open(os.path.join(logger.find_save_path(model_id), 'training_process.pkl'), 'wb') as f:
        pickle.dump(training_info, f)

    return


In [23]:
model_id = 'test_2'
batch_gd(model_id = model_id, model = model, criterion = criterion, optimizer = optimizer,
         train_loader = train_loader, val_loader = val_loader, epochs=epoch, name = model.name)

  0%|                                                                                                        | 0/30 [00:00<?, ?it/s]
  0%|                                                                                                      | 0/2393 [00:00<?, ?it/s][A
  0%|                                                                                              | 1/2393 [00:00<13:50,  2.88it/s][A
  0%|                                                                                              | 3/2393 [00:00<05:17,  7.52it/s][A
  0%|▏                                                                                             | 6/2393 [00:00<03:13, 12.37it/s][A
  0%|▎                                                                                             | 9/2393 [00:00<02:28, 16.01it/s][A
  1%|▍                                                                                            | 12/2393 [00:00<02:07, 18.66it/s][A
  1%|▌                                             

NameError: name 'logger' is not defined

In [24]:
from models.deepLOB.evaluate import evaluate

evaluate.test(model_id=model_id, model_type=model_type)
# classification_report.report(model_id=model_id)
# training_vis.vis_training_process(model_id=model_id)

ModuleNotFoundError: No module named 'loaders'

In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

test_loader = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

all_midprices = dataset_test.get_midprice()
all_targets = []
all_predictions = []

for inputs, targets in tqdm(test_loader):
    # Move to GPU
    model.eval()
    inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.int64)

    # Forward pass
    outputs = model(inputs)

    # Get prediction
    max_output, predictions = torch.max(outputs, 1)

    # update counts
    all_targets.append(targets.cpu().numpy())
    all_predictions.append(predictions.cpu().numpy())

all_targets = np.concatenate(all_targets)
all_predictions = np.concatenate(all_predictions)

# with open(os.path.join(logger.find_save_path(model_id), 'prediction.pkl'), 'wb') as f:
#     pickle.dump([all_midprices, all_targets, all_predictions], f)

test_acc = accuracy_score(all_targets, all_predictions)
print(f"Test acc: {test_acc:.4f}")
print(classification_report(all_targets, all_predictions, digits=4))
print(confusion_matrix(all_targets, all_predictions))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 650/650 [00:04<00:00, 140.34it/s]

Test acc: 0.6259
              precision    recall  f1-score   support

           0     0.6170    0.5906    0.6035     29147
           1     0.7812    0.6163    0.6890     26310
           2     0.5397    0.6724    0.5988     27662

    accuracy                         0.6259     83119
   macro avg     0.6460    0.6264    0.6304     83119
weighted avg     0.6432    0.6259    0.6290     83119

[[17213  2390  9544]
 [ 3775 16214  6321]
 [ 6909  2152 18601]]





In [None]:
# from utils.fi2010_loader import __vis_sample_lob__
# __vis_sample_lob__('DecPre')