# Multi-Horizon Forecasting for Limit Order Books: Novel Deep Learning Approaches and Hardware Acceleration using Intelligent Processing Units
### Authors: Zihao Zhang and Stefan Zohren
### Oxford-Man Institute of Quantitative Finance, Department of Engineering Science, University of Oxford

This jupyter notebook is used to demonstrate our recent paper [2]. We use FI-2010 [1] dataset and present how model architecture is constructed here. The FI-2010 is publicly avilable and interested readers can check out their paper [1]. 

### Data:
The FI-2010 is publicly avilable and interested readers can check out their paper [1]. The dataset can be downloaded from: https://etsin.fairdata.fi/dataset/73eb48d7-4dbc-4a10-a52a-da745b47a649 

Otherwise, the notebook will download the data automatically or it can be obtained from: 

https://drive.google.com/drive/folders/1Xen3aRid9ZZhFqJRgEMyETNazk02cNmv?usp=sharing.

### References:
[1] Ntakaris A, Magris M, Kanniainen J, Gabbouj M, Iosifidis A. Benchmark dataset for mid‐price forecasting of limit order book data with machine learning methods. Journal of Forecasting. 2018 Dec;37(8):852-66. https://arxiv.org/abs/1705.03233

[2] Zhang Z, Zohren S. Multi-Horizon Forecasting for Limit Order Books: Novel Deep Learning Approaches and Hardware Acceleration using Intelligent Processing Units. https://arxiv.org/abs/2105.10430

#### This notebook demonstrates how to train DeepLOB-Seq2Seq by using tensorflow 2 on IPUs.

#### For more information about IPU, please check https://www.graphcore.ai/


In [None]:
# obtain data
import os 
if not os.path.isfile('data.zip'):
    !wget https://raw.githubusercontent.com/zcakhaa/DeepLOB-Deep-Convolutional-Neural-Networks-for-Limit-Order-Books/master/data/data.zip
    !unzip -n data.zip
    print('data downloaded.')
else:
    print('data already existed.')

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import glob
import argparse
import sys
import time
import tensorflow as tf
from tensorflow.python import ipu
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# load my packages
from preprocess import *
from model import get_model_seq, get_model_attention

In [2]:
# please change the data_path to your local path
# data_path = '/home/zihaoz/deeplob/data'

T = 50 # lookback window size
epochs = 150 # number of training epochs
batch_size = 16 # gradient descent batch size
n_hidden = 64 # hidden state for decoder
SHUFFLE=True # shuffle the traning data
saved_model_path = './model_deeplob_seq/deeplob_seq' # saved model path

In [3]:
# load data
dec_train = np.loadtxt('Train_Dst_NoAuction_DecPre_CF_7.txt')
dec_test1 = np.loadtxt('Test_Dst_NoAuction_DecPre_CF_7.txt')
dec_test2 = np.loadtxt('Test_Dst_NoAuction_DecPre_CF_8.txt')
dec_test3 = np.loadtxt('Test_Dst_NoAuction_DecPre_CF_9.txt')
dec_test = np.hstack((dec_test1, dec_test2, dec_test3))

# extract limit order book data from the FI-2010 dataset
train_lob = prepare_x(dec_train)
test_lob = prepare_x(dec_test)

# extract label from the FI-2010 dataset
train_label = get_label(dec_train)
test_label = get_label(dec_test)

# prepare training data. We feed past T observations into our algorithms.
train_encoder_input, train_decoder_target = data_classification(train_lob, train_label, T)
train_decoder_input = prepare_decoder_input(train_encoder_input, teacher_forcing=False)

test_encoder_input, test_decoder_target = data_classification(test_lob, test_label, T)
test_decoder_input = prepare_decoder_input(test_encoder_input, teacher_forcing=False)

print(f'train_encoder_input.shape = {train_encoder_input.shape},'
      f'train_decoder_target.shape = {train_decoder_target.shape}')
print(f'test_encoder_input.shape = {test_encoder_input.shape},'
      f'test_decoder_target.shape = {test_decoder_target.shape}')


train_encoder_input.shape = (254701, 50, 40, 1),train_decoder_target.shape = (254701, 5, 3)
test_encoder_input.shape = (139538, 50, 40, 1),test_decoder_target.shape = (139538, 5, 3)


In [4]:
# Configure the IPU system
cfg = ipu.utils.create_ipu_config()
cfg = ipu.utils.auto_select_ipus(cfg, 1)
ipu.utils.configure_ipu_system(cfg)

In [5]:
strategy = ipu.ipu_strategy.IPUStrategy()
all_results = [[1000, 0]]
split_train_val = int(np.floor(len(train_encoder_input) * 0.8))

with strategy.scope():
    # Create an instance of the model
    model = get_model_seq(n_hidden)

    # Get the dataset
    train_ds = create_dataset(train_encoder_input[:split_train_val], train_decoder_input[:split_train_val], 
                              train_decoder_target[:split_train_val], batch_size, method='train', shuffle=SHUFFLE)
    val_ds = create_dataset(train_encoder_input[split_train_val:], train_decoder_input[split_train_val:], 
                            train_decoder_target[split_train_val:], batch_size, method='val')
    test_ds = create_dataset(test_encoder_input, test_decoder_input, 
                             test_decoder_target, batch_size, method='prediction')

    # Train the model
    adam = keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999)
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=adam)
    epoch_ = 0
    epochs_per_fit = 5
    
    while epoch_ < epochs:
        
        model.fit(train_ds, steps_per_epoch=len(train_encoder_input) // batch_size,
                  initial_epoch=epoch_, epochs=epoch_ + epochs_per_fit)
        epoch_ = epoch_ + epochs_per_fit
        result = model.evaluate(val_ds)
        all_results.append(result)
        print(f'Epoch = {epoch_},' f'Validation Results = {result}')

        if all_results[-1][0] < all_results[-2][0]:
            model.save_weights(saved_model_path)

    model.load_weights(saved_model_path)
    pred = model.predict(test_ds)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch = 5,Validation Results = [1.0359435021652577, 0.47807884]
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch = 10,Validation Results = [1.0029647791260188, 0.48766887]
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch = 15,Validation Results = [0.9052797757730501, 0.5411679]
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch = 20,Validation Results = [0.9296963038780275, 0.5376767]
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch = 25,Validation Results = [0.8765886861279705, 0.55609095]
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch = 30,Validation Results = [0.9112871621392767, 0.54719603]
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch = 35,Validation Results = [0.8559668276454883, 0.5727773]
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch = 40,Validation Results = [0.8471099839110116, 0.5897149]
Epoch 41/45
Epoch 4

Epoch = 70,Validation Results = [0.845632915088671, 0.63105166]
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75
Epoch = 75,Validation Results = [0.7968672948113771, 0.6454642]
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Epoch = 80,Validation Results = [0.7814938865717298, 0.6561145]
Epoch 81/85
Epoch 82/85
Epoch 83/85
Epoch 84/85
Epoch 85/85
Epoch = 85,Validation Results = [0.7749453322488511, 0.65838045]
Epoch 86/90
Epoch 87/90
Epoch 88/90
Epoch 89/90
Epoch 90/90
Epoch = 90,Validation Results = [0.7742200728609427, 0.66035974]
Epoch 91/95
Epoch 92/95
Epoch 93/95
Epoch 94/95
Epoch 95/95
Epoch = 95,Validation Results = [0.7776556417379357, 0.6615614]
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Epoch = 100,Validation Results = [0.770497651477533, 0.664656]
Epoch 101/105
Epoch 102/105
Epoch 103/105
Epoch 104/105
Epoch 105/105
Epoch = 105,Validation Results = [0.7749558086253929, 0.663525]
Epoch 106/110
Epoch 107/110
Epoch 108/110
Epoch 109

Epoch 139/140
Epoch 140/140
Epoch = 140,Validation Results = [0.7885088693981676, 0.664923]
Epoch 141/145
Epoch 142/145
Epoch 143/145
Epoch 144/145
Epoch 145/145
Epoch = 145,Validation Results = [0.7770631442907431, 0.67090404]
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150
Epoch = 150,Validation Results = [0.773415037257608, 0.67012644]


In [6]:
evaluation_metrics(test_decoder_target, pred)

Prediction horizon = 0
accuracy_score = 0.8211357642472193
classification_report =               precision    recall  f1-score   support

           0     0.7356    0.5023    0.5970     21147
           1     0.8382    0.9574    0.8939     98622
           2     0.7658    0.4822    0.5918     19767

    accuracy                         0.8211    139536
   macro avg     0.7799    0.6473    0.6942    139536
weighted avg     0.8124    0.8211    0.8061    139536

-------------------------------
Prediction horizon = 1
accuracy_score = 0.7355377823644077
classification_report =               precision    recall  f1-score   support

           0     0.6543    0.4530    0.5354     27448
           1     0.7649    0.9120    0.8320     86603
           2     0.6494    0.4403    0.5248     25485

    accuracy                         0.7355    139536
   macro avg     0.6895    0.6018    0.6307    139536
weighted avg     0.7221    0.7355    0.7175    139536

-------------------------------
Predicti