In [1]:
import pandas as pd
import os
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from numba import njit
from scipy.optimize import curve_fit
import scipy.stats

In [2]:
#create a list of all the files in the order folder
DIR = "../data/tesla_2015/order/"
tesla_files = os.listdir(DIR)
tesla_files.sort()
header_list = ["AskPrice_1", "AskSize_1", "BidPrice_1", "BidSize_1",
               "AskPrice_2", "AskSize_2", "BidPrice_2", "BidSize_2",
               "AskPrice_3", "AskSize_3", "BidPrice_3", "BidSize_3",
               "AskPrice_4", "AskSize_4", "BidPrice_4", "BidSize_4",
               "AskPrice_5", "AskSize_5", "BidPrice_5", "BidSize_5"]

#create a list of all the files in the message folder
DIR_1 ="../data/tesla_2015/message/"
tesla_files_1 =os.listdir(DIR_1)
tesla_files_1.sort()
header_list_1 = ["time","event type","order ID","size","price","direction"]
lst_message = []
lst_order = []

# crate a new dataframe that shows the state of the LOB each second
for message, order in zip(tesla_files_1, tesla_files):
    
    #load message dataframe
    df_m = pd.read_csv(DIR_1 + message, names = header_list_1)
    df_m = df_m.loc[df_m["time"] > 34200 + 3600]
    df_m = df_m.loc[df_m["time"] < 57600 - 1800]
    secs = df_m.index.to_list()

    # load order dataframe
    df_o = pd.read_csv(DIR + order, names=header_list,usecols=np.arange(20))
    df_o = df_o.loc[secs] 
    #scale price to dollar cent
    # add spread and mid price
    df_o["Spread"] = df_o["AskPrice_1"] - df_o["BidPrice_1"]
    df_o["MidPrice"] = (df_o["AskPrice_1"] + df_o["BidPrice_1"]) / 2
    
    lst_message.append(df_m)
    lst_order.append(df_o)

# merge the daily data into two unique dataframes
df_message = pd.concat(lst_message)
df_message.reset_index(inplace=True, drop = True)
df_order = pd.concat(lst_order)
df_order.reset_index(inplace=True, drop = True)


In [3]:
difference = df_order["MidPrice"].diff().fillna(0).to_list()
difference.pop(0)
difference.append(0)
df_order["Class"] = difference
df_order["Class"].loc[df_order["Class"] > 0] = +1
df_order["Class"].loc[df_order["Class"] < 0] = -1
df_order.Class.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


 0.0    3620847
-1.0     306729
 1.0     295836
Name: Class, dtype: int64

In [4]:
df_order.drop("Spread", axis=1, inplace=True)
df_order.drop("MidPrice", axis=1, inplace=True)

In [5]:
df_order.head()

Unnamed: 0,AskPrice_1,AskSize_1,BidPrice_1,BidSize_1,AskPrice_2,AskSize_2,BidPrice_2,BidSize_2,AskPrice_3,AskSize_3,...,BidSize_3,AskPrice_4,AskSize_4,BidPrice_4,BidSize_4,AskPrice_5,AskSize_5,BidPrice_5,BidSize_5,Class
0,2165800,100,2163900,100,2166100,201,2163400,200,2166400,100,...,100,2166600,50,2162300,100,2166900,100,2162200,100,1.0
1,2166100,201,2163900,100,2166400,100,2163400,200,2166600,50,...,100,2166900,100,2162300,100,2168700,100,2162200,100,0.0
2,2166100,101,2163900,100,2166400,100,2163400,200,2166600,50,...,100,2166900,100,2162300,100,2168700,100,2162200,100,0.0
3,2166100,101,2163900,100,2166400,100,2163400,200,2166600,50,...,100,2168700,100,2162300,100,2169400,200,2162200,100,0.0
4,2166100,101,2163900,100,2166400,100,2163400,200,2166600,50,...,100,2167300,100,2162300,100,2168700,100,2162200,100,-1.0


In [6]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU
from keras.layers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, SGD

In [11]:
# divido i dati
train = df_order[0:10_000].values
val = df_order[10_000: 11_000].values
test = df_order[11_000: 13_000].values

In [12]:
window = 10

# Previous prices, price on next day, window
X_train, Y_train = [], []

for i in range(train.shape[0]- window):
    batch = np.reshape(train[i:i+window, 0:4], (window, 4))
    X_train.append(batch)
    Y_train.append(train[i+window, -1])
X_train = np.stack(X_train)
Y_train = np.stack(Y_train)

In [13]:
LSTM_model = Sequential()
#Adding the first LSTM layer with a sigmoid activation function and some Dropout regularization
#Units - dimensionality of the output space

LSTM_model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 4)))

LSTM_model.add(LSTM(units = 50, return_sequences = True))

LSTM_model.add(LSTM(units = 50, return_sequences = True))

LSTM_model.add(LSTM(units = 50))

# Adding the output layer
LSTM_model.add(Dense(units = 1))
LSTM_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 10, 50)            11000     
                                                                 
 lstm_5 (LSTM)               (None, 10, 50)            20200     
                                                                 
 lstm_6 (LSTM)               (None, 10, 50)            20200     
                                                                 
 lstm_7 (LSTM)               (None, 50)                20200     
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 71,651
Trainable params: 71,651
Non-trainable params: 0
_________________________________________________________________


In [21]:
LSTM_model.compile(optimizer = 'adam', loss = 'mse', metrics = ["accuracy"])
history = LSTM_model.fit(X_train, Y_train, epochs = 20, batch_size = 32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [30]:
history.history["loss"]

[0.17959517240524292,
 0.17962290346622467,
 0.17961610853672028,
 0.17959047853946686,
 0.179641455411911,
 0.17963169515132904,
 0.17961271107196808,
 0.17962554097175598,
 0.17965514957904816,
 0.17960664629936218,
 0.17960990965366364,
 0.1796056628227234,
 0.17966783046722412,
 0.17959736287593842,
 0.1796322911977768,
 0.17961034178733826,
 0.1796267181634903,
 0.1796322762966156,
 0.1795978844165802,
 0.1796339899301529]