In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import tensorflow as tf
import time
from pylab import *
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from skimage.util.shape import view_as_windows
from sklearn.metrics import roc_curve,roc_auc_score
from tcn import TCN, tcn_full_summary
from tensorflow.keras.layers import Dense, Conv2D, Layer, Lambda, Flatten
from tensorflow.keras import Input, Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.python.client import device_lib

module_path = os.path.abspath(os.path.join('..'))
project_path = os.path.abspath(os.path.join('../..'))
train_source = os.path.abspath(os.path.join('../../data/train_source/'))
test_source = os.path.abspath(os.path.join('../../data/test_source/'))
val_source = os.path.abspath(os.path.join('../../data/val_source/'))
train_path = os.path.abspath(os.path.join('../../data/train/'))
test_path = os.path.abspath(os.path.join('../../data/test/'))
val_path_bs = os.path.abspath(os.path.join('../../data/validation_bs/'))
val_path_bsu = os.path.abspath(os.path.join('../../data/validation_bsu/'))
dest_path_bsu = os.path.abspath(os.path.join('../../data/final_bsu/'))
dest_path_bs = os.path.abspath(os.path.join('../../data/final_bs/'))

if module_path not in sys.path:
    sys.path.append(module_path)
if project_path not in sys.path:
    sys.path.append(project_path)
    
np.set_printoptions(suppress=True)
np.set_printoptions(threshold=sys.maxsize)
#print(device_lib.list_local_devices())

In [4]:
# 1. VAL_BSU - DONE
# 2. VAL_BS 50 - DONE
# 3. VAL_BSU 50
# 4. VAL_BS 100
# 5. VAL_BSU 100
# 6. Train_BSU 50 - DONE
# 7. Train_BSU 100 - I think we got one day for train and one day for test
# 8. Train_BS = 100 - We may have to fun this over night

In [5]:
vol_imb_diff = 0.3

In [6]:
def convert_data_to_labels(data_source, frames):
    X = None
    Y = None
    W = None
    for subdir, dirs, files in os.walk(data_source):
        for file in files:
            data_path = os.path.join(subdir, file)
            print(data_path)
            npy = np.load(data_path)
            x, y, weight = retrieve_cleansed_data(npy, frames)
            if X is not None:
                X = np.append(X, x, axis=0)
            else:
                X = x
                
            if Y is not None:
                Y = Y.append(y)
            else:
                Y = y
                
            if W is not None:
                W = W.append(weight)
            else:
                W = weight
            print(X.shape)
    return X, Y, W

In [7]:
def retrieve_cleansed_data(lob, width, filtered=False):
    min_max_scaler = MinMaxScaler(feature_range=(0,50))
    quantile_transformer = QuantileTransformer()
    
    # As evidenced by above, we can technically select all in the second axis as there is only 1 element. However, 
    # because we need a 2d input we make it 0. The 3rd axis is side so we need this
    lob_qty_buy = pd.DataFrame(lob['quantity'][:,0,0,0:20])
    lob_qty_buy = lob_qty_buy.replace(0, np.NaN)
    avg_buy = lob_qty_buy.mean().mean()
    vol_sum_buy = lob_qty_buy.sum(axis=1)
    
    lob_qty_sell = pd.DataFrame(lob['quantity'][:,0,1,0:20])
    lob_qty_sell = lob_qty_sell.replace(0, np.NaN)
    avg_sell = lob_qty_sell.mean().mean()
    vol_sum_sell = lob_qty_sell.sum(axis=1)

    vol_imbalance = (vol_sum_buy - vol_sum_sell) / (vol_sum_buy + vol_sum_sell)
    
    # Imbalance tips. The only way this can happen is if a new order comes in or deletes. 
    
    label_df = pd.concat([vol_imbalance, pd.Series(lob['action'].ravel()), pd.Series(lob['side'].ravel())], axis=1)
    label_df[3] = label_df[0].diff() # Change in the values of the two states of OB (We take diff of prev row)
    label_df[4] = 0
    label_df[4] = np.where(((label_df[1] == 2) & (label_df[2] == 'B') & (np.abs(label_df[3]) > vol_imb_diff)), 1, 
                  np.where(((label_df[1] == 2) & (label_df[2] == 'S') & (np.abs(label_df[3]) > vol_imb_diff)), 2, 
                             label_df[4].values))
    label_df = label_df.iloc[width-1:]
    Y_labels = label_df[4].reset_index(drop=True)
    X_feature = np.abs(label_df[3]).reset_index(drop=True)
   
    # Normalise positive samples
    # these array manipulations are to get a final array where b-s in same array group
    lob_n, d, w, h = lob['quantity'].shape
    b_qty = lob['quantity'][:,0,0,:]
    s_qty = lob['quantity'][:,0,1,:]
    lob_qty = np.stack((b_qty, s_qty), axis=2)

    lob_qty = lob_qty.reshape(-1,1)
    lob_qty = min_max_scaler.fit_transform(lob_qty)
    lob_qty = lob_qty.reshape(lob_n, h, w)
    
    b_price = lob['price'][:,0,0,:]
    s_price = lob['price'][:,0,1,:]
    lob_price = np.stack((b_price, s_price), axis=2)

    lob_price = lob_price.reshape(-1,1)
    lob_price = quantile_transformer.fit_transform(lob_price)
    lob_price = lob_price.reshape(lob_n, h, w)

    lob_states = np.dstack((lob_qty, lob_price))
    lob_states = lob_states.reshape(lob_n, h, w, 2)
    lob_states = view_as_windows(lob_states,(width,1,1,1))[...,0,0,0].transpose(0,4,1,2,3)
    
    if filtered:
        Y_labels = Y_labels[(Y_labels==1) | (Y_labels==2)]
        lob_states = lob_states[Y_labels.index]
        X_feature = X_feature[Y_labels.index]
        Y_labels = Y_labels.reset_index(drop=True)
        X_feature = X_feature.reset_index(drop=True)
        print(lob_states.shape)
    else:
        Y_labels = Y_labels[(Y_labels==0)].sample(2000, replace=True)
        lob_states = lob_states[Y_labels.index]
        X_feature = X_feature[Y_labels.index]
        Y_labels = Y_labels.reset_index(drop=True)
        X_feature = X_feature.reset_index(drop=True)
        print(lob_states.shape)
    return lob_states, Y_labels, X_feature

In [8]:
def save_training_data(path, train_data_source, test_data_source, frames):
    X_train, Y_train, F_train = convert_data_to_labels(train_data_source, frames)
    X_test, Y_test, F_test = convert_data_to_labels(test_data_source, frames)
    Y_train = Y_train.reset_index(drop=True)
    F_train = F_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    F_test = F_test.reset_index(drop=True)
    np.save(path + '/' + str(frames) + '_X_train.npy', X_train)
    np.save(path + '/' + str(frames) + '_Y_train.npy', Y_train)
    np.save(path + '/' + str(frames) + '_F_train.npy', F_train)
    np.save(path + '/' + str(frames) + '_X_test.npy', X_test)
    np.save(path + '/' + str(frames) + '_Y_test.npy', Y_test)
    np.save(path + '/' + str(frames) + '_F_test.npy', F_test)

In [9]:
save_training_data(dest_path_bsu, train_path + '/20160907/', train_path + '/20160901/', 50)

/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/IBM_NASDAQ.npy
(2000, 50, 30, 2, 2)
(2000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/SPY_EDGX.npy
(2000, 50, 30, 2, 2)
(4000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/ABEO_EDGX.npy
(2000, 50, 30, 2, 2)
(6000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/GOOG_NASDAQ.npy
(2000, 50, 30, 2, 2)
(8000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/ABEO_BATS.npy
(2000, 50, 30, 2, 2)
(10000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/ABEO_ARCA.npy
(2000, 50, 30, 2, 2)
(12000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/IBM_EDGA.npy
(2000, 50, 30, 2, 2)
(14000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/train/20160907/SPY_ARCA.npy
(2000, 

In [None]:
save_training_data(dest_path_bs, train_path, test_path, 100)

### Validation Pipeline

In [10]:
def save_val_data(path, val_source, frames):
    X_val, Y_val, F_val = convert_data_to_labels(val_source, frames)
    Y_val = Y_val.reset_index(drop=True)
    F_val = F_val.reset_index(drop=True)
    np.save(path + '/' + str(frames) + '_X_val.npy', X_val)
    np.save(path + '/' + str(frames) + '_Y_val.npy', Y_val)
    np.save(path + '/' + str(frames) + '_F_val.npy', F_val)

In [11]:
save_val_data(val_path_bsu, val_source, 50)

/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/val_source/bid/20160914/IBM_NASDAQ.npy
(2000, 50, 30, 2, 2)
(2000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/val_source/bid/20160914/SPY_EDGX.npy
(2000, 50, 30, 2, 2)
(4000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/val_source/bid/20160914/ABEO_EDGX.npy
(2000, 50, 30, 2, 2)
(6000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/val_source/bid/20160914/GOOG_NASDAQ.npy
(2000, 50, 30, 2, 2)
(8000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/val_source/bid/20160914/ABEO_BATS.npy
(2000, 50, 30, 2, 2)
(10000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/val_source/bid/20160914/ABEO_ARCA.npy
(2000, 50, 30, 2, 2)
(12000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/LimitOrderBookPatterns/data/val_source/bid/20160914/IBM_EDGA.npy
(2000, 50, 30, 2, 2)
(14000, 50, 30, 2, 2)
/rds/general/user/kk2219/home/

### Validation that data is Correct - Do not Run Normally

In [None]:
dataset = np.load(project_path + '/data/test_source/VOD_BATS.npy')
lob_t, y_t, f_t = retrieve_cleansed_data(dataset)

sell_side_label = y_t[y_t == 2]
buy_side_label = y_t[y_t == 1]
print(sell_side_label.count())
print(buy_side_label.count())
print(sell_side_label.index)
print(lob_t[11])