https://github.com/FSUHeting/DL_LOB_Trading_and_MidPirce_Movement/tree/master

In [28]:
# functions for data preprocessing

import numpy as np
from sklearn.preprocessing import StandardScaler


# generate (x, y) pair for cnn model. x: k by look back range by n. y: k by forecast size by n.
# n is 4 times level. k is the num_rows in the code.
def generate_data(data, forecast_size, look_back):
    m, n = data.shape
    num_rows = m - look_back - forecast_size
    data_x = np.empty([num_rows, look_back, n])
    data_y = np.empty([num_rows, forecast_size, n])

    for i in range(num_rows):
        data_x[i, :, :] = data[i: i + look_back, :]
        data_y[i, :, :] = data[i + look_back: i + look_back + forecast_size, :]

    return data_x, data_y

# load benchmark dataset and convert it back to raw limit order book dataset.
def read_benchmark_data(data_path, data_level):
    with open(data_path, 'r') as f:
        data = np.genfromtxt(f, max_rows=int(4*data_level))
    
    data = np.transpose(data)
    n = data.shape[1]
    for i in range(n):
        if i % 2 == 0:
            data[:, i] = data[:, i] * 100
        else:
            data[:, i] = data[:, i] * 1000000
        
    return data

# generate matrix price, volumes and target probability
def benchmark_data_for_model(data_x, data_y):    
    n = data_x.shape[0]
    price = np.empty_like(data_x[:,:,0::2])
    price_ask = data_x[:,:,0::4]
    price_bid = data_x[:,:,2::4]
    np.copyto(price, np.concatenate((price_bid[:,:,::-1], price_ask), axis=-1))
    volume = np.empty_like(data_x[:,:,1::2])
    volume_ask = data_x[:,:,1::4]
    volume_bid = data_x[:,:,3::4]
    np.copyto(volume, np.concatenate((volume_bid[:,:,::-1], volume_ask), axis=-1))
    volume = np.log(volume)
    midPrice = (price_ask[:,-1,0] + price_bid[:,-1,0])/2
    midAverage = np.average(data_y[:,:,0] + data_y[:,:,2], axis=1)/2
    alpha = 0.00002
    prob = np.zeros([n, 3])
    for i in range(n):
        if midPrice[i]*(1+alpha) < midAverage[i]:
            prob[i][0] = 1
        elif midPrice[i]*(1-alpha) > midAverage[i]:
            prob[i][1] = 1
        else:
            prob[i][2] = 1

    for i in range(n):
        price_mean = np.mean(price[i, :, :], axis=0)
        price[i, :, :] = price[i, :, :] - price_mean
        volume_mean = np.mean(volume[i, :, :], axis=0)
        volume[i, :, :] = volume[i, :, :] - volume_mean
    
    return price, volume, prob 

In [29]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [20]:
# choose the Decimal preprocessed data from the benchmark dataset. 
# We only use level 5 limit order book data.
# prepar the data for training.
file_path = 'data/FI-2010/BenchmarkDatasets/Auction/3.Auction_DecPre/Auction_DecPre_Training/'
data_name = '_Dst_Auction_DecPre_CF_1.txt'
data_level = 5
forecast_size = 10
look_back = 100
data_train = read_benchmark_data(file_path + 'Train' + data_name, data_level)
data_x, data_y = generate_data(data_train, forecast_size, look_back)
del data_train

In [12]:
data_x.shape

(47232, 100, 20)

In [13]:
data_y.shape

(47232, 10, 20)

In [16]:
# get inout and target data for training our deep learning model.
# check the distribution
train_price, train_volume, train_prob = benchmark_data_for_model(data_x, data_y)
print('positive sample ratio in train: ', np.mean(train_prob[:, 0]))
print('negative sample ratio in train: ', np.mean(train_prob[:, 1]))
print('neutral sample ratio in train: ', np.mean(train_prob[:, 2]))
del data_x, data_y

positive sample ratio in train:  0.38465447154471544
negative sample ratio in train:  0.37510586043360433
neutral sample ratio in train:  0.24023966802168023


In [17]:
train_price.shape

(47232, 100, 10)

In [18]:
train_prob.shape

(47232, 3)

In [28]:
file_path = 'data/FI-2010/BenchmarkDatasets/Auction/3.Auction_DecPre/Auction_DecPre_Training/'
data_name = '_Dst_Auction_DecPre_CF_1.txt'
data_level = 5
forecast_size = 10
look_back = 100
data_path = file_path + 'Train' + data_name
with open(data_path, 'r') as f:
    #data = np.genfromtxt(f, max_rows=int(4*data_level))
    data_all = np.genfromtxt(f)
    data = data_all[:int(4*data_level), :]

In [29]:
data_all.shape

(149, 47342)

In [30]:
print(data.shape)

(20, 47342)


In [33]:
data = np.transpose(data)
n = data.shape[1]
for i in range(n):
    if i % 2 == 0:
        data[:, i] = data[:, i] * 100
    else:
        data[:, i] = data[:, i] * 1000000

In [34]:
data_train = data

In [35]:
data_x, data_y = generate_data(data_train, forecast_size, look_back)

In [36]:
data_x.shape, data_y.shape

((47232, 100, 20), (47232, 10, 20))

In [38]:
n = data_x.shape[0]
price = np.empty_like(data_x[:,:,0::2])
price_ask = data_x[:,:,0::4]
price_bid = data_x[:,:,2::4]
np.copyto(price, np.concatenate((price_bid[:,:,::-1], price_ask), axis=-1))
volume = np.empty_like(data_x[:,:,1::2])
volume_ask = data_x[:,:,1::4]
volume_bid = data_x[:,:,3::4]
np.copyto(volume, np.concatenate((volume_bid[:,:,::-1], volume_ask), axis=-1))
volume = np.log(volume)
midPrice = (price_ask[:,-1,0] + price_bid[:,-1,0])/2
midAverage = np.average(data_y[:,:,0] + data_y[:,:,2], axis=1)/2
alpha = 0.00002
prob = np.zeros([n, 3])
for i in range(n):
    if midPrice[i]*(1+alpha) < midAverage[i]:
        prob[i][0] = 1
    elif midPrice[i]*(1-alpha) > midAverage[i]:
        prob[i][1] = 1
    else:
        prob[i][2] = 1

for i in range(n):
    price_mean = np.mean(price[i, :, :], axis=0)
    price[i, :, :] = price[i, :, :] - price_mean
    volume_mean = np.mean(volume[i, :, :], axis=0)
    volume[i, :, :] = volume[i, :, :] - volume_mean

train_price = price
train_volume = volume
train_prob = prob

data의 original label (horizon=5)와 계산해본 label과의 비교
(거의 비슷한데 약간 다름) ==> why???????????????

In [41]:
mid = (data_all[0,:] + data_all[2, :])/2.
mid = mid.T

In [62]:
k = 5
n = mid.shape[0] - k
thres = 0.00002

matching_cnt = 0
for i in range(n):
    m_i = mid[i]
    l_i = (np.sum(mid[i+1:i+1+k])/k-m_i)/m_i
    if l_i >= thres:
        y = 0
    elif l_i <= -thres:
        y = 2
    else:
        y = 1
    if i <= 50:
        print(y, int(data_all[149-2, i+k-1])-1)
    if y == int(data_all[149-2, i+k-1])-1:
        matching_cnt += 1

print('horizon = %d, matching ratio = %f'%(k, matching_cnt/n))
    

0 1
0 0
2 0
2 0
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
1 1
1 1
1 2
2 1
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
0 0
2 2
0 2
2 2
2 2
2 2
2 2
2 1
0 0
2 1
1 0
0 0
0 0
0 0
0 0
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
horizon = 5, matching ratio = 0.948074


In [61]:
k = 10
n = mid.shape[0] - k
thres = 0.00002
label_index = 149-1 # for k=10

matching_cnt = 0
for i in range(n):
    m_i = mid[i]
    l_i = (np.sum(mid[i+1:i+1+k])/k-m_i)/m_i
    if l_i >= thres:
        y = 0
    elif l_i <= -thres:
        y = 2
    else:
        y = 1
    if y == int(data_all[label_index, i+k-1])-1:
        matching_cnt += 1

print('horizon = %d, matching ratio = %f'%(k, matching_cnt/n))

horizon = 10, matching ratio = 0.964633


In [60]:
auctions = ['Auction', 'NoAuction']
splits = [ ['Training', 'Train'], ['Testing', 'Test'] ]
folds = [1, 2, 3, 4, 5, 6, 7, 8, 9]
horizons = { '1': 0, '2': 1, '3': 2, '5': 3, '10': 4}

#splits = splits[:1]
auctions = auctions[:1]
folds = folds[:1]
horizons = { '1': 0 }

In [55]:
import numpy as np
def check_mid_label(mid, mid_label):
    mid = np.round(mid, decimals=5)
    check = np.array_equal(mid, mid_label)
    if check == False:
        print(check)
    #print(check)
    #print(results)
    #print(mid[0:20])
    #print(mid_label[0:20])
    #print(mid[0], mid_label[0])
    #print(check)

In [82]:
thres = 0.00002

def check_label(data_all, mid, k, h):
    max_offset = 11
    n = mid.shape[0] - k - max_offset
    label_index = 149 - h 
    y_label = data_all[:, -h] - 1
    matching_best = 0
    offset_best = 0
    for offset in range(0, max_offset):
        y_label_shift = y_label[offset:]
        matching_cnt = 0
        cnt = 0
        for i in range(n):
            m_i = mid[i]
            l_i = (np.sum(mid[i+1:i+1+k])/k-m_i)/m_i
            if l_i >= thres:
                y = 0
            elif l_i <= -thres:
                y = 2
            else:
                y = 1
            if y == y_label_shift[i]:
                matching_cnt += 1
            cnt += 1
        if (matching_cnt/cnt > matching_best):
            matching_best = matching_cnt/cnt
            offset_best = offset
    return matching_best, offset_best

In [86]:
auctions = ['Auction', 'NoAuction']
splits = [ ['Training', 'Train'], ['Testing', 'Test'] ]
folds = [1, 2, 3, 4, 5, 6, 7, 8, 9]
horizons = { '1': 0, '2': 1, '3': 2, '5': 3, '10': 4}

#splits = splits[:1]
#auctions = auctions[:1]
#folds = folds[:1]
#horizons = { '1': 0 }

file_name = 'data/FI-2010/BenchmarkDatasets/%s/3.%s_DecPre/%s_DecPre_%s/%s_Dst_%s_DecPre_CF_%d.txt'

result_dict = {
    'auction': [],
    'split': [],
    'fold': [],
    'horizon': [],
    'matching_ratio': [],
    'shift': [],
}

for auction in auctions:
    for split in splits:
        for fold in folds:
            data_path = file_name%(auction, auction, auction, split[0], split[1], auction, fold)
            print(data_path)
            with open(data_path, 'r') as f:
                #data = np.genfromtxt(f, max_rows=int(4*data_level))
                data_all = np.genfromtxt(f)
                #data = data_all[:int(4*data_level), :]
            
            mid = (data_all[0,:] + data_all[2, :])/2.
            mid = mid.T
            mid = np.round(mid, decimals=5)
            mid_label = data_all[50, :].T
            data = data_all.T
            
            for horizon in horizons.keys():
                k = int(horizon)
                h = 5 - horizons[horizon]
                res, offset = check_label(data, mid, k, h)
                #print(horizon, res, offset)
                result_dict['auction'].append(auction)
                result_dict['split'].append(split)
                result_dict['fold'].append(fold)
                result_dict['horizon'].append(horizon)
                result_dict['matching_ratio'].append(res)
                result_dict['shift'].append(offset)
                print(horizon, res, offset)
                
print('done')


data/FI-2010/BenchmarkDatasets/Auction/3.Auction_DecPre/Auction_DecPre_Training/Train_Dst_Auction_DecPre_CF_1.txt
1 0.9024720050707796 1
2 0.9231549367195588 1
3 0.9359575726842461 2
5 0.9480623758610489 4
10 0.9646245852792629 9
data/FI-2010/BenchmarkDatasets/Auction/3.Auction_DecPre/Auction_DecPre_Training/Train_Dst_Auction_DecPre_CF_2.txt
1 0.8982627320323655 1
2 0.9213569442791775 1
3 0.935267519093053 2
5 0.9467113803548247 4
10 0.9642451452371937 9
data/FI-2010/BenchmarkDatasets/Auction/3.Auction_DecPre/Auction_DecPre_Training/Train_Dst_Auction_DecPre_CF_3.txt
1 0.9000190228591357 1
2 0.9227586534879482 1
3 0.9360029168846404 2
5 0.9475190234622701 4
10 0.9640997186001348 9
data/FI-2010/BenchmarkDatasets/Auction/3.Auction_DecPre/Auction_DecPre_Training/Train_Dst_Auction_DecPre_CF_4.txt
1 0.8987935023846626 1
2 0.9214473334710622 1
3 0.9350643984558538 2
5 0.9472528510377068 4
10 0.9641691311457024 9
data/FI-2010/BenchmarkDatasets/Auction/3.Auction_DecPre/Auction_DecPre_Training/T

In [89]:
import pandas as pd

df = pd.DataFrame.from_dict(result_dict)

In [95]:
pd.set_option('display.max_rows', None)

In [96]:
df

Unnamed: 0,auction,split,fold,horizon,matching_ratio,shift
0,Auction,"[Training, Train]",1,1,0.902472,1
1,Auction,"[Training, Train]",1,2,0.923155,1
2,Auction,"[Training, Train]",1,3,0.935958,2
3,Auction,"[Training, Train]",1,5,0.948062,4
4,Auction,"[Training, Train]",1,10,0.964625,9
5,Auction,"[Training, Train]",2,1,0.898263,1
6,Auction,"[Training, Train]",2,2,0.921357,1
7,Auction,"[Training, Train]",2,3,0.935268,2
8,Auction,"[Training, Train]",2,5,0.946711,4
9,Auction,"[Training, Train]",2,10,0.964245,9


In [99]:
df.to_excel('data/FI-2010-shift.xlsx', index=False)