In [1]:
%reload_ext autoreload
%autoreload

import os
from itertools import product

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
assert os.path.exists("./BenchmarkDatasets")

In [3]:
def load_raw(day: int, normalization="DecPre") -> np.array:
    index = {"DecPre": 3, "Zscore": 1}[normalization]
    root = f"./BenchmarkDatasets/NoAuction/{index}.NoAuction_{normalization}"
    if normalization == "Zscore":
        normalization1 = "ZScore"
    else:
        normalization1 = normalization
    if day == 1:
        return np.loadtxt(
            f"{root}/NoAuction_{normalization}_Training/Train_Dst_NoAuction_{normalization1}_CF_1.txt"
        )
    else:
        return np.loadtxt(
            f"{root}/NoAuction_{normalization}_Testing/Test_Dst_NoAuction_{normalization1}_CF_{day-1}.txt"
        )

In [4]:
def split(array, n_stocks=5) -> (np.array, ...):
    boundaries = np.sort(
        np.argsort(np.abs(np.diff(array[0], prepend=np.inf)))[-n_stocks - 1 :]
    )
    return tuple(array[:, boundaries[i] : boundaries[i + 1]] for i in range(n_stocks))

In [5]:
def to_dataframe(array) -> pd.DataFrame:
    data = {}

    for level in range(10):
        data[f"PRICE_ASK_{level}"] = array[4 * level]
    for level in range(10):
        data[f"PRICE_BID_{level}"] = array[4 * level + 2]
    for level in range(10):
        data[f"VOLUME_ASK_{level}"] = array[4 * level + 1]
    for level in range(10):
        data[f"VOLUME_BID_{level}"] = array[4 * level + 3]
    data[f"LABEL_1TICK"] = 2 - array[-5]
    data[f"LABEL_2TICK"] = 2 - array[-4]
    data[f"LABEL_3TICK"] = 2 - array[-3]
    data[f"LABEL_5TICK"] = 2 - array[-2]
    data[f"LABEL_10TICK"] = 2 - array[-1]

    return pd.DataFrame(data)

In [6]:
def revert_decimal_normalization(df, max_denom=10 ** 8):
    assert (df.iloc[:, :-5].values != 0).all()

    df.iloc[:, :-5] = df.iloc[:, :-5] * max_denom
    df.iloc[:, :-5] = df.iloc[:, :-5].round()
    df.iloc[:, :-5] = df.iloc[:, :-5].astype(int)
    for c in df.columns[:-5]:
        while (df.loc[:, c] % 10 == 0).all():
            df.loc[:, c] //= 10
    return df

In [7]:
dfs = {i_stock: {} for i_stock in range(1, 5 + 1)}

for day in tqdm(range(1, 10 + 1)):
    a_raw = load_raw(day)
    for i_stock, a in enumerate(split(a_raw)):
        dfs[i_stock + 1][day] = to_dataframe(a)

100%|██████████| 10/10 [00:35<00:00,  3.50s/it]


In [8]:
listdf = []

for i_stock, day in tqdm(list(product(range(1, 5 + 1), range(1, 10 + 1)))):
    df = dfs[i_stock][day]
    df = revert_decimal_normalization(df)
    df["STOCK"] = i_stock
    df["DAY"] = day
    df = df[list(df.columns[-2:]) + list(df.columns[:-2])]

    listdf.append(df)

dataframe = pd.concat(listdf, axis=0)
dataframe = dataframe.reset_index(drop=True)
dataframe = dataframe.astype(int)
dataframe.to_csv(f"data.csv")

100%|██████████| 50/50 [00:07<00:00,  6.69it/s]


In [9]:
dataframe

Unnamed: 0,STOCK,DAY,PRICE_ASK_0,PRICE_ASK_1,PRICE_ASK_2,PRICE_ASK_3,PRICE_ASK_4,PRICE_ASK_5,PRICE_ASK_6,PRICE_ASK_7,...,VOLUME_BID_5,VOLUME_BID_6,VOLUME_BID_7,VOLUME_BID_8,VOLUME_BID_9,LABEL_1TICK,LABEL_2TICK,LABEL_3TICK,LABEL_5TICK,LABEL_10TICK
0,1,1,2615,2618,2619,2620,2621,2623,2625,2626,...,100,143,134,123,128,0,0,0,0,0
1,1,1,2615,2619,2620,2621,2625,2626,2629,2633,...,159,100,143,134,123,0,0,0,0,0
2,1,1,2614,2615,2617,2619,2620,2621,2625,2626,...,159,100,143,134,123,-1,-1,0,0,0
3,1,1,2614,2617,2619,2620,2621,2625,2629,2633,...,100,143,134,123,128,0,0,-1,0,0
4,1,1,2614,2617,2619,2620,2621,2625,2629,2633,...,100,143,134,123,128,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272936,5,10,3688,3690,3691,3692,3693,3695,3696,3697,...,100,768,300,682,950,0,-1,-1,0,1
272937,5,10,3688,3689,3690,3691,3692,3693,3694,3695,...,100,768,300,682,950,-1,-1,-1,-1,-1
272938,5,10,3677,3688,3689,3690,3691,3692,3693,3694,...,768,300,682,950,100,-1,-1,-1,-1,-1
272939,5,10,3677,3688,3689,3690,3691,3692,3693,3694,...,768,300,600,682,950,1,1,-1,-1,-1


In [10]:
dfs = {i_stock: {} for i_stock in range(1, 5 + 1)}

for day in tqdm(range(1, 10 + 1)):
    a_raw = load_raw(day, normalization="Zscore")
    for i_stock, a in enumerate(split(a_raw)):
        dfs[i_stock + 1][day] = to_dataframe(a)

100%|██████████| 10/10 [00:34<00:00,  3.40s/it]


In [11]:
listdf = []

for i_stock, day in tqdm(list(product(range(1, 5 + 1), range(1, 10 + 1)))):
    df = dfs[i_stock][day]
    # df = revert_decimal_normalization(df)
    df["STOCK"] = i_stock
    df["DAY"] = day
    df = df[list(df.columns[-2:]) + list(df.columns[:-2])]

    listdf.append(df)

dataframe = pd.concat(listdf, axis=0)
dataframe = dataframe.reset_index(drop=True)
# dataframe = dataframe.astype(int)
dataframe.iloc[:, -5:] = dataframe.iloc[:, -5:].astype(int)
dataframe.iloc[:100000].to_csv(f"data_zscore1.csv")
dataframe.iloc[100000:].to_csv(f"data_zscore2.csv")

100%|██████████| 50/50 [00:00<00:00, 464.43it/s]


In [12]:
dataframe

Unnamed: 0,STOCK,DAY,PRICE_ASK_0,PRICE_ASK_1,PRICE_ASK_2,PRICE_ASK_3,PRICE_ASK_4,PRICE_ASK_5,PRICE_ASK_6,PRICE_ASK_7,...,VOLUME_BID_5,VOLUME_BID_6,VOLUME_BID_7,VOLUME_BID_8,VOLUME_BID_9,LABEL_1TICK,LABEL_2TICK,LABEL_3TICK,LABEL_5TICK,LABEL_10TICK
0,1,1,0.408275,0.409877,0.409493,0.409170,0.408796,0.409242,0.409436,0.408336,...,-0.784121,-0.718656,-0.573157,-0.521503,-0.464096,0,0,0,0,0
1,1,1,0.408275,0.410898,0.410513,0.410191,0.412876,0.412301,0.413513,0.415468,...,-0.758706,-0.737892,-0.569944,-0.518372,-0.465155,0,0,0,0,0
2,1,1,0.407254,0.406815,0.407452,0.408150,0.407776,0.407202,0.409436,0.408336,...,-0.758706,-0.737892,-0.569944,-0.518372,-0.465155,-1,-1,0,0,0
3,1,1,0.407254,0.408857,0.409493,0.409170,0.408796,0.411282,0.413513,0.415468,...,-0.784121,-0.718656,-0.573157,-0.521503,-0.464096,0,0,-1,0,0
4,1,1,0.407254,0.408857,0.409493,0.409170,0.408796,0.411282,0.413513,0.415468,...,-0.784121,-0.718656,-0.573157,-0.521503,-0.464096,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284870,5,10,1.369639,1.371981,1.371401,1.371911,1.371474,1.370849,1.370002,1.368860,...,-0.807128,-0.463179,-0.562488,-0.370706,-0.297304,1,1,-1,-1,0
284871,5,10,1.369639,1.371001,1.370421,1.369952,1.370495,1.369870,1.369023,1.367882,...,-0.807128,-0.463179,-0.562488,-0.370706,-0.297304,0,1,1,0,-1
284872,5,10,1.369639,1.369041,1.369442,1.368973,1.368537,1.368891,1.368045,1.366904,...,-0.807128,-0.463179,-0.562488,-0.370706,-0.297304,1,1,1,-1,1
284873,5,10,1.369639,1.370021,1.369442,1.368973,1.368537,1.368891,1.368045,1.366904,...,-0.807128,-0.463179,-0.562488,-0.370706,-0.297304,0,-1,-1,0,1
