In [1]:
%reload_ext autoreload
%autoreload

import os
from itertools import product

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
assert os.path.exists("./BenchmarkDatasets")

In [3]:
def load_raw(day: int) -> np.array:
    root = "./BenchmarkDatasets/NoAuction/3.NOAuction_DecPre"
    if day == 1:
        return np.loadtxt(
            f"{root}/NoAuction_DecPre_Training/Train_Dst_NoAuction_DecPre_CF_1.txt"
        )
    else:
        return np.loadtxt(
            f"{root}/NoAuction_DecPre_Testing/Test_Dst_NoAuction_DecPre_CF_{day-1}.txt"
        )

In [4]:
def split(array, n_stocks=5) -> (np.array, ...):
    boundaries = np.sort(
        np.argsort(np.abs(np.diff(array[0], prepend=np.inf)))[-n_stocks - 1 :]
    )
    return tuple(array[:, boundaries[i] : boundaries[i + 1]] for i in range(n_stocks))

In [5]:
def to_dataframe(array) -> pd.DataFrame:
    data = {}

    for level in range(10):
        data[f"PRICE_ASK_{level}"] = array[4 * level]
    for level in range(10):
        data[f"PRICE_BID_{level}"] = array[4 * level + 2]
    for level in range(10):
        data[f"VOLUME_ASK_{level}"] = array[4 * level + 1]
    for level in range(10):
        data[f"VOLUME_BID_{level}"] = array[4 * level + 3]
    data[f"LABEL_1TICK"] = 2 - array[-5]
    data[f"LABEL_2TICK"] = 2 - array[-4]
    data[f"LABEL_3TICK"] = 2 - array[-3]
    data[f"LABEL_5TICK"] = 2 - array[-2]
    data[f"LABEL_10TICK"] = 2 - array[-1]

    return pd.DataFrame(data)

In [6]:
def revert_decimal_normalization(df, max_denom=10 ** 8):
    assert (df.iloc[:, :-5].values != 0).all()

    df.iloc[:, :-5] = df.iloc[:, :-5] * max_denom
    df.iloc[:, :-5] = df.iloc[:, :-5].round()
    df.iloc[:, :-5] = df.iloc[:, :-5].astype(int)
    for c in df.columns[:-5]:
        while (df.loc[:, c] % 10 == 0).all():
            df.loc[:, c] //= 10
    return df

In [7]:
dfs = {i_stock: {} for i_stock in range(1, 5 + 1)}

for day in tqdm(range(1, 10 + 1)):
    a_raw = load_raw(day)
    for i_stock, a in enumerate(split(a_raw)):
        dfs[i_stock + 1][day] = to_dataframe(a)

100%|██████████| 10/10 [00:27<00:00,  2.76s/it]


In [8]:
listdf = []

for i_stock, day in tqdm(list(product(range(1, 5 + 1), range(1, 10 + 1)))):
    df = dfs[i_stock][day]
    df = revert_decimal_normalization(df)
    df["STOCK"] = i_stock
    df["DAY"] = day
    df = df[list(df.columns[-2:]) + list(df.columns[:-2])]

    listdf.append(df)

dataframe = pd.concat(listdf, axis=0)
dataframe = dataframe.reset_index(drop=True)
dataframe.to_csv(f"data.csv")

100%|██████████| 50/50 [00:06<00:00,  7.88it/s]


In [9]:
dataframe

Unnamed: 0,STOCK,DAY,PRICE_ASK_0,PRICE_ASK_1,PRICE_ASK_2,PRICE_ASK_3,PRICE_ASK_4,PRICE_ASK_5,PRICE_ASK_6,PRICE_ASK_7,...,VOLUME_BID_5,VOLUME_BID_6,VOLUME_BID_7,VOLUME_BID_8,VOLUME_BID_9,LABEL_1TICK,LABEL_2TICK,LABEL_3TICK,LABEL_5TICK,LABEL_10TICK
0,1,1,2615.0,2618.0,2619.0,2620.0,2621.0,2623.0,2625.0,2626.0,...,100.0,143.0,134.0,123.0,128.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2615.0,2619.0,2620.0,2621.0,2625.0,2626.0,2629.0,2633.0,...,159.0,100.0,143.0,134.0,123.0,0.0,0.0,0.0,0.0,0.0
2,1,1,2614.0,2615.0,2617.0,2619.0,2620.0,2621.0,2625.0,2626.0,...,159.0,100.0,143.0,134.0,123.0,-1.0,-1.0,0.0,0.0,0.0
3,1,1,2614.0,2617.0,2619.0,2620.0,2621.0,2625.0,2629.0,2633.0,...,100.0,143.0,134.0,123.0,128.0,0.0,0.0,-1.0,0.0,0.0
4,1,1,2614.0,2617.0,2619.0,2620.0,2621.0,2625.0,2629.0,2633.0,...,100.0,143.0,134.0,123.0,128.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272936,5,10,3688.0,3690.0,3691.0,3692.0,3693.0,3695.0,3696.0,3697.0,...,100.0,768.0,300.0,682.0,950.0,0.0,-1.0,-1.0,0.0,1.0
272937,5,10,3688.0,3689.0,3690.0,3691.0,3692.0,3693.0,3694.0,3695.0,...,100.0,768.0,300.0,682.0,950.0,-1.0,-1.0,-1.0,-1.0,-1.0
272938,5,10,3677.0,3688.0,3689.0,3690.0,3691.0,3692.0,3693.0,3694.0,...,768.0,300.0,682.0,950.0,100.0,-1.0,-1.0,-1.0,-1.0,-1.0
272939,5,10,3677.0,3688.0,3689.0,3690.0,3691.0,3692.0,3693.0,3694.0,...,768.0,300.0,600.0,682.0,950.0,1.0,1.0,-1.0,-1.0,-1.0
