In [1]:
%reload_ext autoreload
%autoreload

import os
from itertools import product

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
assert os.path.exists("./BenchmarkDatasets")

In [3]:
import mpl_hig

mpl_hig.set("whitegrid")

In [4]:
def load_raw(day: int) -> np.array:
    root = "./BenchmarkDatasets/NoAuction/3.NOAuction_DecPre"
    if day == 1:
        return np.loadtxt(
            f"{root}/NoAuction_DecPre_Training/Train_Dst_NoAuction_DecPre_CF_1.txt"
        )
    else:
        return np.loadtxt(
            f"{root}/NoAuction_DecPre_Testing/Test_Dst_NoAuction_DecPre_CF_{day-1}.txt"
        )

In [5]:
def split(array, n_stocks=5) -> (np.array, ...):
    boundaries = np.sort(
        np.argsort(np.abs(np.diff(array[0], prepend=np.inf)))[-n_stocks - 1 :]
    )
    return tuple(array[:, boundaries[i] : boundaries[i + 1]] for i in range(n_stocks))

In [6]:
def to_dataframe(array) -> pd.DataFrame:
    data = {}

    for level in range(10):
        data[f"PRICE_ASK_{level}"] = array[4 * level]
        data[f"PRICE_BID_{level}"] = array[4 * level + 2]
        data[f"VOLUME_ASK_{level}"] = array[4 * level + 1]
        data[f"VOLUME_BID_{level}"] = array[4 * level + 3]

    return pd.DataFrame(data)

In [7]:
def revert_decimal_normalization(df, max_denom=10 ** 8):
    assert (df.values != 0).all()

    df = df * max_denom
    df = df.round()
    df = df.astype(int)
    for c in df.columns:
        while (df.loc[:, c] % 10 == 0).all():
            df.loc[:, c] //= 10
    return df

In [8]:
dfs = {i_stock: {} for i_stock in range(1, 5 + 1)}

for day in tqdm(range(1, 10 + 1)):
    a_raw = load_raw(day)
    for i_stock, a in enumerate(split(a_raw)):
        dfs[i_stock + 1][day] = to_dataframe(a)

100%|██████████| 10/10 [00:33<00:00,  3.37s/it]


In [9]:
listdf = []

for i_stock, day in tqdm(list(product(range(1, 5 + 1), range(1, 10 + 1)))):
    df = dfs[i_stock][day]
    df = revert_decimal_normalization(df)
    df["STOCK"] = i_stock
    df["DAY"] = day
    df = df[list(df.columns[-2:]) + list(df.columns[:-2])]

    listdf.append(df)

dataframe = pd.concat(listdf, axis=0)
dataframe = dataframe.reset_index(drop=True)
dataframe.to_csv(f"data.csv")

100%|██████████| 50/50 [00:05<00:00,  8.37it/s]


In [10]:
dataframe

Unnamed: 0,STOCK,DAY,PRICE_ASK_0,PRICE_BID_0,VOLUME_ASK_0,VOLUME_BID_0,PRICE_ASK_1,PRICE_BID_1,VOLUME_ASK_1,VOLUME_BID_1,...,VOLUME_ASK_7,VOLUME_BID_7,PRICE_ASK_8,PRICE_BID_8,VOLUME_ASK_8,VOLUME_BID_8,PRICE_ASK_9,PRICE_BID_9,VOLUME_ASK_9,VOLUME_BID_9
0,1,1,2615,2606,353,326,2618,2604,200,682,...,787,134,2629,2588,146,123,2633,2579,311,128
1,1,1,2615,2606,211,326,2619,2604,164,682,...,311,143,2637,2591,165,134,2646,2588,138,123
2,1,1,2614,2606,122,326,2615,2604,200,682,...,787,143,2629,2591,146,134,2633,2588,311,123
3,1,1,2614,2606,322,326,2617,2604,938,682,...,311,134,2637,2588,165,123,2646,2579,138,128
4,1,1,2614,2606,322,326,2617,2604,938,682,...,311,134,2637,2588,165,123,2646,2579,138,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272936,5,10,3688,3677,324,150,3690,3676,100,200,...,393,300,3698,3657,200,682,3699,3653,3268,950
272937,5,10,3688,3677,324,150,3689,3676,200,200,...,502,300,3696,3657,447,682,3697,3653,393,950
272938,5,10,3677,3676,250,200,3688,3672,324,869,...,280,682,3695,3653,502,950,3696,3650,447,100
272939,5,10,3677,3676,250,200,3688,3672,324,869,...,280,600,3695,3657,502,682,3696,3653,447,950
