In [1]:
%reload_ext autoreload
%autoreload

import os
from itertools import product

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
assert os.path.exists("./BenchmarkDatasets")

In [3]:
def load_raw(day: int) -> np.array:
    root = "./BenchmarkDatasets/NoAuction/3.NOAuction_DecPre"
    if day == 1:
        return np.loadtxt(
            f"{root}/NoAuction_DecPre_Training/Train_Dst_NoAuction_DecPre_CF_1.txt"
        )
    else:
        return np.loadtxt(
            f"{root}/NoAuction_DecPre_Testing/Test_Dst_NoAuction_DecPre_CF_{day-1}.txt"
        )

In [4]:
def split(array, n_stocks=5) -> (np.array, ...):
    boundaries = np.sort(
        np.argsort(np.abs(np.diff(array[0], prepend=np.inf)))[-n_stocks - 1 :]
    )
    return tuple(array[:, boundaries[i] : boundaries[i + 1]] for i in range(n_stocks))

In [5]:
def to_dataframe(array) -> pd.DataFrame:
    data = {}

    for level in range(10):
        data[f"PRICE_ASK_{level}"] = array[4 * level]
    for level in range(10):
        data[f"PRICE_BID_{level}"] = array[4 * level + 2]
    for level in range(10):
        data[f"VOLUME_ASK_{level}"] = array[4 * level + 1]
    for level in range(10):
        data[f"VOLUME_BID_{level}"] = array[4 * level + 3]

    return pd.DataFrame(data)

In [6]:
def revert_decimal_normalization(df, max_denom=10 ** 8):
    assert (df.values != 0).all()

    df = df * max_denom
    df = df.round()
    df = df.astype(int)
    for c in df.columns:
        while (df.loc[:, c] % 10 == 0).all():
            df.loc[:, c] //= 10
    return df

In [7]:
dfs = {i_stock: {} for i_stock in range(1, 5 + 1)}

for day in tqdm(range(1, 10 + 1)):
    a_raw = load_raw(day)
    for i_stock, a in enumerate(split(a_raw)):
        dfs[i_stock + 1][day] = to_dataframe(a)

100%|██████████| 10/10 [00:27<00:00,  2.76s/it]


In [8]:
listdf = []

for i_stock, day in tqdm(list(product(range(1, 5 + 1), range(1, 10 + 1)))):
    df = dfs[i_stock][day]
    df = revert_decimal_normalization(df)
    df["STOCK"] = i_stock
    df["DAY"] = day
    df = df[list(df.columns[-2:]) + list(df.columns[:-2])]

    listdf.append(df)

dataframe = pd.concat(listdf, axis=0)
dataframe = dataframe.reset_index(drop=True)
dataframe.to_csv(f"data.csv")

100%|██████████| 50/50 [00:05<00:00,  9.16it/s]


In [9]:
dataframe

Unnamed: 0,STOCK,DAY,PRICE_ASK_0,PRICE_ASK_1,PRICE_ASK_2,PRICE_ASK_3,PRICE_ASK_4,PRICE_ASK_5,PRICE_ASK_6,PRICE_ASK_7,...,VOLUME_BID_0,VOLUME_BID_1,VOLUME_BID_2,VOLUME_BID_3,VOLUME_BID_4,VOLUME_BID_5,VOLUME_BID_6,VOLUME_BID_7,VOLUME_BID_8,VOLUME_BID_9
0,1,1,2615,2618,2619,2620,2621,2623,2625,2626,...,326,682,786,893,159,100,143,134,123,128
1,1,1,2615,2619,2620,2621,2625,2626,2629,2633,...,326,682,786,393,500,159,100,143,134,123
2,1,1,2614,2615,2617,2619,2620,2621,2625,2626,...,326,682,786,393,500,159,100,143,134,123
3,1,1,2614,2617,2619,2620,2621,2625,2629,2633,...,326,682,786,893,159,100,143,134,123,128
4,1,1,2614,2617,2619,2620,2621,2625,2629,2633,...,326,682,786,893,159,100,143,134,123,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272936,5,10,3688,3690,3691,3692,3693,3695,3696,3697,...,150,200,869,980,446,100,768,300,682,950
272937,5,10,3688,3689,3690,3691,3692,3693,3694,3695,...,150,200,869,980,446,100,768,300,682,950
272938,5,10,3677,3688,3689,3690,3691,3692,3693,3694,...,200,869,680,446,100,768,300,682,950,100
272939,5,10,3677,3688,3689,3690,3691,3692,3693,3694,...,200,869,380,446,100,768,300,600,682,950
