In [1]:
import datetime
import time

import numpy as np
import pandas as pd
import pyarrow as pa

N = 6500  # UIDs in the universe
D = 21  # Number of days
T = 391  # Times per day
K = 2  # Number of features

MISSING_PCT = 0.1  # Percent of rows missing from the unstructured long form
IS_SORTED = False  # Whether the unstructured long form is sorted


def get_full_timestamp_grid(start_date=datetime.datetime(2018, 1, 1), num_days=D):
    """
    Generate a datetime index with timestamps for each minute of a typical trading day,
    over a specific number of days.

    Parameters:
    start_date (datetime): The date from which to start the timestamp series.
    num_days (int): The number of days for which to generate the timestamp series.

    Returns:
    pandas.DatetimeIndex: Generated timestamp series.
    """
    wq_timestamp = pd.date_range(
        start=start_date,
        end=start_date + datetime.timedelta(days=num_days),
        freq="1T",
        name="wq_timestamp",
    )
    wq_timestamp = wq_timestamp[
        (wq_timestamp.time >= datetime.time(9, 30))
        & (wq_timestamp.time <= datetime.time(16, 0))
    ]
    return wq_timestamp


def get_universe(N=N):
    """
    Generate a Pandas Index of unique identifiers (UIDs) of the format "EQi".

    Parameters:
    N (int): The number of UIDs to generate. If not provided, defaults to the global constant N.

    Returns:
    pandas.Index: An index containing string UIDs.
    """
    num_digits = len(str(N))
    return pd.Index([f"EQ{str(i).zfill(num_digits)}" for i in range(N)], name="UID")


def get_index(is_sorted: bool = False, missing_pct: float = 0.1):
    wq_timestamp = get_full_timestamp_grid()
    U = get_universe()
    ix = pd.MultiIndex.from_product((wq_timestamp, U))
    np_ix = np.arange(len(ix))

    if not is_sorted:
        np.random.shuffle(np_ix)

    if missing_pct > 0:
        # Missing timestamps are unusual, but missing UIDs
        # are *ubiquitous*
        np_ix = np.random.choice(
            np_ix, size=int((1 - missing_pct) * len(np_ix)), replace=False
        )

    ix = ix[np_ix]

    return ix


def make_unstructured_long_form() -> pd.DataFrame:
    """Generate random unstructured data."""
    ix = get_index(is_sorted=IS_SORTED, missing_pct=MISSING_PCT)
    return pd.DataFrame(
        index=ix,
        data=np.random.normal(size=(len(ix), K)),
        columns=[f"feature_{k}" for k in range(K)],
    ).reset_index()


def unstructured_to_structured_long_form(df: pd.DataFrame) -> pd.DataFrame:
    """Converts from an unstructured table, into a dataframe with a standardized index.

    Missing UIDs or timestamps will be NaN filled.
    """
    # The index could also be subselecting certain times of day
    # e.g., down to a 5T grid.  But, this is the most general.
    ix = get_index(is_sorted=True, missing_pct=0.0)
    df = df.set_index(["wq_timestamp", "UID"])
    df = df.reindex(ix)
    return df


def structured_long_form_to_square_df(df: pd.DataFrame) -> pd.DataFrame:
    """Converts from a (datetime, UID, feature) dataframe to a datetime x (feature, UID) dataframe."""
    return df.unstack()


def square_df_to_square_numpy(df: pd.DataFrame):
    """Converts from a datetime x (feature, UID) dataframe to a [d, t, i, k] numpy array."""
    features = df.columns.levels[0]
    X = np.stack(
        [
            np.stack([g[f] for f in features], axis=-1)
            for _, g in df.groupby(lambda ix: ix.date, axis=0)
        ]
    )
    return X

In [2]:
print("Data formatting examples.")

print("Constructing unstructured long form...")
# 7.09 s ± 219 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = make_unstructured_long_form()

start = time.perf_counter()
print("Converting to structured long form...")
# 10.4 s ± 138 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = unstructured_to_structured_long_form(df)

print("Converting to square dataframe...")
# 5.47 s ± 67.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = structured_long_form_to_square_df(df)

print("Converting to square numpy array...")
# 335 ms ± 941 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = square_df_to_square_numpy(df)
print("total seconds: ", time.perf_counter() - start)
print("Done.")

Data formatting examples.
Constructing unstructured long form...
Converting to structured long form...
Converting to square dataframe...
Converting to square numpy array...
total seconds:  14.547978999995394
Done.


In [3]:
df

array([[[[-0.15772064, -0.98098507],
         [-0.48405897, -0.97477624],
         [ 0.18231006,  2.7115689 ],
         ...,
         [ 0.57240422, -1.545617  ],
         [ 0.19176875, -0.05062415],
         [        nan,         nan]],

        [[-0.4585519 , -0.28106926],
         [ 1.75407845, -0.42523078],
         [-0.51200281,  1.13315976],
         ...,
         [ 0.47687742, -0.7444683 ],
         [-1.00934463, -1.32950269],
         [-1.54194743,  1.28764133]],

        [[ 0.17847358, -0.67475381],
         [ 0.23835649,  1.27995498],
         [ 0.69100629,  2.33093184],
         ...,
         [ 0.24491078,  0.30212223],
         [-0.39396666, -0.04862567],
         [-0.28428688,  0.70574697]],

        ...,

        [[ 1.91531281,  2.32789257],
         [-1.07127177, -1.25052058],
         [-1.21423329, -0.01752425],
         ...,
         [-0.59381532,  0.03440005],
         [ 1.17379708, -0.67411046],
         [ 0.6269321 ,  0.28680459]],

        [[-0.26835707,  0.86249969