In [1]:
import os
import numpy as np
import pandas as pd

def long_df_to_npz_data_key(
    df: pd.DataFrame,
    out_name: str,
    data_dir: str = "./timevae_data",
    pad_value: float = 0.0,
):
    required = {"unique_id", "ds", "y"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    df = df.loc[:, ["unique_id", "ds", "y"]].copy()
    df["ds"] = pd.to_datetime(df["ds"], errors="coerce")
    if df["ds"].isna().any():
        raise ValueError("Some 'ds' values could not be parsed as datetimes.")
    df = df.sort_values(["unique_id", "ds"], kind="mergesort")

    groups = list(df.groupby("unique_id", sort=False))
    if not groups:
        raise ValueError("No samples found.")

    unique_ids = np.array([gid for gid, _ in groups], dtype=object)
    series_list = [g["y"].to_numpy(dtype=np.float32, copy=False) for _, g in groups]
    ds_list = np.array(
        [g["ds"].to_numpy(dtype="datetime64[ns]") for _, g in groups],
        dtype=object,
    )

    lengths = np.array([len(s) for s in series_list], dtype=np.int32)
    N = len(series_list)
    T = int(lengths.max())
    F = 1

    data = np.full((N, T, F), pad_value, dtype=np.float32)
    mask = np.zeros((N, T, 1), dtype=np.float32)

    for i, s in enumerate(series_list):
        L = len(s)
        data[i, :L, 0] = s
        mask[i, :L, 0] = 1.0

    os.makedirs(data_dir, exist_ok=True)
    out_path = os.path.join(data_dir, f"{out_name}.npz")
    np.savez_compressed(
        out_path,
        data=data,
        mask=mask,
        lengths=lengths,
        unique_id=unique_ids,
        ds_list=ds_list,
    )
    return out_path

In [3]:
import pandas as pd

df_long = pd.read_csv("m3_m.csv")

df_long = df_long.loc[:, ~df_long.columns.str.contains("^Unnamed")]
df_long["ds"] = pd.to_datetime(df_long["ds"])
df_long.head()

path = long_df_to_npz_data_key(df_long, "m3_q", data_dir="npz_data_converted")
print(path)

npz_data_converted/m3_q.npz
