# MiniRocket - Data Preparation

A bit of sanity checks on the data and prepare it into Nested Panels.

In [None]:
# Imports & paths
import json
import math
import re
from pathlib import Path

import numpy as np
import pandas as pd

In [None]:
# Paths
DATA_DIR = Path("../data")
ARTIFACTS_XGB = DATA_DIR / "artifacts/xgb"
ARTIFACTS_MINI = DATA_DIR / "artifacts/minirocket"
ARTIFACTS_MINI.mkdir(parents=True, exist_ok=True)

# Config (feel free to tweak)
ID_COLS = ["subject", "experiment"]
TARGET_COL = "label"
TIME_COL = "Time"

# Fixed-length resampling length for MiniRocket.
# Set to None to keep native lengths (you'll need padding/windowing later).
RESAMPLE_LEN = 256
RANDOM_STATE = 42

In [None]:
# Load unified dataframe
df_path = DATA_DIR / "consolidated" / "fall_dataset.parquet"
df = pd.read_parquet(df_path)

# Defensive sort
df = df.sort_values(ID_COLS + [TIME_COL]).reset_index(drop=True)

print("Loaded rows:", len(df))
print("Columns:", df.columns.tolist()[:8], "... (+ more)")

df

## Basic Integrity Checks

In [None]:
# Identify sensor columns
EXCLUDE = [TIME_COL] + ID_COLS + [TARGET_COL]
SENSOR_COLS = [c for c in df.columns if c not in EXCLUDE]
assert len(SENSOR_COLS) > 0, "No sensor columns detected."

print(f"Detected {len(SENSOR_COLS)} sensor channels.")
print("Example channels:", SENSOR_COLS[:8])

In [None]:
# Ensure one label per (subject, experiment)
lbl_counts = df.groupby(ID_COLS)[TARGET_COL].nunique()
multi_lbl = lbl_counts[lbl_counts > 1]
if len(multi_lbl) > 0:
    raise ValueError(
        f"Found {len(multi_lbl)} experiments with multiple labels. "
        f"First few: {multi_lbl.head().to_dict()}"
    )

In [None]:
# Ensure Time is monotonic inside each group (already sorted above)
is_monotonic = (
    df.groupby(ID_COLS)[TIME_COL].apply(lambda s: s.is_monotonic_increasing).all()
)
assert is_monotonic, "Time must be monotonic per (subject, experiment)."

## Splits

In [None]:
# Build label maps
labels_sorted = sorted(df[TARGET_COL].unique())
label_to_id = {lbl: i for i, lbl in enumerate(labels_sorted)}
id_to_label = {i: lbl for lbl, i in label_to_id.items()}
print("Label map:", label_to_id)

# Per-experiment label_id table
exp_labels = (
    df.groupby(ID_COLS)[TARGET_COL]
      .first()
      .map(label_to_id)
      .astype(int)
      .rename("label_id")
      .to_frame()
      .reset_index()
)

print("Experiments total:", len(exp_labels))

In [None]:
from sklearn.model_selection import GroupShuffleSplit

# Derive per-subject label (majority label across that subject's experiments)
subj_major = (
    exp_labels.groupby("subject")["label_id"]
    .agg(lambda x: x.value_counts().index[0])
    .rename("subject_major_label")
    .to_frame()
    .reset_index()
)

subjects = subj_major["subject"].to_numpy()
subj_labels = subj_major["subject_major_label"].to_numpy()

# First split: subjects -> (trainval_subjects, test_subjects)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
(trainval_idx, test_idx) = next(gss.split(subjects, subj_labels, groups=subjects))
trainval_subjects = set(subjects[trainval_idx])
test_subjects     = set(subjects[test_idx])

# Second split: within trainval subjects → train vs val (again group-wise)
subj_tv = subjects[trainval_idx]
subj_tv_labels = subj_labels[trainval_idx]
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE+1)
(train_idx2, val_idx2) = next(gss2.split(subj_tv, subj_tv_labels, groups=subj_tv))
train_subjects = set(subj_tv[train_idx2])
val_subjects   = set(subj_tv[val_idx2])

print("Subjects per split:",
      "train:", len(train_subjects),
      "val:", len(val_subjects),
      "test:", len(test_subjects))

# Map experiments into splits based on subject membership
def exps_for_subjects(subj_set):
    return (exp_labels[exp_labels["subject"].isin(subj_set)][["subject","experiment"]]
            .itertuples(index=False, name=None))

train_idx = list(exps_for_subjects(train_subjects))
val_idx   = list(exps_for_subjects(val_subjects))
test_idx  = list(exps_for_subjects(test_subjects))

print("Experiments per split:",
      "train:", len(train_idx),
      "val:", len(val_idx),
      "test:", len(test_idx))

## Build Labels

Build labels for each entry

In [None]:
# Helper: build per-experiment label Series
def build_labels(df: pd.DataFrame) -> pd.Series:
    exp_lbl = (
        df.groupby(ID_COLS)[TARGET_COL]
        .first()
        .map(label_to_id)
        .astype(int)
        .rename("label_id")
    )
    exp_lbl.index = pd.MultiIndex.from_tuples(exp_lbl.index, names=ID_COLS)
    return exp_lbl

labels_all = build_labels(df)
labels_all

## ...

In [None]:
# Convert Time to numeric seconds relative to experiment start (robust for datetime or float)
def _to_seconds_relative(s: pd.Series) -> np.ndarray:
    # If datetime-like, convert; else assume numeric
    if np.issubdtype(s.dtype, np.datetime64):
        s = s.view("int64") / 1e9  # ns -> s
    return (s - s.iloc[0]).astype(float).to_numpy()

# Helper: resample a single experiment into fixed-length multivariate series
def resample_experiment(exp_df: pd.DataFrame, resample_len: int) -> dict:
    """
    Returns dict[channel_name] = pd.Series(length=resample_len, index=range(resample_len))
    Uses linear interpolation over normalized time grid [0, 1].
    """
    # Time to normalized [0, 1]
    t_sec = _to_seconds_relative(exp_df[TIME_COL])
    if t_sec[-1] == 0:
        # Degenerate case: single timestamp -> replicate values
        t_norm = np.zeros_like(t_sec)
        grid = np.zeros(resample_len)
    else:
        t_norm = t_sec / t_sec[-1]
        grid = np.linspace(0.0, 1.0, resample_len, endpoint=True)

    out = {}
    for ch in SENSOR_COLS:
        y = exp_df[ch].to_numpy(dtype=float)
        # Handle NaNs: forward-fill then zeros as last resort
        if np.isnan(y).any():
            s = pd.Series(y).ffill().bfill().fillna(0.0).to_numpy()
        else:
            s = y
        # Interpolate to uniform grid
        if len(s) == 1:
            yi = np.full(resample_len, s[0], dtype=float)
        else:
            yi = np.interp(grid, t_norm, s)
        out[ch] = pd.Series(yi, index=pd.RangeIndex(resample_len, name="t"))
    return out

# Helper: pack panel into sktime nested DataFrame
def pack_nested(panels: list) -> pd.DataFrame:
    """
    panels: list of dicts [{channel: pd.Series(...), ...}, ...] in the same channel order.
    Returns a nested DataFrame with columns=channels, n_rows=len(panels).
    """
    channels = list(panels[0].keys())
    data = {ch: [p[ch] for p in panels] for ch in channels}
    nested = pd.DataFrame(data)
    return nested

# Build nested panels per split
def build_split_nested(df_all: pd.DataFrame, idx: pd.MultiIndex, resample_len: int):
    # Slice only the groups in the split
    panels = []
    y_list = []
    missing_groups = []
    # Iterate groups by (subject, experiment)
    grouped = df_all.groupby(ID_COLS, sort=False)
    want = set(idx)  # fast membership
    for key, g in grouped:
        if key not in want:
            continue
        # Prepare label
        lbl = g[TARGET_COL].iloc[0]
        y_list.append(label_to_id[lbl])
        # Panelize
        if resample_len is not None:
            panel = resample_experiment(g, resample_len=resample_len)
        else:
            # Keep native length: use direct values as a Series indexed by 0..T-1
            # (You must pad/window later before MiniRocket.)
            t_len = len(g)
            out = {}
            for ch in SENSOR_COLS:
                y = g[ch].to_numpy(dtype=float)
                if np.isnan(y).any():
                    y = pd.Series(y).ffill().bfill().fillna(0.0).to_numpy()
                out[ch] = pd.Series(y, index=pd.RangeIndex(t_len, name="t"))
            panel = out
        panels.append(panel)

    # Pack into nested frame (channels as columns)
    nested = pack_nested(panels)
    y_ser = pd.Series(y_list, name="label_id").astype(int)
    # Row index mirrors input order of idx; we can reindex to match that order
    # Build a map key->row for deterministic ordering
    # For simplicity, align to the iteration order of idx:
    # rebuild panels/y in idx order
    key_to_pos = {k: i for i, k in enumerate(set(df_all.groupby(ID_COLS).groups.keys()))}

    # Better: just ensure ordering by idx explicitly
    # Build list again, ordered by idx
    ordered_panels = []
    ordered_y = []
    groups = dict(tuple(df_all.groupby(ID_COLS, sort=False)))
    for key in idx:
        g = groups.get(key)
        if g is None:
            missing_groups.append(key)
            continue
        lbl = g[TARGET_COL].iloc[0]
        if resample_len is not None:
            panel = resample_experiment(g, resample_len=resample_len)
        else:
            t_len = len(g)
            out = {}
            for ch in SENSOR_COLS:
                yv = g[ch].to_numpy(dtype=float)
                if np.isnan(yv).any():
                    yv = pd.Series(yv).ffill().bfill().fillna(0.0).to_numpy()
                out[ch] = pd.Series(yv, index=pd.RangeIndex(t_len, name="t"))
            panel = out
        ordered_panels.append(panel)
        ordered_y.append(label_to_id[lbl])

    nested = pack_nested(ordered_panels)
    y_ser = pd.Series(ordered_y, name="label_id").astype(int)
    return nested, y_ser

print("Building nested panels (this may take a minute depending on data size)...")
train_nested, y_train = build_split_nested(df, train_idx, RESAMPLE_LEN)
val_nested,   y_val   = build_split_nested(df, val_idx,   RESAMPLE_LEN)
test_nested,  y_test  = build_split_nested(df, test_idx,  RESAMPLE_LEN)

print("Nested shapes (rows, channels):")
print("  train:", train_nested.shape, "  val:", val_nested.shape, "  test:", test_nested.shape)
print("Each cell is a pd.Series of length:", RESAMPLE_LEN if RESAMPLE_LEN else "(native length)")

In [None]:
train_nested

## Persist  Artifacts

In [None]:
# ..
(train_nested).to_pickle(ARTIFACTS_MINI / "train_nested.pkl")
(val_nested).to_pickle(ARTIFACTS_MINI / "val_nested.pkl")
(test_nested).to_pickle(ARTIFACTS_MINI / "test_nested.pkl")

y_train.to_frame().to_parquet(ARTIFACTS_MINI / "train_y.parquet", index=False)
y_val.to_frame().to_parquet(ARTIFACTS_MINI / "val_y.parquet", index=False)
y_test.to_frame().to_parquet(ARTIFACTS_MINI / "test_y.parquet", index=False)

# Splits file
splits = {
    "label_to_id": label_to_id,
    "id_to_label": {int(k): v for k, v in id_to_label.items()},
    "train_idx": train_idx,
    "val_idx": val_idx,
    "test_idx": test_idx,
    "split_strategy": {
        "group": "subject",
        "method": "GroupShuffleSplit(train/val/test)",
        "test_size_subjects": 0.2,
        "val_size_subjects_in_trainval": 0.2,
        "random_state": RANDOM_STATE,
    },
}
with open(ARTIFACTS_MINI / "splits.json", "w") as f:
    json.dump(splits, f, ensure_ascii=False, indent=2)

# Manifest for the modeling notebook
manifest = {
    "version": 1,
    "time_col": TIME_COL,
    "id_cols": ID_COLS,
    "target_col": TARGET_COL,
    "sensor_channels": SENSOR_COLS,
    "resample_len": RESAMPLE_LEN,
    "paths": {
        "train_nested": str(ARTIFACTS_MINI / "train_nested.pkl"),
        "val_nested": str(ARTIFACTS_MINI / "val_nested.pkl"),
        "test_nested": str(ARTIFACTS_MINI / "test_nested.pkl"),
        "train_y": str(ARTIFACTS_MINI / "train_y.parquet"),
        "val_y": str(ARTIFACTS_MINI / "val_y.parquet"),
        "test_y": str(ARTIFACTS_MINI / "test_y.parquet"),
        "splits": str(ARTIFACTS_MINI / "splits.json"),
    },
}
with open(ARTIFACTS_MINI / "manifest.json", "w") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

print("\n=== MiniRocket DATA PREP — DONE ===")
print("Artifacts saved in:", ARTIFACTS_MINI)