# XGBoost - Data Preparation

In [None]:
# Imports 
import json, math, pathlib
import numpy as np
import pandas as pd
from scipy.stats import iqr, skew, kurtosis, entropy as shannon_entropy
from numpy.fft import rfft, rfftfreq
from sklearn.model_selection import GroupShuffleSplit

In [None]:
# Database Paths
DATA_DIR = pathlib.Path("../data")
ARTIFACTS = pathlib.Path(DATA_DIR / "artifacts/xgb")
ARTIFACTS.mkdir(parents=True, exist_ok=True)

In [None]:
# Define helpers
ID_COLS = ["subject","experiment"]
TARGET_COL = "label"
EXCLUDE = ["Time", "subject", "label", "experiment"]


In [None]:
# If fs is constant (was observed ~128 Hz), set here:
FS = 128.0


## Load Consolidated Dataset

In [None]:
# Load the unified dataframe produced earlier
#    Expected columns: Time, subject, label, experiment, <sensor...>
df = pd.read_parquet(DATA_DIR / "consolidated" / "fall_dataset.parquet")

# Ensure sorted by time inside each experiment (defensive)
df = df.sort_values(["subject","experiment","Time"]).reset_index(drop=True)

SENSOR_COLS = [c for c in df.columns if c not in EXCLUDE]

df

## Feature Engineering Functions

In [None]:
def zero_crossing_rate(x: np.ndarray) -> float:
    # fraction of sign changes over length
    s = np.signbit(x)
    return float(np.count_nonzero(s[1:] != s[:-1])) / max(len(x)-1, 1)

def rms(x: np.ndarray) -> float:
    return float(np.sqrt(np.mean(x**2))) if len(x) else 0.0

def energy(x: np.ndarray) -> float:
    # sum of squares (optionally normalized by length)
    return float(np.sum(x**2))

def spectral_feats(x: np.ndarray, fs: float) -> dict:
    # Real FFT
    X = np.abs(rfft(x))
    freqs = rfftfreq(len(x), d=1.0/fs)
    spec_sum = np.sum(X) + 1e-12
    # spectral centroid (amplitude-weighted mean freq)
    centroid = float(np.sum(freqs * X) / spec_sum)
    # simple bandpowers (0–2Hz, 2–5Hz, 5–10Hz, >10Hz) – tune as needed
    bands = [(0,2),(2,5),(5,10),(10,fs/2)]
    bp = {}
    for lo, hi in bands:
        mask = (freqs >= lo) & (freqs < hi)
        bp[f"bp_{lo}_{hi}"] = float(np.sum(X[mask]))
    return {"spec_centroid": centroid, **bp}

def entropy_hist(x: np.ndarray, bins: int = 32) -> float:
    # Shannon entropy of normalized histogram
    hist, _ = np.histogram(x, bins=bins, density=True)
    p = hist / (np.sum(hist) + 1e-12)
    return float(shannon_entropy(p + 1e-12, base=2))

def summarize_1d(x: np.ndarray, fs: float) -> dict:
    x = x.astype(float)
    feats = {
        "mean": float(np.mean(x)),
        "std": float(np.std(x, ddof=1)) if len(x) > 1 else 0.0,
        "min": float(np.min(x)),
        "max": float(np.max(x)),
        "median": float(np.median(x)),
        "iqr": float(iqr(x)) if len(x) > 1 else 0.0,
        "skew": float(skew(x)) if len(x) > 2 else 0.0,
        "kurtosis": float(kurtosis(x, fisher=True)) if len(x) > 3 else 0.0,
        "rms": rms(x),
        "energy": energy(x),
        "zcr": zero_crossing_rate(x),
        "entropy": entropy_hist(x, bins=32),
    }
    feats.update(spectral_feats(x, fs=fs))
    return feats

## Process Dataset

In [None]:

# Aggregate per (subject, experiment) → engineered features per sensor
rows = []
labels = []
meta = []

grouped = df.groupby(ID_COLS, sort=False)
for (subject, experiment), g in grouped:
    # optional: enforce length or handle short runs
    # g is time-ordered due to the earlier sort
    feature_dict = {"subject": subject, "experiment": experiment}

    for col in SENSOR_COLS:
        feats = summarize_1d(g[col].to_numpy(), fs=FS)
        # prefix with sensor name
        for k, v in feats.items():
            feature_dict[f"{col}__{k}"] = v

    rows.append(feature_dict)
    labels.append(g[TARGET_COL].iloc[0])  # label is constant within an experiment
    meta.append({
        "subject": subject,
        "experiment": experiment,
        "n_samples": int(len(g)),
        "time_start": float(g["Time"].iloc[0]),
        "time_end": float(g["Time"].iloc[-1]),
    })

X = pd.DataFrame(rows).set_index(["subject","experiment"])
y = pd.Series(labels, index=X.index, name="label")
metadata = pd.DataFrame(meta).set_index(["subject","experiment"])

# Encode labels (string → int) but also keep mapping
label_to_id = {lbl:i for i, lbl in enumerate(sorted(y.unique()))}
id_to_label = {i:lbl for lbl,i in label_to_id.items()}
y_encoded = y.map(label_to_id).astype(int)

# Subject-grouped splits to avoid leakage
subjects = X.index.get_level_values("subject")
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X, y_encoded, groups=subjects))

# Make a val split from the train set (again grouped by subject)
train_subjects = subjects.to_numpy()[train_idx]
gss2 = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=43)
tr2_idx, val_idx = next(gss2.split(
    X.iloc[train_idx], y_encoded.iloc[train_idx], groups=train_subjects
))
final_train_idx = train_idx[tr2_idx]
final_val_idx   = train_idx[val_idx]

splits = {
    "label_to_id": label_to_id,
    "id_to_label": id_to_label,
    "train_idx": X.index[final_train_idx].tolist(),
    "val_idx": X.index[final_val_idx].tolist(),
    "test_idx": X.index[test_idx].tolist(),
    "strategy": "grouped_by_subject",
}

## Check for Split Intersections

In [None]:
def to_set(tlist):
    s = set()

    for ele in tlist:
        tup = tuple(ele)
        s.add(tup)
    
    return s

In [None]:
train_idxs = to_set(splits["train_idx"])
val_idxs = to_set(splits["val_idx"])
test_idxs = to_set(splits["test_idx"])

In [None]:
# Train x Validation
print("Train X Validation intersections:")
for idx in train_idxs:
    if idx in val_idxs:
        print(idx)

# Train x Test
print("Train X Test intersections:")
for idx in train_idxs:
    if idx in test_idxs:
        print(idx)

# Validation x Test
print("Validation X Test intersections:")
for idx in val_idxs:
    if idx in test_idxs:
        print(idx)

## Persist Artifacts

In [None]:
# Persist artifacts
X.to_parquet(ARTIFACTS / "X.parquet", index=True)
y_encoded.to_frame("label_id").to_parquet(ARTIFACTS / "y.parquet", index=True)
metadata.to_parquet(ARTIFACTS / "metadata.parquet", index=True)
with open(ARTIFACTS / "splits.json", "w") as f:
    json.dump(splits, f, indent=2)

print("Saved:", list(ARTIFACTS.iterdir()))
print("Shapes → X:", X.shape, " y:", y_encoded.shape)
print("Class counts:", y_encoded.value_counts().sort_index().to_dict())
