# Data Exploration

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
## Load Data

In [None]:

df_dataset = pd.read_parquet(Path("../data/consolidated/fall_dataset.parquet"))
df_dataset

## Number of samples by class 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.ticker import ScalarFormatter

# 1) Count rows per (subject, label, experiment)
counts = (
    df_dataset
    .groupby(["subject", "label", "experiment"])
    .size()
    .reset_index(name="count")
)

# 2) Summary table with percentiles
summary_counts = (
    counts.groupby("label")["count"].agg(
        n_experiments = "count",
        min  = "min",
        p05  = lambda s: s.quantile(0.05),
        p25  = lambda s: s.quantile(0.25),
        median = "median",
        p75  = lambda s: s.quantile(0.75),
        p95  = lambda s: s.quantile(0.95),
        max  = "max",
        mean = "mean",
        std  = "std",
    )
    .reset_index()
)

# (optional) prettier printing
num_cols = summary_counts.columns.drop("label")
summary_counts[num_cols] = summary_counts[num_cols].round(3)

print("Summary statistics of samples per experiment:")
print(summary_counts.to_string(index=False))

# 3) Faceted histograms per class (zoomed, no x-axis offset)
g = sns.FacetGrid(counts, col="label", sharey=False, sharex=False, height=4)
g.map_dataframe(sns.histplot, x="count", bins=30)

for ax in g.axes.flat:
    ax.xaxis.set_major_formatter(ScalarFormatter(useOffset=False))
    title = ax.get_title()

    # sensible zooms similar to the duration plot:
    # Tight classes (Falls/Near_Falls) around ~1921; ADLs can have longer tails.
    data_this = counts.loc[counts["label"] == title.split(" = ")[-1], "count"]
    if title in ("Falls", "Near_Falls") or title.endswith("Falls") and "Near" in title:
        ax.set_xlim(1890, 1960)   # ~1921 ± ~40
    elif title == "ADLs":
        # show full spread with a little padding
        lo = max(0, data_this.min() - 25)
        hi = data_this.max() + 25
        ax.set_xlim(lo, hi)

g.set_axis_labels("Number of samples per experiment", "Count of experiments")
g.set_titles(col_template="{col_name}")
plt.suptitle("Histogram of Sample Counts per Experiment by Class", y=1.05)
plt.tight_layout()
plt.show()


## Duration of Experiments

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

# Duration per experiment (convert µs → seconds)
durations = df_dataset.groupby(["subject","label", "experiment"])["Time"].agg(
    min_t="min", max_t="max"
).reset_index()

durations["duration_sec"] = (durations["max_t"] - durations["min_t"]) / 1e6

# ---- Summary table ----
summary_dur = (
    durations.groupby("label")["duration_sec"].agg(
        count  ="count",
        min    ="min",
        p05    =lambda s: s.quantile(0.05),
        p25    =lambda s: s.quantile(0.25),
        median ="median",
        p75    =lambda s: s.quantile(0.75),
        p95    =lambda s: s.quantile(0.95),
        max    ="max",
        mean   ="mean",
        std    ="std",
    )
    .reset_index()
)

# optional: round numeric columns for prettier printing
num_cols = summary_dur.columns.drop("label")
summary_dur[num_cols] = summary_dur[num_cols].round(4)

print("Summary statistics of experiment duration (seconds):")
print(summary_dur.to_string(index=False))

# Plot
g = sns.FacetGrid(durations, col="label", sharey=False, sharex=False, height=4)
g.map_dataframe(sns.histplot, x="duration_sec", bins=30)

# turn off scientific offset and optionally zoom
for ax in g.axes.flat:
    ax.xaxis.set_major_formatter(ScalarFormatter(useOffset=False))
    title = ax.get_title()
    if title in ("Falls", "Near_Falls"):
        ax.set_xlim(14.995, 15.010)   # ~15.0 to 15.01 s window
    elif title == "ADLs":
        ax.set_xlim(14.9, 20.1)

g.set_axis_labels("Duration (seconds)", "Count of experiments")
g.set_titles(col_template="{col_name}")
plt.suptitle("Histogram of Experiment Durations per Class", y=1.05)
plt.tight_layout()
plt.show()



## Sample Frequency

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

# 1) Per-experiment sampling stats
def _per_experiment_freq(g):
    # g is a sub-DataFrame for one (subject,label,experiment)
    g = g.sort_values("Time")
    dt = g["Time"].diff().dropna()  # microseconds
    dt = dt[dt > 0]                  # guard against duplicates/non-monotonic
    if len(dt) == 0:
        return pd.Series({
            "n_samples": len(g),
            "n_intervals": 0,
            "median_dt_us": np.nan,
            "mean_dt_us": np.nan,
            "p05_dt_us": np.nan,
            "p95_dt_us": np.nan,
            "cv_dt": np.nan,
            "large_gap_ratio": np.nan,
            "freq_hz": np.nan,
        })
    med = dt.median()
    large_gap_ratio = (dt > 1.5 * med).mean()
    return pd.Series({
        "n_samples": len(g),
        "n_intervals": len(dt),
        "median_dt_us": med,
        "mean_dt_us": dt.mean(),
        "p05_dt_us": dt.quantile(0.05),
        "p95_dt_us": dt.quantile(0.95),
        "cv_dt": (dt.std() / dt.mean()) if dt.mean() > 0 else np.nan,
        "large_gap_ratio": large_gap_ratio,
        "freq_hz": 1e6 / med if med > 0 else np.nan,
    })

freq_per_exp = (
    df_dataset
    .groupby(["subject", "label", "experiment"], as_index=False)
    .apply(_per_experiment_freq)
    .reset_index(drop=True)
)

# 2) Summary table per class (with percentiles, like before)
summary_freq = (
    freq_per_exp.groupby("label")["freq_hz"].agg(
        n_experiments = "count",
        min   = "min",
        p05   = lambda s: s.quantile(0.05),
        p25   = lambda s: s.quantile(0.25),
        median= "median",
        p75   = lambda s: s.quantile(0.75),
        p95   = lambda s: s.quantile(0.95),
        max   = "max",
        mean  = "mean",
        std   = "std",
    )
    .reset_index()
)

# Optional: round for pretty print
num_cols = summary_freq.columns.drop("label")
summary_freq[num_cols] = summary_freq[num_cols].round(3)

print("Sampling frequency (Hz) per class — summary:")
print(summary_freq.to_string(index=False))

# 3) Faceted histograms of frequency per class
g = sns.FacetGrid(freq_per_exp, col="label", sharey=False, sharex=False, height=4)
g.map_dataframe(sns.histplot, x="freq_hz", bins=30)

# Nice axes (no scientific offset) + auto zoom using 5–95th percentiles
for ax in g.axes.flat:
    ax.xaxis.set_major_formatter(ScalarFormatter(useOffset=False))
    lab = ax.get_title().split(" = ")[-1]
    x = freq_per_exp.loc[freq_per_exp["label"] == lab, "freq_hz"].dropna().values
    if len(x):
        q5, q95 = np.percentile(x, [5, 95])
        pad = max(0.05 * (q95 - q5), 0.25)  # small padding
        ax.set_xlim(q5 - pad, q95 + pad)

g.set_axis_labels("Sampling frequency (Hz)", "Count of experiments")
g.set_titles(col_template="{col_name}")
plt.suptitle("Histogram of Sampling Frequency per Class", y=1.05)
plt.tight_layout()
plt.show()



## Sensor behavior

Ploting every sensor value over its data time to check if theres a pattern between the 3 classes.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =========================
# Config (tweak as needed)
# =========================
META_COLS = {"Time", "label", "experiment", "subject"}
LABEL_ORDER = ["Falls", "Near_Falls", "ADLs"]  # will auto-fallback if any is missing
ALPHA = 0.05
MARKER_SIZE = 0.7
MAX_POINTS_PER_PANEL = 200_000   # per (sensor, label) subplot; auto-downsamples if exceeded
STANDARDIZE = False              # z-score per sensor (helps when different units/scales)
RANDOM_SEED = 42                 # for reproducible downsampling
X_PCTL = 0.95                    # use 95th percentile duration per label for x-limit

rng = np.random.default_rng(RANDOM_SEED)

# -------------------------
# Identify sensor columns
# -------------------------
sensor_cols = [
    c for c in df_dataset.columns
    if c not in META_COLS and np.issubdtype(df_dataset[c].dtype, np.number)
]
if not sensor_cols:
    raise ValueError("No numeric sensor columns found (after excluding meta cols).")

# -------------------------
# Precompute per-(subject,label,experiment) durations in seconds
# -------------------------
exp_dur = (
    df_dataset.groupby(["subject", "label", "experiment"])["Time"]
    .agg(min_t="min", max_t="max")
    .reset_index()
)
exp_dur["duration_sec"] = (exp_dur["max_t"] - exp_dur["min_t"]) / 1e6

# 95th-percentile duration per label (x-limits)
dur_by_label = (
    exp_dur.groupby("label")["duration_sec"]
    .quantile(X_PCTL)
    .to_dict()
)

# -------------------------
# Optional: per-sensor stats (for standardization)
# -------------------------
if STANDARDIZE:
    sensor_stats = {
        s: (np.nanmean(df_dataset[s].to_numpy()), np.nanstd(df_dataset[s].to_numpy()) or 1.0)
        for s in sensor_cols
    }
else:
    sensor_stats = {s: (0.0, 1.0) for s in sensor_cols}

# -------------------------
# Utility: build (x,y) arrays for a label and sensor with downsampling
# -------------------------
def collect_xy_for_label_sensor(df_lab: pd.DataFrame, sensor: str):
    """
    Returns x (seconds from experiment start) and y arrays for the chosen label+sensor,
    potentially downsampled to MAX_POINTS_PER_PANEL.
    """
    xs, ys, total_points = [], [], 0

    # Iterate experiments for this label
    for (_, _, exp), g in df_lab.groupby(["subject", "label", "experiment"]):
        g = g.sort_values("Time")
        if len(g) < 2:
            continue

        # Time from experiment start (seconds)
        t_rel_sec = (g["Time"] - g["Time"].iloc[0]) / 1e6
        x = t_rel_sec.to_numpy()

        y = g[sensor].to_numpy()
        # drop NaN pairs quickly
        m = np.isfinite(x) & np.isfinite(y)
        if m.sum() == 0:
            continue
        x, y = x[m], y[m]

        xs.append(x)
        ys.append(y)
        total_points += len(x)

    if total_points == 0:
        return np.array([]), np.array([])

    # Concatenate and downsample if needed
    X = np.concatenate(xs, axis=0)
    Y = np.concatenate(ys, axis=0)

    if total_points > MAX_POINTS_PER_PANEL:
        idx = rng.choice(total_points, size=MAX_POINTS_PER_PANEL, replace=False)
        X, Y = X[idx], Y[idx]

    # Standardize if requested (global per sensor)
    if STANDARDIZE:
        mu, sd = sensor_stats[sensor]
        Y = (Y - mu) / sd

    return X, Y

# -------------------------
# Main plotting loop: one figure per sensor, 3 subplots (one per label)
# -------------------------
available_labels = list(df_dataset["label"].dropna().unique())
labels_to_plot = [lab for lab in LABEL_ORDER if lab in available_labels]
# include any extra labels not in LABEL_ORDER
labels_to_plot += [lab for lab in available_labels if lab not in labels_to_plot]

for sensor in sensor_cols:
    fig, axes = plt.subplots(
        nrows=1, ncols=len(labels_to_plot),
        figsize=(5.5 * len(labels_to_plot), 4.8), sharey=True
    )
    if len(labels_to_plot) == 1:
        axes = [axes]  # make iterable

    for ax, lab in zip(axes, labels_to_plot):
        df_lab = df_dataset[df_dataset["label"] == lab]
        X, Y = collect_xy_for_label_sensor(df_lab, sensor)

        if X.size == 0:
            ax.set_title(f"{lab} — {sensor}\n(no data)")
            ax.set_xlabel("Time from start (s)")
            continue

        # Scatter (rasterized for speed)
        ax.scatter(X, Y, s=MARKER_SIZE, alpha=ALPHA, rasterized=True)
        ax.set_title(f"{lab} — {sensor}")
        ax.set_xlabel("Time from start (s)")

        # Y-label only on first subplot
        if ax is axes[0]:
            ax.set_ylabel("Sensor value" + (" (z-score)" if STANDARDIZE else ""))

        # Sensible x-limit: 0 .. 95th percentile duration for that label
        grid_end = float(dur_by_label.get(lab, np.nan))
        if np.isfinite(grid_end) and grid_end > 0:
            ax.set_xlim(0.0, grid_end)

    plt.tight_layout()
    plt.show()
