In [None]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support,
    roc_curve,
    auc,
)
from sklearn.preprocessing import label_binarize
import neurokit2 as nk
from scipy import stats
from tqdm import tqdm
import warnings
import time
import joblib

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 100


In [None]:
dataset_path = ""

subjects = [
    "S2",
    "S3",
    "S4",
  #  "S5",
    "S6",
    "S7",
    "S8",
    "S9",
    "S10",
    "S11", #?
    "S13",
    "S14",
    "S15", 
    "S16",
    "S17",
]


In [None]:
valid_labels = [1, 2, 3]
label_names = {1: "Baseline", 2: "Stress", 3: "Amusement"}

#---------------------------------
sf_chest = 700
sf_wrist_BVP = 64
sf_wrist_EDA = 4
sf_wrist_TEMP = 4
sf_wrist_ACC = 32


window_size_sec = 60
step_size_sec = 30

# respiban
chest_sensors = [
    "chest_acc_x",
    "chest_acc_y",
    "chest_acc_z",
    "ecg",
    "emg",
    "chest_eda",
    "chest_temp",
    "resp",
]

In [None]:
def _nan_slope(x):
    x = np.asarray(x)
    idx = np.arange(len(x))
    mask = np.isfinite(x)
    if mask.sum() < 2:
        return 0.0
    try:
        s, _ = np.polyfit(idx[mask], x[mask], 1)
        return float(s)
    except:
        return 0.0


def _stats(signal, prefix):
    s = np.asarray(signal, dtype=float)
    out = {
        f"{prefix}_mean": np.nanmean(s) if s.size else 0.0,
        f"{prefix}_std": np.nanstd(s) if s.size else 0.0,
        f"{prefix}_min": np.nanmin(s) if s.size else 0.0,
        f"{prefix}_max": np.nanmax(s) if s.size else 0.0,
        f"{prefix}_median": np.nanmedian(s) if s.size else 0.0,
    }
    out[f"{prefix}_range"] = out[f"{prefix}_max"] - out[f"{prefix}_min"]
    if s.size:
        q25 = np.nanpercentile(s, 25)
        q75 = np.nanpercentile(s, 75)
        out[f"{prefix}_q25"] = q25
        out[f"{prefix}_q75"] = q75
        out[f"{prefix}_iqr"] = q75 - q25
        out[f"{prefix}_slope"] = _nan_slope(s)
    else:
        out.update(
            {
                f"{prefix}_q25": 0.0,
                f"{prefix}_q75": 0.0,
                f"{prefix}_iqr": 0.0,
                f"{prefix}_slope": 0.0,
            }
        )
    return out

In [None]:
def extract_ecg_features_nk(ecg_window, fs=700):
    out_keys = [
        "hr_mean",
        "hr_std",
        "hr_min",
        "hr_max",
        "hr_range",
        "hrv_sdnn",
        "hrv_rmssd",
        "hrv_pnn50",
        "hrv_meannn",
        "hrv_sdsd",
        "hrv_lf",
        "hrv_hf",
        "hrv_lf_hf_ratio",
        "hrv_total_power",
        "ecg_n_peaks",
        "ecg_quality_mean",
        "ecg_quality_std",
    ]
    features = {k: 0.0 for k in out_keys}
    s = np.asarray(ecg_window, dtype=float)
    if s.size < int(5 * fs):
        return features

    try:
        ecg_cleaned = nk.ecg_clean(s, sampling_rate=fs)
        peaks_df, info = nk.ecg_peaks(ecg_cleaned, sampling_rate=fs)
        hr = nk.ecg_rate(peaks_df, sampling_rate=fs, desired_length=len(ecg_cleaned))

        features["hr_mean"] = float(np.nanmean(hr))
        features["hr_std"] = float(np.nanstd(hr))
        features["hr_min"] = float(np.nanmin(hr))
        features["hr_max"] = float(np.nanmax(hr))
        features["hr_range"] = features["hr_max"] - features["hr_min"]

        try:
            hrv_time = nk.hrv_time(peaks_df, sampling_rate=fs)
            if not hrv_time.empty:
                features["hrv_sdnn"] = float(hrv_time.get("HRV_SDNN", [0]).values[0])
                features["hrv_rmssd"] = float(hrv_time.get("HRV_RMSSD", [0]).values[0])
                features["hrv_pnn50"] = float(hrv_time.get("HRV_pNN50", [0]).values[0])
                features["hrv_meannn"] = float(
                    hrv_time.get("HRV_MeanNN", [0]).values[0]
                )
                features["hrv_sdsd"] = float(hrv_time.get("HRV_SDSD", [0]).values[0])
        except:
            pass

        try:
            hrv_freq = nk.hrv_frequency(
                peaks_df, sampling_rate=fs, method="fft", show=False
            )
            if not hrv_freq.empty:
                features["hrv_lf"] = float(hrv_freq.get("HRV_LF", [0]).values[0])
                features["hrv_hf"] = float(hrv_freq.get("HRV_HF", [0]).values[0])
                features["hrv_lf_hf_ratio"] = float(
                    hrv_freq.get("HRV_LFHF", [0]).values[0]
                )
                features["hrv_total_power"] = float(
                    hrv_freq.get("HRV_TP", [0]).values[0]
                )
        except:
            pass

        rpeaks_idx = info.get("ECG_R_Peaks", [])
        features["ecg_n_peaks"] = float(len(rpeaks_idx))
        features["ecg_quality_mean"] = float(np.nanmean(ecg_cleaned))
        features["ecg_quality_std"] = float(np.nanstd(ecg_cleaned))
    except:
        pass

    return features

#---------------------------------
def extract_eda_features_nk(eda_window, fs, prefix=""):
 """eda"""
    s = np.asarray(eda_window, dtype=float)
    out = {}
    try:
        eda_clean = nk.eda_clean(s, sampling_rate=fs)
        phasic = nk.eda_phasic(eda_clean, sampling_rate=fs)
        tonic = phasic["EDA_Tonic"].values
        ph = phasic["EDA_Phasic"].values

        out.update(_stats(tonic, f"{prefix}eda_tonic"))
        out.update(_stats(ph, f"{prefix}eda_phasic"))

        peaks_dict, peaks_info = nk.eda_peaks(ph, sampling_rate=fs)
        out[f"{prefix}eda_scr_n_peaks"] = float(
            np.nansum(peaks_dict.get("SCR_Peaks", 0))
        )

        amps = peaks_info.get("SCR_Amplitude", [])
        amps = (
            np.asarray([a for a in amps if np.isfinite(a)], dtype=float)
            if len(amps)
            else np.array([])
        )
        out[f"{prefix}eda_scr_amplitude_mean"] = (
            float(np.nanmean(amps)) if amps.size else 0.0
        )
        out[f"{prefix}eda_scr_amplitude_max"] = (
            float(np.nanmax(amps)) if amps.size else 0.0
        )
        out[f"{prefix}eda_scr_amplitude_sum"] = (
            float(np.nansum(amps)) if amps.size else 0.0
        )

        out.update(_stats(eda_clean, f"{prefix}eda"))
    except:
        out.update(_stats(s, f"{prefix}eda"))
        zeros = [
            "tonic_mean",
            "tonic_std",
            "tonic_min",
            "tonic_max",
            "tonic_range",
            "tonic_median",
            "tonic_q25",
            "tonic_q75",
            "tonic_iqr",
            "tonic_slope",
            "phasic_mean",
            "phasic_std",
            "phasic_min",
            "phasic_max",
            "phasic_range",
            "phasic_median",
            "phasic_q25",
            "phasic_q75",
            "phasic_iqr",
            "phasic_slope",
        ]
        for k in zeros:
            out[f"{prefix}eda_{k}"] = 0.0
        out[f"{prefix}eda_scr_n_peaks"] = 0.0
        out[f"{prefix}eda_scr_amplitude_mean"] = 0.0
        out[f"{prefix}eda_scr_amplitude_max"] = 0.0
        out[f"{prefix}eda_scr_amplitude_sum"] = 0.0
    return out

#---------------------------------#---------------------------------
def extract_rsp_features_nk(rsp_window, fs=700):
    """respiration"""
    s = np.asarray(rsp_window, dtype=float)
    out = {
        k: 0.0
        for k in [
            "rsp_rate_mean",
            "rsp_rate_std",
            "rsp_rate_min",
            "rsp_rate_max",
            "rsp_rate_range",
            "rsp_amplitude_mean",
            "rsp_amplitude_std",
            "rsp_amplitude_max",
            "rrv_sdbb",
            "rsp_mean",
            "rsp_std",
            "rsp_min",
            "rsp_max",
            "rsp_range",
            "rsp_median",
            "rsp_q25",
            "rsp_q75",
            "rsp_iqr",
            "rsp_slope",
        ]
    }
    if s.size < int(3 * fs):
        return out
    try:
        rsp_clean = nk.rsp_clean(s, sampling_rate=fs)
        sig, info = nk.rsp_process(rsp_clean, sampling_rate=fs)
        rate = sig.get("RSP_Rate", np.array([]))
        amp = sig.get("RSP_Amplitude", np.array([]))

        if len(rate):
            out["rsp_rate_mean"] = float(np.nanmean(rate))
            out["rsp_rate_std"] = float(np.nanstd(rate))
            out["rsp_rate_min"] = float(np.nanmin(rate))
            out["rsp_rate_max"] = float(np.nanmax(rate))
            out["rsp_rate_range"] = out["rsp_rate_max"] - out["rsp_rate_min"]

        if len(amp):
            out["rsp_amplitude_mean"] = float(np.nanmean(amp))
            out["rsp_amplitude_std"] = float(np.nanstd(amp))
            out["rsp_amplitude_max"] = float(np.nanmax(amp))

        try:
            rrv = nk.rsp_rrv(sig, sampling_rate=fs)
            if not rrv.empty:
                out["rrv_sdbb"] = float(rrv.get("RRV_SDBB", [0]).values[0])
        except:
            pass

        out.update(_stats(rsp_clean, "rsp"))
    except:
        pass
    return out

#---------------------------------#---------------------------------#---------------------------------#---------------------------------
def extract_emg_features_nk(emg_window, fs=700):
    """EMG"""
    s = np.asarray(emg_window, dtype=float)
    out = {
        k: 0.0
        for k in [
            "emg_amplitude_mean",
            "emg_amplitude_std",
            "emg_amplitude_max",
            "emg_amplitude_min",
            "emg_amplitude_range",
            "emg_n_onsets",
            "emg_rms",
            "emg_mean",
            "emg_std",
            "emg_min",
            "emg_max",
            "emg_range",
            "emg_median",
            "emg_q25",
            "emg_q75",
            "emg_iqr",
            "emg_slope",
        ]
    }
    if s.size < int(1 * fs):
        return out
    try:
        emg_clean = nk.emg_clean(s, sampling_rate=fs)
        sig, info = nk.emg_process(emg_clean, sampling_rate=fs)
        amp_series = sig.get("EMG_Amplitude", sig.get("EMG_Envelope", np.array([])))
        amp = np.asarray(amp_series) if amp_series is not None else np.array([])

        if amp.size:
            out["emg_amplitude_mean"] = float(np.nanmean(amp))
            out["emg_amplitude_std"] = float(np.nanstd(amp))
            out["emg_amplitude_max"] = float(np.nanmax(amp))
            out["emg_amplitude_min"] = float(np.nanmin(amp))
            out["emg_amplitude_range"] = (
                out["emg_amplitude_max"] - out["emg_amplitude_min"]
            )

        onsets = sig.get("EMG_Onsets", np.array([]))
        out["emg_n_onsets"] = float(np.nansum(onsets)) if len(onsets) else 0.0
        out["emg_rms"] = (
            float(np.sqrt(np.nanmean(emg_clean**2))) if emg_clean.size else 0.0
        )
        out.update(_stats(emg_clean, "emg"))
    except:
        pass
    return out

#---------------------------------#---------------------------------
def extract_bvp_features_nk(bvp_window, fs=64):
    """BVP""" 
    s = np.asarray(bvp_window, dtype=float)
    out = {
        k: 0.0
        for k in [
            "bvp_rate_mean",
            "bvp_rate_std",
            "bvp_rate_min",
            "bvp_rate_max",
            "bvp_rate_range",
            "bvp_n_peaks",
            "bvp_mean",
            "bvp_std",
            "bvp_min",
            "bvp_max",
            "bvp_range",
            "bvp_median",
            "bvp_q25",
            "bvp_q75",
            "bvp_iqr",
            "bvp_slope",
        ]
    }
    if s.size < 10:
        return out
    try:
        ppg_clean = nk.ppg_clean(s, sampling_rate=fs)
        sig, info = nk.ppg_process(ppg_clean, sampling_rate=fs)
        rate = sig["PPG_Rate"].values
        peaks = sig["PPG_Peaks"].values

        out["bvp_rate_mean"] = float(np.nanmean(rate))
        out["bvp_rate_std"] = float(np.nanstd(rate))
        out["bvp_rate_min"] = float(np.nanmin(rate))
        out["bvp_rate_max"] = float(np.nanmax(rate))
        out["bvp_rate_range"] = out["bvp_rate_max"] - out["bvp_rate_min"]
        out["bvp_n_peaks"] = float(np.nansum(peaks))

        out.update(_stats(ppg_clean, "bvp"))
    except:
        pass
    return out


In [None]:
# missing val 
chest_missing = pd.DataFrame(
    {
        "Column": chest_sensors,
        "Missing_Count": [
            df_chest_combined[col].isnull().sum() for col in chest_sensors
        ],
        "Missing_Percent": [
            df_chest_combined[col].isnull().sum() / len(df_chest_combined) * 100
            for col in chest_sensors
        ],
    }
)
print(chest_missing.to_string(index=False))

wrist_missing = pd.DataFrame(
    {
        "Sensor": ["BVP", "EDA", "TEMP", "ACC_X", "ACC_Y", "ACC_Z"],
        "Sampling_Rate": [
            sf_wrist_BVP,
            sf_wrist_EDA,
            sf_wrist_TEMP,
            sf_wrist_ACC,
            sf_wrist_ACC,
            sf_wrist_ACC,
        ],
        "Total_Samples": [
            len(df_wrist_bvp_combined),
            len(df_wrist_eda_combined),
            len(df_wrist_temp_combined),
            len(df_wrist_acc_combined),
            len(df_wrist_acc_combined),
            len(df_wrist_acc_combined),
        ],
        "Missing_Count": [
            df_wrist_bvp_combined["bvp"].isnull().sum(),
            df_wrist_eda_combined["wrist_eda"].isnull().sum(),
            df_wrist_temp_combined["wrist_temp"].isnull().sum(),
            df_wrist_acc_combined["wrist_acc_x"].isnull().sum(),
            df_wrist_acc_combined["wrist_acc_y"].isnull().sum(),
            df_wrist_acc_combined["wrist_acc_z"].isnull().sum(),
        ],
    }
)
wrist_missing["missing&"] = (
    wrist_missing["count"] / wrist_missing["Total_Samples"] * 100
)
print(wrist_missing.to_string(index=False))

if (
    chest_missing["count"].sum() == 0
    and wrist_missing["count"].sum() == 0
):
    print("fine")
    
    
#INF check
chest_inf = pd.DataFrame(
    {
        "Column": chest_sensors,
        "Inf_Count": [np.isinf(df_chest_combined[col]).sum() for col in chest_sensors],
        "NegInf_Count": [
            np.isneginf(df_chest_combined[col]).sum() for col in chest_sensors
        ],
        "PosInf_Count": [
            np.isposinf(df_chest_combined[col]).sum() for col in chest_sensors
        ],
    }
)
print(chest_inf.to_string(index=False))

wrist_inf = pd.DataFrame(
    {
        "Sensor": ["BVP", "EDA", "TEMP", "ACC_X", "ACC_Y", "ACC_Z"],
        "Inf_Count": [
            np.isinf(df_wrist_bvp_combined["bvp"]).sum(),
            np.isinf(df_wrist_eda_combined["wrist_eda"]).sum(),
            np.isinf(df_wrist_temp_combined["wrist_temp"]).sum(),
            np.isinf(df_wrist_acc_combined["wrist_acc_x"]).sum(),
            np.isinf(df_wrist_acc_combined["wrist_acc_y"]).sum(),
            np.isinf(df_wrist_acc_combined["wrist_acc_z"]).sum(),
        ],
    }
)
print(wrist_inf.to_string(index=False))

if chest_inf["Inf_Count"].sum() == 0 and wrist_inf["Inf_Count"].sum() == 0:
    print("fine.")

#range anomalies 
chest_ranges = pd.DataFrame(
    {
        "Column": chest_sensors,
        "Min": [df_chest_combined[col].min() for col in chest_sensors],
        "Max": [df_chest_combined[col].max() for col in chest_sensors],
        "Mean": [df_chest_combined[col].mean() for col in chest_sensors],
        "Std": [df_chest_combined[col].std() for col in chest_sensors],
        "Median": [df_chest_combined[col].median() for col in chest_sensors],
    }
)
print(chest_ranges.to_string(index=False))

# - temp
negative_chest_temp = (df_chest_combined["chest_temp"] < 0).sum()
wrist_ranges = pd.DataFrame(
    {
        "Sensor": ["BVP", "EDA", "TEMP", "ACC_X", "ACC_Y", "ACC_Z"],
        "Sampling_Rate": [
            sf_wrist_BVP,
            sf_wrist_EDA,
            sf_wrist_TEMP,
            sf_wrist_ACC,
            sf_wrist_ACC,
            sf_wrist_ACC,
        ],
        "Min": [
            df_wrist_bvp_combined["bvp"].min(),
            df_wrist_eda_combined["wrist_eda"].min(),
            df_wrist_temp_combined["wrist_temp"].min(),
            df_wrist_acc_combined["wrist_acc_x"].min(),
            df_wrist_acc_combined["wrist_acc_y"].min(),
            df_wrist_acc_combined["wrist_acc_z"].min(),
        ],
        "Max": [
            df_wrist_bvp_combined["bvp"].max(),
            df_wrist_eda_combined["wrist_eda"].max(),
            df_wrist_temp_combined["wrist_temp"].max(),
            df_wrist_acc_combined["wrist_acc_x"].max(),
            df_wrist_acc_combined["wrist_acc_y"].max(),
            df_wrist_acc_combined["wrist_acc_z"].max(),
        ],
        "Mean": [
            df_wrist_bvp_combined["bvp"].mean(),
            df_wrist_eda_combined["wrist_eda"].mean(),
            df_wrist_temp_combined["wrist_temp"].mean(),
            df_wrist_acc_combined["wrist_acc_x"].mean(),
            df_wrist_acc_combined["wrist_acc_y"].mean(),
            df_wrist_acc_combined["wrist_acc_z"].mean(),
        ],
    }
)
print(wrist_ranges.to_string(index=False))


negative_wrist_temp = (df_wrist_temp_combined["wrist_temp"] < 0).sum()
negative_wrist_eda = (df_wrist_eda_combined["wrist_eda"] < 0).sum()
extreme_bvp_low = (df_wrist_bvp_combined["bvp"] < -1000).sum()
extreme_bvp_high = (df_wrist_bvp_combined["bvp"] > 1000).sum()


In [None]:
overall_counts = df_filtered_preview["label"].value_counts().sort_index()
overall_pct = overall_counts / overall_counts.sum() * 100
imbalance_ratio = overall_counts.max() / max(1, overall_counts.min())

for lbl in valid_labels:
    c = int(overall_counts.get(lbl, 0))
    p = overall_pct.get(lbl, 0.0)
    print(f"  {label_names[lbl]:12s}: {c:10,} ({p:5.1f}%)")


per_subject = (
    df_filtered_preview.groupby(["sid", "label"])
    .size()
    .unstack(fill_value=0)
    .reindex(columns=valid_labels, fill_value=0)
)
print(per_subject)

total_n = overall_counts.sum()
class_weight = {}
for lbl in valid_labels:
    n_lbl = max(1, int(overall_counts.get(lbl, 0)))
    class_weight[lbl] = total_n / (len(valid_labels) * n_lbl)

for lbl, weight in class_weight.items():
    print(f"  {label_names[lbl]:12s}: {weight:.4f}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
colors = ["#AED6F1", "#F1948A", "#F8B88B"]
labels_text = [label_names[i] for i in overall_counts.index]

ax1.pie(
    overall_counts.values,
    labels=labels_text,
    autopct="%1.1f%%",
    colors=colors,
    shadow=True,
    startangle=90,
)
ax1.set_title("Raw Data Label Distribution", fontweight="bold", fontsize=12)

per_subject.plot(kind="bar", ax=ax2, color=colors, width=0.7)
ax2.set_title("Labels per Subject (Raw Data)", fontweight="bold", fontsize=12)
ax2.set_xlabel("Subject", fontweight="bold")
ax2.set_ylabel("Sample Count", fontweight="bold")
ax2.legend(labels_text, title="Condition")
ax2.grid(True, alpha=0.3, axis="y")
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha="right")

plt.tight_layout()
plt.savefig("EDA_1_label_distribution_raw.png", dpi=150, bbox_inches="tight")
plt.show()


wrist_sensors_data = {
    "BVP": df_wrist_bvp_combined["bvp"],
    "EDA": df_wrist_eda_combined["wrist_eda"],
    "TEMP": df_wrist_temp_combined["wrist_temp"],
    "ACC_X": df_wrist_acc_combined["wrist_acc_x"],
    "ACC_Y": df_wrist_acc_combined["wrist_acc_y"],
    "ACC_Z": df_wrist_acc_combined["wrist_acc_z"],
}


In [None]:
subject_eda_means = (
    df_wrist_eda_combined.groupby("subject")["wrist_eda"].mean().sort_values()
)
subject_eda_stds = df_wrist_eda_combined.groupby("subject")["wrist_eda"].std()

x = np.arange(len(subject_eda_means))

In [None]:
chest_outlier_summary = []
for col in chest_sensors:
    Q1 = df_filtered_preview[col].quantile(0.25)
    Q3 = df_filtered_preview[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = (
        (df_filtered_preview[col] < lower_bound)
        | (df_filtered_preview[col] > upper_bound)
    ).sum()
    outlier_pct = (outliers / len(df_filtered_preview)) * 100

    chest_outlier_summary.append(
        {
            "Column": col,
            "Outliers": outliers,
            "Percent": outlier_pct,
            "Lower": lower_bound,
            "Upper": upper_bound,
        }
    )

df_chest_outliers = pd.DataFrame(chest_outlier_summary)
print(df_chest_outliers[["Column", "Outliers", "Percent"]].to_string(index=False))


#---------------------------------#---------------------------------
wrist_outlier_summary = []
for sensor_name, sensor_data in wrist_sensors_data.items():
    Q1 = sensor_data.quantile(0.25)
    Q3 = sensor_data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = ((sensor_data < lower_bound) | (sensor_data > upper_bound)).sum()
    outlier_pct = (outliers / len(sensor_data)) * 100

    wrist_outlier_summary.append(
        {
            "Sensor": sensor_name,
            "Outliers": outliers,
            "Percent": outlier_pct,
        }
    )

df_wrist_outliers = pd.DataFrame(wrist_outlier_summary)
print(df_wrist_outliers.to_string(index=False))

In [None]:
df_filtered = df_chest_combined[df_chest_combined["label"].isin(valid_labels)].copy()

samples_before = len(df_filtered)
if negative_chest_temp > 0:
    df_filtered = df_filtered[df_filtered["chest_temp"] > 0].copy()


In [None]:
cleaning_count = 0

for subject_id in all_data.keys():
    df_temp = all_data[subject_id]["temp"]
    before = len(df_temp)
    df_temp = df_temp[df_temp["wrist_temp"] > 0].copy()
    cleaning_count += before - len(df_temp)
    all_data[subject_id]["temp"] = df_temp

for subject_id in all_data.keys():
    df_eda = all_data[subject_id]["eda"]
    before = len(df_eda)
    df_eda = df_eda[df_eda["wrist_eda"] >= 0].copy()
    cleaning_count += before - len(df_eda)
    all_data[subject_id]["eda"] = df_eda



for subject_id in all_data.keys():
    df_bvp = all_data[subject_id]["bvp"]
    before = len(df_bvp)
    df_bvp = df_bvp[(df_bvp["bvp"] >= -1000) & (df_bvp["bvp"] <= 1000)].copy()
    cleaning_count += before - len(df_bvp)
    all_data[subject_id]["bvp"] = df_bvp

In [None]:
for subject_id in all_data.keys():
    cleaned_chest = df_filtered[df_filtered["sid"] == subject_id].copy()
    all_data[subject_id]["chest"] = cleaned_chest

In [None]:
total_issues = 0
for subject_id in all_data.keys():
    # Check chest
    chest_neg_temp = (all_data[subject_id]["chest"]["chest_temp"] < 0).sum()

    # Check wrist
    bvp_extreme = (
        (all_data[subject_id]["bvp"]["bvp"] < -1000)
        | (all_data[subject_id]["bvp"]["bvp"] > 1000)
    ).sum()
    eda_negative = (all_data[subject_id]["eda"]["wrist_eda"] < 0).sum()
    temp_negative = (all_data[subject_id]["temp"]["wrist_temp"] < 0).sum()

    issues = chest_neg_temp + bvp_extreme + eda_negative + temp_negative
    total_issues += issues

    if issues > 0:
        print("issues found")

if total_issues == 0:
    print("complete") 
else:
    print(f"missing count: {total_issues}")



In [None]:
def extract_window_features_time_based(subject_data, start_time, end_time):
    features = {}
    
    #rspi ban ---------------------------------
    df_chest = subject_data["chest"]
    chest_mask = (df_chest["time"] >= start_time) & (df_chest["time"] < end_time)
    chest_window = df_chest.loc[chest_mask]

    if len(chest_window) < 100:
        return None

    labels = chest_window["label"].to_numpy()
    if labels.size == 0:
        return None
    vals, cnts = np.unique(labels, return_counts=True)
    maj = vals[np.argmax(cnts)]
    if (cnts.max() / labels.size) < 0.8:
        return None
    features["label"] = int(maj)

    # Extract chest features
    features.update(extract_ecg_features_nk(chest_window["ecg"].to_numpy(), fs=700))
    features.update(
        extract_eda_features_nk(
            chest_window["chest_eda"].to_numpy(), fs=700, prefix="chest_"
        )
    )
    features.update(extract_rsp_features_nk(chest_window["resp"].to_numpy(), fs=700))
    features.update(extract_emg_features_nk(chest_window["emg"].to_numpy(), fs=700))
    features.update(_stats(chest_window["chest_temp"].to_numpy(), "chest_temp"))

    for axis in ["chest_acc_x", "chest_acc_y", "chest_acc_z"]:
        features.update(_stats(chest_window[axis].to_numpy(), axis))

    chest_acc_mag = np.sqrt(
        chest_window["chest_acc_x"].to_numpy() ** 2
        + chest_window["chest_acc_y"].to_numpy() ** 2
        + chest_window["chest_acc_z"].to_numpy() ** 2
    )
    features.update(_stats(chest_acc_mag, "chest_acc_mag"))

    # empatica --------------------------
    df_bvp = subject_data["bvp"]
    m = (df_bvp["time"] >= start_time) & (df_bvp["time"] < end_time)
    bw = df_bvp.loc[m, "bvp"].to_numpy()
    features.update(extract_bvp_features_nk(bw, fs=64))

    df_eda = subject_data["eda"]
    m = (df_eda["time"] >= start_time) & (df_eda["time"] < end_time)
    ew = df_eda.loc[m, "wrist_eda"].to_numpy()
    features.update(extract_eda_features_nk(ew, fs=4, prefix="wrist_"))

    df_temp = subject_data["temp"]
    m = (df_temp["time"] >= start_time) & (df_temp["time"] < end_time)
    tw = df_temp.loc[m, "wrist_temp"].to_numpy()
    features.update(_stats(tw, "wrist_temp"))

    df_acc = subject_data["acc"]
    m = (df_acc["time"] >= start_time) & (df_acc["time"] < end_time)
    if m.sum() >= 10:
        for axis in ["wrist_acc_x", "wrist_acc_y", "wrist_acc_z"]:
            features.update(_stats(df_acc.loc[m, axis].to_numpy(), axis))
        wmag = np.sqrt(
            df_acc.loc[m, "wrist_acc_x"].to_numpy() ** 2
            + df_acc.loc[m, "wrist_acc_y"].to_numpy() ** 2
            + df_acc.loc[m, "wrist_acc_z"].to_numpy() ** 2
        )
        features.update(_stats(wmag, "wrist_acc_mag"))
    else:
        for axis in ["wrist_acc_x", "wrist_acc_y", "wrist_acc_z", "wrist_acc_mag"]:
            features.update(
                {
                    f"{axis}_{k}": 0.0
                    for k in [
                        "mean",
                        "std",
                        "min",
                        "max",
                        "range",
                        "median",
                        "q25",
                        "q75",
                        "iqr",
                        "slope",
                    ]
                }
            )

    return features

In [None]:
all_features = []

for subject_id in tqdm(sorted(all_data.keys()), desc="extracting"):
    subject_data = all_data[subject_id]
    df_chest = subject_data["chest"]
    df_chest = df_chest[df_chest["label"].isin(valid_labels)]

    if df_chest.empty:
        continue

    max_time = float(df_chest["time"].max())
    t = 0.0

    while t + window_size_sec <= max_time:
        try:
            feats = extract_window_features_time_based(
                subject_data, t, t + window_size_sec
            )
            if feats is not None:
                feats["subject"] = subject_id
                all_features.append(feats)
        except:
            pass
        t += step_size_sec

In [None]:
df_features = pd.DataFrame(all_features)
df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
df_features.fillna(0.0, inplace=True)

In [None]:
feature_columns = [
    col for col in df_features.columns if col not in ["label", "subject"]
]
chest_features = [
    col
    for col in feature_columns
    if col.startswith(("chest_", "ecg", "emg", "hr_", "hrv_", "rsp_", "rrv_"))
]
wrist_features = [
    col for col in feature_columns if col.startswith("wrist_") or col.startswith("bvp_")
]

In [None]:
df_features.to_pickle("wesad_features_final.pkl")

In [None]:
label_dist = df_features["label"].value_counts().sort_index()
for label in valid_labels:
    count = label_dist.get(label, 0)
    pct = (count / len(df_features)) * 100
    print(f"  {label_names[label]:12s}: {count:6,} ({pct:5.1f}%)")


subject_dist = df_features.groupby(["subject", "label"]).size().unstack(fill_value=0)
print(subject_dist)


In [None]:
feature_label_corr = (
    df_features[feature_columns]
    .corrwith(df_features["label"])
    .abs()
    .sort_values(ascending=False)
)

for i, (feat, corr) in enumerate(feature_label_corr.head(20).items(), 1):
    sensor = (
        "CHEST"
        if any(
            feat.startswith(p)
            for p in ["chest_", "ecg", "emg", "hr_", "hrv_", "rsp_", "rrv_"]
        )
        else "WRIST"
    )
    print(f"  {i:2d}. {feat:45s}: {corr:.4f} [{sensor}]")


In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 15, 20, 25, 30, 40, 50]
validation_results = []

print(
    f"\n{'K':<8} {'Train Acc':<12} {'Val Acc':<12} {'Val F1':<12} {'Overfitting':<12}"
)

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, metric="euclidean", weights="uniform")
    knn.fit(X_train_scaled, y_train)

    train_pred = knn.predict(X_train_scaled)
    train_acc = accuracy_score(y_train, train_pred)

    val_pred = knn.predict(X_val_scaled)
    val_acc = accuracy_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred, average="weighted")

    diff = train_acc - val_acc

    validation_results.append(
        {
            "k": k,
            "train_acc": train_acc,
            "val_acc": val_acc,
            "val_f1": val_f1,
            "overfitting": diff,
        }
    )

    print(f"{k:<8} {train_acc:<12.4f} {val_acc:<12.4f} {val_f1:<12.4f} {diff:<12.4f}")

val_df = pd.DataFrame(validation_results)
best_idx = val_df["val_f1"].idxmax()
optimal_k = int(val_df.loc[best_idx, "k"])

In [None]:
X_train_full = np.vstack([X_train_scaled, X_val_scaled])
y_train_full = np.concatenate([y_train, y_val])
train_val_subjects_array = np.concatenate(
    [df_train["subject"].values, df_val["subject"].values]
)


group_kfold = GroupKFold(n_splits=5)
cv_scores = []
cv_f1_scores = []


fold_num = 1
for train_idx, test_idx in group_kfold.split(
    X_train_full, y_train_full, groups=train_val_subjects_array
):
    knn_cv = KNeighborsClassifier(
        n_neighbors=optimal_k, metric="euclidean", weights="uniform"
    )
    knn_cv.fit(X_train_full[train_idx], y_train_full[train_idx])

    y_cv_pred = knn_cv.predict(X_train_full[test_idx])
    cv_acc = accuracy_score(y_train_full[test_idx], y_cv_pred)
    cv_f1 = f1_score(y_train_full[test_idx], y_cv_pred, average="weighted")

    cv_scores.append(cv_acc)
    cv_f1_scores.append(cv_f1)

    print(f"Fold {fold_num:<4} {cv_acc:<12.4f} {cv_f1:<12.4f}")
    fold_num += 1

cv_scores = np.array(cv_scores)
cv_f1_scores = np.array(cv_f1_scores)

In [None]:
knn_final = KNeighborsClassifier(
    n_neighbors=optimal_k, metric="euclidean", weights="uniform"
)

time_start = time.time()
knn_final.fit(X_train_full, y_train_full)
train_time = time.time() - time_start

In [None]:
time_start = time.time()
y_pred_final = knn_final.predict(X_test_scaled)
inference_time = time.time() - time_start

accuracy_final = accuracy_score(y_test, y_pred_final)
precision_final = precision_score(y_test, y_pred_final, average="weighted")
recall_final = recall_score(y_test, y_pred_final, average="weighted")
f1_final = f1_score(y_test, y_pred_final, average="weighted")


print(f"{optimal_k}")
print(f"{accuracy_final:.4f} ({accuracy_final*100:.2f}%)")
print(f"{precision_final:.4f}")
print(f"{recall_final:.4f}")
print(f"{f1_final:.4f}")