In [None]:
import os
import numpy as np
import pandas as pd
from scipy import stats
from scipy import signal as scipy_signal
from scipy.fft import fft, fftfreq
import warnings


warnings.filterwarnings("ignore")

In [None]:
from biosppy.signals import eda as biosppy_eda

In [None]:
DATA_FINAL_DIR = ""
FEATURE_DIR = os.path.join(DATA_FINAL_DIR, "Extracted_Features_Final")
os.makedirs(FEATURE_DIR, exist_ok=True)

In [None]:
FS_EDA = 4 
FS_HR = 1  
FS_TEMP = 1  
FS_ACC = 63 
WINDOW_SIZE_SEC = 60
OVERLAP_SEC = 30

In [None]:
participants = [f"MIT{str(i).zfill(3)}" for i in range(1, 26)]
participants = [
    p
    for p in participants
    if os.path.exists(os.path.join(DATA_FINAL_DIR, p, "Filtered_Signals"))
]

In [None]:
import os
import pandas as pd
import numpy as np

DATA_FINAL_DIR = ""

pid = "MIT001"
side = "LEFT"

signals = {
    "EDA": {
        "raw_path": os.path.join(
            DATA_FINAL_DIR, pid, "Signals", f"{pid}_{side}_eda.csv"
        ),
        "filtered_path": os.path.join(
            DATA_FINAL_DIR, pid, "Filtered_Signals", f"{pid}_{side}_eda_filtered.csv"
        ),
        "expected_fs": 4,
    },
    "HR": {
        "raw_path": os.path.join(
            DATA_FINAL_DIR, pid, "Signals", f"{pid}_{side}_hr.csv"
        ),
        "filtered_path": None,  
        "expected_fs": 1,
    },
    "TEMP": {
        "raw_path": os.path.join(
            DATA_FINAL_DIR, pid, "Signals", f"{pid}_{side}_temp.csv"
        ),
        "filtered_path": os.path.join(
            DATA_FINAL_DIR, pid, "Filtered_Signals", f"{pid}_{side}_temp_filtered.csv"
        ),
        "expected_fs": 1,
    },
    "ACC": {
        "raw_path": os.path.join(
            DATA_FINAL_DIR, pid, "Signals", f"{pid}_{side}_acc.csv"
        ),
        "filtered_path": os.path.join(
            DATA_FINAL_DIR, pid, "Filtered_Signals", f"{pid}_{side}_acc_filtered.csv"
        ),
        "expected_fs": 64,
    },
    "BVP": {
        "raw_path": os.path.join(
            DATA_FINAL_DIR, pid, "Signals", f"{pid}_{side}_bvp.csv"
        ),
        "filtered_path": None,
        "expected_fs": 64,
    },
}

results = []

for signal_name, paths in signals.items():
    print(f"{signal_name}:")

    if os.path.exists(paths["raw_path"]):
        df_raw = pd.read_csv(paths["raw_path"])
        df_raw["ts"] = pd.to_datetime(df_raw["timestamp"])
        duration_raw = (df_raw["ts"].iloc[-1] - df_raw["ts"].iloc[0]).total_seconds()
        fs_raw = len(df_raw) / duration_raw


        time_diff = df_raw["ts"].diff().dt.total_seconds().dropna()

    # Check FILTERED
    if paths["filtered_path"] and os.path.exists(paths["filtered_path"]):
        df_filt = pd.read_csv(paths["filtered_path"])
        df_filt["ts"] = pd.to_datetime(df_filt["timestamp"])
        duration_filt = (df_filt["ts"].iloc[-1] - df_filt["ts"].iloc[0]).total_seconds()
        fs_filt = len(df_filt) / duration_filt

        print(f"samples: {len(df_filt):,}")
        print(f"calculated Fs: {fs_filt:.2f} Hz")
        print(f"sample match: {len(df_raw) == len(df_filt)}")
    else:
        fs_filt = None


    actual_fs = fs_raw if fs_raw else fs_filt
    expected = paths["expected_fs"]
    match = "match " if actual_fs and abs(actual_fs - expected) < 1 else "no"
    print(f"expect: {expected} Hz, actual: {actual_fs:.2f} Hz ~~ {match}")

    results.append(
        {
            "signal": signal_name,
            "expected_fs": expected,
            "actual_fs": round(actual_fs, 2) if actual_fs else None,
            "match": match,
        }
    )


df_summary = pd.DataFrame(results)
print(df_summary.to_string(index=False))

In [None]:
window_samples = WINDOW_SIZE_SEC * FS_HR  
step_samples = OVERLAP_SEC * FS_HR  

all_hr_features = []

for participant_id in participants:
    for side in ["LEFT", "RIGHT"]:

        hr_path = os.path.join(
            DATA_FINAL_DIR, participant_id, "Signals", f"{participant_id}_{side}_hr.csv"
        )

        if not os.path.exists(hr_path):
            continue

        df_hr = pd.read_csv(hr_path)
        phases = df_hr["phase"].dropna().unique()

        for phase in phases:
            phase_data = df_hr[df_hr["phase"] == phase].reset_index(drop=True)

            if len(phase_data) < window_samples:
                continue

            window_idx = 0
            for start in range(0, len(phase_data) - window_samples + 1, step_samples):
                end = start + window_samples
                window = phase_data.iloc[start:end]

                hr_vals = window["hr"].values
                hr_valid = hr_vals[(hr_vals >= 40) & (hr_vals <= 200)]

                features = {
                    "participant_id": participant_id,
                    "side": side,
                    "phase": phase,
                    "window_idx": window_idx,
                    "window_start": window["timestamp"].iloc[0],
                    "window_end": window["timestamp"].iloc[-1],
                }

                if len(hr_valid) >= 5:
                    # Basic statistics
                    features["hr_mean"] = np.mean(hr_valid)
                    features["hr_std"] = np.std(hr_valid)
                    features["hr_min"] = np.min(hr_valid)
                    features["hr_max"] = np.max(hr_valid)
                    features["hr_range"] = np.ptp(hr_valid)
                    features["hr_median"] = np.median(hr_valid)
                    features["hr_iqr"] = np.percentile(hr_valid, 75) - np.percentile(
                        hr_valid, 25
                    )
                    features["hr_cv"] = (
                        (features["hr_std"] / features["hr_mean"]) * 100
                        if features["hr_mean"] > 0
                        else 0
                    )

                
                    rr_intervals = 60000 / hr_valid  
                    rr_diff = np.diff(rr_intervals)

                    features["hr_rmssd"] = (
                        np.sqrt(np.mean(rr_diff**2)) if len(rr_diff) > 0 else np.nan
                    )
                    features["hr_sdsd"] = (
                        np.std(rr_diff) if len(rr_diff) > 0 else np.nan
                    )
                    features["hr_pnn50"] = (
                        (np.sum(np.abs(rr_diff) > 50) / len(rr_diff)) * 100
                        if len(rr_diff) > 0
                        else np.nan
                    )
                    features["hr_pnn20"] = (
                        (np.sum(np.abs(rr_diff) > 20) / len(rr_diff)) * 100
                        if len(rr_diff) > 0
                        else np.nan
                    )
                    features["hr_sdnn"] = np.std(rr_intervals)
                    features["hr_mean_rr"] = np.mean(rr_intervals)
                else:
                    for col in [
                        "hr_mean",
                        "hr_std",
                        "hr_min",
                        "hr_max",
                        "hr_range",
                        "hr_median",
                        "hr_iqr",
                        "hr_cv",
                        "hr_rmssd",
                        "hr_sdsd",
                        "hr_pnn50",
                        "hr_pnn20",
                        "hr_sdnn",
                        "hr_mean_rr",
                    ]:
                        features[col] = np.nan

                all_hr_features.append(features)
                window_idx += 1

    print(f"  {participant_id} done")

df_hr_features = pd.DataFrame(all_hr_features)
df_hr_features.to_csv(os.path.join(FEATURE_DIR, "features_hr.csv"), index=False)

In [None]:
window_samples = WINDOW_SIZE_SEC * FS_TEMP  
step_samples = OVERLAP_SEC * FS_TEMP  

all_temp_features = []

for participant_id in participants:
    for side in ["LEFT", "RIGHT"]:

        temp_path = os.path.join(
            DATA_FINAL_DIR,
            participant_id,
            f"{participant_id}_{side}_temp_filtered.csv",
        )

        if not os.path.exists(temp_path):
            continue

        df_temp = pd.read_csv(temp_path)
        phases = df_temp["phase"].dropna().unique()

        for phase in phases:
            phase_data = df_temp[df_temp["phase"] == phase].reset_index(drop=True)

            if len(phase_data) < window_samples:
                continue

            window_idx = 0
            for start in range(0, len(phase_data) - window_samples + 1, step_samples):
                end = start + window_samples
                window = phase_data.iloc[start:end]

                temp_vals = window["temp_filtered"].values

                features = {
                    "participant_id": participant_id,
                    "side": side,
                    "phase": phase,
                    "window_idx": window_idx,
                    "window_start": window["timestamp"].iloc[0],
                    "window_end": window["timestamp"].iloc[-1],
                }
                features["temp_mean"] = np.mean(temp_vals)
                features["temp_std"] = np.std(temp_vals)
                features["temp_min"] = np.min(temp_vals)
                features["temp_max"] = np.max(temp_vals)
                features["temp_range"] = np.ptp(temp_vals)
                features["temp_median"] = np.median(temp_vals)
                features["temp_iqr"] = np.percentile(temp_vals, 75) - np.percentile(
                    temp_vals, 25
                )
                features["temp_skewness"] = stats.skew(temp_vals)
                features["temp_kurtosis"] = stats.kurtosis(temp_vals)

                #trend 
                x = np.arange(len(temp_vals))
                slope, _ = np.polyfit(x, temp_vals, 1)
                features["temp_slope"] = slope

                #derivaive
                temp_diff = np.diff(temp_vals)
                features["temp_mean_diff"] = np.mean(np.abs(temp_diff))
                features["temp_std_diff"] = np.std(temp_diff)

                all_temp_features.append(features)
                window_idx += 1

    print(f"{participant_id} done")

df_temp_features = pd.DataFrame(all_temp_features)
df_temp_features.to_csv(os.path.join(FEATURE_DIR, "features_temp.csv"), index=False)


In [None]:
window_samples = WINDOW_SIZE_SEC * FS_ACC  # 60 * 63 = 3780
step_samples = OVERLAP_SEC * FS_ACC  # 30 * 63 = 1890
all_acc_features = []

for participant_id in participants:
    for side in ["LEFT", "RIGHT"]:

        acc_path = os.path.join(
            DATA_FINAL_DIR,
            participant_id,
            f"{participant_id}_{side}_acc_filtered.csv",
        )

        if not os.path.exists(acc_path):
            continue

        df_acc = pd.read_csv(acc_path)
        phases = df_acc["phase"].dropna().unique()

        for phase in phases:
            phase_data = df_acc[df_acc["phase"] == phase].reset_index(drop=True)

            if len(phase_data) < window_samples:
                continue

            window_idx = 0
            for start in range(0, len(phase_data) - window_samples + 1, step_samples):
                end = start + window_samples
                window = phase_data.iloc[start:end]

                features = {
                    "participant_id": participant_id,
                    "side": side,
                    "phase": phase,
                    "window_idx": window_idx,
                    "window_start": window["timestamp"].iloc[0],
                    "window_end": window["timestamp"].iloc[-1],
                }

                acc_x = window["acc_x_filtered"].values
                acc_y = window["acc_y_filtered"].values
                acc_z = window["acc_z_filtered"].values

                for axis, vals in [("x", acc_x), ("y", acc_y), ("z", acc_z)]:
                    features[f"acc_{axis}_mean"] = np.mean(vals)
                    features[f"acc_{axis}_std"] = np.std(vals)
                    features[f"acc_{axis}_min"] = np.min(vals)
                    features[f"acc_{axis}_max"] = np.max(vals)
                    features[f"acc_{axis}_range"] = np.ptp(vals)
                    features[f"acc_{axis}_iqr"] = np.percentile(
                        vals, 75
                    ) - np.percentile(vals, 25)

                magnitude = np.sqrt(acc_x**2 + acc_y**2 + acc_z**2)
                features["acc_magnitude_mean"] = np.mean(magnitude)
                features["acc_magnitude_std"] = np.std(magnitude)
                features["acc_magnitude_max"] = np.max(magnitude)
                features["acc_magnitude_range"] = np.ptp(magnitude)

                # ENMO
                if "enmo" in window.columns:
                    enmo = window["enmo"].values
                else:
                    if "acc_x_raw" in window.columns:
                        mag_raw = np.sqrt(
                            window["acc_x_raw"] ** 2
                            + window["acc_y_raw"] ** 2
                            + window["acc_z_raw"] ** 2
                        )
                        enmo = np.clip(mag_raw - 1.0, 0, None)
                    else:
                        enmo = np.clip(magnitude - 1.0, 0, None)

                features["acc_enmo_mean"] = np.mean(enmo)
                features["acc_enmo_std"] = np.std(enmo)
                features["acc_enmo_max"] = np.max(enmo)
                features["acc_enmo_sum"] = np.sum(enmo)

                features["acc_activity_level"] = np.var(magnitude)

                features["acc_sma"] = (
                    np.sum(np.abs(acc_x))
                    + np.sum(np.abs(acc_y))
                    + np.sum(np.abs(acc_z))
                ) / len(acc_x)

                var_x, var_y, var_z = np.var(acc_x), np.var(acc_y), np.var(acc_z)
                total_var = var_x + var_y + var_z

                if total_var > 0:
                    features["acc_x_dominance"] = var_x / total_var
                    features["acc_y_dominance"] = var_y / total_var
                    features["acc_z_dominance"] = var_z / total_var
                else:
                    features["acc_x_dominance"] = 0.33
                    features["acc_y_dominance"] = 0.33
                    features["acc_z_dominance"] = 0.33

                all_acc_features.append(features)
                window_idx += 1

    print(f"  {participant_id} done")

df_acc_features = pd.DataFrame(all_acc_features)
df_acc_features.to_csv(os.path.join(FEATURE_DIR, "features_acc.csv"), index=False)