In [2]:
%load_ext autoreload
%autoreload 2

# Process young-adult (Empatica E4) cohort

This notebook builds per-subject, 64 Hz, UTC-aligned physiological time series and then snaps
self-report pain labels (`pain_data.csv`) onto the resulting time grid.

## Expected folder layout

```
young_adults/
  original_files/
    101/
      BVP.csv
      EDA.csv
      TEMP.csv
      HR.csv
      pain_data.csv
    102/
      ...
  processed_data/
    per_subject/          # physiological merge only
    combined/             # physiological + snapped pain labels
```

## Outputs

- `processed_data/per_subject/<subject>_merged_64hz.csv`
- `processed_data/combined/<subject>_merged_64hz_with_pain.csv`

All merged outputs include `timestamp_ns` (UTC, int64) and `datetime_utc`.


In [3]:
import os

# Import from either a packaged layout (src/) or a flat layout.
try:
    from src.preprocessing import MergeConfig, process_empatica_e4_physio_all_subjects, SCIPY_OK
    from src.merge import batch_join_subject_folders
except ImportError:
    from preprocessing import MergeConfig, process_empatica_e4_physio_all_subjects, SCIPY_OK
    from merge import batch_join_subject_folders

ROOT = './young_adults'
ROOT_ORIGINAL = os.path.join(ROOT, 'original_files')
OUT_PHYSIO = os.path.join(ROOT, 'processed_data', 'per_subject')
OUT_COMBINED = os.path.join(ROOT, 'processed_data', 'combined')


In [6]:
cfg = MergeConfig(
    # Mapping of EDA/TEMP onto the 64 Hz BVP-native grid
    map_method='snap',          # 'snap' (default) or 'interp'
    map_snap_kind='one_to_one', # 'one_to_one' or 'per_grid'
    map_interp_kind='linear',

    # HR from Empatica HR.csv (already at 1 Hz)
    hr_map_method='snap',
    hr_map_interp_kind='cubic',

    # Self-report parsing and snapping tolerance
    pain_tz='America/New_York',
    pain_max_snap_s=0.25,
)

if (cfg.map_method == 'interp' or cfg.hr_map_interp_kind in ('quadratic', 'cubic')) and not SCIPY_OK:
    print('[WARN] SciPy not available: quadratic/cubic interpolation will fall back to linear; bandpass disabled.')

# Interpolate instantaneous HR onto 1Hz grid
    kind = cfg.hr_interp_kind
    # If SciPy not available, quadratic/cubic will fall back to linear inside interp_on_grid
    hr_1 = interp_on_grid(t_hr_inst, hr_inst, t1_ns, kind=kind)

    if cfg.hr_target == "1hz":
        # If a 1Hz dataset is needed, return HR broadcast to 64Hz grid by nearest-neighbor.
        # (Still keeps final merged DF at 64Hz.)
        hr_64 = nearest_neighbor_on_grid(t1_ns.astype(np.int64), hr_1.astype(np.float64), grid_ts, cfg.map_snap_kind)
        hr_64 = bandpass_filter_64hz(hr_64, cfg)
        return hr_64

    # Default: interpolate 1Hz HR to 64Hz grid
    hr_64 = interp_on_grid(t1_ns.astype(np.int64), hr_1.astype(np.float64), grid_ts, kind=kind)
    hr_64 = bandpass_filter_64hz(hr_64, cfg)

# HR mapping (separate knobs; default cubic interpolation onto 64Hz)
    if hr is None or hr.empty:
        hr_map = np.full(grid_ts.shape, np.nan, dtype=np.float64)
    else:
        hr_map = map_scalar_channel_to_grid(
            hr, grid_ts,
            method=getattr(cfg, "hr_map_method", "interp"),
            interp_kind=getattr(cfg, "hr_map_interp_kind", "cubic"),
            snap_kind=cfg.map_snap_kind,
        )
        # optional smoothing
        hr_map = bandpass_filter_64hz(hr_map, cfg)

In [None]:
# 1) Merge physiological channels (no labels)
process_empatica_e4_physio_all_subjects(
    root=ROOT_ORIGINAL,
    out_dir=OUT_PHYSIO,
    cfg=cfg,
)

[OK] 101 -> ./young_adults/processed_data/per_subject/101_merged_64hz.csv


In [5]:
# 2) Snap pain labels (pain_data.csv) onto the physiological grid
batch_join_subject_folders(
    merged_dir=OUT_PHYSIO,
    subjects_root=ROOT_ORIGINAL,
    out_dir=OUT_COMBINED,
    report_filename='pain_data.csv',
    out_name_template='{subject_id}_merged_64hz_with_pain.csv',
    merged_glob='*_merged_64hz.csv',
    time_window_csv=os.path.join(ROOT, 'experiment_time.csv'),
    tz_local=cfg.pain_tz,
    max_snap_s=cfg.pain_max_snap_s,
)

[OK] 101: ./young_adults/processed_data/per_subject/101_merged_64hz.csv + ./young_adults/original_files/101/pain_data.csv -> ./young_adults/processed_data/combined/101_merged_64hz_with_pain.csv
[OK] 102: ./young_adults/processed_data/per_subject/102_merged_64hz.csv + ./young_adults/original_files/102/pain_data.csv -> ./young_adults/processed_data/combined/102_merged_64hz_with_pain.csv
[OK] 103: ./young_adults/processed_data/per_subject/103_merged_64hz.csv + ./young_adults/original_files/103/pain_data.csv -> ./young_adults/processed_data/combined/103_merged_64hz_with_pain.csv
[OK] 104: ./young_adults/processed_data/per_subject/104_merged_64hz.csv + ./young_adults/original_files/104/pain_data.csv -> ./young_adults/processed_data/combined/104_merged_64hz_with_pain.csv
[OK] 105: ./young_adults/processed_data/per_subject/105_merged_64hz.csv + ./young_adults/original_files/105/pain_data.csv -> ./young_adults/processed_data/combined/105_merged_64hz_with_pain.csv
[OK] 106: ./young_adults/proce

In [None]:
import glob
import pandas as pd

paths = sorted(glob.glob(os.path.join(OUT_COMBINED, '*_merged_64hz_with_pain.csv')))
if not paths:
    raise FileNotFoundError(f'No outputs found in {OUT_COMBINED}')

df = pd.read_csv(paths[0])
df.head()
