In [1]:
%load_ext autoreload
%autoreload 2

# Process young-adult (Empatica E4) cohort

This notebook builds per-subject, 64 Hz, UTC-aligned physiological time series and then snaps
self-report pain labels (`pain_data.csv`) onto the resulting time grid.

## Expected folder layout

```
data/
  young_adults/
    original_files/
      101/
        BVP.csv
        EDA.csv
        TEMP.csv
        HR.csv
        pain_data.csv
      102/
        ...
    processed_data/
      combined/          # physiological merge only
      with_self_report/             # physiological + snapped pain labels
```

## Outputs

- `processed_data/combined/<subject>_merged_64hz.csv`
- `processed_data/with_self_report/<subject>_merged_64hz_with_pain.csv`

All merged outputs include `timestamp_ns` (UTC, int64) and `datetime_utc`.


In [2]:
import os

# Import from either a packaged layout (src/) or a flat layout.
from src.preprocessing import MergeConfig, process_empatica_e4_physio_all_subjects, SCIPY_OK
from src.merge import batch_join_subject_folders

ROOT_DATA_FOLDER = "./data"
ROOT_FOLDER_YOUNG_ADULTS = os.path.join(ROOT_DATA_FOLDER, 'young_adults')
ROOT_ORIGINAL = os.path.join(ROOT_FOLDER_YOUNG_ADULTS, 'original_files')
OUT_PHYSIO = os.path.join(ROOT_FOLDER_YOUNG_ADULTS, 'processed_data', 'combined')
OUT_WITH_SR = os.path.join(ROOT_FOLDER_YOUNG_ADULTS, 'processed_data', 'with_self_report')


In [3]:
cfg = MergeConfig(
    # Mapping of EDA/TEMP onto the 64 Hz BVP-native grid
    map_method='snap',          # 'snap' (default) or 'interp'
    map_snap_kind='one_to_one', # 'one_to_one' or 'per_grid'
    map_interp_kind='linear',

    # HR from Empatica HR.csv (already at 1 Hz)
    hr_map_method='snap',
    hr_map_interp_kind='cubic',

    # Self-report parsing and snapping tolerance
    pain_tz='America/New_York',
    pain_max_snap_s=0.25,
)

if (cfg.map_method == 'interp' or cfg.hr_map_interp_kind in ('quadratic', 'cubic')) and not SCIPY_OK:
    print('[WARN] SciPy not available: quadratic/cubic interpolation will fall back to linear; bandpass disabled.')

In [4]:
# 1) Merge physiological channels (no labels)
process_empatica_e4_physio_all_subjects(
    root=ROOT_ORIGINAL,
    out_dir=OUT_PHYSIO,
    cfg=cfg,
)

[OK] 101 -> ./data/young_adults/processed_data/combined/101_merged_64hz.csv
[OK] 102 -> ./data/young_adults/processed_data/combined/102_merged_64hz.csv
[OK] 103 -> ./data/young_adults/processed_data/combined/103_merged_64hz.csv
[OK] 104 -> ./data/young_adults/processed_data/combined/104_merged_64hz.csv
[OK] 105 -> ./data/young_adults/processed_data/combined/105_merged_64hz.csv
[OK] 106 -> ./data/young_adults/processed_data/combined/106_merged_64hz.csv
[OK] 107 -> ./data/young_adults/processed_data/combined/107_merged_64hz.csv
[OK] 108 -> ./data/young_adults/processed_data/combined/108_merged_64hz.csv
[OK] 109 -> ./data/young_adults/processed_data/combined/109_merged_64hz.csv
[OK] 110 -> ./data/young_adults/processed_data/combined/110_merged_64hz.csv
[OK] 111 -> ./data/young_adults/processed_data/combined/111_merged_64hz.csv
[OK] 112 -> ./data/young_adults/processed_data/combined/112_merged_64hz.csv
[OK] 113 -> ./data/young_adults/processed_data/combined/113_merged_64hz.csv
[OK] 114 -> 

In [5]:
# 2) Snap pain labels (pain_data.csv) onto the physiological grid
batch_join_subject_folders(
    merged_dir=OUT_PHYSIO,
    subjects_root=ROOT_ORIGINAL,
    out_dir=OUT_WITH_SR,
    report_filename='pain_data.csv',
    out_name_template='{subject_id}_merged_64hz_with_pain.csv',
    merged_glob='*_merged_64hz.csv',
    time_window_csv=os.path.join(ROOT_FOLDER_YOUNG_ADULTS, 'experiment_time.csv'),
    tz_local=cfg.pain_tz,
    #max_snap_s=cfg.pain_max_snap_s,
    max_snap_s=0.25,
)

[OK] 101: ./data/young_adults/processed_data/combined/101_merged_64hz.csv + ./data/young_adults/original_files/101/pain_data.csv -> ./data/young_adults/processed_data/with_self_report/101_merged_64hz_with_pain.csv
[OK] 102: ./data/young_adults/processed_data/combined/102_merged_64hz.csv + ./data/young_adults/original_files/102/pain_data.csv -> ./data/young_adults/processed_data/with_self_report/102_merged_64hz_with_pain.csv
[OK] 103: ./data/young_adults/processed_data/combined/103_merged_64hz.csv + ./data/young_adults/original_files/103/pain_data.csv -> ./data/young_adults/processed_data/with_self_report/103_merged_64hz_with_pain.csv
[OK] 104: ./data/young_adults/processed_data/combined/104_merged_64hz.csv + ./data/young_adults/original_files/104/pain_data.csv -> ./data/young_adults/processed_data/with_self_report/104_merged_64hz_with_pain.csv
[OK] 105: ./data/young_adults/processed_data/combined/105_merged_64hz.csv + ./data/young_adults/original_files/105/pain_data.csv -> ./data/young

In [6]:
import os
import pandas as pd
from pathlib import Path

directory_path = Path(OUT_WITH_SR)
df = pd.DataFrame()

for file_path in sorted(directory_path.glob('*.csv')):
    if file_path.is_file():
        print(file_path)
        subject_df = pd.read_csv(file_path)
        subject_id = os.path.split(file_path)[-1].split('_')[0]
        subject_df['subject'] = subject_id
        df = pd.concat([df, subject_df], ignore_index=True)
        
# save all subjects' data to a single .feather file
df.to_feather(os.path.join(ROOT_FOLDER_YOUNG_ADULTS, 'young_adults.feather'))

data/young_adults/processed_data/with_self_report/101_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/102_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/103_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/104_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/105_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/106_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/107_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/108_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/109_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/110_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/111_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_self_report/112_merged_64hz_with_pain.csv
data/young_adults/processed_data/with_se

In [9]:

""" df[df['subject']=='101'].ffill().bfill().plot(x='datetime_utc',
                                              #y=['bvp','eda','temperature','hr','PainLevel'],
                                              y=['PainLevel'],
                                              ) """

" df[df['subject']=='101'].ffill().bfill().plot(x='datetime_utc',\n                                              #y=['bvp','eda','temperature','hr','PainLevel'],\n                                              y=['PainLevel'],\n                                              ) "