# Feature Extraction - Run Once, Use Everywhere

This notebook extracts all features (physiology, behavior, gaze) from preprocessing results and saves them to a pickle file.

**Parameterized**: Set `TIMEFRAME` to 'PRE' or 'POST' for pre-decision or post-decision analysis.

**Run this notebook ONCE after preprocessing completes.**

All other model notebooks will load the saved features instead of re-extracting.

In [4]:
# ============================================================================
# CONFIGURATION: Set timeframe for analysis
# ============================================================================
TIMEFRAME = 'POST'  # Options: 'PRE', 'POST' 
# ============================================================================

import json
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set time window based on TIMEFRAME
if TIMEFRAME == 'PRE':
    TIME_WINDOW = (-2.0, 0.0)  # PRE-decision: -2 to 0 seconds before submit
    SUFFIX = '_pre'
elif TIMEFRAME == 'POST':
    TIME_WINDOW = (0.0, 2.0)   # POST-decision: 0 to 2 seconds after submit
    SUFFIX = '_post'
else:
    raise ValueError("TIMEFRAME must be 'PRE' or 'POST'")

print(f"\n{'='*70}")
print(f"FEATURE EXTRACTION: {TIMEFRAME}-DECISION PERIOD")
print(f"Time window: {TIME_WINDOW[0]} to {TIME_WINDOW[1]} seconds")
print(f"{'='*70}\n")
print(f"Feature extraction started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


FEATURE EXTRACTION: POST-DECISION PERIOD
Time window: 0.0 to 2.0 seconds

Feature extraction started: 2026-01-11 07:33:46


## 1. Extract Physiology and Behavior Features

In [5]:
preprocessing_dir = Path('../../data/results/preprocessing_outputs/preprocessing')
preprocessing_files = sorted(preprocessing_dir.glob('preprocessing_*.json'))
raw_dir = Path('../../data/raw/json')
baseline_method = 't3_stable_pre_decision'

print(f"Found {len(preprocessing_files)} preprocessing files")
print(f"Baseline correction method: {baseline_method}")

Found 99 preprocessing files
Baseline correction method: t3_stable_pre_decision


In [6]:
# Extract all features
all_physiology_features = []
all_behavior_features = []
all_outcomes = []
all_subject_ids = []
all_trial_ids = []

total_trials = 0

for preprocessed_file in preprocessing_files:
    with open(preprocessed_file, 'r') as f:
        preprocessed = json.load(f)
    
    subject_id = preprocessed['subject_id']
    print(f"\nProcessing subject: {subject_id}", end=" ")
    
    matches = list(raw_dir.glob(f"*{subject_id.split('_')[-1]}.json"))
    pattern = subject_id.replace("_", ".*")
    match = next((f for f in matches if re.search(pattern, f.name)), None)
    if not match:
        print("❌ No matching JSON file")
        continue
    
    with open(match, 'r') as f:
        raw_data = json.load(f)
    
    subject_trial_count = 0
    
    for trial_id, trial_data in preprocessed['trial_data'].items():
        method_data = trial_data['methods'][baseline_method]
        
        if method_data['success'] != True:
            continue
        
        raw_trial = raw_data['trials'][int(trial_id)-1]
        
        if not raw_trial['gamble details']['submitted']:
            continue
        
        # Extract pupil data
        time_aligned = np.array(trial_data['time_relative_to_submit'])
        pupil_avg = np.array(method_data['pupil_avg_baselined'])
        pupil_L = np.array(method_data['pupil_L_baselined'])
        pupil_R = np.array(method_data['pupil_R_baselined'])

        valid_mask = ~np.isnan(pupil_avg)
        pupil_avg_clean = pupil_avg[valid_mask]
        pupil_L_clean = pupil_L[valid_mask]
        pupil_R_clean = pupil_R[valid_mask]
        time_clean = time_aligned[valid_mask]

        if len(pupil_avg_clean) < 20:
            continue

        # Filter to time window (PRE or POST)
        if TIMEFRAME == 'PRE':
            time_mask = (time_clean >= TIME_WINDOW[0]) & (time_clean < TIME_WINDOW[1])
        else:  # POST
            time_mask = (time_clean > TIME_WINDOW[0]) & (time_clean <= TIME_WINDOW[1])
        
        pupil = pupil_avg_clean[time_mask]
        pupil_L_filtered = pupil_L_clean[time_mask]
        pupil_R_filtered = pupil_R_clean[time_mask]
        time_filtered = time_clean[time_mask]

        if len(pupil) < 5:
            continue

        # Calculate derivatives
        pupil_velocity = np.diff(pupil) if len(pupil) > 1 else np.array([0])
        pupil_acceleration = np.diff(pupil_velocity) if len(pupil_velocity) > 1 else np.array([0])
        dilation_mask = pupil_velocity > 0 if len(pupil_velocity) > 0 else np.array([False])

        # PHYSIOLOGY FEATURES
        physiology_features = {
            f'pupil_mean{SUFFIX}': np.mean(pupil),
            f'pupil_std{SUFFIX}': np.std(pupil),
            f'pupil_slope{SUFFIX}': np.polyfit(time_filtered, pupil, 1)[0] if len(time_filtered) > 1 else 0,
            f'time_to_peak{SUFFIX}': time_filtered[np.argmax(pupil)] - time_filtered[0] if len(time_filtered) > 0 else 0,
            f'pupil_cv{SUFFIX}': np.std(pupil) / np.abs(np.mean(pupil)) if (len(pupil) > 0 and np.mean(pupil) != 0) else 0,
            f'pupil_velocity_mean{SUFFIX}': np.mean(np.abs(pupil_velocity)) if len(pupil_velocity) > 0 else 0,
            f'pupil_max_dilation_rate{SUFFIX}': np.max(pupil_velocity) if len(pupil_velocity) > 0 else 0,
            f'pupil_max_constriction_rate{SUFFIX}': np.abs(np.min(pupil_velocity)) if len(pupil_velocity) > 0 else 0,
            f'pupil_acceleration_std{SUFFIX}': np.std(pupil_acceleration) if len(pupil_acceleration) > 1 else 0,
            f'pct_time_dilating{SUFFIX}': np.mean(dilation_mask) if len(dilation_mask) > 0 else 0,
            f'num_dilation_peaks{SUFFIX}': np.sum(np.diff(np.sign(pupil_velocity)) > 0) if len(pupil_velocity) > 1 else 0,
            f'eye_asymmetry{SUFFIX}': np.nanmean(np.abs(pupil_L_filtered - pupil_R_filtered)) if len(pupil_L_filtered) > 0 else 0,
            f'eye_asymmetry_std{SUFFIX}': np.nanstd(pupil_L_filtered - pupil_R_filtered) if len(pupil_L_filtered) > 1 else 0,
        }
        
        # BEHAVIOR FEATURES
        gamble_params = raw_trial['gamble details']['gamble parameters']
        lct = raw_trial['lct']
        
        show_screen_time = None
        submit_time = None
        click_time = None
        
        for event in lct:
            if 'show screen' in event['event']:
                show_screen_time = event['time']
            elif 'gamble clicked' in event['event']:
                click_time = event['time']
            elif 'submit' in event['event']:
                submit_time = event['time']
        
        if show_screen_time is None or submit_time is None:
            continue
        
        reaction_time = (click_time - show_screen_time) if click_time else np.nan
        decision_time = (submit_time - show_screen_time)

        invest_ev = (gamble_params['invest amount 1'] * gamble_params['invest probability 1'] + 
                    gamble_params['invest amount 2'] * gamble_params['invest probability 2'])
        keep_ev = gamble_params['keep amount']
        ev_difference = invest_ev - keep_ev

        invest_variance = ((gamble_params['invest amount 1'] - invest_ev)**2 * gamble_params['invest probability 1'] +
                        (gamble_params['invest amount 2'] - invest_ev)**2 * gamble_params['invest probability 2'])
        
        final_choice = raw_trial['gamble details']['choices'][-1]['choice'] if len(raw_trial['gamble details']['choices']) > 0 else None
        chose_invest = 1 if final_choice == 'INVEST' else 0
        
        behavior_features = {
            'reaction_time': reaction_time if not np.isnan(reaction_time) else decision_time,
            'decision_time': decision_time,
            'ev_difference': ev_difference,
            'invest_variance': invest_variance,
            'ambiguity': gamble_params['ambiguity'],
            'condition_social': 1 if gamble_params['condition'] == 'social' else 0,
            'risk_premium': ev_difference / np.sqrt(invest_variance) if invest_variance > 0 else 0,
        }
                
        outcome = chose_invest
        
        all_physiology_features.append(physiology_features)
        all_behavior_features.append(behavior_features)
        all_outcomes.append(outcome)
        all_subject_ids.append(subject_id)
        all_trial_ids.append(f"{trial_id}_{subject_id}")
        
        subject_trial_count += 1
    
    print(f"✓ {subject_trial_count} trials")
    total_trials += subject_trial_count

print(f"\n{'='*80}")
print(f"Total trials extracted: {total_trials}")


Processing subject: 0727_1400_539136F ✓ 0 trials

Processing subject: 0727_1400_A6I5HI6 ✓ 0 trials

Processing subject: 0806_1000_539136F ✓ 118 trials

Processing subject: 0806_1000_U9TEJGM ✓ 131 trials

Processing subject: 0811_1000_4LI8GO7 ✓ 121 trials

Processing subject: 0811_1000_539136F ✓ 130 trials

Processing subject: 0811_1000_U9TEJGM ✓ 131 trials

Processing subject: 0813_1000_539136F ✓ 129 trials

Processing subject: 0813_1000_9M4VCHG ✓ 123 trials

Processing subject: 0813_1000_U9TEJGM ✓ 128 trials

Processing subject: 0813_1600_539136F ✓ 124 trials

Processing subject: 0813_1600_9M4VCHG ✓ 130 trials

Processing subject: 0813_1600_U9TEJGM ✓ 123 trials

Processing subject: 0816_1400_539136F ✓ 129 trials

Processing subject: 0816_1400_9M4VCHG ✓ 124 trials

Processing subject: 0816_1400_U9TEJGM ✓ 125 trials

Processing subject: 0817_1000_539136F ✓ 131 trials

Processing subject: 0817_1000_9M4VCHG ✓ 131 trials

Processing subject: 0817_1000_U9TEJGM ✓ 119 trials

Processing subj

## 2. Extract Gaze Features

In [7]:
def extract_gaze_features_from_trial(eye_data, submit_time=None, time_window=(-2.0, 0.0)):
    """
    Extract gaze features from raw eye tracking data.
    
    Parameters
    ----------
    eye_data : list
        Raw eye tracking samples from trial
    submit_time : float, optional
        Timestamp of submit button press for time-aligning
    time_window : tuple, optional
        (start, end) in seconds relative to submit_time
        Default: (-2.0, 0.0) for PRE-decision period
    
    Returns
    -------
    dict or None
        Dictionary of gaze features, or None if insufficient data
    """
    if not eye_data or len(eye_data) == 0:
        return None
    
    timestamps = np.array([s['time'] for s in eye_data])
    
    # FILTER TO TIME WINDOW if submit_time provided
    if submit_time is not None:
        time_relative = timestamps - submit_time
        time_mask = (time_relative >= time_window[0]) & (time_relative < time_window[1])
        
        # Filter all data to time window
        indices = np.where(time_mask)[0]
        if len(indices) < 5:  # Need minimum samples
            return None
        
        # Apply filter
        eye_data = [eye_data[i] for i in indices]
        timestamps = timestamps[indices]
    
    # Extract gaze data from filtered eye_data
    gaze_x_L = np.array([s.get('gazeL_X', np.nan) for s in eye_data])
    gaze_y_L = np.array([s.get('gazeL_Y', np.nan) for s in eye_data])
    gaze_x_R = np.array([s.get('gazeR_X', np.nan) for s in eye_data])
    gaze_y_R = np.array([s.get('gazeR_Y', np.nan) for s in eye_data])
    
    gaze_x = np.nanmean([gaze_x_L, gaze_x_R], axis=0)
    gaze_y = np.nanmean([gaze_y_L, gaze_y_R], axis=0)
    
    screen_x_L = np.array([s.get('pupilLSensorPosL_X', np.nan) for s in eye_data])
    screen_y_L = np.array([s.get('pupilLSensorPosL_Y', np.nan) for s in eye_data])
    screen_x_R = np.array([s.get('pupilLSensorPosR_X', np.nan) for s in eye_data])
    screen_y_R = np.array([s.get('pupilLSensorPosR_Y', np.nan) for s in eye_data])
    
    screen_x = np.nanmean([screen_x_L, screen_x_R], axis=0)
    screen_y = np.nanmean([screen_y_L, screen_y_R], axis=0)
    
    valid_L = np.array([s.get('validL', 0) for s in eye_data])
    valid_R = np.array([s.get('validR', 0) for s in eye_data])
    
    features = {}
    features['gaze_valid_pct'] = np.mean((valid_L > 0) & (valid_R > 0))
    
    valid_mask = (valid_L > 0) & (valid_R > 0)
    if valid_mask.sum() < 5:
        return None
    
    gaze_x_valid = gaze_x[valid_mask]
    gaze_y_valid = gaze_y[valid_mask]
    screen_x_valid = screen_x[valid_mask]
    screen_y_valid = screen_y[valid_mask]
    timestamps_valid = timestamps[valid_mask]
    
    features['gaze_x_mean'] = np.nanmean(gaze_x_valid)
    features['gaze_x_std'] = np.nanstd(gaze_x_valid)
    features['gaze_y_mean'] = np.nanmean(gaze_y_valid)
    features['gaze_y_std'] = np.nanstd(gaze_y_valid)
    features['screen_x_mean'] = np.nanmean(screen_x_valid)
    features['screen_x_std'] = np.nanstd(screen_x_valid)
    features['screen_y_mean'] = np.nanmean(screen_y_valid)
    features['screen_y_std'] = np.nanstd(screen_y_valid)
    
    dt = np.diff(timestamps_valid)
    dt[dt == 0] = 1e-6
    dx = np.diff(screen_x_valid)
    dy = np.diff(screen_y_valid)
    
    velocity = np.sqrt(dx**2 + dy**2) / dt
    features['gaze_velocity_mean'] = np.nanmean(velocity)
    features['gaze_velocity_std'] = np.nanstd(velocity)
    features['gaze_velocity_max'] = np.nanmax(velocity)
    
    acceleration = np.diff(velocity) / dt[:-1]
    features['gaze_acceleration_mean'] = np.nanmean(np.abs(acceleration))
    features['gaze_acceleration_std'] = np.nanstd(acceleration)
    
    fixation_mask = velocity < 30
    saccade_mask = velocity > 100
    features['fixation_ratio'] = np.mean(fixation_mask)
    features['saccade_ratio'] = np.mean(saccade_mask)
    features['saccade_count'] = np.sum(np.diff(saccade_mask.astype(int)) == 1)
    
    features['gaze_dispersion_x'] = np.nanmax(screen_x_valid) - np.nanmin(screen_x_valid)
    features['gaze_dispersion_y'] = np.nanmax(screen_y_valid) - np.nanmin(screen_y_valid)
    features['gaze_path_length'] = np.sum(np.sqrt(dx**2 + dy**2))
    
    return features

def map_subject_filename(json_filename):
    match = re.search(r'(\d{4})_(\d{4})_LCT_DESKTOP-([A-Z0-9]+)', json_filename)
    if match:
        date1, date2, desktop_id = match.groups()
        return f"{date1}_{date2}_{desktop_id}"
    return None

In [8]:
# Extract gaze features
# For PRE-decision: use raw JSON trial['eye'] data (PRE-decision window only)
# For POST-decision: use preprocessed PKL data (has POST-decision gaze data!)

if TIMEFRAME == 'PRE':
    print(f"Extracting gaze features from raw JSON with {TIMEFRAME}-decision time window filtering...")

    raw_json_files = sorted(raw_dir.glob('*.json'))
    all_gaze_data = []
    skipped_no_submit = 0

    for file_path in raw_json_files:
        with open(file_path, 'r') as f:
            data = json.load(f)

        subject_id = map_subject_filename(file_path.name)
        if not subject_id:
            continue

        trials = data.get('trials', [])

        for trial_idx, trial in enumerate(trials):
            eye_data = trial.get('eye', [])
            if not eye_data:
                continue

            # Get submit time from LCT
            lct = trial.get('lct', [])
            submit_time = None
            for event in lct:
                if 'submit' in event['event']:
                    submit_time = event['time']
                    break

            if submit_time is None:
                skipped_no_submit += 1
                continue

            gamble_details = trial.get('gamble details', {})
            trial_id = str(gamble_details.get('trial', trial_idx))

            # Extract gaze features with time window filtering
            gaze_features = extract_gaze_features_from_trial(
                eye_data,
                submit_time=submit_time,
                time_window=TIME_WINDOW
            )
            if gaze_features is None:
                continue

            gaze_features['subject_id'] = subject_id
            gaze_features['trial_id'] = f"{trial_id}_{subject_id}"
            all_gaze_data.append(gaze_features)

    print(f"Extracted gaze features from {len(all_gaze_data)} trials")
    print(f"Skipped {skipped_no_submit} trials without submit time")
    print(f"✓ Gaze features use same time window as pupil features ({TIME_WINDOW[0]} to {TIME_WINDOW[1]}s)")

elif TIMEFRAME == 'POST':
    print(f"Extracting gaze features from raw PKL data for {TIMEFRAME}-decision period...")
    print(f"  Time window: {TIME_WINDOW[0]} to {TIME_WINDOW[1]} seconds")
    print(f"  Using preprocessed files to get trial info (avoiding JSON files)")
    
    raw_pkl_dir = Path('../../data/raw/eye')
    
    # Step 1: Pre-load all PKL files
    print(f"  [1/3] Loading PKL files...")
    pkl_cache = {}
    for i, pkl_file in enumerate(raw_pkl_dir.glob('*.pkl')):
        if i % 20 == 0:
            print(f"    Loading PKL {i}...")
        subject_id_from_pkl = pkl_file.stem
        try:
            with open(pkl_file, 'rb') as f:
                eye_pkl = pickle.load(f)
            if not isinstance(eye_pkl, pd.DataFrame):
                eye_pkl = pd.DataFrame(eye_pkl)
            eye_pkl = eye_pkl.sort_values('time').reset_index(drop=True)
            pkl_cache[subject_id_from_pkl] = eye_pkl
        except Exception as e:
            print(f"    ⚠ Error loading {pkl_file.name}: {e}")
    print(f"  Loaded {len(pkl_cache)} PKL files")
    
    # Step 2: Build lookup of valid trials from all_trial_ids
    # These are trials that already passed all validation checks
    print(f"  [2/3] Building valid trial lookup...")
    valid_trials = set(all_trial_ids)
    print(f"  Found {len(valid_trials)} valid trials from preprocessing")
    
    # Step 3: Extract gaze features
    print(f"  [3/3] Extracting gaze features...")
    all_gaze_data = []
    skipped_count = 0
    trials_processed = 0
    
    for preprocessed_file in preprocessing_files:
        with open(preprocessed_file, 'r') as f:
            preprocessed = json.load(f)
        
        subject_id = preprocessed['subject_id']
        
        # Check if PKL exists
        if subject_id not in pkl_cache:
            continue
        
        eye_pkl = pkl_cache[subject_id]
        
        # Process each trial
        for trial_id, trial_data in preprocessed['trial_data'].items():
            trials_processed += 1
            if trials_processed % 1000 == 0:
                print(f"    Processed {trials_processed} trials, extracted {len(all_gaze_data)} gaze features...")
            
            # Skip if trial not in valid set (already filtered in physiology extraction)
            trial_key = f"{trial_id}_{subject_id}"
            if trial_key not in valid_trials:
                skipped_count += 1
                continue
            
            # Get submit time from preprocessed data
            # time_relative_to_submit has times relative to submit, so we can calculate submit_time
            time_relative = np.array(trial_data['time_relative_to_submit'])
            time_absolute = np.array(trial_data['time'])
            
            if len(time_relative) == 0 or len(time_absolute) == 0:
                skipped_count += 1
                continue
            
            # Calculate submit time: absolute_time = submit_time + relative_time
            # So: submit_time = absolute_time - relative_time
            submit_time = time_absolute[0] - time_relative[0]
            
            # Extract POST-decision window from PKL using time-based filtering
            post_window_start = submit_time
            post_window_end = submit_time + 2000  # 2 seconds
            
            # Simple time filter (very fast)
            post_decision_data = eye_pkl[
                (eye_pkl['time'] > post_window_start) & 
                (eye_pkl['time'] <= post_window_end)
            ]
            
            if len(post_decision_data) < 5:
                skipped_count += 1
                continue
            
            # Convert to list of dicts
            eye_data_list = post_decision_data.to_dict('records')
            
            # Extract gaze features
            gaze_features = extract_gaze_features_from_trial(
                eye_data_list,
                submit_time=None,
                time_window=None
            )
            
            if gaze_features is None:
                skipped_count += 1
                continue
            
            gaze_features['subject_id'] = subject_id
            gaze_features['trial_id'] = f"{trial_id}_{subject_id}"
            all_gaze_data.append(gaze_features)
    
    print(f"\n✓ Extracted gaze features from {len(all_gaze_data)} trials")
    print(f"  Total trials processed: {trials_processed}")
    print(f"  Skipped: {skipped_count} trials")

Extracting gaze features from raw PKL data for POST-decision period...
  Time window: 0.0 to 2.0 seconds
  Using preprocessed files to get trial info (avoiding JSON files)
  [1/3] Loading PKL files...
    Loading PKL 0...
    Loading PKL 20...
    Loading PKL 40...
    Loading PKL 60...
    Loading PKL 80...
    Loading PKL 100...
  Loaded 108 PKL files
  [2/3] Building valid trial lookup...
  Found 11467 valid trials from preprocessing
  [3/3] Extracting gaze features...
    Processed 1000 trials, extracted 962 gaze features...
    Processed 2000 trials, extracted 1941 gaze features...
    Processed 3000 trials, extracted 2908 gaze features...
    Processed 4000 trials, extracted 3859 gaze features...
    Processed 5000 trials, extracted 4822 gaze features...
    Processed 6000 trials, extracted 5792 gaze features...
    Processed 7000 trials, extracted 6771 gaze features...
    Processed 8000 trials, extracted 7762 gaze features...
    Processed 9000 trials, extracted 8739 gaze featu

## 3. Create DataFrames and Merge

In [9]:
# Create dataframes
physio_df = pd.DataFrame(all_physiology_features)
physio_df.insert(0, 'subject_id', all_subject_ids)
physio_df.insert(1, 'trial_id', all_trial_ids)
physio_df['outcome'] = all_outcomes

behavior_df = pd.DataFrame(all_behavior_features)
behavior_df.insert(0, 'subject_id', all_subject_ids)
behavior_df.insert(1, 'trial_id', all_trial_ids)
behavior_df['outcome'] = all_outcomes

gaze_df = pd.DataFrame(all_gaze_data)

# Merge all modalities
merged_df = physio_df.merge(
    behavior_df.drop(columns=['outcome']),
    on=['subject_id', 'trial_id'],
    how='inner'
)

# Only merge gaze data if available (PRE-decision only)
if len(gaze_df) > 0:
    merged_df = merged_df.merge(
        gaze_df,
        on=['subject_id', 'trial_id'],
        how='inner'
    )

print(f"\n{'='*80}")
print(f"FINAL MERGED DATASET ({TIMEFRAME}-DECISION):")
print(f"{'='*80}")
print(f"Total trials: {len(merged_df)}")
print(f"Subjects: {merged_df['subject_id'].nunique()}")
print(f"\nFeature counts:")
print(f"  Physiology ({TIMEFRAME}): {len([c for c in merged_df.columns if c.endswith(SUFFIX)])} features")
print(f"  Behavior: 7 features (reaction_time, decision_time, ev_difference, invest_variance, ambiguity, condition_social, risk_premium)")
if len(gaze_df) > 0:
    print(f"  Gaze: {len([c for c in merged_df.columns if c.startswith('gaze_') or c.startswith('screen_')])} features")
else:
    print(f"  Gaze: 0 features (not available for {TIMEFRAME}-decision)")
print(f"\nOutcome distribution:")
print(merged_df['outcome'].value_counts())


FINAL MERGED DATASET (POST-DECISION):
Total trials: 11467
Subjects: 97

Feature counts:
  Physiology (POST): 13 features
  Behavior: 7 features (reaction_time, decision_time, ev_difference, invest_variance, ambiguity, condition_social, risk_premium)
  Gaze: 17 features

Outcome distribution:
outcome
1    7550
0    3917
Name: count, dtype: int64


## 4. Save to Pickle File

In [10]:
# Save to pickle
output_dir = Path(f'../../data/results/features_{TIMEFRAME}')
output_dir.mkdir(parents=True, exist_ok=True)

output_file = output_dir / f'extracted_features_{TIMEFRAME}.pkl'

# Define feature column names
physio_cols = [c for c in merged_df.columns if c.endswith(SUFFIX)]
behavior_cols = ['reaction_time', 'decision_time', 'ev_difference',
                 'invest_variance', 'ambiguity', 'condition_social', 'risk_premium']
if len(gaze_df) > 0:
    gaze_cols = [c for c in merged_df.columns
                 if c.startswith('gaze_') or c.startswith('screen_') or
                 c in ['fixation_ratio', 'saccade_ratio', 'saccade_count', 'gaze_valid_pct',
                       'gaze_dispersion_x', 'gaze_dispersion_y', 'gaze_path_length']]
else:
    gaze_cols = []

# Save with metadata
feature_data = {
    'merged_df': merged_df,
    'physio_df': physio_df,
    'behavior_df': behavior_df,
    'gaze_df': gaze_df,
    'physio_cols': physio_cols,
    'behavior_cols': behavior_cols,
    'gaze_cols': gaze_cols,
    'metadata': {
        'extraction_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'n_trials': len(merged_df),
        'n_subjects': merged_df['subject_id'].nunique(),
        'baseline_method': baseline_method,
        'preprocessing_files': len(preprocessing_files),
        'time_window': f'{TIMEFRAME}-decision ({TIME_WINDOW[0]} to {TIME_WINDOW[1]} seconds)',
        'has_gaze_features': len(gaze_df) > 0
    }
}

with open(output_file, 'wb') as f:
    pickle.dump(feature_data, f)

print(f"\n{'='*80}")
print(f"✓ {TIMEFRAME}-decision features saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
print(f"{'='*80}")


✓ POST-decision features saved to: ../../data/results/features_POST/extracted_features_POST.pkl
  File size: 8.01 MB
