# Feature Extraction (PRE-Decision) - Run Once, Use Everywhere

This notebook extracts all features (physiology PRE-decision, behavior, gaze) from preprocessing results and saves them to a pickle file.

**PRE-DECISION: Extracts pupil features from BEFORE the submit button press (decision period)**

**Run this notebook ONCE after preprocessing completes.**

All other model notebooks will load the saved features instead of re-extracting.

In [16]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print(f"Feature extraction started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Feature extraction started: 2025-12-27 14:06:32


## 1. Extract Physiology and Behavior Features (PRE-Decision)

In [17]:
preprocessing_dir = Path('../../data/results/preprocessing_outputs/preprocessing')
preprocessing_files = sorted(preprocessing_dir.glob('preprocessing_*.json'))
raw_dir = Path('../../data/raw/json')
baseline_method = 't3_stable_pre_decision'

print(f"Found {len(preprocessing_files)} preprocessing files")
print(f"Baseline correction method: {baseline_method}")

Found 99 preprocessing files
Baseline correction method: t3_stable_pre_decision


In [18]:
# Extract all features
all_physiology_features = []
all_behavior_features = []
all_outcomes = []
all_subject_ids = []
all_trial_ids = []

total_trials = 0

for preprocessed_file in preprocessing_files:
    with open(preprocessed_file, 'r') as f:
        preprocessed = json.load(f)
    
    subject_id = preprocessed['subject_id']
    print(f"\nProcessing subject: {subject_id}", end=" ")
    
    matches = list(raw_dir.glob(f"*{subject_id.split('_')[-1]}.json"))
    pattern = subject_id.replace("_", ".*")
    match = next((f for f in matches if re.search(pattern, f.name)), None)
    if not match:
        print("❌ No matching JSON file")
        continue
    
    with open(match, 'r') as f:
        raw_data = json.load(f)
    
    subject_trial_count = 0
    
    for trial_id, trial_data in preprocessed['trial_data'].items():
        method_data = trial_data['methods'][baseline_method]
        
        if method_data['success'] != True:
            continue
        
        raw_trial = raw_data['trials'][int(trial_id)-1]
        
        if not raw_trial['gamble details']['submitted']:
            continue
        
        # Extract pupil data
        time_aligned = np.array(trial_data['time_relative_to_submit'])
        pupil_avg = np.array(method_data['pupil_avg_baselined'])
        pupil_L = np.array(method_data['pupil_L_baselined'])
        pupil_R = np.array(method_data['pupil_R_baselined'])

        valid_mask = ~np.isnan(pupil_avg)
        pupil_avg_clean = pupil_avg[valid_mask]
        pupil_L_clean = pupil_L[valid_mask]
        pupil_R_clean = pupil_R[valid_mask]
        time_clean = time_aligned[valid_mask]

        if len(pupil_avg_clean) < 20:
            continue

        # PRE-SUBMIT DATA ONLY (-2 to 0 seconds before submit)
        pre_submit_mask = (time_clean >= -2.0) & (time_clean < 0)
        pupil_pre = pupil_avg_clean[pre_submit_mask]
        pupil_L_pre = pupil_L_clean[pre_submit_mask]
        pupil_R_pre = pupil_R_clean[pre_submit_mask]
        time_pre = time_clean[pre_submit_mask]

        if len(pupil_pre) < 5:
            continue

        # Calculate derivatives
        pupil_velocity_pre = np.diff(pupil_pre) if len(pupil_pre) > 1 else np.array([0])
        pupil_acceleration_pre = np.diff(pupil_velocity_pre) if len(pupil_velocity_pre) > 1 else np.array([0])
        dilation_mask_pre = pupil_velocity_pre > 0 if len(pupil_velocity_pre) > 0 else np.array([False])

        # PHYSIOLOGY FEATURES (PRE-SUBMIT ONLY)
        physiology_features = {
            'pupil_mean_pre': np.mean(pupil_pre),
            'pupil_std_pre': np.std(pupil_pre),
            'pupil_slope_pre': np.polyfit(time_pre, pupil_pre, 1)[0] if len(time_pre) > 1 else 0,
            'time_to_peak_pre': time_pre[np.argmax(pupil_pre)] - time_pre[0] if len(time_pre) > 0 else 0,
            'pupil_cv_pre': np.std(pupil_pre) / np.abs(np.mean(pupil_pre)) if (len(pupil_pre) > 0 and np.mean(pupil_pre) != 0) else 0,
            'pupil_velocity_mean_pre': np.mean(np.abs(pupil_velocity_pre)) if len(pupil_velocity_pre) > 0 else 0,
            'pupil_max_dilation_rate_pre': np.max(pupil_velocity_pre) if len(pupil_velocity_pre) > 0 else 0,
            'pupil_max_constriction_rate_pre': np.abs(np.min(pupil_velocity_pre)) if len(pupil_velocity_pre) > 0 else 0,
            'pupil_acceleration_std_pre': np.std(pupil_acceleration_pre) if len(pupil_acceleration_pre) > 1 else 0,
            'pct_time_dilating_pre': np.mean(dilation_mask_pre) if len(dilation_mask_pre) > 0 else 0,
            'num_dilation_peaks_pre': np.sum(np.diff(np.sign(pupil_velocity_pre)) > 0) if len(pupil_velocity_pre) > 1 else 0,
            'eye_asymmetry_pre': np.nanmean(np.abs(pupil_L_pre - pupil_R_pre)) if len(pupil_L_pre) > 0 else 0,
            'eye_asymmetry_std_pre': np.nanstd(pupil_L_pre - pupil_R_pre) if len(pupil_L_pre) > 1 else 0,
        }
        
        # BEHAVIOR FEATURES
        gamble_params = raw_trial['gamble details']['gamble parameters']
        lct = raw_trial['lct']
        
        show_screen_time = None
        submit_time = None
        click_time = None
        
        for event in lct:
            if 'show screen' in event['event']:
                show_screen_time = event['time']
            elif 'gamble clicked' in event['event']:
                click_time = event['time']
            elif 'submit' in event['event']:
                submit_time = event['time']
        
        if show_screen_time is None or submit_time is None:
            continue
        
        reaction_time = (click_time - show_screen_time) if click_time else np.nan
        decision_time = (submit_time - show_screen_time)

        invest_ev = (gamble_params['invest amount 1'] * gamble_params['invest probability 1'] + 
                    gamble_params['invest amount 2'] * gamble_params['invest probability 2'])
        keep_ev = gamble_params['keep amount']
        ev_difference = invest_ev - keep_ev

        invest_variance = ((gamble_params['invest amount 1'] - invest_ev)**2 * gamble_params['invest probability 1'] +
                        (gamble_params['invest amount 2'] - invest_ev)**2 * gamble_params['invest probability 2'])
        
        final_choice = raw_trial['gamble details']['choices'][-1]['choice'] if len(raw_trial['gamble details']['choices']) > 0 else None
        chose_invest = 1 if final_choice == 'INVEST' else 0
        
        behavior_features = {
            'reaction_time': reaction_time if not np.isnan(reaction_time) else decision_time,
            'decision_time': decision_time,
            'ev_difference': ev_difference,
            'invest_variance': invest_variance,
            'ambiguity': gamble_params['ambiguity'],
            'condition_social': 1 if gamble_params['condition'] == 'social' else 0,
            'risk_premium': ev_difference / np.sqrt(invest_variance) if invest_variance > 0 else 0,
        }
                
        outcome = chose_invest
        
        all_physiology_features.append(physiology_features)
        all_behavior_features.append(behavior_features)
        all_outcomes.append(outcome)
        all_subject_ids.append(subject_id)
        all_trial_ids.append(f"{trial_id}_{subject_id}")
        
        subject_trial_count += 1
    
    print(f"✓ {subject_trial_count} trials")
    total_trials += subject_trial_count

print(f"\n{'='*80}")
print(f"Total trials extracted: {total_trials}")


Processing subject: 0727_1400_539136F ✓ 0 trials

Processing subject: 0727_1400_A6I5HI6 ✓ 0 trials

Processing subject: 0806_1000_539136F ✓ 118 trials

Processing subject: 0806_1000_U9TEJGM ✓ 131 trials

Processing subject: 0811_1000_4LI8GO7 ✓ 121 trials

Processing subject: 0811_1000_539136F ✓ 130 trials

Processing subject: 0811_1000_U9TEJGM ✓ 131 trials

Processing subject: 0813_1000_539136F ✓ 129 trials

Processing subject: 0813_1000_9M4VCHG ✓ 123 trials

Processing subject: 0813_1000_U9TEJGM ✓ 128 trials

Processing subject: 0813_1600_539136F ✓ 124 trials

Processing subject: 0813_1600_9M4VCHG ✓ 130 trials

Processing subject: 0813_1600_U9TEJGM ✓ 123 trials

Processing subject: 0816_1400_539136F ✓ 129 trials

Processing subject: 0816_1400_9M4VCHG ✓ 124 trials

Processing subject: 0816_1400_U9TEJGM ✓ 125 trials

Processing subject: 0817_1000_539136F ✓ 131 trials

Processing subject: 0817_1000_9M4VCHG ✓ 131 trials

Processing subject: 0817_1000_U9TEJGM ✓ 119 trials

Processing subj

## 2. Extract Gaze Features

In [19]:
def extract_gaze_features_from_trial(eye_data):
    """Extract gaze features from raw eye tracking data."""
    if not eye_data or len(eye_data) == 0:
        return None
    
    timestamps = np.array([s['time'] for s in eye_data])
    gaze_x_L = np.array([s.get('gazeL_X', np.nan) for s in eye_data])
    gaze_y_L = np.array([s.get('gazeL_Y', np.nan) for s in eye_data])
    gaze_x_R = np.array([s.get('gazeR_X', np.nan) for s in eye_data])
    gaze_y_R = np.array([s.get('gazeR_Y', np.nan) for s in eye_data])
    
    gaze_x = np.nanmean([gaze_x_L, gaze_x_R], axis=0)
    gaze_y = np.nanmean([gaze_y_L, gaze_y_R], axis=0)
    
    screen_x_L = np.array([s.get('pupilLSensorPosL_X', np.nan) for s in eye_data])
    screen_y_L = np.array([s.get('pupilLSensorPosL_Y', np.nan) for s in eye_data])
    screen_x_R = np.array([s.get('pupilLSensorPosR_X', np.nan) for s in eye_data])
    screen_y_R = np.array([s.get('pupilLSensorPosR_Y', np.nan) for s in eye_data])
    
    screen_x = np.nanmean([screen_x_L, screen_x_R], axis=0)
    screen_y = np.nanmean([screen_y_L, screen_y_R], axis=0)
    
    valid_L = np.array([s.get('validL', 0) for s in eye_data])
    valid_R = np.array([s.get('validR', 0) for s in eye_data])
    
    features = {}
    features['gaze_valid_pct'] = np.mean((valid_L > 0) & (valid_R > 0))
    
    valid_mask = (valid_L > 0) & (valid_R > 0)
    if valid_mask.sum() < 5:
        return None
    
    gaze_x_valid = gaze_x[valid_mask]
    gaze_y_valid = gaze_y[valid_mask]
    screen_x_valid = screen_x[valid_mask]
    screen_y_valid = screen_y[valid_mask]
    timestamps_valid = timestamps[valid_mask]
    
    features['gaze_x_mean'] = np.nanmean(gaze_x_valid)
    features['gaze_x_std'] = np.nanstd(gaze_x_valid)
    features['gaze_y_mean'] = np.nanmean(gaze_y_valid)
    features['gaze_y_std'] = np.nanstd(gaze_y_valid)
    features['screen_x_mean'] = np.nanmean(screen_x_valid)
    features['screen_x_std'] = np.nanstd(screen_x_valid)
    features['screen_y_mean'] = np.nanmean(screen_y_valid)
    features['screen_y_std'] = np.nanstd(screen_y_valid)
    
    dt = np.diff(timestamps_valid)
    dt[dt == 0] = 1e-6
    dx = np.diff(screen_x_valid)
    dy = np.diff(screen_y_valid)
    
    velocity = np.sqrt(dx**2 + dy**2) / dt
    features['gaze_velocity_mean'] = np.nanmean(velocity)
    features['gaze_velocity_std'] = np.nanstd(velocity)
    features['gaze_velocity_max'] = np.nanmax(velocity)
    
    acceleration = np.diff(velocity) / dt[:-1]
    features['gaze_acceleration_mean'] = np.nanmean(np.abs(acceleration))
    features['gaze_acceleration_std'] = np.nanstd(acceleration)
    
    fixation_mask = velocity < 30
    saccade_mask = velocity > 100
    features['fixation_ratio'] = np.mean(fixation_mask)
    features['saccade_ratio'] = np.mean(saccade_mask)
    features['saccade_count'] = np.sum(np.diff(saccade_mask.astype(int)) == 1)
    
    features['gaze_dispersion_x'] = np.nanmax(screen_x_valid) - np.nanmin(screen_x_valid)
    features['gaze_dispersion_y'] = np.nanmax(screen_y_valid) - np.nanmin(screen_y_valid)
    features['gaze_path_length'] = np.sum(np.sqrt(dx**2 + dy**2))
    
    return features

def map_subject_filename(json_filename):
    match = re.search(r'(\d{4})_(\d{4})_LCT_DESKTOP-([A-Z0-9]+)', json_filename)
    if match:
        date1, date2, desktop_id = match.groups()
        return f"{date1}_{date2}_{desktop_id}"
    return None

In [20]:
print("Extracting gaze features...")

raw_json_files = sorted(raw_dir.glob('*.json'))
all_gaze_data = []

for file_path in raw_json_files:
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    subject_id = map_subject_filename(file_path.name)
    if not subject_id:
        continue
    
    trials = data.get('trials', [])
    
    for trial_idx, trial in enumerate(trials):
        eye_data = trial.get('eye', [])
        if not eye_data:
            continue
        
        gamble_details = trial.get('gamble details', {})
        trial_id = str(gamble_details.get('trial', trial_idx))
        
        gaze_features = extract_gaze_features_from_trial(eye_data)
        if gaze_features is None:
            continue
        
        gaze_features['subject_id'] = subject_id
        gaze_features['trial_id'] = f"{trial_id}_{subject_id}"
        all_gaze_data.append(gaze_features)

print(f"Extracted gaze features from {len(all_gaze_data)} trials")

Extracting gaze features...
Extracted gaze features from 15121 trials


## 3. Create DataFrames and Merge

In [21]:
# Create dataframes
physio_df = pd.DataFrame(all_physiology_features)
physio_df.insert(0, 'subject_id', all_subject_ids)
physio_df.insert(1, 'trial_id', all_trial_ids)
physio_df['outcome'] = all_outcomes

behavior_df = pd.DataFrame(all_behavior_features)
behavior_df.insert(0, 'subject_id', all_subject_ids)
behavior_df.insert(1, 'trial_id', all_trial_ids)
behavior_df['outcome'] = all_outcomes

gaze_df = pd.DataFrame(all_gaze_data)

# Merge all modalities
merged_df = physio_df.merge(
    behavior_df.drop(columns=['outcome']),
    on=['subject_id', 'trial_id'],
    how='inner'
)

merged_df = merged_df.merge(
    gaze_df,
    on=['subject_id', 'trial_id'],
    how='inner'
)

print(f"\n{'='*80}")
print("FINAL MERGED DATASET (PRE-DECISION):")
print(f"{'='*80}")
print(f"Total trials: {len(merged_df)}")
print(f"Subjects: {merged_df['subject_id'].nunique()}")
print(f"\nFeature counts:")
print(f"  Physiology (PRE): {len([c for c in merged_df.columns if c.endswith('_pre')])} features")
print(f"  Behavior: 7 features (reaction_time, decision_time, ev_difference, invest_variance, ambiguity, condition_social, risk_premium)")
print(f"  Gaze: {len([c for c in merged_df.columns if c.startswith('gaze_') or c.startswith('screen_')])} features")
print(f"\nOutcome distribution:")
print(merged_df['outcome'].value_counts())
print(f"\nData types:")
print(merged_df.dtypes)


FINAL MERGED DATASET (PRE-DECISION):
Total trials: 12511
Subjects: 97

Feature counts:
  Physiology (PRE): 13 features
  Behavior: 7 features (reaction_time, decision_time, ev_difference, invest_variance, ambiguity, condition_social, risk_premium)
  Gaze: 17 features

Outcome distribution:
outcome
1    8238
0    4273
Name: count, dtype: int64

Data types:
subject_id                          object
trial_id                            object
pupil_mean_pre                     float64
pupil_std_pre                      float64
pupil_slope_pre                    float64
time_to_peak_pre                   float64
pupil_cv_pre                       float64
pupil_velocity_mean_pre            float64
pupil_max_dilation_rate_pre        float64
pupil_max_constriction_rate_pre    float64
pupil_acceleration_std_pre         float64
pct_time_dilating_pre              float64
num_dilation_peaks_pre               int64
eye_asymmetry_pre                  float64
eye_asymmetry_std_pre              floa

## 4. Save to Pickle File

In [22]:
# Save to pickle
output_dir = Path('../../data/results/features_PRE')
output_dir.mkdir(parents=True, exist_ok=True)

output_file = output_dir / 'extracted_features_PRE.pkl'

# Define feature column names
physio_cols = [c for c in merged_df.columns if c.endswith('_pre')]
behavior_cols = ['reaction_time', 'decision_time', 'ev_difference', 
                 'invest_variance', 'ambiguity', 'condition_social', 'risk_premium']
gaze_cols = [c for c in merged_df.columns 
             if c.startswith('gaze_') or c.startswith('screen_') or 
             c in ['fixation_ratio', 'saccade_ratio', 'saccade_count', 'gaze_valid_pct',
                   'gaze_dispersion_x', 'gaze_dispersion_y', 'gaze_path_length']]

# Save with metadata
feature_data = {
    'merged_df': merged_df,
    'physio_df': physio_df,
    'behavior_df': behavior_df,
    'gaze_df': gaze_df,
    'physio_cols': physio_cols,
    'behavior_cols': behavior_cols,
    'gaze_cols': gaze_cols,
    'metadata': {
        'extraction_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'n_trials': len(merged_df),
        'n_subjects': merged_df['subject_id'].nunique(),
        'baseline_method': baseline_method,
        'preprocessing_files': len(preprocessing_files),
        'time_window': 'PRE-decision (-2 to 0 seconds before submit)'
    }
}

with open(output_file, 'wb') as f:
    pickle.dump(feature_data, f)

print(f"\n{'='*80}")
print(f"✓ PRE-decision features saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
print(f"{'='*80}")


✓ PRE-decision features saved to: ../../data/results/features_PRE/extracted_features_PRE.pkl
  File size: 9.00 MB
