# Relabel Education and SART Videos

This notebook creates label files for Education and SART tasks by:
1. Reading annotation CSVs from preprocessed subjects
2. Identifying Education (position 3) and SART segments (position 4 + large unlabeled)
3. Copying corresponding acq files as label files

**Input**: `ELM_preprocessed/sub-*/` folders with annotations and acq files  
**Output**: New label files (`label-Education`, `label-SART-practice`, `label-SART-actual`)

In [None]:
# Cell 1: Setup and Configuration
import pandas as pd
from pathlib import Path
import shutil

# Configuration
base_dir = Path("/Users/saewonchung/Desktop/ELM_MW_data_analysis")
preprocessed_dir = base_dir / "ELM_preprocessed"

# Education video durations (5 different videos, randomly selected per subject)
EDUCATION_DURATIONS = [376, 314, 326, 385, 319]
DURATION_TOLERANCE = 5

# DRY RUN MODE: Set to True to test without creating files
DRY_RUN = True  # Change to False to actually create label files

print(f"Base directory: {base_dir}")
print(f"Preprocessed directory: {preprocessed_dir}")
print(f"Education durations: {EDUCATION_DURATIONS} ¬±{DURATION_TOLERANCE}s")
print(f"\n{'üîç DRY RUN MODE' if DRY_RUN else '‚úÖ PRODUCTION MODE'} - Files will {'NOT' if DRY_RUN else ''} be created")

In [None]:
# Cell 2: Helper Functions

def identify_education_sart(annotations_df):
    """
    Identify Education and SART segments from annotations.

    Returns:
        dict with keys: 'education_duration', 'sart_practice_duration', 'warnings'
    """
    # Filter meaningful Video/Begin events (duration > 1s)
    video_begin = annotations_df[
        (annotations_df['description'].str.contains('Video/Begin')) &
        (annotations_df['duration'] > 1)
    ].copy()

    video_begin = video_begin.reset_index(drop=True)

    result = {
        'education_duration': None,
        'sart_practice_duration': None,
        'warnings': []
    }

    # Position 3: Education (0-indexed = position 2)
    if len(video_begin) >= 3:
        edu_duration = int(video_begin.iloc[2]['duration'])

        # Check if duration matches any education video
        matched = False
        for target_dur in EDUCATION_DURATIONS:
            if abs(edu_duration - target_dur) <= DURATION_TOLERANCE:
                matched = True
                break

        if not matched:
            result['warnings'].append(
                f"Education duration {edu_duration}s doesn't match known videos {EDUCATION_DURATIONS}"
            )

        result['education_duration'] = edu_duration
    else:
        result['warnings'].append(f"Incomplete data: only {len(video_begin)} meaningful Video/Begin events")

    # Position 4: SART practice (0-indexed = position 3)
    if len(video_begin) >= 4:
        sart_practice_dur = int(video_begin.iloc[3]['duration'])

        if not (30 <= sart_practice_dur <= 110):
            result['warnings'].append(
                f"SART practice duration {sart_practice_dur}s outside expected range [30-110]s"
            )

        result['sart_practice_duration'] = sart_practice_dur

    return result


def find_acq_file_by_duration(subject_dir, target_duration, tolerance=2):
    """
    Find acq file matching the target duration.

    Args:
        subject_dir: Path to subject's preprocessed folder
        target_duration: Target duration in seconds
        tolerance: Duration matching tolerance

    Returns:
        Path to matching acq file, or None
    """
    acq_files = list(subject_dir.glob("*_acq-*_dur-*_desc-preproc_haemo.csv"))

    for acq_file in acq_files:
        # Parse duration from filename: acq-8_dur-375_desc-preproc_haemo.csv
        parts = acq_file.stem.split('_')
        for part in parts:
            if part.startswith('dur-'):
                file_duration = int(part.split('-')[1])
                if abs(file_duration - target_duration) <= tolerance:
                    return acq_file

    return None


def find_large_unlabeled_acq_files(subject_dir, min_duration=500):
    """
    Find large acq files (>500s) that are unlabeled (potential SART actual).

    Returns:
        List of paths to large unlabeled acq files
    """
    acq_files = list(subject_dir.glob("*_acq-*_dur-*_desc-preproc_haemo.csv"))
    large_files = []

    # Get list of already labeled durations (from existing label files)
    labeled_durations = set()
    for label_file in subject_dir.glob("*_label-*_haemo.csv"):
        # Try to infer duration from the data if needed
        # For now, we'll check Zima (~508s) and Splitscreen (~145s)
        if 'Zima' in label_file.name:
            labeled_durations.add(508)
        elif 'Splitscreen' in label_file.name:
            labeled_durations.add(145)

    for acq_file in acq_files:
        # Parse duration from filename
        parts = acq_file.stem.split('_')
        for part in parts:
            if part.startswith('dur-'):
                file_duration = int(part.split('-')[1])

                # Check if large and not already labeled
                if file_duration > min_duration:
                    # Check if this duration is close to any labeled duration
                    is_labeled = any(abs(file_duration - ld) <= 10 for ld in labeled_durations)
                    if not is_labeled:
                        large_files.append(acq_file)
                break

    return large_files

print("‚úÖ Helper functions defined")

In [None]:
# Cell 3: Process All Subjects

def process_subject(subject_dir):
    """Process a single subject and create Education/SART label files."""
    subject_name = subject_dir.name
    print(f"\n{'='*60}")
    print(f"Processing {subject_name}")
    print(f"{'='*60}")

    # Find annotation file
    annot_files = list(subject_dir.glob("*_annotations.csv"))
    if not annot_files:
        print(f"‚ö†Ô∏è  No annotation file found, skipping")
        return {'subject': subject_name, 'status': 'no_annotations', 
                'education_created': False, 'sart_practice_created': False, 
                'sart_actual_created': False, 'warnings': []}

    annot_file = annot_files[0]
    annotations = pd.read_csv(annot_file)

    # Identify Education and SART segments
    result = identify_education_sart(annotations)

    # Print warnings
    for warning in result['warnings']:
        print(f"‚ö†Ô∏è  {warning}")

    status = {
        'subject': subject_name,
        'education_created': False,
        'sart_practice_created': False,
        'sart_actual_created': False,
        'warnings': result['warnings']
    }

    # Create Education label file
    if result['education_duration']:
        edu_acq = find_acq_file_by_duration(subject_dir, result['education_duration'])
        if edu_acq:
            # Create label filename
            session_id = edu_acq.stem.split('_ses-')[1].split('_')[0]
            label_file = subject_dir / f"{subject_name}_ses-{session_id}_task-Video_label-Education_haemo.csv"

            # Copy acq file to label file (or simulate in dry run)
            if DRY_RUN:
                print(f"üîç [DRY RUN] Would create: {label_file.name} (duration={result['education_duration']}s)")
                print(f"   Source: {edu_acq.name}")
            else:
                shutil.copy(edu_acq, label_file)
                print(f"‚úÖ Created Education label: {label_file.name} (duration={result['education_duration']}s)")
            status['education_created'] = True
        else:
            print(f"‚ö†Ô∏è  No acq file found for Education duration {result['education_duration']}s")

    # Create SART practice label file
    if result['sart_practice_duration']:
        sart_prac_acq = find_acq_file_by_duration(subject_dir, result['sart_practice_duration'])
        if sart_prac_acq:
            session_id = sart_prac_acq.stem.split('_ses-')[1].split('_')[0]
            label_file = subject_dir / f"{subject_name}_ses-{session_id}_task-Video_label-SART-practice_haemo.csv"

            # Copy acq file to label file (or simulate in dry run)
            if DRY_RUN:
                print(f"üîç [DRY RUN] Would create: {label_file.name} (duration={result['sart_practice_duration']}s)")
                print(f"   Source: {sart_prac_acq.name}")
            else:
                shutil.copy(sart_prac_acq, label_file)
                print(f"‚úÖ Created SART-practice label: {label_file.name} (duration={result['sart_practice_duration']}s)")
            status['sart_practice_created'] = True
        else:
            print(f"‚ö†Ô∏è  No acq file found for SART practice duration {result['sart_practice_duration']}s")

    # Create SART actual label files (large unlabeled segments)
    large_acq_files = find_large_unlabeled_acq_files(subject_dir)
    if large_acq_files:
        for i, large_acq in enumerate(large_acq_files):
            # Parse duration
            parts = large_acq.stem.split('_')
            file_duration = None
            for part in parts:
                if part.startswith('dur-'):
                    file_duration = int(part.split('-')[1])
                    break

            session_id = large_acq.stem.split('_ses-')[1].split('_')[0]

            # If multiple large files, number them
            if len(large_acq_files) > 1:
                label_file = subject_dir / f"{subject_name}_ses-{session_id}_task-Video_label-SART-actual-{i+1}_haemo.csv"
            else:
                label_file = subject_dir / f"{subject_name}_ses-{session_id}_task-Video_label-SART-actual_haemo.csv"

            # Copy acq file to label file (or simulate in dry run)
            if DRY_RUN:
                print(f"üîç [DRY RUN] Would create: {label_file.name} (duration={file_duration}s)")
                print(f"   Source: {large_acq.name}")
            else:
                shutil.copy(large_acq, label_file)
                print(f"‚úÖ Created SART-actual label: {label_file.name} (duration={file_duration}s)")
            status['sart_actual_created'] = True
    else:
        print(f"‚ÑπÔ∏è  No large unlabeled segments found (SART actual may be missing or mislabeled)")

    return status


# Process all subjects
subject_dirs = sorted([d for d in preprocessed_dir.iterdir() if d.is_dir() and d.name.startswith('sub-')])
print(f"Found {len(subject_dirs)} subjects\n")

results = []
for subject_dir in subject_dirs:
    status = process_subject(subject_dir)
    results.append(status)

print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")

In [None]:
# Cell 4: Summary Report

# Convert results to DataFrame
results_df = pd.DataFrame(results)

print(f"\nTotal subjects processed: {len(results_df)}")
print(f"Education labels created: {results_df['education_created'].sum()}")
print(f"SART-practice labels created: {results_df['sart_practice_created'].sum()}")
print(f"SART-actual labels created: {results_df['sart_actual_created'].sum()}")

# Subjects with warnings
subjects_with_warnings = results_df[results_df['warnings'].apply(len) > 0]
if len(subjects_with_warnings) > 0:
    print(f"\n‚ö†Ô∏è  {len(subjects_with_warnings)} subjects with warnings:")
    for _, row in subjects_with_warnings.iterrows():
        print(f"  {row['subject']}: {row['warnings']}")

# Incomplete subjects
incomplete = results_df[~results_df['education_created'] | ~results_df['sart_practice_created']]
if len(incomplete) > 0:
    print(f"\n‚ö†Ô∏è  {len(incomplete)} incomplete subjects (missing Education or SART-practice):")
    for _, row in incomplete.iterrows():
        missing = []
        if not row['education_created']:
            missing.append('Education')
        if not row['sart_practice_created']:
            missing.append('SART-practice')
        print(f"  {row['subject']}: missing {', '.join(missing)}")

print("\n‚úÖ Relabeling complete!")