# Dataset Epoch Extraction - Individual MNE Files

This notebook extracts individual epochs from the Inner Speech EEG dataset and saves each epoch as:
- Individual MNE Epochs object (.fif files)
- Complete metadata CSV/pickle files
- Organized folder structure by subject/session

Each epoch will be saved as a separate MNE file that can be loaded directly with `mne.read_epochs()`.

## 1. Import Libraries

In [1]:
import mne
import numpy as np
import pandas as pd
import os
from pathlib import Path
import pickle
from tqdm import tqdm
import shutil

# Import custom processing functions
from Python_Processing.Data_extractions import (
    extract_block_data_from_subject,
    load_events
)
from Python_Processing.Utilitys import sub_name

# Set MNE logging level to reduce output
mne.set_log_level('WARNING')

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Configuration

In [2]:
# Dataset configuration
derivatives_path = "dataset"  # Path to your dataset folder
output_path = "extracted_epochs_mne"  # Output folder for extracted epochs

# Create output directory structure
os.makedirs(output_path, exist_ok=True)
os.makedirs(os.path.join(output_path, "individual_epochs"), exist_ok=True)
os.makedirs(os.path.join(output_path, "metadata"), exist_ok=True)

# Define mappings
class_names = {0: 'Up', 1: 'Down', 2: 'Right', 3: 'Left'}
condition_names = {0: 'Pronounced', 1: 'Inner', 2: 'Visualized'}

# Subject and session ranges
subjects = list(range(1, 11))  # Subjects 1-10
sessions = [1, 2, 3]  # Sessions 1-3

print(f"Dataset path: {derivatives_path}")
print(f"Output path: {output_path}")
print(f"Subjects: {subjects}")
print(f"Sessions: {sessions}")
print(f"\nOutput structure:")
print(f"  {output_path}/")
print(f"    ├── individual_epochs/  (MNE .fif files)")
print(f"    └── metadata/          (CSV and pickle files)")

Dataset path: dataset
Output path: extracted_epochs_mne
Subjects: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Sessions: [1, 2, 3]

Output structure:
  extracted_epochs_mne/
    ├── individual_epochs/  (MNE .fif files)
    └── metadata/          (CSV and pickle files)


## 3. Extract Individual Epochs as MNE Files

In [3]:
def extract_epochs_as_mne_files(derivatives_path, subjects, sessions, output_path):
    """
    Extract individual epochs and save each as a separate MNE Epochs file.
    
    Returns:
    - DataFrame with epoch metadata including file paths
    """
    
    all_epochs_metadata = []
    total_epochs = 0
    
    # Progress tracking
    total_combinations = len(subjects) * len(sessions)
    pbar = tqdm(total=total_combinations, desc="Processing subjects/sessions")
    
    for subject_num in subjects:
        subject_id = f"sub-{subject_num:02d}"
        
        # Create subject directory
        subject_dir = os.path.join(output_path, "individual_epochs", subject_id)
        os.makedirs(subject_dir, exist_ok=True)
        
        for session_num in sessions:
            session_id = f"ses-{session_num:02d}"
            
            # Create session directory
            session_dir = os.path.join(subject_dir, session_id)
            os.makedirs(session_dir, exist_ok=True)
            
            try:
                # Load EEG data and events for this subject/session
                X_session, Y_session = extract_block_data_from_subject(
                    derivatives_path, subject_num, "eeg", session_num
                )
                
                # Get sampling frequency and channel names
                sfreq = X_session.info['sfreq']
                ch_names = X_session.ch_names
                info = X_session.info.copy()
                
                # Process each epoch
                for trial_idx in range(len(X_session)):
                    # Extract event information
                    timestamp = Y_session[trial_idx, 0]
                    class_id = int(Y_session[trial_idx, 1])
                    condition_id = int(Y_session[trial_idx, 2])
                    session_from_events = int(Y_session[trial_idx, 3])
                    
                    # Create unique epoch ID
                    epoch_id = f"{subject_id}_{session_id}_trial_{trial_idx:03d}"
                    
                    # Create filename for this epoch
                    epoch_filename = f"{epoch_id}_{condition_names[condition_id]}_{class_names[class_id]}-epo.fif"
                    epoch_filepath = os.path.join(session_dir, epoch_filename)
                    
                    # Extract single epoch data
                    single_epoch_data = X_session._data[trial_idx:trial_idx+1]  # Keep as (1, n_channels, n_timepoints)
                    
                    # Create events array for this single epoch
                    # MNE expects events as (n_events, 3) with [sample, prev_id, event_id]
                    single_epoch_events = np.array([[0, 0, class_id + 1]])  # +1 because MNE event IDs should be > 0
                    
                    # Create event_id dictionary
                    event_id = {class_names[class_id]: class_id + 1}
                    
                    # Create MNE Epochs object for single epoch
                    single_epoch = mne.EpochsArray(
                        single_epoch_data,
                        info,
                        events=single_epoch_events,
                        event_id=event_id,
                        tmin=X_session.tmin,
                        verbose=False
                    )
                    
                    # Add metadata to the epoch
                    epoch_metadata_dict = {
                        'subject_id': subject_id,
                        'subject_number': subject_num,
                        'session_id': session_id,
                        'session_number': session_num,
                        'trial_number': trial_idx,
                        'speech_type': condition_names[condition_id],
                        'condition_id': condition_id,
                        'class': class_names[class_id],
                        'class_id': class_id,
                        'timestamp': timestamp
                    }
                    
                    # Convert to DataFrame for MNE metadata
                    single_epoch.metadata = pd.DataFrame([epoch_metadata_dict])
                    
                    # Save the single epoch as MNE file
                    single_epoch.save(epoch_filepath, overwrite=True, verbose=False)
                    
                    # Store epoch metadata for our master list
                    epoch_metadata = {
                        'epoch_id': epoch_id,
                        'subject_name': subject_id,
                        'subject_number': subject_num,
                        'session_number': session_num,
                        'session_id': session_id,
                        'trial_number': trial_idx,
                        'speech_type': condition_names[condition_id],
                        'condition_id': condition_id,
                        'class': class_names[class_id],
                        'class_id': class_id,
                        'timestamp': timestamp,
                        'sampling_frequency': sfreq,
                        'n_channels': len(ch_names),
                        'n_timepoints': single_epoch_data.shape[2],
                        'duration_seconds': single_epoch_data.shape[2] / sfreq,
                        'file_path': epoch_filepath,
                        'relative_path': os.path.relpath(epoch_filepath, output_path),
                        'filename': epoch_filename
                    }
                    
                    all_epochs_metadata.append(epoch_metadata)
                    total_epochs += 1
                
                pbar.set_postfix({
                    'Current': f"{subject_id}/{session_id}",
                    'Epochs': f"{len(X_session)}",
                    'Total': total_epochs
                })
                
            except Exception as e:
                print(f"\nError processing {subject_id}/{session_id}: {e}")
                continue
            
            pbar.update(1)
    
    pbar.close()
    
    # Convert to DataFrame
    epochs_df = pd.DataFrame(all_epochs_metadata)
    
    print(f"\nExtraction complete!")
    print(f"Total epochs extracted: {total_epochs}")
    print(f"Total subjects: {len(subjects)}")
    print(f"Total sessions per subject: {len(sessions)}")
    print(f"Individual MNE files saved to: {os.path.join(output_path, 'individual_epochs')}")
    
    return epochs_df

# Run the extraction
print("Starting epoch extraction as MNE files...")
epochs_df = extract_epochs_as_mne_files(
    derivatives_path, subjects, sessions, output_path
)

Starting epoch extraction as MNE files...





Error processing sub-06/ses-02: 'NoneType' object has no attribute 'kind'


Processing subjects/sessions:  97%|█████████▋| 29/30 [09:48<00:20, 20.30s/it, Current=sub-10/ses-03, Epochs=200, Total=5440]



Extraction complete!
Total epochs extracted: 5440
Total subjects: 10
Total sessions per subject: 3
Individual MNE files saved to: extracted_epochs_mne\individual_epochs


## 4. Analyze Extracted Data

In [4]:
# Display basic statistics
print("=== DATASET OVERVIEW ===")
print(f"Total epochs: {len(epochs_df)}")
print(f"Unique subjects: {epochs_df['subject_number'].nunique()}")
print(f"Sessions per subject: {epochs_df.groupby('subject_number')['session_number'].nunique().iloc[0]}")
print(f"Sampling frequency: {epochs_df['sampling_frequency'].iloc[0]} Hz")
print(f"Number of channels: {epochs_df['n_channels'].iloc[0]}")
print(f"Epoch duration: {epochs_df['duration_seconds'].iloc[0]:.2f} seconds")
print(f"Time points per epoch: {epochs_df['n_timepoints'].iloc[0]}")

print("\n=== DISTRIBUTION BY SPEECH TYPE ===")
speech_type_counts = epochs_df['speech_type'].value_counts()
for speech_type, count in speech_type_counts.items():
    print(f"{speech_type}: {count} epochs ({count/len(epochs_df)*100:.1f}%)")

print("\n=== DISTRIBUTION BY CLASS ===")
class_counts = epochs_df['class'].value_counts()
for class_name, count in class_counts.items():
    print(f"{class_name}: {count} epochs ({count/len(epochs_df)*100:.1f}%)")

print("\n=== EPOCHS PER SUBJECT ===")
subject_counts = epochs_df['subject_name'].value_counts().sort_index()
for subject, count in subject_counts.items():
    print(f"{subject}: {count} epochs")

print("\n=== FILE STRUCTURE SAMPLE ===")
print("Sample file paths:")
for i, filepath in enumerate(epochs_df['relative_path'].head(5)):
    print(f"  {i+1}. {filepath}")

=== DATASET OVERVIEW ===
Total epochs: 5440
Unique subjects: 10
Sessions per subject: 3
Sampling frequency: 256.0 Hz
Number of channels: 128
Epoch duration: 4.50 seconds
Time points per epoch: 1153

=== DISTRIBUTION BY SPEECH TYPE ===
Visualized: 2196 epochs (40.4%)
Inner: 2156 epochs (39.6%)
Pronounced: 1088 epochs (20.0%)

=== DISTRIBUTION BY CLASS ===
Left: 1360 epochs (25.0%)
Up: 1360 epochs (25.0%)
Right: 1360 epochs (25.0%)
Down: 1360 epochs (25.0%)

=== EPOCHS PER SUBJECT ===
sub-01: 500 epochs
sub-02: 600 epochs
sub-03: 500 epochs
sub-04: 600 epochs
sub-05: 600 epochs
sub-06: 340 epochs
sub-07: 600 epochs
sub-08: 500 epochs
sub-09: 600 epochs
sub-10: 600 epochs

=== FILE STRUCTURE SAMPLE ===
Sample file paths:
  1. individual_epochs\sub-01\ses-01\sub-01_ses-01_trial_000_Pronounced_Left-epo.fif
  2. individual_epochs\sub-01\ses-01\sub-01_ses-01_trial_001_Pronounced_Up-epo.fif
  3. individual_epochs\sub-01\ses-01\sub-01_ses-01_trial_002_Pronounced_Left-epo.fif
  4. individual_epo

## 5. Save Metadata Files

In [6]:
# Save metadata as CSV
metadata_file = os.path.join(output_path, "metadata", "epochs_metadata.csv")
epochs_df.to_csv(metadata_file, index=False)
print(f"Metadata saved to: {metadata_file}")

# Save metadata as pickle for easier loading
metadata_pkl_file = os.path.join(output_path, "metadata", "epochs_metadata.pkl")
epochs_df.to_pickle(metadata_pkl_file)
print(f"Metadata (pickle) saved to: {metadata_pkl_file}")

# Save a summary report
summary_file = os.path.join(output_path, "metadata", "extraction_summary.txt")
with open(summary_file, 'w') as f:
    f.write("Inner Speech EEG Dataset - Individual MNE Epochs Extraction Summary\n")
    f.write("=" * 70 + "\n\n")
    f.write(f"Total epochs extracted: {len(epochs_df)}\n")
    f.write(f"Number of subjects: {epochs_df['subject_number'].nunique()}\n")
    f.write(f"Sessions per subject: {epochs_df.groupby('subject_number')['session_number'].nunique().iloc[0]}\n")
    f.write(f"Sampling frequency: {epochs_df['sampling_frequency'].iloc[0]} Hz\n")
    f.write(f"Number of channels: {epochs_df['n_channels'].iloc[0]}\n")
    f.write(f"Epoch duration: {epochs_df['duration_seconds'].iloc[0]:.2f} seconds\n")
    f.write(f"Time points per epoch: {epochs_df['n_timepoints'].iloc[0]}\n\n")
    
    f.write("File Structure:\n")
    f.write(f"  {output_path}/\n")
    f.write(f"    ├── individual_epochs/\n")
    f.write(f"    │   ├── sub-01/\n")
    f.write(f"    │   │   ├── ses-01/\n")
    f.write(f"    │   │   │   ├── sub-01_ses-01_trial_000_Inner_Up-epo.fif\n")
    f.write(f"    │   │   │   ├── sub-01_ses-01_trial_001_Inner_Down-epo.fif\n")
    f.write(f"    │   │   │   └── ...\n")
    f.write(f"    │   │   ├── ses-02/\n")
    f.write(f"    │   │   └── ses-03/\n")
    f.write(f"    │   ├── sub-02/\n")
    f.write(f"    │   └── ...\n")
    f.write(f"    └── metadata/\n")
    f.write(f"        ├── epochs_metadata.csv\n")
    f.write(f"        ├── epochs_metadata.pkl\n")
    f.write(f"        └── extraction_summary.txt\n\n")
    
    f.write("Speech Type Distribution:\n")
    for speech_type, count in speech_type_counts.items():
        f.write(f"  {speech_type}: {count} epochs ({count/len(epochs_df)*100:.1f}%)\n")
    
    f.write("\nClass Distribution:\n")
    for class_name, count in class_counts.items():
        f.write(f"  {class_name}: {count} epochs ({count/len(epochs_df)*100:.1f}%)\n")
    
    f.write("\nHow to Load Individual Epochs:\n")
    f.write("  import mne\n")
    f.write("  epoch = mne.read_epochs('path/to/epoch-epo.fif')\n")
    f.write("  data = epoch.get_data()  # Shape: (1, n_channels, n_timepoints)\n")
    f.write("  metadata = epoch.metadata  # Pandas DataFrame with epoch info\n")

print(f"Summary report saved to: {summary_file}")

print(f"\n=== EXTRACTION COMPLETE ===")
print(f"All files saved to: {output_path}")
print(f"\nFiles created:")
print(f"  - {len(epochs_df)} individual MNE epoch files (.fif)")
print(f"  - {metadata_file} (CSV format)")
print(f"  - {metadata_pkl_file} (Pickle format)")
print(f"  - {summary_file} (Summary report)")

Metadata saved to: extracted_epochs_mne\metadata\epochs_metadata.csv
Metadata (pickle) saved to: extracted_epochs_mne\metadata\epochs_metadata.pkl


UnicodeEncodeError: 'charmap' codec can't encode characters in position 4-6: character maps to <undefined>

## 6. Test Loading Individual Epochs

In [None]:
# Test loading a few individual epochs
print("=== TESTING INDIVIDUAL EPOCH LOADING ===")

# Load first 3 epochs as examples
test_epochs = epochs_df.head(3)

for i, (_, row) in enumerate(test_epochs.iterrows()):
    print(f"\n--- Test Epoch {i+1} ---")
    print(f"Epoch ID: {row['epoch_id']}")
    print(f"File: {row['filename']}")
    print(f"Subject: {row['subject_name']}, Session: {row['session_number']}")
    print(f"Speech Type: {row['speech_type']}, Class: {row['class']}")
    
    # Load the epoch
    try:
        epoch = mne.read_epochs(row['file_path'], verbose=False)
        
        print(f"✓ Successfully loaded epoch")
        print(f"  Data shape: {epoch.get_data().shape}")
        print(f"  Sampling frequency: {epoch.info['sfreq']} Hz")
        print(f"  Number of channels: {len(epoch.ch_names)}")
        print(f"  Time range: {epoch.tmin:.2f} to {epoch.tmax:.2f} seconds")
        
        # Check metadata
        if epoch.metadata is not None:
            print(f"  Metadata columns: {list(epoch.metadata.columns)}")
            print(f"  Metadata values: {dict(epoch.metadata.iloc[0])}")
        
        # Check events
        print(f"  Events: {epoch.events}")
        print(f"  Event ID: {epoch.event_id}")
        
    except Exception as e:
        print(f"✗ Error loading epoch: {e}")

## 7. Create Usage Examples

In [None]:
print("=== USAGE EXAMPLES ===")

# Example 1: Load all Inner speech epochs for a specific subject
print("\n1. Loading all Inner speech epochs for subject 1:")
inner_speech_sub1 = epochs_df[
    (epochs_df['speech_type'] == 'Inner') & 
    (epochs_df['subject_number'] == 1)
]

print(f"Found {len(inner_speech_sub1)} Inner speech epochs for subject 1")
print("Sample file paths:")
for filepath in inner_speech_sub1['relative_path'].head(3):
    print(f"  - {filepath}")

# Example 2: Load epochs by class
print("\n2. Loading all 'Up' class epochs:")
up_epochs = epochs_df[epochs_df['class'] == 'Up']
print(f"Found {len(up_epochs)} 'Up' class epochs")
print(f"Distribution by speech type:")
print(up_epochs['speech_type'].value_counts())

# Example 3: Create a function to load multiple epochs
def load_epochs_by_criteria(metadata_df, **criteria):
    """
    Load multiple epochs based on filtering criteria.
    
    Parameters:
    - metadata_df: DataFrame with epoch metadata
    - **criteria: Filtering criteria (e.g., subject_number=1, speech_type='Inner')
    
    Returns:
    - List of loaded MNE Epochs objects
    - Filtered metadata DataFrame
    """
    
    # Apply filters
    filtered_df = metadata_df.copy()
    for key, value in criteria.items():
        if isinstance(value, list):
            filtered_df = filtered_df[filtered_df[key].isin(value)]
        else:
            filtered_df = filtered_df[filtered_df[key] == value]
    
    # Load epochs
    loaded_epochs = []
    for _, row in filtered_df.iterrows():
        try:
            epoch = mne.read_epochs(row['file_path'], verbose=False)
            loaded_epochs.append(epoch)
        except Exception as e:
            print(f"Warning: Could not load {row['filename']}: {e}")
    
    return loaded_epochs, filtered_df

print("\n3. Example function usage:")
print("# Load all Inner speech 'Up' epochs from subjects 1-3")
print("epochs_list, metadata = load_epochs_by_criteria(")
print("    epochs_df,")
print("    subject_number=[1, 2, 3],")
print("    speech_type='Inner',")
print("    class='Up'")
print(")")

# Test the function with a small sample
sample_epochs, sample_metadata = load_epochs_by_criteria(
    epochs_df.head(5),  # Just test with first 5 epochs
    subject_number=1
)

print(f"\nTest function result: Loaded {len(sample_epochs)} epochs")
if len(sample_epochs) > 0:
    print(f"First epoch data shape: {sample_epochs[0].get_data().shape}")

## 8. Create Quick Access Guide

In [None]:
# Create a quick access guide file
guide_file = os.path.join(output_path, "QUICK_ACCESS_GUIDE.md")

guide_content = f"""# Quick Access Guide - Individual MNE Epochs

## Overview
This dataset contains {len(epochs_df)} individual EEG epochs saved as MNE `.fif` files.

## File Structure
```
{output_path}/
├── individual_epochs/          # Individual MNE epoch files
│   ├── sub-01/
│   │   ├── ses-01/
│   │   │   ├── sub-01_ses-01_trial_000_Inner_Up-epo.fif
│   │   │   ├── sub-01_ses-01_trial_001_Inner_Down-epo.fif
│   │   │   └── ...
│   │   ├── ses-02/
│   │   └── ses-03/
│   ├── sub-02/
│   └── ...
├── metadata/                   # Metadata files
│   ├── epochs_metadata.csv     # Human-readable metadata
│   ├── epochs_metadata.pkl     # Python-optimized metadata
│   └── extraction_summary.txt  # Summary statistics
└── QUICK_ACCESS_GUIDE.md       # This guide
```

## Loading Individual Epochs

### Method 1: Load Single Epoch
```python
import mne

# Load a single epoch
epoch = mne.read_epochs('individual_epochs/sub-01/ses-01/sub-01_ses-01_trial_000_Inner_Up-epo.fif')

# Get data
data = epoch.get_data()  # Shape: (1, n_channels, n_timepoints)
metadata = epoch.metadata  # Pandas DataFrame with epoch info
```

### Method 2: Load Multiple Epochs Using Metadata
```python
import pandas as pd
import mne

# Load metadata
metadata = pd.read_csv('metadata/epochs_metadata.csv')

# Filter for specific criteria
inner_speech_up = metadata[
    (metadata['speech_type'] == 'Inner') & 
    (metadata['class'] == 'Up') &
    (metadata['subject_number'] == 1)
]

# Load the epochs
epochs_list = []
for _, row in inner_speech_up.iterrows():
    epoch = mne.read_epochs(row['file_path'])
    epochs_list.append(epoch)
```

### Method 3: Combine Multiple Epochs
```python
# Load multiple epochs and combine them
epochs_list = []  # Load as shown above

# Combine into single Epochs object
if len(epochs_list) > 1:
    combined_epochs = mne.concatenate_epochs(epochs_list)
else:
    combined_epochs = epochs_list[0]

# Now you can use all MNE functions
combined_epochs.plot()  # Plot the epochs
combined_epochs.average().plot()  # Plot average
```

## Metadata Columns
- `epoch_id`: Unique identifier for each epoch
- `subject_name`: Subject ID (sub-01, sub-02, ...)
- `subject_number`: Subject number (1, 2, ...)
- `session_number`: Session number (1, 2, 3)
- `trial_number`: Trial number within session
- `speech_type`: Type of speech (Pronounced, Inner, Visualized)
- `class`: Direction class (Up, Down, Right, Left)
- `timestamp`: Original timestamp from recording
- `file_path`: Full path to the MNE file
- `relative_path`: Relative path from output directory
- `filename`: Just the filename

## Common Use Cases

### 1. Load all epochs for one subject
```python
subject_1_epochs = metadata[metadata['subject_number'] == 1]
```

### 2. Load specific speech type
```python
inner_speech = metadata[metadata['speech_type'] == 'Inner']
```

### 3. Load specific class for classification
```python
up_vs_down = metadata[metadata['class'].isin(['Up', 'Down'])]
```

### 4. Load by session
```python
session_1 = metadata[metadata['session_number'] == 1]
```

## Dataset Statistics
- Total epochs: {len(epochs_df)}
- Subjects: {epochs_df['subject_number'].nunique()}
- Sessions per subject: {epochs_df.groupby('subject_number')['session_number'].nunique().iloc[0]}
- Sampling frequency: {epochs_df['sampling_frequency'].iloc[0]} Hz
- Channels: {epochs_df['n_channels'].iloc[0]}
- Epoch duration: {epochs_df['duration_seconds'].iloc[0]:.2f} seconds

## Speech Type Distribution
"""

for speech_type, count in speech_type_counts.items():
    guide_content += f"- {speech_type}: {count} epochs ({count/len(epochs_df)*100:.1f}%)\n"

guide_content += "\n## Class Distribution\n"
for class_name, count in class_counts.items():
    guide_content += f"- {class_name}: {count} epochs ({count/len(epochs_df)*100:.1f}%)\n"

guide_content += """
## Tips
1. Use the metadata CSV file to explore and filter epochs before loading
2. Each epoch file contains exactly one trial with complete MNE metadata
3. File names include subject, session, trial, speech type, and class for easy identification
4. All standard MNE functions work with these epoch files
5. Use `mne.concatenate_epochs()` to combine multiple epochs into one object
"""

with open(guide_file, 'w') as f:
    f.write(guide_content)

print(f"Quick access guide saved to: {guide_file}")
print(f"\n=== ALL PROCESSING COMPLETE ===")
print(f"\nYour dataset is now organized as individual MNE epoch files!")
print(f"Each epoch can be loaded directly with: mne.read_epochs('path/to/epoch-epo.fif')")
print(f"\nNext steps:")
print(f"1. Read the Quick Access Guide: {guide_file}")
print(f"2. Explore the metadata: {metadata_file}")
print(f"3. Start loading and analyzing individual epochs!")