In [36]:
import os
import glob
import re
from scipy.io import loadmat
import pandas as pd

# Base path for the EEG raw data
base_path = r"..\data\seed-iv\eeg_raw_data"
base_path_out = r"..\data\seed-iv"

In [2]:
# Labels for each session (each has 24 labels)
session1_label = [
    1,
    2,
    3,
    0,
    2,
    0,
    0,
    1,
    0,
    1,
    2,
    1,
    1,
    1,
    2,
    3,
    2,
    2,
    3,
    3,
    0,
    3,
    0,
    3,
]
session2_label = [
    2,
    1,
    3,
    0,
    0,
    2,
    0,
    2,
    3,
    3,
    2,
    3,
    2,
    0,
    1,
    1,
    2,
    1,
    0,
    3,
    0,
    1,
    3,
    1,
]
session3_label = [
    1,
    2,
    2,
    1,
    3,
    3,
    3,
    1,
    1,
    2,
    1,
    0,
    2,
    3,
    3,
    0,
    2,
    3,
    0,
    0,
    2,
    0,
    1,
    0,
]

In [31]:
# Paths for each session (folder 1, 2, and 3)
session_paths = [
    os.path.join(base_path, "1"),
    os.path.join(base_path, "2"),
    os.path.join(base_path, "3"),
]

# A dictionary to store final data from all sessions
all_sessions_data = {"session1": [], "session2": [], "session3": []}

for i, session_path in enumerate(session_paths):
    # Select the corresponding labels and session name
    if i == 0:
        labels = session1_label
        session_name = "session1"
    elif i == 1:
        labels = session2_label
        session_name = "session2"
    else:
        labels = session3_label
        session_name = "session3"

    # Get all .mat files in the current session folder
    mat_files = sorted(glob.glob(os.path.join(session_path, "*.mat")))

    session_data = []

    for file_path in mat_files:
        # Parse the subject number from the file name (e.g., "7_20150715.mat" -> subject_number=7)
        file_name = os.path.basename(file_path)
        subject_str = file_name.split("_")[0]
        subject_date_str = file_name.split("_")[1]

        try:
            subject_number = int(subject_str)
            subject_date = subject_date_str[:-4]
        except ValueError:
            # If the part before '_' is not purely numeric, handle it accordingly
            subject_number = None
            subject_date = None

        # Load the .mat file
        data_dict = loadmat(file_path)

        # We will collect valid keys in the format "<prefix>_eeg<number>"
        valid_keys = []

        # Iterate over all keys in the loaded dictionary
        for key in data_dict.keys():
            # Ignore meta keys
            if key.startswith("__"):
                continue

            # Check if this key matches the pattern: something_eegN
            match = re.match(r"(.*)_eeg(\d+)$", key)
            if match:
                # Extract the index (e.g. from _eeg1, _eeg2, etc.)
                idx = int(match.group(2))
                prefix = match.group(1)  # e.g. 'cz' or 'mz'

                valid_keys.append((key, idx, prefix))

        # Sort the valid keys based on their index (1..24)
        valid_keys.sort(key=lambda x: x[1])

        # Check if the number of found signals matches the length of the label array (24)
        if len(valid_keys) != len(labels):
            print(
                f"Warning: In file '{file_name}', found {len(valid_keys)} EEG signals but expected {len(labels)}."
            )
            print("Labels may not align correctly with the signals.")

        # Pair each key with the corresponding label
        for (key, idx, prefix), label in zip(valid_keys, labels):
            eeg_data = data_dict[key]

            # Store a record of this signal
            session_data.append(
                {
                    "file_path": file_path,
                    "subject_number": subject_number,
                    "subject_date": subject_date,
                    "signal_name": key,  # e.g. 'mz_eeg1'
                    "signal_index": idx,  # e.g. 1
                    "prefix": prefix,  # e.g. 'mz'
                    "data": eeg_data,  # the actual EEG matrix
                    "label": label,  # label from session*_label
                }
            )

    # Save the accumulated data for the current session
    all_sessions_data[session_name] = session_data

print("All sessions loaded successfully.")

All sessions loaded successfully.


In [38]:
# Define a base directory to store the output CSV files
output_base = base_path_out + r"/output_csv"

# Create the base directory if it doesn't exist
os.makedirs(output_base, exist_ok=True)

# Iterate over each session in the all_sessions_data dictionary
for session_name, session_records in all_sessions_data.items():
    # Create a folder for the current session
    session_dir = os.path.join(output_base, session_name)
    os.makedirs(session_dir, exist_ok=True)

    # Iterate over each record (signal) in the current session
    for record in session_records:
        subject_number = record["subject_number"]  # The subject's numeric ID
        signal_name = record["signal_name"]  # For example, 'mz_eeg1'
        label = record["label"]  # The label (e.g., 3)
        data = record["data"].T  # A NumPy array with the EEG signal data

        # Handle the subject number if it is None or not convertible to int
        if subject_number is None:
            subject_str = "unknown_subject"
        else:
            subject_str = str(subject_number)

        # Create a directory for this subject inside the session directory
        subject_dir = os.path.join(session_dir, subject_str)
        os.makedirs(subject_dir, exist_ok=True)

        # Define a CSV filename in the format: signal_name_label.csv
        csv_name = f"{signal_name}_{label}.csv"
        csv_path = os.path.join(subject_dir, csv_name)

        # Convert the signal data to a pandas DataFrame
        df = pd.DataFrame(data)

        # Save the DataFrame to a CSV file without header or index
        df.to_csv(csv_path, header=False, index=False)

print("Done! CSV files have been saved in the specified folders.")

Done! CSV files have been saved in the specified folders.
