In [1]:
import pandas as pd 
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import json
import mne
import numpy as np
from tqdm import tqdm
import re
from sklearn.preprocessing import StandardScaler
import umap
from sklearn.cluster import DBSCAN
from multiprocessing import Pool
import warnings

warnings.filterwarnings("ignore")
mne.set_log_level('ERROR')

os.chdir('../..')

# Load All The Experiments

In [2]:
def collect_valid_paths():
    root_dir = "Generated/Data"
    s_dir_pattern = re.compile(r"^S_\d+$")
    trial_dir_pattern = re.compile(r"^Trial_\d+$")
    valid_dirs = []
    for s_dir in os.listdir(root_dir):
        s_path = os.path.join(root_dir, s_dir)
        if os.path.isdir(s_path) and s_dir_pattern.match(s_dir):
            for trial_dir in os.listdir(s_path):
                trial_path = os.path.join(s_path, trial_dir)
                if os.path.isdir(trial_path) and trial_dir_pattern.match(trial_dir):
                    if any(os.path.isfile(os.path.join(trial_path, f)) for f in os.listdir(trial_path)):
                        valid_dirs.append(trial_path)
    return valid_dirs

all_paths = collect_valid_paths()

print(f"Найдено {len(all_paths)} подходящих директорий")

Найдено 31 подходящих директорий


In [3]:
def load_eeg(path):
    eeg_clean_path = os.path.join(path, "EEG_clean.fif")
    
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning, module="mne")
        clean_eeg = mne.io.read_raw_fif(eeg_clean_path, preload=True, verbose=False)
    
    clean_eeg.pick_types(eeg=True, verbose=False)

    # Извлечем S_13 и Trial_2 из пути
    parts = path.strip(os.sep).split(os.sep)
    subj = parts[-2]
    trial = parts[-1]

    subj_id = int(subj.split('_')[-1])
    trial_id = int(trial.split('_')[-1])

    
    key = f"S{subj_id}_T{trial_id}"

    return key, [clean_eeg, subj_id, trial_id]
        

# Параллельная загрузка
with Pool(processes=72)  as pool:
    results = list(tqdm(pool.imap(load_eeg, all_paths), total=len(all_paths)))

# Собрать словарь, исключив ошибки
data_tuple = {k: v for k, v in results if k is not None}

eeg, subj, trial = data_tuple['S6_T1']

100%|██████████| 31/31 [01:48<00:00,  3.51s/it]


# Average FFT Spectrums on All Experiment (PSD)
As a result, produces:
* `psds_array` - list of psds morlet wavelet spectras
* `metadata`   - list of [`sub_id`, `trial_id`, `gender`, `handiness`, `age`, ...] 

### Exclude 300 sec and 30 sec rest blocks
All subjects have the same structure of the experiment, so we will get timings of these blocks from S1_T1

In [4]:
experiment_path = f"./Generated/Data/S_1/Trial_1/Experiment.json"

# Load experiment sequence
with open(experiment_path,'r') as f:
    experiment_seq = json.load(f)
    
clean_blocks = [
    (block_data['timestamp'], block_data['content']['duration'])
    for block_data in experiment_seq.values()
    if not (block_data['type'] == 'rest' and (block_data['content']['duration'] == 300 or block_data['content']['duration'] == 30))
]

In [5]:
from multiprocessing import Pool, cpu_count
from functools import partial

# Функция для обработки одного элемента
def process_key(key_data, clean_blocks):
    key, (eeg, subj, trial) = key_data

    # Собираем отрезки с execution
    epochs = []
    for start_time, duration in clean_blocks:
        start_sample = int(start_time)
        stop_sample = min(int((start_time + duration)), len(eeg) / 1000.0 - 0.001)
        segment = eeg.copy().crop(tmin=start_sample, tmax=stop_sample)
        epochs.append(segment)

    # Объединяем execution-сегменты
    cropped_eeg = mne.concatenate_raws(epochs)
    print(f"EEG for {key} is cropped for {len(cropped_eeg)} samples.")

    return key, (cropped_eeg, subj, trial)

# Мультипроцессинг
with Pool(processes=72) as pool:
    func = partial(process_key, clean_blocks=clean_blocks)
    data_tuple = dict(pool.map(func, data_tuple.items()))

EEG for S4_T2 is cropped for 918080 samples.
EEG for S4_T1 is cropped for 918080 samples.
EEG for S14_T1 is cropped for 917995 samples.
EEG for S15_T2 is cropped for 918080 samples.
EEG for S15_T1 is cropped for 918080 samples.
EEG for S1_T2 is cropped for 918080 samples.
EEG for S1_T1 is cropped for 918080 samples.
EEG for S5_T2 is cropped for 918080 samples.
EEG for S5_T1 is cropped for 918080 samples.
EEG for S3_T2 is cropped for 918080 samples.
EEG for S3_T1 is cropped for 918080 samples.
EEG for S2_T2 is cropped for 918080 samples.
EEG for S2_T1 is cropped for 918080 samples.
EEG for S7_T2 is cropped for 918080 samples.
EEG for S7_T1 is cropped for 918080 samples.
EEG for S10_T2 is cropped for 918080 samples.
EEG for S10_T1 is cropped for 918080 samples.
EEG for S8_T2 is cropped for 918080 samples.
EEG for S8_T1 is cropped for 918080 samples.
EEG for S12_T2 is cropped for 918080 samples.
EEG for S12_T1 is cropped for 918080 samples.
EEG for S16_T2 is cropped for 918080 samples.
EE

### Get the subjects info
Will be usefull to pack ones for spectrums 

In [6]:
from datetime import datetime

# --- 1) Читаем Excel, пропуская первую строку «слитного» заголовка ---
file_path = './Supplementary/Experiment_Metadata.xlsx'
df_raw = pd.read_excel(file_path, header=1)

# --- 2) Переименовываем нужные колонки ---
df_raw = df_raw.rename(columns={
    'Subject ID'         : 'Subject_id',
    'Имя'                : 'Name',
    'Фамилия'            : 'Surname',
    'Дата Рождения'      : 'Birthdate',
    'Пол'                : 'Gender',
    'Правша / Левша'     : 'Handiness',
})

# --- 3) Оставляем только нужные поля, включая Handiness ---
df = df_raw[['Subject_id', 'Name', 'Surname', 'Birthdate', 'Gender', 'Handiness']].copy()

# --- 4) Преобразуем «Пол» из кириллицы в латинскую букву m/f ---
gender_map = {'М': 'm', 'Ж': 'f'}
df['Gender'] = df['Gender'].map(gender_map)

# --- 5) Преобразуем «Handiness» в r/l ---
# Предполагаем, что в файле пишется именно «Правша» или «Левша»
hand_map = {'Правша': 'r', 'Левша': 'l'}
df['Handiness'] = df['Handiness'].map(hand_map)

# --- 6) Переводим «Дата Рождения» в datetime ---
df['Birthdate'] = pd.to_datetime(df['Birthdate'], dayfirst=True, errors='coerce')

# --- 7) Считаем возраст в годах (целое число) ---
today = datetime.today()
def calculate_age(birth_dt):
    if pd.isna(birth_dt):
        return None
    years = today.year - birth_dt.year
    if (today.month, today.day) < (birth_dt.month, birth_dt.day):
        years -= 1
    return int(years)

df['Age'] = df['Birthdate'].apply(calculate_age)

# --- 8) Заменяем NaN в Age на 0, превращая колонку в целочисленную ---
df['Age'] = df['Age'].fillna(0).astype(int)

# --- 9) Удаляем колонку Birthdate, она больше не нужна ---
df = df.drop(columns=['Birthdate'])

# --- 10) Делаем Subject_id индексом ---
subject_metadata = df.set_index('Subject_id')

subject_metadata

Unnamed: 0_level_0,Name,Surname,Gender,Handiness,Age
Subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Софья,Гамершмидт,f,r,21
2,Сергей,Пешков,m,r,19
3,Рувшан,Давлитшин,m,r,18
4,Мухаббат,Давлатова,f,r,30
5,Камиль,Фоатов,m,r,0
6,Алия,Хадеева,f,r,18
7,Алёна,Петренко,f,r,19
8,Умар,Магомедов,m,r,18
9,Мария,Мельникова,f,r,31
10,Нурсултан,Абдулаев,m,r,20


### Compute and Save FFT Spectras (PSD)

In [8]:
from mne.time_frequency import psd_array_welch
from tqdm import tqdm

n_jobs = 72
results_arr = []

for key, (eeg, subj, trial) in tqdm(data_tuple.items()):
    # Получаем данные (n_channels, n_times)
    data = eeg.get_data()
    
    # Преобразуем в (n_epochs=1, n_channels, n_times)
    data_reshaped = data[np.newaxis, :, :]

    # FFT-параметры
    sfreq = eeg.info['sfreq']
    fmin, fmax = 2, 40  # Диапазон частот
    n_fft = int(sfreq * 10)  # 10 секунд окна → частотное разрешение = 0.1 Гц

    # Вычисляем спектры мощности через FFT (shape: (1, n_channels, n_freqs))
    psds, freqs = psd_array_welch(
        data_reshaped,
        sfreq=sfreq,
        fmin=fmin,
        fmax=fmax,
        n_fft=n_fft,
        n_jobs=n_jobs,
        verbose=False
    )

    # Усредняем по времени (в данном случае один отрезок, просто берем [0])
    power_mean = psds[0]  # (n_channels, n_freqs)

    # Метаданные
    gender    = subject_metadata.loc[subj, 'Gender']
    handiness = subject_metadata.loc[subj, 'Handiness']
    age       = subject_metadata.loc[subj, 'Age']

    results_arr.append([power_mean, subj, trial, gender, handiness, age])

# Пример формы спектра для первого результата
results_arr[0][0].shape  # (n_channels, n_freqs)

100%|██████████| 31/31 [02:01<00:00,  3.92s/it]


(63, 381)

In [9]:
save_dict = {}
for i, entry in enumerate(results_arr):
    psd, s_id, t_id, gender, handiness, age = entry
    save_dict[f'psd_{i}'] = psd
    save_dict[f'subject_id_{i}'] = np.array(s_id)
    save_dict[f'trial_id_{i}'] = np.array(t_id)
    save_dict[f'gender_{i}'] = np.array(gender, dtype='U1')
    save_dict[f'handiness_{i}']  = np.array(handiness, dtype='U1')
    save_dict[f'age_{i}'] = np.array(age, dtype=int)

# Сохраняем в .npz
np.savez('./Generated/Spectrums/psds_array_fft.npz', **save_dict)

### Visualizations (Optional)

In [10]:
from mne.time_frequency import psd_array_welch
from tqdm import tqdm

plt.ioff()

results_arr = []

for key, (eeg, subj, trial) in tqdm(data_tuple.items()):
    
    # Вычисляем PSD через MNE (дискретизация чуть меньше, чтоб было читабельно)
    psd_mne = eeg.compute_psd(method="welch", fmin=2, fmax=40, n_fft=int(eeg.info['sfreq'] * 5), verbose=False)
    
    # Рисуем PSD по всем каналам
    fig_all = psd_mne.plot(average=False, show=True)
    fig_all.suptitle(f'FFT Subject {subj} Trial {trial}', fontsize=14)
    fig_all.savefig(f'./Generated/Figures/Spectral_Analysis/All_Experiment/FFT_S{subj}_T{trial}.png')
    plt.close(fig_all)  # <-- Закрываем, чтобы не отображался в ноутбуке
    
    
    # Рисуем усреднённый PSD
    fig_avg = psd_mne.plot(average=True, show=True)
    fig_avg.suptitle(f'FFT Subject {subj} Trial{trial}', fontsize=14)
    fig_avg.savefig(f'./Generated/Figures/Spectral_Analysis/All_Experiment/FFT_S{subj}_T{trial}_Avg.png')
    plt.close(fig_avg)  # <-- Тоже закрываем

plt.ion()

pass

100%|██████████| 31/31 [01:00<00:00,  1.94s/it]
