In [1]:
from pathlib import Path
import os
DATA_PATH = Path(os.getenv("DATA_PATH"))
import pyedflib
import numpy as np

In [2]:
edf_path = str(DATA_PATH / "tueg/aaaaawey_s001_t001.edf")
f = pyedflib.EdfReader(edf_path)

In [3]:
# Check num of channels
print(f.signals_in_file)
# Check labels of channels
print(f.getSignalLabels())

31
['EEG FP1-REF', 'EEG FP2-REF', 'EEG F3-REF', 'EEG F4-REF', 'EEG C3-REF', 'EEG C4-REF', 'EEG P3-REF', 'EEG P4-REF', 'EEG O1-REF', 'EEG O2-REF', 'EEG F7-REF', 'EEG F8-REF', 'EEG T3-REF', 'EEG T4-REF', 'EEG T5-REF', 'EEG T6-REF', 'EEG A1-REF', 'EEG A2-REF', 'EEG FZ-REF', 'EEG CZ-REF', 'EEG PZ-REF', 'EEG ROC-REF', 'EEG LOC-REF', 'EEG EKG1-REF', 'EMG-REF', 'EEG T1-REF', 'EEG T2-REF', 'PHOTIC-REF', 'IBI', 'BURSTS', 'SUPPR']


In [4]:
# Frequencies of channels
f.getSampleFrequencies()

array([250., 250., 250., 250., 250., 250., 250., 250., 250., 250., 250.,
       250., 250., 250., 250., 250., 250., 250., 250., 250., 250., 250.,
       250., 250., 250., 250., 250., 250.,   1.,   1.,   1.])

In [11]:
# Number of samples for each channel
f.getNSamples()

array([327750, 327750, 327750, 327750, 327750, 327750, 327750, 327750,
       327750, 327750, 327750, 327750, 327750, 327750, 327750, 327750,
       327750, 327750, 327750, 327750, 327750, 327750, 327750, 327750,
       327750, 327750, 327750, 327750,   1311,   1311,   1311])

In [7]:
n = f.signals_in_file
length = 500

signals = np.zeros((n, length))
for i in np.arange(n):
    signals[i, :] = f.readSignal(i, n=length)

In [8]:
signals.shape

(31, 500)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
CHUNK_SIZE = 150
FILE_SIZE = 1000
PATH_BASE = Path("../../data/tueg_split")
def process_split_chunk(chunk):
    n_data = len(chunk)
    proc = multiprocessing.current_process().pid
    
    for iter_idx, (idx, row) in enumerate(chunk.iterrows()):
        # List to keep (N, 100) samples where N = num_channels
        if row.processed or not row.match:
            continue
        # If frequency does not match, do not do anything with file (for now?)
        if row.frequency != 256 and row.frequency != 250:
            chunk.loc[idx, 'processed'] = True
            continue
        edf = pyedflib.EdfReader(row.path)
        ch_list = ast.literal_eval(row.channels)
        n_ch = len(electrode_mapping)
        n_buf = edf.getNSamples()[0]
        # Create big signals ndarray
        signals = np.zeros((n_ch, n_buf), dtype=np.float32)
        electrodes = list(electrode_mapping.keys())
        for ch_idx, ch in enumerate(ch_list):
            ch_name = ch.split('-')[0]
            # Save index in edf file
            if ch_name in electrode_mapping:
                signals[electrodes.index(ch_name), :] = edf.readSignal(ch_idx, n=n_buf)
        edf.close()
        n_chunks = n_buf // CHUNK_SIZE

        # If LE, average reference
        if 'tcp_le' in row.path:
            ch_mean = np.mean(signals, axis=0)
            signals = signals - ch_mean
        
        # Truncate to divide by CHUNK_SIZE
        signals = signals[:, :n_chunks * CHUNK_SIZE]

        # z_score = (signals - signals.mean()) / signals.std()
        # plt.plot(signals.mean(axis=0))
        # Split time dimensions into chunks of size CHUNK_SIZE
        signals = signals.reshape(-1, n_ch, CHUNK_SIZE)

        # Robust z-score the data
        median = np.median(signals)
        mad = np.median(np.abs(signals - median), keepdims=True)
        mad[mad == 0] = 1e-6
        z_scored_signals = (signals - median) / mad

        # Split filtered_signals up into chunks of 1000 and save this and remainder to disk
        num_chunks = (z_scored_signals.shape[0] + FILE_SIZE - 1) // FILE_SIZE
        for i in range(num_chunks):
            start_idx = i * FILE_SIZE
            end_idx = min((i + 1) * FILE_SIZE, z_scored_signals.shape[0])
            data_chunk = z_scored_signals[start_idx:end_idx]
            filename = PATH_BASE / f"{row.path.split('edf/')[1].replace('/', '_')[:-4]}_chunk_{i}.npy"
            np.save(filename, data_chunk)
        
        if (iter_idx + 1) % 10 == 0:
            print(f'Worker {proc}: {iter_idx + 1}/{n_data}, Finished processing {row.path}')
        chunk.loc[idx, 'processed'] = True
    
    return chunk

# Deprecated
        # Remove entries with values that fall outside the 5th and 95th percentile bound
        # First z-score, then remove 
        # z_score = (signals - signals.mean()) / signals.std()
        # print(f'Mean: {signals.mean()}, std: {signals.std()}')
        # mask = np.abs(z_score) < 3
        # valid_entries_mask = np.all(mask, axis=(1, 2))
        # plt.scatter(x=np.linspace(0, n_chunks * CHUNK_SIZE, n_chunks), y=(valid_entries_mask - 0.5) * 200, c=valid_entries_mask)
        # plt.xlim(0, 10000)
        # plt.ylim(-105, 105)
        # plt.show()
        # lower_bound = np.percentile(signals, 1)
        # upper_bound = np.percentile(signals, 99)
        # mask = (signals >= lower_bound) & (signals <= upper_bound)
        # valid_entries_mask = np.all(mask, axis=(1, 2))

        # filtered_signals = signals[valid_entries_mask]

        # Reorder channels dim
        # ch_indices = list(electrode_idx_mapping.values())
        # filtered_signals = filtered_signals[ch_indices]

        # Normalize over valid signals to [-1, 1], loses amplitude information compared to rest of data
        # min_val = filtered_signals.min()
        # max_val = filtered_signals.max()
        # filtered_signals = 2 * (filtered_signals - min_val) / (max_val - min_val) - 1