In [15]:
import pandas as pd
import os
import mne
import numpy as np
from tqdm.notebook import tqdm
import networkx as nx
import pickle
import shutil
import scipy.stats



The top-level directories:

 edf/dev/01_tcp_ar
 edf/dev/02_tcp_le
 edf/dev/03_tcp_ar_a
 edf/train/01_tcp_ar
 edf/train/02_tcp_le
 edf/train/03_tcp_ar_a

refer to the appropriate channel configurations for the
EEGs. 01_tcp_ar refers to an AR reference configuration, with
annotations referencing a TCP format described below.

The pathname of a typical EEG file can be explained as follows:

 Filename:
  edf/dev_test/01_tcp_ar/002/00000258/s002_2003_07_21/00000258_s002_t000.edf

 Components:
  edf: contains the edf data

  dev_test: part of the dev_test set (vs.) train

  01_tcp_ar: data that follows the averaged reference (AR) configuration,
             while annotations use the TCP channel configutation

  002: a three-digit identifier meant to keep the number of subdirectories
       in a directory manageable. This follows the TUH EEG v1.1.0 convention.

  00000258: official patient number that is linked to v1.1.0 of TUH EEG

  s002_2003_07_21: session two (s002) for this patient. The session
                   was archived on 07/21/2003.

  00000258_s002_t000.edf: the actual EEG file. These are split into a series of
  			  files starting with t000.edf, t001.edf, ... These
			  represent pruned EEGs, so the original EEG is 
			  split into these segments, and uninteresting
			  parts of the original recording were deleted
			  (common in clinical practice).

The easiest way to access the annotations is through the spreadsheet
provided (_SEIZURES_*.xlsx). This contains the start and stop time
of each seizure event in an easy to understand format. Convert the
file to .csv if you need a machine-readable version.

There are six types of files in this release:

 *.edf:    the EEG sampled data in European Data Format (edf)
 *.txt:    the EEG report corresponding to the patient and session
 *.tse:    term-based annotations using all available seizure type classes
 *.tse_bi: same as *.tse except bi-class annotations (seizure/background) 
 *.lbl:    event-based annotations using all available seizure type classes
 *.lbl_bi: same as *.lbl except bi-class annotations (seizure/background)


In [16]:
DATA_PATH = os.path.join(os.getcwd(), "data")
RESAMPLE_RATE = 128 #Hz
SLIDING_WINDOW = 1 #s
MAX_SEQUENCE_LENGTH = 100

In [17]:
ref_train = pd.read_excel(os.path.join(DATA_PATH, "_DOCS/seizures_v36r.xlsx"), sheet_name="train", usecols='A:O', index_col=0)
ref_dev = pd.read_excel(os.path.join(DATA_PATH, "_DOCS/seizures_v36r.xlsx"), sheet_name="dev", usecols='A:O', index_col=0)

In [18]:
ref_train.rename(columns={"Unnamed: 11": "Filename", "Seizure Time": "Start Time", "Unnamed: 13": "Stop Time"}, inplace=True)
ref_train = ref_train.iloc[1: , :]
ref_train = ref_train.dropna(subset=["Start Time", "Stop Time", "Seizure Type"])
ref_dev.rename(columns={"Unnamed: 11": "Filename", "Seizure Time": "Start Time", "Unnamed: 13": "Stop Time"}, inplace=True)
ref_dev = ref_dev.iloc[1: , :]
ref_dev = ref_dev.dropna(subset=["Start Time", "Stop Time", "Seizure Type"])


In [19]:
ref_train.head()

Unnamed: 0_level_0,File No.,Patient,Session,File,EEG Type,EEG SubType,LTM -or- Routine,Normal/ Abnormal,No. Seizures/ File,No. Seizures/ Session,Filename,Start Time,Stop Time,Seizure Type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2.0,2.0,1479.0,s001,t001,ICU,ICU,Routine,Abnormal,1.0,1.0,./train/02_tcp_le/014/00001479/s001_2004_06_14...,950.004,1047.0178,FNSZ
3.0,3.0,1413.0,s001,t000,EMU,EMU,Routine,Abnormal,3.0,3.0,./train/02_tcp_le/014/00001413/s001_2004_07_13...,315.34,319.024,ABSZ
4.0,3.0,,,,,,,,,,./train/02_tcp_le/014/00001413/s001_2004_07_13...,327.668,330.032,ABSZ
5.0,3.0,,,,,,,,,,./train/02_tcp_le/014/00001413/s001_2004_07_13...,642.016,645.396,ABSZ
6.0,4.0,1413.0,s003,t001,Outpatient,Outpatient,Routine,Abnormal,3.0,3.0,./train/02_tcp_le/014/00001413/s003_2008_04_09...,422.04,430.3,ABSZ


In [20]:
ref_dev.head()

Unnamed: 0_level_0,File No.,Patient,Session,File,EEG Type,EEG SubType,LTM -or- Routine,Normal/ Abnormal,No. Seizures/ File,No. Seizures/ Session,Filename,Start Time,Stop Time,Seizure Type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5.0,5.0,,,t002,,,,,1.0,,./dev/01_tcp_ar/002/00000258/s003_2003_07_22/0...,8.8275,173.3791,TCSZ
6.0,6.0,,,t003,,,,,1.0,,./dev/01_tcp_ar/002/00000258/s003_2003_07_22/0...,3.9623,175.5054,TCSZ
7.0,7.0,,,t004,,,,,1.0,,./dev/01_tcp_ar/002/00000258/s003_2003_07_22/0...,1.0,150.5157,TCSZ
8.0,8.0,,,t005,,,,,1.0,,./dev/01_tcp_ar/002/00000258/s003_2003_07_22/0...,207.9101,308.0,FNSZ
16.0,16.0,1027.0,s004,t000,Unknown,Unknown,LTM,Abnormal,1.0,3.0,./dev/01_tcp_ar/010/00001027/s004_2004_01_30/0...,34.0551,162.3118,CPSZ


In [21]:
Signal_list = ["EEG FP1", 
               "EEG FP2", 
               "EEG F3", 
               "EEG F4", 
               "EEG C3", 
               "EEG C4", 
               "EEG P3", 
               "EEG P4", 
               "EEG O1", 
               "EEG O2", 
               "EEG F7", 
               "EEG F8",
               "EEG T3", 
               "EEG T4", 
               "EEG T5", 
               "EEG T6", 
               "EEG FZ", 
               "EEG CZ", 
               "EEG PZ"]
n_signal = len(Signal_list)

In [22]:
def create_single_graph(start_time, f, ch_names):
    Graph = nx.Graph()
    f_temp = f.copy()
    f_temp.crop(start_time, start_time + SLIDING_WINDOW, include_tmax=False)
    data = f_temp.get_data()
    Graph_attributes = []
    for signal in Signal_list:
        index = np.where(ch_names == signal)[0][0]
        Graph_attributes.append((signal, {"Signal": data[index,:]}))
    Graph.add_nodes_from(Graph_attributes)
    for node1 in Graph.nodes():
        for node2 in Graph.nodes():
            if node1 == node2:
                continue
            Graph.add_edge(node1, node2)
    Signals = nx.get_node_attributes(Graph, "Signal")
    edge_attrib = {}
    for edge in Graph.edges():
        node1, node2 = edge
        signal1 = Signals[node1]
        signal2 = Signals[node2]
        coeff_pearsonr, _ = scipy.stats.pearsonr(signal1, signal2)
        norm_ccorr = np.sum(signal1*signal2) / (np.sqrt(np.sum(signal1**2)*np.sum(signal2**2)))
        edge_attrib[edge] = {"pearsonr": np.abs(coeff_pearsonr), "norm cross correlation": np.abs(norm_ccorr)}
    nx.set_edge_attributes(Graph, edge_attrib)
    return Graph

In [23]:
pad = lambda a, i: a[0: i] if a.shape[0] > i else np.hstack((a, np.zeros(i - a.shape[0])))

In [24]:
def combine_graph(Graph_list):
    #Initialize the parameters of the new graph
    Graph_combined = nx.Graph()
    Signals_combined = {}
    Edges_combined = {}
    for name in Signal_list:
        Signals_combined[name] = np.array([])
    Graph = Graph_list[0]
    for edge in Graph.edges():
        Edges_combined[edge] = {"pearsonr": [], "norm cross correlation": []}

    #Combine the sliding windows
    for g in Graph_list:
        Signals = nx.get_node_attributes(g, "Signal")
        for name, item in Signals.items():
            Signals_combined[name] = np.concatenate((Signals_combined[name], Signals[name]), axis=0)

        for node1, node2, attributes in g.edges.data():
            edge = (node1, node2)
            Edges_combined[edge]["pearsonr"].append(attributes["pearsonr"])
            Edges_combined[edge]["norm cross correlation"].append(attributes["norm cross correlation"])

    #Truncate or pad with 0 to match sequence length
    Graph_attributes = []
    for name in Signal_list:
        combined_signal = pad(Signals_combined[name], RESAMPLE_RATE * SLIDING_WINDOW * MAX_SEQUENCE_LENGTH)
        Graph_attributes.append((name, {"Signal": combined_signal}))

    for edge in Edges_combined:
        Edges_combined[edge]["pearsonr"] = pad(np.array(Edges_combined[edge]["pearsonr"]), MAX_SEQUENCE_LENGTH)
        Edges_combined[edge]["norm cross correlation"] = pad(np.array(Edges_combined[edge]["norm cross correlation"]), MAX_SEQUENCE_LENGTH)

    Graph_combined.add_nodes_from(Graph_attributes)
    Graph_combined.add_edges_from(Edges_combined)
    nx.set_edge_attributes(Graph_combined, Edges_combined)
    return Graph_combined

In [25]:
i=0
for index, row in tqdm(ref_train.iterrows(), total=len(ref_train)):
    start_time = row["Start Time"]
    stop_time = row["Stop Time"]
    seizure_type = row["Seizure Type"]
    path_file = os.path.join(DATA_PATH, "edf/", row["Filename"])
    file_name = path_file[:-3]+"edf"
    file_code = file_name.split(sep="/")[-1][:-4]

    with mne.io.read_raw_edf(file_name, verbose=False) as f:
        if f.info['sfreq'] != RESAMPLE_RATE:
            f.resample(RESAMPLE_RATE)  #Set the same frequency to all EEGs
        ch_names = [x.split("-")[0] for x in f.ch_names]
        ch_names = np.array(ch_names)

        ### Run sliding window and construct a sequence
        compteur = 0
        Graph_list = []
        while start_time + SLIDING_WINDOW < stop_time:
            Graph = create_single_graph(start_time=start_time, f=f, ch_names=ch_names)
            Graph_list.append(Graph)
            start_time += SLIDING_WINDOW
            compteur += 1
            if compteur == MAX_SEQUENCE_LENGTH:
                break
        Combined_graph = combine_graph(Graph_list)
        to_picle = (seizure_type, Combined_graph)
        with open(f"data_processed/train/{i}.pickle", "wb") as f:
            pickle.dump(to_picle, f)
            i+=1


  0%|          | 0/2377 [00:00<?, ?it/s]

In [26]:
for index, row in tqdm(ref_dev.iterrows(), total=len(ref_dev)):
    start_time = row["Start Time"]
    stop_time = row["Stop Time"]
    seizure_type = row["Seizure Type"]
    path_file = os.path.join(DATA_PATH, "edf/", row["Filename"])
    file_name = path_file[:-3]+"edf"
    file_code = file_name.split(sep="/")[-1][:-4]

    with mne.io.read_raw_edf(file_name, verbose=False) as f:
        if f.info['sfreq'] != RESAMPLE_RATE:
            f.resample(RESAMPLE_RATE)  #Set the same frequency to all EEGs
        ch_names = [x.split("-")[0] for x in f.ch_names]
        ch_names = np.array(ch_names)

        ### Run sliding window and construct a sequence
        compteur = 0
        Graph_list = []
        while start_time + SLIDING_WINDOW < stop_time:
            Graph = create_single_graph(start_time=start_time, f=f, ch_names=ch_names)
            Graph_list.append(Graph)
            start_time += SLIDING_WINDOW
            compteur += 1
            if compteur == MAX_SEQUENCE_LENGTH:
                break
        Combined_graph = combine_graph(Graph_list)
        to_picle = (seizure_type, Combined_graph)
        with open(f"data_processed/dev/{i}.pickle", "wb") as f:
            pickle.dump(to_picle, f)
            i+=1

  0%|          | 0/673 [00:00<?, ?it/s]

In [27]:
shutil.make_archive("data_processed", 'zip', "data_processed")

'/home/polyxene/MLNS-project-datascienthyris/data_processed.zip'