In [1]:
import pandas as pd
import os
import pyedflib
import numpy as np
from tqdm.notebook import tqdm

The top-level directories:

 edf/dev/01_tcp_ar
 edf/dev/02_tcp_le
 edf/dev/03_tcp_ar_a
 edf/train/01_tcp_ar
 edf/train/02_tcp_le
 edf/train/03_tcp_ar_a

refer to the appropriate channel configurations for the
EEGs. 01_tcp_ar refers to an AR reference configuration, with
annotations referencing a TCP format described below.

The pathname of a typical EEG file can be explained as follows:

 Filename:
  edf/dev_test/01_tcp_ar/002/00000258/s002_2003_07_21/00000258_s002_t000.edf

 Components:
  edf: contains the edf data

  dev_test: part of the dev_test set (vs.) train

  01_tcp_ar: data that follows the averaged reference (AR) configuration,
             while annotations use the TCP channel configutation

  002: a three-digit identifier meant to keep the number of subdirectories
       in a directory manageable. This follows the TUH EEG v1.1.0 convention.

  00000258: official patient number that is linked to v1.1.0 of TUH EEG

  s002_2003_07_21: session two (s002) for this patient. The session
                   was archived on 07/21/2003.

  00000258_s002_t000.edf: the actual EEG file. These are split into a series of
  			  files starting with t000.edf, t001.edf, ... These
			  represent pruned EEGs, so the original EEG is 
			  split into these segments, and uninteresting
			  parts of the original recording were deleted
			  (common in clinical practice).

The easiest way to access the annotations is through the spreadsheet
provided (_SEIZURES_*.xlsx). This contains the start and stop time
of each seizure event in an easy to understand format. Convert the
file to .csv if you need a machine-readable version.

There are six types of files in this release:

 *.edf:    the EEG sampled data in European Data Format (edf)
 *.txt:    the EEG report corresponding to the patient and session
 *.tse:    term-based annotations using all available seizure type classes
 *.tse_bi: same as *.tse except bi-class annotations (seizure/background) 
 *.lbl:    event-based annotations using all available seizure type classes
 *.lbl_bi: same as *.lbl except bi-class annotations (seizure/background)


In [2]:
DATA_PATH = "/mnt/d/rizwa/OneDrive - CentraleSupelec/Centrale/3A/MLNS Machine Learning in Network Science/MLNS-project-datascienthyris/data"


In [3]:
ref_train = pd.read_excel(os.path.join(DATA_PATH, "_DOCS/seizures_v36r.xlsx"), sheet_name="train", usecols='A:O', index_col=0)
ref_dev = pd.read_excel(os.path.join(DATA_PATH, "_DOCS/seizures_v36r.xlsx"), sheet_name="dev", usecols='A:O', index_col=0)

In [4]:
ref_train.rename(columns={"Unnamed: 11": "Filename", "Seizure Time": "Start Time", "Unnamed: 13": "Stop Time"}, inplace=True)
ref_train = ref_train.iloc[1: , :]
ref_dev.rename(columns={"Unnamed: 11": "Filename", "Seizure Time": "Start Time", "Unnamed: 13": "Stop Time"}, inplace=True)
ref_dev = ref_dev.iloc[1: , :]

In [5]:
ref_train.head()

Unnamed: 0_level_0,File No.,Patient,Session,File,EEG Type,EEG SubType,LTM -or- Routine,Normal/ Abnormal,No. Seizures/ File,No. Seizures/ Session,Filename,Start Time,Stop Time,Seizure Type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,1.0,1402.0,s003,t001,Outpatient,Outpatient,Routine,Normal,0.0,0.0,./train/02_tcp_le/014/00001402/s003_2008_07_21...,,,
2.0,2.0,1479.0,s001,t001,ICU,ICU,Routine,Abnormal,1.0,1.0,./train/02_tcp_le/014/00001479/s001_2004_06_14...,950.004,1047.0178,FNSZ
3.0,3.0,1413.0,s001,t000,EMU,EMU,Routine,Abnormal,3.0,3.0,./train/02_tcp_le/014/00001413/s001_2004_07_13...,315.34,319.024,ABSZ
4.0,3.0,,,,,,,,,,./train/02_tcp_le/014/00001413/s001_2004_07_13...,327.668,330.032,ABSZ
5.0,3.0,,,,,,,,,,./train/02_tcp_le/014/00001413/s001_2004_07_13...,642.016,645.396,ABSZ


In [6]:
ref_dev.head()

Unnamed: 0_level_0,File No.,Patient,Session,File,EEG Type,EEG SubType,LTM -or- Routine,Normal/ Abnormal,No. Seizures/ File,No. Seizures/ Session,Filename,Start Time,Stop Time,Seizure Type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,1.0,258.0,s002,t000,Outpatient,Outpatient,LTM,Abnormal,0.0,0.0,./dev/01_tcp_ar/002/00000258/s002_2003_07_21/0...,,,
2.0,2.0,,,t002,,,,,0.0,,./dev/01_tcp_ar/002/00000258/s002_2003_07_21/0...,,,
3.0,3.0,258.0,s003,t000,Outpatient,Outpatient,LTM,Abnormal,0.0,4.0,./dev/01_tcp_ar/002/00000258/s003_2003_07_22/0...,,,
4.0,4.0,,,t001,,,,,0.0,,./dev/01_tcp_ar/002/00000258/s003_2003_07_22/0...,,,
5.0,5.0,,,t002,,,,,1.0,,./dev/01_tcp_ar/002/00000258/s003_2003_07_22/0...,8.8275,173.3791,TCSZ


In [7]:
for index, row in tqdm(ref_train.iterrows(), total=len(ref_train)):
    path_file = os.path.join(DATA_PATH, "edf/", row["Filename"])
    file_name = path_file[:-3]+"edf"
    with pyedflib.EdfReader(file_name) as f:
        n = f.signals_in_file
        signal_labels = f.getSignalLabels()
        sigbufs = np.zeros((n, f.getNSamples()[0]))
        for i in np.arange(n):
                sigbufs[i, :] = f.readSignal(i)

  0%|          | 0/6107 [00:00<?, ?it/s]