# CHB-MIT Dataset Explore

In [1]:
%matplotlib widget

## Check patients list

In [1]:
import os

dataset_path = "./CHB-MIT"
patient_dirs = [d for d in os.listdir(dataset_path) if d.startswith("chb")]
patient_dirs

['chb01',
 'chb02',
 'chb03',
 'chb04',
 'chb05',
 'chb06',
 'chb07',
 'chb08',
 'chb09',
 'chb10',
 'chb11',
 'chb12',
 'chb13',
 'chb14',
 'chb15',
 'chb16',
 'chb17',
 'chb18',
 'chb19',
 'chb20',
 'chb21',
 'chb22',
 'chb23',
 'chb24']

## Parse summary files

In [4]:
from typing import List
from utils import info

summary_info: List[info.PatientSummary] = info.parse_all_summary_files(dataset_path)
filtered_summary_info = info.filter_common_channels(summary_info)

In [5]:
filtered_summary_info

[{'patient_id': 'chb01',
  'file_duration': 1,
  'sampling_rate': 256,
  'channels_list': [['FP1-F7',
    'F7-T7',
    'T7-P7',
    'P7-O1',
    'FP1-F3',
    'F3-C3',
    'C3-P3',
    'P3-O1',
    'FP2-F4',
    'F4-C4',
    'C4-P4',
    'P4-O2',
    'FP2-F8',
    'F8-T8',
    'T8-P8',
    'P8-O2',
    'FZ-CZ',
    'CZ-PZ',
    'P7-T7',
    'T7-FT9',
    'FT9-FT10',
    'FT10-T8',
    'T8-P8']],
  'files': [{'name': 'chb01_01.edf',
    'channels_set_idx': 0,
    'start_time': datetime.datetime(2075, 3, 10, 11, 42, 54),
    'end_time': datetime.datetime(2075, 3, 10, 12, 42, 54),
    'seizures': []},
   {'name': 'chb01_02.edf',
    'channels_set_idx': 0,
    'start_time': datetime.datetime(2075, 3, 10, 12, 42, 57),
    'end_time': datetime.datetime(2075, 3, 10, 13, 42, 57),
    'seizures': []},
   {'name': 'chb01_03.edf',
    'channels_set_idx': 0,
    'start_time': datetime.datetime(2075, 3, 10, 13, 43, 4),
    'end_time': datetime.datetime(2075, 3, 10, 14, 43, 4),
    'seizures': [{'st

Let check the result

In [3]:
from pprint import pprint

pprint(filtered_summary_info[0], width=80, indent=2)

{ 'channels_list': [ [ 'FP1-F7',
                       'F7-T7',
                       'T7-P7',
                       'P7-O1',
                       'FP1-F3',
                       'F3-C3',
                       'C3-P3',
                       'P3-O1',
                       'FP2-F4',
                       'F4-C4',
                       'C4-P4',
                       'P4-O2',
                       'FP2-F8',
                       'F8-T8',
                       'T8-P8',
                       'P8-O2',
                       'FZ-CZ',
                       'CZ-PZ',
                       'P7-T7',
                       'T7-FT9',
                       'FT9-FT10',
                       'FT10-T8',
                       'T8-P8']],
  'file_duration': 1,
  'files': [ { 'channels_set_idx': 0,
               'end_time': datetime.datetime(2075, 3, 10, 12, 42, 54),
               'name': 'chb01_01.edf',
               'seizures': [],
               'start_time': datetime.datetime(2075

In [6]:
filtered_summary_info[3]['files'][8:]

[{'name': 'chb04_09.edf',
  'channels_set_idx': 1,
  'start_time': datetime.datetime(2075, 3, 11, 22, 53, 33),
  'end_time': datetime.datetime(2075, 3, 12, 2, 53, 33),
  'seizures': []},
 {'name': 'chb04_10.edf',
  'channels_set_idx': 1,
  'start_time': datetime.datetime(2075, 3, 12, 2, 53, 41),
  'end_time': datetime.datetime(2075, 3, 12, 6, 53, 41),
  'seizures': []},
 {'name': 'chb04_11.edf',
  'channels_set_idx': 1,
  'start_time': datetime.datetime(2075, 3, 12, 6, 53, 48),
  'end_time': datetime.datetime(2075, 3, 12, 8, 14, 20),
  'seizures': []},
 {'name': 'chb04_12.edf',
  'channels_set_idx': 1,
  'start_time': datetime.datetime(2075, 3, 12, 8, 16, 27),
  'end_time': datetime.datetime(2075, 3, 12, 12, 16, 47),
  'seizures': []},
 {'name': 'chb04_13.edf',
  'channels_set_idx': 1,
  'start_time': datetime.datetime(2075, 3, 12, 12, 17, 31),
  'end_time': datetime.datetime(2075, 3, 12, 16, 17, 29),
  'seizures': []},
 {'name': 'chb04_14.edf',
  'channels_set_idx': 1,
  'start_time':

# Total Seizures in dataset

In [7]:
import pandas as pd

seizure_count = {}

for i, patient_info in enumerate(filtered_summary_info):
    patient_id = f"chb{(i + 1):02}"
    for file in patient_info['files']:
        for seizure in file["seizures"]:
            seizure_count[patient_id] = seizure_count.get(patient_id, 0) + 1

sezure_count = pd.DataFrame.from_dict(seizure_count, orient='index', columns=['seizure_count'])

display(sezure_count)
display(sezure_count.sum())

Unnamed: 0,seizure_count
chb01,7
chb02,3
chb03,7
chb04,4
chb05,5
chb06,10
chb07,3
chb08,5
chb09,4
chb10,7


seizure_count    181
dtype: int64

In [8]:
COMMON_CHANNELS = [
    "C3-P3",
    "C4-P4",
    "CZ-PZ",
    "F3-C3",
    "F4-C4",
    "F7-T7",
    "F8-T8",
    "FP1-F3",
    "FP1-F7",
    "FP2-F4",
    "FP2-F8",
    "FT10-T8",
    "FT9-FT10",
    "FZ-CZ",
    "P3-O1",
    "P4-O2",
    "P7-O1",
    "P7-T7",
    "P8-O2",
    "T7-FT9",
    "T7-P7",
    "T8-P8",
]

## Exclude abnormal data
We have checking each records and found that most of the record using this `"C3-P3","C4-P4","CZ-PZ","F3-C3","F4-C4","F7-T7","F8-T8","FP1-F3","FP1-F7","FP2-F4","FP2-F8","FT10-T8","FT9-FT10","FZ-CZ","P3-O1","P4-O2","P7-O1","P7-T7","P8-O2","T7-FT9","T7-P7","T8-P8"` excepts the `chb12_27.edf`, `chb12_28.edf` and `chb12_29.edf` that didn't use this channels configuration. But we already filter out the files those are not use the common channels already.

But for making sure, we will check which files those contain seizure don't contain all channels in common channels

In [9]:
from utils.constant import COMMON_CHANNELS

channels_set = set(COMMON_CHANNELS)

for i, patient_info in enumerate(filtered_summary_info):
    for file in patient_info['files']:
        if file["name"] in ["chb12_27.edf", "chb12_28.edf", "chb12_29.edf"]:
            continue
        
        current_file_channels = set(patient_info["channels_list"][file["channels_set_idx"]])
        missing_channels = channels_set - current_file_channels
        
        if (len(file["seizures"]) > 0) and len(missing_channels) > 0:
            print(f"Missing channels in {file['name']}: {missing_channels}, Seizure Count: {len(file['seizures'])}")

## Checking the total usable ictal class

In [10]:
from utils import dataset


ictal_list = dataset.list_ictal(filtered_summary_info)
ictal_df = pd.DataFrame(ictal_list)
ictal_df

Unnamed: 0,file,start_time,end_time,total_time,total_windows
0,chb01_03.edf,2996,3036,40,9
1,chb01_04.edf,1467,1494,27,5
2,chb01_15.edf,1732,1772,40,9
3,chb01_16.edf,1015,1066,51,11
4,chb01_18.edf,1720,1810,90,21
...,...,...,...,...,...
176,chb24_13.edf,3288,3304,16,3
177,chb24_14.edf,1939,1966,27,5
178,chb24_15.edf,3552,3569,17,3
179,chb24_17.edf,3515,3581,66,15


Now we can check how many windows we got for ictal class

In [11]:
ictal_df.sum()

file             chb01_03.edfchb01_04.edfchb01_15.edfchb01_16.e...
start_time                                                  473011
end_time                                                    484026
total_time                                                   11015
total_windows                                                 2509
dtype: object

## Checking the total usable interictal class

For the interictal, it is more tricky to choose as we need to consider the ictal class. If we choosing it too close to the ictal class it might capture the pre-ictal phrase. To be able to safely get the interictal class we need to get the item that in range before or after the ictal several hours.

In [12]:
interictal_list = dataset.list_interictal(filtered_summary_info, ictal_list)
interictal_df = pd.DataFrame(interictal_list)
interictal_df

Unnamed: 0,file,total_time,total_windows
0,chb01_09.edf,3600,899
1,chb01_10.edf,3600,899
2,chb01_32.edf,3600,899
3,chb01_33.edf,3600,899
4,chb01_34.edf,3600,899
...,...,...,...
272,chb22_15.edf,3600,899
273,chb23_16.edf,14400,3599
274,chb23_17.edf,14400,3599
275,chb23_19.edf,14400,3599


In [13]:
interictal_df.sum()

file             chb01_09.edfchb01_10.edfchb01_32.edfchb01_33.e...
total_time                                                 1742400
total_windows                                               435323
dtype: object

## Prepare dataset

From info above we can contruct the dataset that contain ictal and interictal class that both class has 2509 samples. And since the interictal class is for much more that the ictal class we can randomly select it to match the ictal class. Since we need 2509 windows we can roughly select 9 windows from each files and randomly discard the remainder.

In [14]:
dataset.save_ictal("./CHB-MIT", ictal_list)

Processing chb01_03.edf
Processing chb01_04.edf
Processing chb01_15.edf
Processing chb01_16.edf
Processing chb01_18.edf
Processing chb01_21.edf
Processing chb01_26.edf
Processing chb02_16.edf
Processing chb02_16+.edf
Processing chb02_19.edf
Processing chb03_01.edf
Processing chb03_02.edf
Processing chb03_03.edf
Processing chb03_04.edf
Processing chb03_34.edf
Processing chb03_35.edf
Processing chb03_36.edf
Processing chb04_05.edf
Processing chb04_08.edf
Processing chb04_28.edf
Processing chb04_28.edf
Processing chb05_06.edf
Processing chb05_13.edf
Processing chb05_16.edf
Processing chb05_17.edf
Processing chb05_22.edf
Processing chb06_01.edf
Processing chb06_01.edf
Processing chb06_01.edf
Processing chb06_04.edf
Processing chb06_04.edf
Processing chb06_09.edf
Processing chb06_10.edf
Processing chb06_13.edf
Processing chb06_18.edf
Processing chb06_24.edf
Processing chb07_12.edf
Processing chb07_13.edf
Processing chb07_19.edf
Processing chb08_02.edf
Processing chb08_05.edf
Processing chb0

In [16]:
dataset.save_interictal("./CHB-MIT", interictal_list)

Processing chb01_09.edf
Processing chb01_10.edf
Processing chb01_32.edf
Processing chb01_33.edf
Processing chb01_34.edf
Processing chb01_36.edf
Processing chb01_37.edf
Processing chb01_38.edf
Processing chb01_39.edf
Processing chb01_40.edf
Processing chb01_41.edf
Processing chb01_42.edf
Processing chb01_43.edf
Processing chb01_46.edf
Processing chb02_01.edf
Processing chb02_02.edf
Processing chb02_03.edf
Processing chb02_04.edf
Processing chb02_05.edf
Processing chb02_06.edf
Processing chb02_07.edf
Processing chb02_08.edf
Processing chb02_09.edf
Processing chb02_10.edf
Processing chb02_11.edf
Processing chb02_24.edf
Processing chb02_25.edf
Processing chb02_26.edf
Processing chb02_27.edf
Processing chb02_28.edf
Processing chb02_29.edf
Processing chb02_30.edf
Processing chb02_31.edf
Processing chb02_32.edf
Processing chb02_33.edf
Processing chb02_34.edf
Processing chb02_35.edf
Processing chb03_09.edf
Processing chb03_10.edf
Processing chb03_11.edf
Processing chb03_12.edf
Processing chb03

ValueError: Cannot take a larger sample than population when 'replace=False'