In [1]:
import sys
from pathlib import Path
import pandas as pd
import tarfile
import urllib

def load_metadata():
  file_path = Path("data/train.csv")
  return pd.read_csv(file_path)
  
metadata = load_metadata()

def extract_eeg():
  eeg_dir = Path("../data/eeg")
  tarball_path = Path("data/eeg.tar.gz")
  if not tarball_path.is_file():
    url = 'https://dl.dropboxusercontent.com/scl/fi/5sina48c4naaxv6uze0fv/eeg.tar.gz?rlkey=r7ec191extynfcm8fy0tsiws5&dl=0'
    urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as eeg_tarball:
      eeg_tarball.extractall()
    
extract_eeg()

metadata

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,4144388963,140,604.0,1156825996,140,604.0,1451266906,59489,GRDA,0,0,0,0,3,0
1,2353475448,30,64.0,1002394133,30,64.0,4000072340,5339,LRDA,0,0,0,3,0,0
2,1618328341,9,52.0,900482955,9,52.0,4140697659,20198,GRDA,0,0,0,0,3,0
3,979865826,7,90.0,1626043434,7,90.0,919550440,1069,Other,1,1,4,1,4,5
4,521108392,0,0.0,827447277,0,0.0,1717414556,13134,Other,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2509824693,10,68.0,1005228554,15,330.0,219919562,14386,LPD,0,11,0,1,1,2
996,2882719839,14,50.0,2035369578,14,50.0,4193559045,2641,GPD,5,0,11,0,0,0
997,1322226281,2,18.0,1740512896,2,18.0,1697286566,49448,Other,0,0,0,0,0,3
998,628369060,15,98.0,13143748,17,292.0,1650460145,34998,GPD,0,3,7,0,2,4


In [2]:
import dask.dataframe as dd
from src.utils import compute_signal_hash

channel_order = ['Fp1', 'Fp2',
            'F7', 'F3', 'Fz', 'F4', 'F8', 
            'T3', 'C3', 'Cz', 'C4', 'T4', 
            'T5', 'P3', 'Pz', 'P4', 'T6', 
            'O1', 'O2',
          ]

def load_signals(metadata):
  rows = len(metadata)
  eeg_list = []

  for row in range(0,rows):
    sample = metadata.iloc[row]
    f_name = f'data/eeg/{sample.eeg_id}.parquet'
    eeg = pd.read_parquet(f_name)[channel_order]
    eeg_offset = int(sample.eeg_label_offset_seconds)

    eeg['id'] = str(compute_signal_hash(sample))
    eeg = eeg.set_index('id')

    eeg = eeg.iloc[eeg_offset*200:(eeg_offset+50)*200]
    eeg_list.append(eeg)

  return dd.concat(eeg_list)

ddf = load_signals(metadata)


In [3]:
df = ddf.compute()
df

sig_ids = list(set(df.index.to_list()))
sig_ids


['d5a85ea6f',
 'c74244a73',
 '0989f6ee4',
 '006e7f57f',
 '4da17e650',
 '15a0bd857',
 'fffe54864',
 '2f683794d',
 'f85af3c1b',
 'da0bd345b',
 'd624eb84f',
 '8634145c8',
 '0a5bd8674',
 '0bfde5f9d',
 'c938c8d44',
 '5812f1128',
 'afe7b95d3',
 'f9dad1dc2',
 'c713579f9',
 '3f604920a',
 'bf06dc0f2',
 '3ac70a6cf',
 'bfa0cb2ff',
 '9a6ae314e',
 'c835eb824',
 '62d2ee3e8',
 '206ef174a',
 'cb79eaa39',
 '47a3b463b',
 '120375b18',
 '7617b638a',
 'ac4e3ddde',
 '2cc3bd589',
 '4353b2615',
 '37e3ff0d9',
 'ed72a402e',
 '2d9f98444',
 'ff33b2297',
 '08c566588',
 'f8d6d160f',
 '20f72284e',
 'b698d2294',
 '2fbcf012d',
 '6c6b2d7e4',
 '9a2ce96c5',
 '7ac358ef7',
 '4d0072f7f',
 'da9f5402a',
 '9a9b55083',
 '9790edd63',
 '9c883ca7d',
 '8fb899e70',
 '3142cc877',
 'd8447d751',
 '5358bcc07',
 'df38f81cc',
 '185b54070',
 'f599e6794',
 '910e3c34c',
 '36ac90aff',
 '3e86e4c40',
 '08c7153a9',
 'd158e9c39',
 '410c6482f',
 '18fe5ea76',
 'b2bee22d3',
 '07c69fa08',
 '6f0f01997',
 '62b6cb9c0',
 '7635463d6',
 'cd1ebe55f',
 'e1b1

In [17]:
## Apply the wavelet transform
import pywt
from src.preprocessing import wavelet_transform

max_levels = pywt.dwt_max_level(data_len=10000, filter_len=pywt.Wavelet('coif1').dec_len)

for sig_id in sig_ids:
  df.loc[sig_id] = wavelet_transform(df.loc[sig_id], max_levels)


KeyboardInterrupt: 

In [5]:
sig1 = df.loc['554a28223']

In [6]:
## MNE setup
import mne
import numpy as np

mne_info = mne.create_info(ch_names=sig1.columns.tolist(), sfreq=200, ch_types='eeg')
mne_info.set_montage('standard_1020')
    
sig1_data = np.array(sig1.transpose())
sig1_data = np.nan_to_num(sig1_data)
    
raw = mne.io.RawArray(sig1_data, mne_info)
raw.apply_function(lambda x: x / 20e6, picks='eeg')

Creating RawArray with float64 data, n_channels=19, n_times=10000
    Range : 0 ... 9999 =      0.000 ...    49.995 secs
Ready.


0,1
Measurement date,Unknown
Experimenter,Unknown
Participant,Unknown

0,1
Digitized points,22 points
Good channels,19 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available

0,1
Sampling frequency,200.00 Hz
Highpass,0.00 Hz
Lowpass,100.00 Hz
Duration,00:00:50 (HH:MM:SS)


In [7]:
# Apply filters
from src.preprocessing import notch_filter, bp_filter, standardize

l_freq = 1.0
h_freq = 70.0

df = notch_filter(df, 60)
df = bp_filter(df, 1.0, 70)
df = standardize(df)
df

Creating RawArray with float64 data, n_channels=19, n_times=10000000
    Range : 0 ... 9999999 =      0.000 ... 49999.995 secs
Ready.
Filtering raw data in 1 contiguous segment
Setting up band-stop filter from 59 - 61 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandstop filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 59.35
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 59.10 Hz)
- Upper passband edge: 60.65 Hz
- Upper transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 60.90 Hz)
- Filter length: 1321 samples (6.605 s)



[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    2.4s


Creating RawArray with float64 data, n_channels=19, n_times=10000000
    Range : 0 ... 9999999 =      0.000 ... 49999.995 secs
Ready.


KeyboardInterrupt: 

In [None]:
# Extracting top 3 channels based on variance for all samples
# 1000 samples computation duration = approx. 10 minutes
from src.feature_extraction import calculate_all_samples

top_channels_df = calculate_all_samples(df, sig_ids, 10) # 10 samples for testing
top_channels_df # NEED TO EXTRACT FEATURES FROM CHANNELS IN THIS DATA STRUCTURE

Unnamed: 0,0,1,2
ec2a8e98d,F7,Pz,Fp1
e7ce7ea53,F3,Pz,P3
110270772,T6,F8,T4
4a8ace4a7,P3,T3,C3
f01a73f05,Fp1,F3,T5
...,...,...,...
bc498a548,,,
da0bd345b,,,
4f2d7b60b,,,
d5d2a9f36,,,


In [None]:
# Relevant code for testing purposes

# print(np.var(df['Fp1'], axis=0)) #variance for one col/channel
# fpl = df['Pz'].fillna(0).to_numpy() #converting to numby array for easier computation
#print(np.var(df['Fp1'], axis=0)) #variance for one col/channel

# one sample and their channels
# sig1 = df.loc[['521108392']]
# sig1
# sig1['Fp1'] # one sample and single channel
# print(np.var(fpl, axis=0))

# variance for one channel(Fp1) in one signal(4144388963)
# np.var(sig1['F7'].to_numpy())

# File created to test the correctness of extracted values using MATLAB
# Save Fp1 channel data into a MATLAB file
# import scipy.io
# scipy.io.savemat('Fp1_data.mat', {'Fp1_data': sig1['Fp1']})