In [1]:
import sys
from pathlib import Path
import pandas as pd
import tarfile
import urllib

def load_metadata():
  file_path = Path("data/train.csv")
  return pd.read_csv(file_path)
  
metadata = load_metadata()

def extract_eeg():
  eeg_dir = Path("../data/eeg")
  tarball_path = Path("data/eeg.tar.gz")
  if not tarball_path.is_file():
    url = 'https://dl.dropboxusercontent.com/scl/fi/5sina48c4naaxv6uze0fv/eeg.tar.gz?rlkey=r7ec191extynfcm8fy0tsiws5&dl=0'
    urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as eeg_tarball:
      eeg_tarball.extractall()
    
extract_eeg()

metadata

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,4144388963,140,604.0,1156825996,140,604.0,1451266906,59489,GRDA,0,0,0,0,3,0
1,2353475448,30,64.0,1002394133,30,64.0,4000072340,5339,LRDA,0,0,0,3,0,0
2,1618328341,9,52.0,900482955,9,52.0,4140697659,20198,GRDA,0,0,0,0,3,0
3,979865826,7,90.0,1626043434,7,90.0,919550440,1069,Other,1,1,4,1,4,5
4,521108392,0,0.0,827447277,0,0.0,1717414556,13134,Other,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2509824693,10,68.0,1005228554,15,330.0,219919562,14386,LPD,0,11,0,1,1,2
996,2882719839,14,50.0,2035369578,14,50.0,4193559045,2641,GPD,5,0,11,0,0,0
997,1322226281,2,18.0,1740512896,2,18.0,1697286566,49448,Other,0,0,0,0,0,3
998,628369060,15,98.0,13143748,17,292.0,1650460145,34998,GPD,0,3,7,0,2,4


In [19]:
import dask.dataframe as dd
from src.utils import compute_signal_hash

channel_order = ['Fp1', 'Fp2',
            'F7', 'F3', 'Fz', 'F4', 'F8', 
            'T3', 'C3', 'Cz', 'C4', 'T4', 
            'T5', 'P3', 'Pz', 'P4', 'T6', 
            'O1', 'O2',
          ]

def load_signals(metadata):
  rows = len(metadata)
  eeg_list = []

  for row in range(0,rows):
    sample = metadata.iloc[row]
    f_name = f'data/eeg/{sample.eeg_id}.parquet'
    eeg = pd.read_parquet(f_name)[channel_order]
    eeg_offset = int(sample.eeg_label_offset_seconds)

    eeg['id'] = str(compute_signal_hash(sample))
    eeg = eeg.set_index('id')

    eeg = eeg.iloc[eeg_offset*200:(eeg_offset+50)*200]
    eeg_list.append(eeg)

  return dd.concat(eeg_list)

ddf = load_signals(metadata)
ddf

Unnamed: 0_level_0,Fp1,Fp2,F7,F3,Fz,F4,F8,T3,C3,Cz,C4,T4,T5,P3,Pz,P4,T6,O1,O2
npartitions=1000,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [38]:
df = ddf.compute()
df

sig_ids = list(set(df.index.to_list()))
sig_ids

['ec2a8e98d',
 'e7ce7ea53',
 '110270772',
 '4a8ace4a7',
 'f01a73f05',
 '955d73257',
 'a6f07eec7',
 'c74244a73',
 '93d14b4e3',
 '322cc5813',
 '914849384',
 '2bd1c20f8',
 '866bd3fec',
 '5579f813c',
 '60269dbe0',
 '0c49aed96',
 'f1a5d7d46',
 '160354922',
 'fb8ae9775',
 '2cbfa6c0e',
 '820fbbf05',
 '89dfd0cfa',
 '08c566588',
 'ebff36d84',
 '89b8950d8',
 'c3c5f6b58',
 '74611891c',
 'c22c1634e',
 '550e5f044',
 '6d0845d85',
 '135f33192',
 'fb7b4bc2d',
 '93fb959ff',
 '6006f5e46',
 '617ad99f0',
 '5b3baedcb',
 '3f604920a',
 'b70df112d',
 '23b974dc9',
 'f0451b7c5',
 'faa42b5b3',
 'c4d128561',
 'f81c6d8b5',
 'e3d0d7ed6',
 'd9f6ae80b',
 '8374d9625',
 '8e112186d',
 'ab2192b99',
 '41631bf0a',
 'f41d9013e',
 '3973d5845',
 '46d2aa4dc',
 'b48e07930',
 'f98e0147c',
 '074596131',
 '213e6cec7',
 '1ed184b89',
 '0b6ec90f1',
 '0fa2bb843',
 '69ff37ace',
 '0ea91c9c1',
 'c7c93b5d1',
 'd01247125',
 '9da65221f',
 '6bcd5b7c3',
 '31eb59b57',
 'ff33b2297',
 '6d65ba8f3',
 'da9f5402a',
 'df38f81cc',
 '1cb65c7fd',
 '4d07

In [44]:
sig1 = ddf.loc['ec2a8e98d'].compute()
sig1

Unnamed: 0_level_0,Fp1,Fp2,F7,F3,Fz,F4,F8,T3,C3,Cz,C4,T4,T5,P3,Pz,P4,T6,O1,O2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ec2a8e98d,12.390000,9.080000,-5.650000,33.340000,16.690001,22.990000,-5.870000,45.020000,91.080002,29.030001,6.580000,-47.340000,50.389999,26.320000,3.310000,10.680000,-12.160000,45.570000,35.75
ec2a8e98d,-22.879999,-4.470000,-6.090000,24.020000,13.990000,17.610001,0.810000,41.869999,87.900002,26.340000,6.180000,-42.810001,48.700001,25.680000,7.300000,11.690000,-11.200000,46.400002,37.57
ec2a8e98d,-35.770000,-16.549999,-0.580000,11.020000,2.820000,6.530000,-9.130000,34.119999,75.760002,11.650000,-6.260000,-53.369999,39.240002,14.600000,1.530000,0.870000,-20.049999,35.490002,29.23
ec2a8e98d,-13.310000,18.969999,11.560000,16.100000,0.090000,12.810000,6.050000,37.639999,78.099998,12.090000,-5.640000,-46.189999,42.529999,17.490000,-5.360000,5.150000,-12.950000,37.820000,31.60
ec2a8e98d,-32.540001,19.940001,12.040000,23.740000,4.320000,17.740000,10.620000,45.840000,83.510002,14.980000,-0.820000,-36.680000,46.590000,20.370001,-2.270000,9.790000,-7.170000,39.680000,37.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ec2a8e98d,-121.769997,55.470001,-37.790001,-110.839996,-78.500000,-20.500000,-30.280001,-128.529999,-47.049999,-88.160004,-36.700001,-60.750000,-94.870003,-116.330002,54.700001,-44.250000,-18.790001,-35.560001,7.47
ec2a8e98d,-140.589996,44.259998,-66.930000,-131.259995,-106.120003,-44.279999,-32.360001,-127.150002,-59.369999,-103.029999,-48.520000,-64.040001,-94.620003,-119.120003,54.730000,-46.639999,-7.720000,-33.770000,13.01
ec2a8e98d,-177.429993,14.850000,-61.500000,-150.789993,-135.970001,-71.669998,-42.900002,-131.759995,-68.650002,-116.900002,-62.180000,-68.139999,-94.160004,-119.699997,42.139999,-51.709999,-20.660000,-33.400002,8.13
ec2a8e98d,-180.509995,-15.750000,-67.669998,-171.380005,-161.699997,-103.080002,-59.869999,-141.059998,-73.000000,-125.699997,-75.230003,-76.470001,-94.110001,-120.589996,40.270000,-53.930000,-22.670000,-32.340000,9.00


In [None]:
## MNE setup
import mne
import numpy as np

mne_info = mne.create_info(ch_names=sig1.columns.tolist(), sfreq=200, ch_types='eeg')
mne_info.set_montage('standard_1020')
    
sig1_data = np.array(sig1.transpose())
sig1_data = np.nan_to_num(sig1_data)
    
raw = mne.io.RawArray(sig1_data, mne_info)
raw.apply_function(lambda x: x / 20e6, picks='eeg')

In [None]:
# Apply filters
from src.preprocessing import notch_filter, bp_filter, standardize

l_freq = 1.0
h_freq = 70.0

df = notch_filter(df, 60)
df = bp_filter(df, 1.0, 70)
df = standardize(df)
df

In [39]:
# Extracting top 3 channels based on variance for all samples
# 1000 samples computation duration = approx. 10 minutes
from src.feature_extraction import calculate_all_samples

top_channels_df = calculate_all_samples(df, sig_ids, 10) # 10 samples for testing
top_channels_df # NEED TO EXTRACT FEATURES FROM CHANNELS IN THIS DATA STRUCTURE

Unnamed: 0,0,1,2
ec2a8e98d,F7,Pz,Fp1
e7ce7ea53,F3,Pz,P3
110270772,T6,F8,T4
4a8ace4a7,P3,T3,C3
f01a73f05,Fp1,F3,T5
...,...,...,...
bc498a548,,,
da0bd345b,,,
4f2d7b60b,,,
d5d2a9f36,,,


In [None]:
# Relevant code for testing purposes

# print(np.var(df['Fp1'], axis=0)) #variance for one col/channel
# fpl = df['Pz'].fillna(0).to_numpy() #converting to numby array for easier computation
#print(np.var(df['Fp1'], axis=0)) #variance for one col/channel

# one sample and their channels
# sig1 = df.loc[['521108392']]
# sig1
# sig1['Fp1'] # one sample and single channel
# print(np.var(fpl, axis=0))

# variance for one channel(Fp1) in one signal(4144388963)
# np.var(sig1['F7'].to_numpy())