In [1]:
import sys
from pathlib import Path
import pandas as pd
import tarfile
import urllib

def load_metadata():
  file_path = Path("data/train.csv")
  return pd.read_csv(file_path)
  
metadata = load_metadata()

def extract_eeg():
  eeg_dir = Path("../data/eeg")
  tarball_path = Path("data/eeg.tar.gz")
  if not tarball_path.is_file():
    url = 'https://dl.dropboxusercontent.com/scl/fi/5sina48c4naaxv6uze0fv/eeg.tar.gz?rlkey=r7ec191extynfcm8fy0tsiws5&dl=0'
    urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as eeg_tarball:
      eeg_tarball.extractall()
    
extract_eeg()

metadata

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,4144388963,140,604.0,1156825996,140,604.0,1451266906,59489,GRDA,0,0,0,0,3,0
1,2353475448,30,64.0,1002394133,30,64.0,4000072340,5339,LRDA,0,0,0,3,0,0
2,1618328341,9,52.0,900482955,9,52.0,4140697659,20198,GRDA,0,0,0,0,3,0
3,979865826,7,90.0,1626043434,7,90.0,919550440,1069,Other,1,1,4,1,4,5
4,521108392,0,0.0,827447277,0,0.0,1717414556,13134,Other,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2509824693,10,68.0,1005228554,15,330.0,219919562,14386,LPD,0,11,0,1,1,2
996,2882719839,14,50.0,2035369578,14,50.0,4193559045,2641,GPD,5,0,11,0,0,0
997,1322226281,2,18.0,1740512896,2,18.0,1697286566,49448,Other,0,0,0,0,0,3
998,628369060,15,98.0,13143748,17,292.0,1650460145,34998,GPD,0,3,7,0,2,4


In [2]:
import dask.dataframe as dd
from src.utils import compute_signal_hash

channel_order = ['Fp1', 'Fp2',
            'F7', 'F3', 'Fz', 'F4', 'F8', 
            'T3', 'C3', 'Cz', 'C4', 'T4', 
            'T5', 'P3', 'Pz', 'P4', 'T6', 
            'O1', 'O2',
          ]

def load_signals(metadata):
  rows = len(metadata)
  eeg_list = []

  for row in range(0,rows):
    sample = metadata.iloc[row]
    f_name = f'data/eeg/{sample.eeg_id}.parquet'
    eeg = pd.read_parquet(f_name)[channel_order]
    eeg_offset = int(sample.eeg_label_offset_seconds)

    eeg['id'] = str(compute_signal_hash(sample))
    eeg = eeg.set_index('id')

    eeg = eeg.iloc[eeg_offset*200:(eeg_offset+50)*200]
    eeg_list.append(eeg)

  return dd.concat(eeg_list)

ddf = load_signals(metadata)


In [3]:
df = ddf.compute()
df

sig_ids = list(set(df.index.to_list()))
sig_ids


['5035e2acd',
 '8a91cb1ad',
 '4a6d209e0',
 '99be200a4',
 '006e7f57f',
 '3636e9f35',
 '95d5c2973',
 'd6b0fcfaf',
 'e7e267993',
 '12d3d452f',
 'b2f06435e',
 '85f6c123e',
 'b9a0de027',
 'b45d5c6ee',
 'cab96cd31',
 'c4745e235',
 '06d0adee0',
 '7b7d47c43',
 '111390350',
 '4fb9790f4',
 'df20a2bdd',
 'c267c8f9f',
 '9c883ca7d',
 '96d03273a',
 '83eef2699',
 'e7ce7ea53',
 '122a594be',
 'fdd305b6a',
 '43f09d8e2',
 '38d1a58ff',
 '9039ac70b',
 '2e86dab84',
 'e492e31ee',
 '53d2b75d7',
 'f53931105',
 'cf2a707d4',
 '1ecff204c',
 '110270772',
 'cfac79190',
 '29fafdd8f',
 '4353b2615',
 '9856a2764',
 'daa92bca8',
 'e05e256ae',
 'edf43d851',
 'e991bd109',
 '0813aa5a7',
 'ab4c5ebd3',
 'a9c1362fa',
 '4c3c7255a',
 '2b4b9657b',
 '42fde4e24',
 '8135b03b0',
 '015eb1252',
 '7039fc846',
 '04da6a289',
 'd9636b936',
 'b409d1b51',
 'a4d64c67f',
 'd91b5cffe',
 '20ad68874',
 '2aaabc680',
 '51ce45a7d',
 'ffb9a987d',
 'a5f5a0de4',
 'c8f436b04',
 '84ad27c96',
 'c0c4b4ac4',
 '9d31c802a',
 '3ba3d8156',
 '7b693973a',
 '8c9a

In [4]:
## Apply the wavelet transform
import pywt
from src.preprocessing import wavelet_transform

sig1 = df.loc['554a28223']
max_levels = pywt.dwt_max_level(data_len=10000, filter_len=pywt.Wavelet('coif1').dec_len)

wavelet_transform(sig1, max_levels)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[channel] = cleaned_channel


Unnamed: 0_level_0,Fp1,Fp2,F7,F3,Fz,F4,F8,T3,C3,Cz,C4,T4,T5,P3,Pz,P4,T6,O1,O2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
554a28223,-984.511169,-876.734070,-1225.700684,-906.091736,-13.846349,-29.070328,-928.310669,-13.286700,27.069481,7.418628,-23.925968,-19.927547,19.441269,46.024136,35.952480,19.817194,-21.210072,65.054398,6.803778
554a28223,-260.331177,-270.122498,-475.205658,-212.220200,-9.920161,-32.047688,-203.939896,-23.594122,21.981354,10.798325,-26.403273,-23.975498,12.593662,39.720871,30.828548,13.370839,-26.334854,58.722820,1.418020
554a28223,489.202515,365.357178,358.590637,554.605774,-16.101706,-32.209091,449.888428,-24.820429,16.896597,3.263272,-27.147144,-23.186510,11.364991,36.389462,28.489307,12.381949,-25.586250,55.345875,1.749627
554a28223,-666.800537,-620.084778,-893.104858,-585.981323,-19.597702,-31.496923,-662.622070,-23.528051,13.968780,-1.274930,-27.446741,-19.829138,13.460982,35.238617,27.655241,12.328134,-22.897917,54.804611,3.037000
554a28223,-647.978271,-610.913391,-908.576233,-595.352356,-14.319089,-34.315193,-589.389404,-27.010290,11.278534,4.287117,-28.893570,-23.488106,8.393459,29.833605,21.684698,6.879334,-27.200588,50.027031,-1.334273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554a28223,-5065.542480,-3398.557861,-5890.711426,-4518.985352,-20.355412,-39.778660,-4315.083984,-9.850664,-10.341274,-25.150072,-43.656494,-10.814011,3.561765,8.876219,-7.731160,-8.667025,-9.507124,12.556088,-8.131554
554a28223,2149.230469,1263.906982,753.582397,1882.058472,-24.641586,-62.724018,1797.529175,-38.750835,-26.704504,-17.796974,-67.374870,-37.882484,-24.510349,-16.195755,-30.213629,-35.176456,-36.607941,-13.547744,-35.454292
554a28223,3338.802490,2796.140137,2563.083984,3421.479980,-44.665726,-52.331989,3362.990967,-21.062195,-30.859398,-49.483582,-55.373447,-19.159809,-6.244178,-4.644877,-20.930002,-18.833021,-17.113064,-0.398541,-16.994825
554a28223,-4850.237793,-2959.149414,-5332.207520,-4135.940430,-27.670534,-35.562897,-3893.759277,-3.329864,-13.393333,-37.226372,-40.189499,-1.732017,8.640246,11.209974,-5.805586,-2.985414,-1.309992,13.991497,-0.629811


In [5]:
sig1 = df.loc['554a28223']

In [6]:
## MNE setup
import mne
import numpy as np

mne_info = mne.create_info(ch_names=sig1.columns.tolist(), sfreq=200, ch_types='eeg')
mne_info.set_montage('standard_1020')
    
sig1_data = np.array(sig1.transpose())
sig1_data = np.nan_to_num(sig1_data)
    
raw = mne.io.RawArray(sig1_data, mne_info)
raw.apply_function(lambda x: x / 20e6, picks='eeg')

Creating RawArray with float64 data, n_channels=19, n_times=10000
    Range : 0 ... 9999 =      0.000 ...    49.995 secs
Ready.


0,1
Measurement date,Unknown
Experimenter,Unknown
Participant,Unknown

0,1
Digitized points,22 points
Good channels,19 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available

0,1
Sampling frequency,200.00 Hz
Highpass,0.00 Hz
Lowpass,100.00 Hz
Duration,00:00:50 (HH:MM:SS)


In [7]:
# Apply filters
from src.preprocessing import notch_filter, bp_filter, standardize

l_freq = 1.0
h_freq = 70.0

df = notch_filter(df, 60)
df = bp_filter(df, 1.0, 70)
df = standardize(df)
df

Creating RawArray with float64 data, n_channels=19, n_times=10000000
    Range : 0 ... 9999999 =      0.000 ... 49999.995 secs
Ready.
Filtering raw data in 1 contiguous segment
Setting up band-stop filter from 59 - 61 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandstop filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 59.35
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 59.10 Hz)
- Upper passband edge: 60.65 Hz
- Upper transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 60.90 Hz)
- Filter length: 1321 samples (6.605 s)



KeyboardInterrupt: 

In [None]:
# Extracting top 3 channels based on variance for all samples
# 1000 samples computation duration = approx. 10 minutes
from src.feature_extraction import calculate_all_samples

top_channels_df = calculate_all_samples(df, sig_ids, 10) # 10 samples for testing
top_channels_df # NEED TO EXTRACT FEATURES FROM CHANNELS IN THIS DATA STRUCTURE

Unnamed: 0,0,1,2
ec2a8e98d,F7,Pz,Fp1
e7ce7ea53,F3,Pz,P3
110270772,T6,F8,T4
4a8ace4a7,P3,T3,C3
f01a73f05,Fp1,F3,T5
...,...,...,...
bc498a548,,,
da0bd345b,,,
4f2d7b60b,,,
d5d2a9f36,,,


In [None]:
# Relevant code for testing purposes

# print(np.var(df['Fp1'], axis=0)) #variance for one col/channel
# fpl = df['Pz'].fillna(0).to_numpy() #converting to numby array for easier computation
#print(np.var(df['Fp1'], axis=0)) #variance for one col/channel

# one sample and their channels
# sig1 = df.loc[['521108392']]
# sig1
# sig1['Fp1'] # one sample and single channel
# print(np.var(fpl, axis=0))

# variance for one channel(Fp1) in one signal(4144388963)
# np.var(sig1['F7'].to_numpy())

# File created to test the correctness of extracted values using MATLAB
# Save Fp1 channel data into a MATLAB file
# import scipy.io
# scipy.io.savemat('Fp1_data.mat', {'Fp1_data': sig1['Fp1']})