In [1]:
import sys
from pathlib import Path
import pandas as pd
import tarfile
import urllib

def load_metadata():
  file_path = Path("data/train.csv")
  return pd.read_csv(file_path)
  
metadata = load_metadata()

def extract_eeg():
  eeg_dir = Path("../data/eeg")
  tarball_path = Path("data/eeg.tar.gz")
  if not tarball_path.is_file():
    url = 'https://dl.dropboxusercontent.com/scl/fi/5sina48c4naaxv6uze0fv/eeg.tar.gz?rlkey=r7ec191extynfcm8fy0tsiws5&dl=0'
    urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as eeg_tarball:
      eeg_tarball.extractall()
    
extract_eeg()

metadata

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,4144388963,140,604.0,1156825996,140,604.0,1451266906,59489,GRDA,0,0,0,0,3,0
1,2353475448,30,64.0,1002394133,30,64.0,4000072340,5339,LRDA,0,0,0,3,0,0
2,1618328341,9,52.0,900482955,9,52.0,4140697659,20198,GRDA,0,0,0,0,3,0
3,979865826,7,90.0,1626043434,7,90.0,919550440,1069,Other,1,1,4,1,4,5
4,521108392,0,0.0,827447277,0,0.0,1717414556,13134,Other,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2509824693,10,68.0,1005228554,15,330.0,219919562,14386,LPD,0,11,0,1,1,2
996,2882719839,14,50.0,2035369578,14,50.0,4193559045,2641,GPD,5,0,11,0,0,0
997,1322226281,2,18.0,1740512896,2,18.0,1697286566,49448,Other,0,0,0,0,0,3
998,628369060,15,98.0,13143748,17,292.0,1650460145,34998,GPD,0,3,7,0,2,4


In [2]:
import dask.dataframe as dd
from src.utils import compute_signal_hash

channel_order = ['Fp1', 'Fp2',
            'F7', 'F3', 'Fz', 'F4', 'F8', 
            'T3', 'C3', 'Cz', 'C4', 'T4', 
            'T5', 'P3', 'Pz', 'P4', 'T6', 
            'O1', 'O2',
          ]

def load_signals(metadata):
  rows = len(metadata)
  eeg_list = []

  for row in range(0,rows):
    sample = metadata.iloc[row]
    f_name = f'data/eeg/{sample.eeg_id}.parquet'
    eeg = pd.read_parquet(f_name)[channel_order]
    eeg_offset = int(sample.eeg_label_offset_seconds)

    eeg['id'] = str(compute_signal_hash(sample))
    eeg = eeg.set_index('id')

    eeg = eeg.iloc[eeg_offset*200:(eeg_offset+50)*200]
    eeg_list.append(eeg)

  return dd.concat(eeg_list)

ddf = load_signals(metadata)


In [3]:
df = ddf.compute()
df

sig_ids = list(set(df.index.to_list()))
sig_ids


['18fe5ea76',
 '7b7d47c43',
 'bfb2a73b4',
 '7ac4030e6',
 'dc80c9dfe',
 '59a00790c',
 'ab4ed2798',
 '9df6b2c1e',
 'c8f436b04',
 '324961cd9',
 '19ca7a1f6',
 'cbf60be47',
 '704f22883',
 'e56fff22d',
 '41fabcf30',
 '8135b03b0',
 '9ad4c9cf7',
 'e991bd109',
 'fb7b93120',
 'eaddd42cb',
 'd35697e9f',
 '06d0adee0',
 '9c655e292',
 '35b9913cd',
 '074596131',
 '2e86dab84',
 '4a38a8134',
 '0bfde5f9d',
 '074b55d3d',
 '9bcc210e0',
 '616dbd9e2',
 '8cb20395b',
 '111390350',
 'b97046429',
 '5f5bb690b',
 'd9a794cdb',
 '9a2ce96c5',
 '1c5ed22bd',
 '54feb046e',
 '0d9f5e183',
 '0bced5c79',
 'b70df112d',
 'afd05328c',
 '559d655fe',
 '110270772',
 '28a8b5d34',
 'c9946ef2d',
 '000d515bc',
 '7824b136a',
 'cadd7f158',
 '4635ed902',
 '206ef174a',
 '62d2ee3e8',
 '68cfdf575',
 '7c4545352',
 '7d48f93d3',
 'f2e4e78da',
 'ad538d1d4',
 'b3caa8d8d',
 'ed0df7a47',
 'f1a1f5cca',
 'bcf30793a',
 'f686d91f2',
 'cc13f4074',
 '23052f302',
 'd451159f6',
 'eb283a303',
 '4127ebe87',
 '89571ba38',
 '31aef1468',
 'ef9bc8aca',
 'ded9

In [4]:
## Apply the wavelet transform
import pywt
from src.preprocessing import wavelet_transform

sig1 = df.loc['554a28223']
max_levels = pywt.dwt_max_level(data_len=10000, filter_len=pywt.Wavelet('coif1').dec_len)

wavelet_transform(sig1, max_levels)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[channel] = cleaned_channel


Unnamed: 0_level_0,Fp1,Fp2,F7,F3,Fz,F4,F8,T3,C3,Cz,C4,T4,T5,P3,Pz,P4,T6,O1,O2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
554a28223,-321.262634,-276.429047,-481.192169,-219.418854,-11.192525,-28.735918,-244.464645,-20.542229,20.920938,7.780495,-26.900488,-24.411654,16.755802,40.199436,31.532873,18.424831,-28.062050,58.375713,4.613134
554a28223,-322.056244,-277.201935,-482.492645,-219.677582,-10.683583,-27.933582,-244.652390,-20.523211,21.909281,7.975334,-26.197697,-24.748871,17.381937,41.442375,32.975235,20.210745,-28.768248,59.112938,5.099830
554a28223,-323.161835,-278.292969,-484.306061,-220.074295,-9.982683,-26.781773,-244.956161,-20.499950,23.701698,8.249864,-25.130291,-25.218176,18.255968,43.527622,35.330185,22.896301,-29.729155,60.271263,5.794386
554a28223,-322.007019,-277.078705,-482.403046,-219.471100,-10.672517,-28.161940,-244.419174,-20.507420,19.674946,7.947012,-26.704594,-24.730492,17.333971,39.514023,31.075871,19.063669,-28.844727,58.378559,4.983001
554a28223,-321.019867,-276.035461,-480.775665,-218.942078,-11.259166,-29.360046,-243.944519,-20.511358,16.054626,7.684843,-28.096663,-24.313259,16.545521,35.932495,27.295282,15.703547,-28.102045,56.708237,4.284483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554a28223,-1983.957520,-818.085388,-2241.071289,-1517.300415,-29.736858,-49.850250,-1473.876953,-19.657503,-15.668918,-34.555775,-54.498058,-21.346954,-4.167595,1.932437,-15.786781,-16.464008,-18.666327,5.816536,-15.878128
554a28223,-650.010376,-561.674744,-1471.194824,-602.110352,-29.705521,-54.954521,-481.271240,-21.842779,-20.850029,-34.486015,-57.790001,-23.360613,-5.562373,-1.765835,-18.923599,-18.816568,-20.560150,3.346204,-17.280712
554a28223,-940.404968,-620.586060,-1648.391846,-806.463562,-29.675159,-52.331905,-702.181641,-19.935198,-19.086557,-34.415524,-55.642334,-21.825686,-4.816867,0.295480,-17.512161,-17.222387,-19.186632,4.281776,-16.264330
554a28223,-1021.950806,-637.805420,-1700.359253,-863.887512,-29.645067,-49.889622,-764.252502,-17.973743,-18.215117,-34.346516,-53.409626,-20.190914,-4.027854,2.445187,-16.225748,-15.533817,-17.724480,5.305260,-15.193349


In [5]:
sig1 = df.loc['554a28223']

In [6]:
## MNE setup
import mne
import numpy as np

mne_info = mne.create_info(ch_names=sig1.columns.tolist(), sfreq=200, ch_types='eeg')
mne_info.set_montage('standard_1020')
    
sig1_data = np.array(sig1.transpose())
sig1_data = np.nan_to_num(sig1_data)
    
raw = mne.io.RawArray(sig1_data, mne_info)
raw.apply_function(lambda x: x / 20e6, picks='eeg')

Creating RawArray with float64 data, n_channels=19, n_times=10000
    Range : 0 ... 9999 =      0.000 ...    49.995 secs
Ready.


0,1
Measurement date,Unknown
Experimenter,Unknown
Participant,Unknown

0,1
Digitized points,22 points
Good channels,19 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available

0,1
Sampling frequency,200.00 Hz
Highpass,0.00 Hz
Lowpass,100.00 Hz
Duration,00:00:50 (HH:MM:SS)


In [7]:
# Apply filters
from src.preprocessing import notch_filter, bp_filter, standardize

l_freq = 1.0
h_freq = 70.0

df = notch_filter(df, 60)
df = bp_filter(df, 1.0, 70)
df = standardize(df)
df

Creating RawArray with float64 data, n_channels=19, n_times=10000000
    Range : 0 ... 9999999 =      0.000 ... 49999.995 secs
Ready.


KeyboardInterrupt: 

In [None]:
# Extracting top 3 channels based on variance for all samples
# 1000 samples computation duration = approx. 10 minutes
from src.feature_extraction import calculate_all_samples

top_channels_df = calculate_all_samples(df, sig_ids, 10) # 10 samples for testing
top_channels_df # NEED TO EXTRACT FEATURES FROM CHANNELS IN THIS DATA STRUCTURE

Unnamed: 0,0,1,2
ec2a8e98d,F7,Pz,Fp1
e7ce7ea53,F3,Pz,P3
110270772,T6,F8,T4
4a8ace4a7,P3,T3,C3
f01a73f05,Fp1,F3,T5
...,...,...,...
bc498a548,,,
da0bd345b,,,
4f2d7b60b,,,
d5d2a9f36,,,


In [None]:
# Relevant code for testing purposes

# print(np.var(df['Fp1'], axis=0)) #variance for one col/channel
# fpl = df['Pz'].fillna(0).to_numpy() #converting to numby array for easier computation
#print(np.var(df['Fp1'], axis=0)) #variance for one col/channel

# one sample and their channels
# sig1 = df.loc[['521108392']]
# sig1
# sig1['Fp1'] # one sample and single channel
# print(np.var(fpl, axis=0))

# variance for one channel(Fp1) in one signal(4144388963)
# np.var(sig1['F7'].to_numpy())

# File created to test the correctness of extracted values using MATLAB
# Save Fp1 channel data into a MATLAB file
# import scipy.io
# scipy.io.savemat('Fp1_data.mat', {'Fp1_data': sig1['Fp1']})