In [3]:
import sys
from pathlib import Path
import pandas as pd
import tarfile
import urllib

def load_metadata():
  file_path = Path("data/train.csv")
  return pd.read_csv(file_path)
  
metadata = load_metadata()

def extract_eeg():
  eeg_dir = Path("../data/eeg")
  tarball_path = Path("data/eeg.tar.gz")
  if not tarball_path.is_file():
    url = 'https://dl.dropboxusercontent.com/scl/fi/5sina48c4naaxv6uze0fv/eeg.tar.gz?rlkey=r7ec191extynfcm8fy0tsiws5&dl=0'
    urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as eeg_tarball:
      eeg_tarball.extractall()
    
extract_eeg()

metadata

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,4144388963,140,604.0,1156825996,140,604.0,1451266906,59489,GRDA,0,0,0,0,3,0
1,2353475448,30,64.0,1002394133,30,64.0,4000072340,5339,LRDA,0,0,0,3,0,0
2,1618328341,9,52.0,900482955,9,52.0,4140697659,20198,GRDA,0,0,0,0,3,0
3,979865826,7,90.0,1626043434,7,90.0,919550440,1069,Other,1,1,4,1,4,5
4,521108392,0,0.0,827447277,0,0.0,1717414556,13134,Other,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2509824693,10,68.0,1005228554,15,330.0,219919562,14386,LPD,0,11,0,1,1,2
996,2882719839,14,50.0,2035369578,14,50.0,4193559045,2641,GPD,5,0,11,0,0,0
997,1322226281,2,18.0,1740512896,2,18.0,1697286566,49448,Other,0,0,0,0,0,3
998,628369060,15,98.0,13143748,17,292.0,1650460145,34998,GPD,0,3,7,0,2,4


In [None]:
import dask.dataframe as dd
from src.utils import compute_signal_hash

channel_order = ['Fp1', 'Fp2',
            'F7', 'F3', 'Fz', 'F4', 'F8', 
            'T3', 'C3', 'Cz', 'C4', 'T4', 
            'T5', 'P3', 'Pz', 'P4', 'T6', 
            'O1', 'O2',
          ]

def load_signals(metadata):
  rows = len(metadata)
  eeg_list = []

  for row in range(0,rows):
    sample = metadata.iloc[row]
    f_name = f'data/eeg/{sample.eeg_id}.parquet'
    eeg = pd.read_parquet(f_name)[channel_order]
    eeg_offset = int(sample.eeg_label_offset_seconds)

    eeg['eeg_id'] = str(sample.eeg_id)
    eeg = eeg.set_index('eeg_id')

    eeg = eeg.iloc[eeg_offset*200:(eeg_offset+50)*200]
    eeg_list.append(eeg)

  return dd.concat(eeg_list)

ddf = load_signals(metadata)


In [None]:
df = ddf.compute()
df
eeg_ids = metadata['eeg_id'].to_list()

In [None]:
sig1 = df.loc['4144388963']

Extracting top 3 channels based on max variance for all samples
- 1000 samples computation duration = approx. 15 minutes

In [None]:
from src.feature_extraction import calculate_all_samples

top_channels_df = calculate_all_samples(df, eeg_ids, 1000) 
top_channels_df 

Extracting Statistical Features from every sample with extraction function
- 1000 samples computation duration = approx. 20 minutes

In [None]:
from src.feature_extraction import extract_features_all_samples

features_df = extract_features_all_samples(df, top_channels_df)
features_df

Setting up Feature Data and Target Data for correct format to split data and Input to Microsoft's Light Gradient Boosting Machine (LGBM)

In [None]:
# Resetting the index of features_df 
features_df = features_df.reset_index().rename(columns={'index': 'eeg_id'})

metadata = metadata.drop_duplicates(subset='eeg_id')
# Merging feature data with eeg data, expert consensus values converted to numerical values
x_train = pd.merge(metadata, features_df, on='eeg_id', how='inner')
x_train['expert_consensus'] = x_train['expert_consensus'].replace({'Seizure': 0, 'LPD': 1, 'GPD': 2,'LRDA':3, 'GRDA':4, 'Other':5 })

x_train = x_train.set_index('eeg_id')

Inputting Parameters for LGBM Model
- parameters were obtained by observing similiar implementation in same competition project using LGBM library. (see report doc --> citations/acknowledgements for more details) 
- Slight adjustments to parameters applied to fit our implementation

In [None]:
from lightgbm import LGBMClassifier
import lightgbm as lgb


params = {
    'objective': 'multiclass',
    'num_class': 6,
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 121,
    'learning_rate': 0.018623105710769177,
    'feature_fraction': 1.0,
    'bagging_fraction': 0.756777580360579,
    'max_depth': 8,
    'verbose': 0
}

lgb_model = lgb.LGBMClassifier(**params)
# Dropping Columns that are not needed for training (Feature Data and Labels are kept)
lgb_train = x_train.drop(columns = ['eeg_sub_id','eeg_label_offset_seconds','spectrogram_id','spectrogram_sub_id','spectrogram_label_offset_seconds','label_id', 'patient_id','seizure_vote','lpd_vote', 'gpd_vote', 'lrda_vote','grda_vote','other_vote','expert_consensus']).copy()


Splitting Data 80/20 and adjusting params to obtain training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

# lgb_train = feature data, x_train.expert_consensus = target labels
X_train, X_test, y_train, y_test = train_test_split(lgb_train, x_train.expert_consensus, test_size=0.2, random_state=42)
columns_to_convert = ['std','mean','max','min','var','med','skew','kurt','ent','mom','pow']
X_train[columns_to_convert] = X_train[columns_to_convert].astype(float)

# Setting the column names to be integers for correct indexing
X_train.columns = range(X_train.shape[1])
X_test.columns = range(X_test.shape[1])

# Converting y_train and y_test to a one-dimensional series
y_train = y_train.squeeze()
y_test = y_test.squeeze()
X_test = X_test.astype(float)

# Setting the feature column names to be integers for correct indexing
X_train.columns = range(X_train.shape[1])
X_test.columns = range(X_test.shape[1])

Training our LGBM model on the training data and evaluating it on the test data - Probabilities for each target label are obtained and Displayed

In [None]:
lgb_model.fit(X_train, y_train)
y_pred_proba = lgb_model.predict_proba(X_test)

# Predicted probabilities to DataFrame
pred_df = pd.DataFrame(y_pred_proba, columns=['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote'])
pred_df['eeg_id'] = X_test.index
pred_df = pred_df[['eeg_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']]

pred_df

Generating Confusion Matrix with predicted labels and true labels

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = lgb_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

Generating result metrics to evaluate our multiclassification model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Preprocessing Data: Applying Wavelet Transform, Notch filter, Standardization to our data
- Model was trained without preprocessing due to higher scores without

In [None]:
## Apply the wavelet transform
import pywt
from src.preprocessing import wavelet_transform

max_levels = pywt.dwt_max_level(data_len=10000, filter_len=pywt.Wavelet('coif1').dec_len)

for sig_id in eeg_ids:
  df.loc[sig_id] = wavelet_transform(df.loc[sig_id], max_levels)

In [None]:
## MNE setup
import mne
import numpy as np

mne_info = mne.create_info(ch_names=sig1.columns.tolist(), sfreq=200, ch_types='eeg')
mne_info.set_montage('standard_1020')
    
sig1_data = np.array(sig1.transpose())
sig1_data = np.nan_to_num(sig1_data)

raw = mne.io.RawArray(sig1_data, mne_info)
raw.apply_function(lambda x: x / 20e6, picks='eeg')

In [None]:
# Apply filters
from src.preprocessing import notch_filter, bp_filter, standardize

l_freq = 1.0
h_freq = 70.0

df = notch_filter(df, 60)
df = bp_filter(df, 1.0, 70)
df = standardize(df)
df

In [None]:
# File created to test the correctness of extracted values using MATLAB
# Save Fp1 channel data into a MATLAB file
# import scipy.io
# scipy.io.savemat('Fp1_data.mat', {'Fp1_data': sig1['Fp1']})