## Introduction to the automated analysis of EEG quality

This notebook will introduce you to the challenge by going through the data and working towards a first very simple model.

## Loading the data

In [1]:
## First let's load the training data
from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter
import pandas as pd

ROOT_PATH = Path("../data/train/")
training_data = [(np.load(ROOT_PATH / f"data_{i}.npy"),np.load(ROOT_PATH / f"target_{i}.npy")) for i in range(4)]


We expect to have five channels and one label per channel for each two seconds of data.
Let's have a look at the data duration and shape

To remove the DC component and high frequency component we apply a band-pass filter

In [2]:
# Let's filter the signal to improve the visualisation

def butter_bandpass(lowcut, highcut, fs, order=5):
    return butter(order, [lowcut, highcut], fs=fs, btype='band')

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

We see that some of the high amplitude is classified as bad quality, we could use that to build a first simple model

## Exploring the statistics of the good and bad quality EEG

In [3]:
# First we need to get the point that maps to a label

def reshape_array_into_windows(x, sample_rate, window_duration_in_seconds):
    """
    Reshape the data into an array of shape (C, T, window) where 'window' contains
    the points corresponding to 'window_duration' seconds of data.

    Parameters:
    x (numpy array): The input data array.
    sample_rate (int): The number of samples per second.
    window_duration_in_seconds (float): The duration of each window in seconds.

    Returns:
    reshaped_x (numpy array): The reshaped array with shape (C, T, window).
    """
    # Calculate the number of samples in one window
    window_size = int(window_duration_in_seconds * sample_rate)
    
    # Ensure the total length of x is a multiple of window_size
    total_samples = x.shape[-1]
    if total_samples % window_size != 0:
        # Truncate or pad x to make it divisible by window_size
        x = x[..., :total_samples - (total_samples % window_size)]
    # Reshape x into (C, T, window)
    reshaped_x = x.reshape(x.shape[0], -1, window_size)

    return reshaped_x


## Building a XGBOOST Classifier model based on our observation

In [4]:
# We first load and reshape all the data
all_data = []
all_targets = []
for (data,target) in training_data:
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)
    targets_flatten = target[..., :len(reshaped_data[0])].reshape(-1)
    reshaped_data = reshaped_data.reshape((-1,reshaped_data.shape[-1]))
    all_data.append(reshaped_data)
    all_targets.append(targets_flatten)
all_data = np.concatenate(all_data)
all_targets = np.concatenate(all_targets)
assert all_data.shape[0] == all_targets.shape[0]


## Loading the features

In [5]:
features = pd.read_csv("features.csv", index_col=0)
features.head()

Unnamed: 0,amplitude,mean,max,min,stdev,skewness,kurtosis,hjorth_activity,hjorth_mobility,hjorth_complexity,...,sample_entropy,spectral_entropy,energy_band_0,energy_band_1,energy_band_2,energy_band_3,energy_band_4,energy_band_5,energy_band_6,energy_band_7
0,28600.257975,1245.670285,21471.069232,-7129.188743,7780.162127,0.765047,-0.610001,60530920.0,0.043994,6.48883,...,0.004544,0.055527,31317360000.0,202624900.0,2364575.0,273852.478188,7335.565786,10011.650727,17660.037549,11613.001761
1,7506.462109,-4965.798852,-476.438958,-7982.901067,2433.203314,0.304853,-1.31126,5920478.0,0.005689,34.624977,...,0.012692,0.115011,17250960000.0,20532.29,1374.747,1052.824111,176.143959,67.214865,3.894931,3.055691
2,4054.175414,2364.739039,3590.077431,-464.097984,1187.694358,-0.797484,-0.582127,1410618.0,0.00591,50.729171,...,0.00269,0.075448,3576688000.0,15743.83,889.0063,812.85486,145.044263,43.013947,2.302323,1.947904
3,2187.981464,2755.580822,3620.974382,1432.992918,695.837658,-0.312038,-1.245956,484190.0,0.006904,59.065314,...,0.015968,0.136786,4525491000.0,6757.636,384.9655,667.420601,125.16616,32.429763,1.707716,1.597904
4,2362.44708,8.122686,1420.37748,-942.0696,649.347508,0.350397,-0.977942,421652.2,0.006834,68.005797,...,0.017123,0.088179,290369800.0,16947.03,818.4609,814.47183,120.684325,28.069911,2.113378,2.217117


In [6]:
from features.utils import remove_collinear_features

# Remove collinear features
features = remove_collinear_features(features, threshold=0.6)
features.head()

Removed features: ['max', 'min', 'stdev', 'hjorth_activity', 'delta_power', 'alpha_power', 'beta_power', 'gamma_power', 'sample_entropy', 'spectral_entropy', 'energy_band_0', 'energy_band_1', 'energy_band_2', 'energy_band_3', 'energy_band_5', 'energy_band_6', 'energy_band_7']


Unnamed: 0,amplitude,mean,skewness,kurtosis,hjorth_mobility,hjorth_complexity,theta_power,shannon_entropy,energy_band_4
0,28600.257975,1245.670285,0.765047,-0.610001,0.043994,6.48883,422.099286,0.300821,7335.565786
1,7506.462109,-4965.798852,0.304853,-1.31126,0.005689,34.624977,529.876066,0.749798,176.143959
2,4054.175414,2364.739039,-0.797484,-0.582127,0.00591,50.729171,155.719216,0.747064,145.044263
3,2187.981464,2755.580822,-0.312038,-1.245956,0.006904,59.065314,362.701007,1.113054,125.16616
4,2362.44708,8.122686,0.350397,-0.977942,0.006834,68.005797,100.71629,1.015574,120.684325


In [7]:
%load_ext autoreload
%autoreload 2

In [8]:
# We train a model on 70% of the data and evaluate the model on the remaining 30%
from sklearn.model_selection import StratifiedKFold
from models.xgboost import train_xgboost_model, evaluate_model

# Define the number of splits for k-fold cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation results
evaluation_results = []

# Perform k-fold cross-validation
for train_index, val_index in skf.split(features, all_targets):
    x_train, x_val = features.iloc[train_index], features.iloc[val_index]
    y_train, y_val = all_targets[train_index], all_targets[val_index]
    
    # Train the model
    model = train_xgboost_model(x_train, y_train)
    
    # Evaluate the model
    evaluation_result = evaluate_model(model, x_val, y_val)
    evaluation_results.append(evaluation_result)

# Print the evaluation results
for i, result in enumerate(evaluation_results):
    print(f"Fold {i+1} evaluation result: {result}")

# Train the final model on the entire dataset
model = train_xgboost_model(features, all_targets)


Cohen's Kappa: 0.8789606108469012
F1 Score: 0.9513204351924337
Cohen's Kappa: 0.8775750197476239
F1 Score: 0.9507573336741868
Cohen's Kappa: 0.879366453561998
F1 Score: 0.9513752220035521
Cohen's Kappa: 0.8829990931093434
F1 Score: 0.9529122178888818
Cohen's Kappa: 0.880826643224924
F1 Score: 0.9521209508756805
Fold 1 evaluation result: None
Fold 2 evaluation result: None
Fold 3 evaluation result: None
Fold 4 evaluation result: None
Fold 5 evaluation result: None


We can now evaluate the cohen kappa

- What do you think of the performances ?
- What do you think of the split strategy ?
- What are additional features you could use ?

## Running the model on the test data and submitting to the leaderboard


In [25]:
from features.frequency_domain_features import extract_frequency_domain_features_multichannel
from features.complexity_features import extract_multichannel_entropy_features
from features.wavelet_decomposition import extract_wavelet_energy_features_multichannel
from features.time_domain_features import extract_time_domain_features
import xgboost as xgb
from models.features_list import list_features, collinear_features

ROOT_TEST_PATH = Path("../data/test/")
test_data = {i:np.load(ROOT_TEST_PATH / f"data_{i}.npy") for i in [4,5]}
# We process each record independantly

def compute_features_on_record(data):
    """
    We compute each of the feature for each window and each channel
    Each value of the output dict has shape (Channels,T)
    """
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)
    print("Before any feature extraction: ", reshaped_data.shape)
    
    time_features = extract_time_domain_features(reshaped_data, return_type="numpy")
    print("Time features shape: ", {k: time_features[k].shape for k in time_features})

    frequency_features = extract_frequency_domain_features_multichannel(reshaped_data)
    print("Frequency features shape: ", {k: frequency_features[k].shape for k in frequency_features})

    entropy_features = extract_multichannel_entropy_features(reshaped_data)
    print("Entropy features shape: ", {k: entropy_features[k].shape for k in entropy_features})

    wavelet_features = extract_wavelet_energy_features_multichannel(reshaped_data)
    print("Wavelet features shape: ", {k: wavelet_features[k].shape for k in wavelet_features})

    features = {**time_features, **frequency_features, **entropy_features, **wavelet_features}
    print("Features shape: ", {k:features[k].shape for k in features})
    print("Features name: ", list(features.keys()))
    
    return features  # {5 ch x 13k, 5 ch x 13k, . . .}

def compute_predictions_on_record(data,model,features_name_for_model):
    predictions = []
    features = compute_features_on_record(data)

    features = np.array([features[k] for k in features_name_for_model])  # (26, 5, 13000)
    print("Features shape for model: ", features.shape)
    features = features.swapaxes(0,1).swapaxes(1,2)  # (5, 13000, 26)
    
    print("Features shape for model: ", features.shape)
    for channel in range(features.shape[0]):
        features_df = pd.DataFrame(features[channel], columns=list_features)
        features_df.to_csv(f"features/test/features_test_channel_{channel}.csv")
        predictions.append(
            np.round(
                model.predict(
                    xgb.DMatrix(
                        data=features_df[collinear_features]
                    )
                )
            )
        )
    return np.array(predictions)

def format_array_to_target_format(array, record_number):
    assert isinstance(record_number, int)
    assert isinstance(array, np.ndarray)
    assert len(array.shape) == 2
    assert array.shape[0] == 5
    print(set(np.unique(array)))
    assert set(np.unique(array)) == {0, 1}
    formatted_target = []
    for i in range(array.shape[0]):
        channel_encoding = (i + 1) * 100000
        record_number_encoding = record_number * 1000000
        for j in range(array.shape[1]):
            formatted_target.append(
                {
                    "identifier": record_number_encoding + channel_encoding + j,
                    "target": array[i, j],
                }
            )
    return formatted_target


We the functions defined above, we can now run the model and submit the predictions

In [26]:
from models.features_list import list_features

results = []
for record_number, data in test_data.items():
    preds = compute_predictions_on_record(data, model, list_features)
    formatted_preds = format_array_to_target_format(preds,record_number)
    results.extend(formatted_preds)
df = pd.DataFrame(results)
df.to_csv("../results/xgboost-balanced-cv.csv",index = False)

Before any feature extraction:  (5, 13204, 500)
Time features shape:  {'amplitude': (5, 13204), 'mean': (5, 13204), 'max': (5, 13204), 'min': (5, 13204), 'stdev': (5, 13204), 'skewness': (5, 13204), 'kurtosis': (5, 13204), 'hjorth_activity': (5, 13204), 'hjorth_mobility': (5, 13204), 'hjorth_complexity': (5, 13204)}
Frequency features shape:  {'delta_power': (5, 13204), 'theta_power': (5, 13204), 'alpha_power': (5, 13204), 'beta_power': (5, 13204), 'gamma_power': (5, 13204)}
Entropy features shape:  {'shannon_entropy': (5, 13204), 'sample_entropy': (5, 13204), 'spectral_entropy': (5, 13204)}
Wavelet features shape:  {'energy_band_0': (5, 13204), 'energy_band_1': (5, 13204), 'energy_band_2': (5, 13204), 'energy_band_3': (5, 13204), 'energy_band_4': (5, 13204), 'energy_band_5': (5, 13204), 'energy_band_6': (5, 13204), 'energy_band_7': (5, 13204)}
Features shape:  {'amplitude': (5, 13204), 'mean': (5, 13204), 'max': (5, 13204), 'min': (5, 13204), 'stdev': (5, 13204), 'skewness': (5, 13204