## Introduction to the automated analysis of EEG quality

This notebook will introduce you to the challenge by going through the data and working towards a first very simple model.

## Loading the data

In [2]:
## First let's load the training data
from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter
import pandas as pd

ROOT_PATH = Path("../data/train/")
training_data = [(np.load(ROOT_PATH / f"data_{i}.npy"),np.load(ROOT_PATH / f"target_{i}.npy")) for i in range(4)]


We expect to have five channels and one label per channel for each two seconds of data.
Let's have a look at the data duration and shape

In [3]:
# Let's filter the signal to improve the visualisation

def butter_bandpass(lowcut, highcut, fs, order=5):
    return butter(order, [lowcut, highcut], fs=fs, btype='band')

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [4]:
# First we need to get the point that maps to a label

def reshape_array_into_windows(x, sample_rate, window_duration_in_seconds):
    """
    Reshape the data into an array of shape (C, T, window) where 'window' contains
    the points corresponding to 'window_duration' seconds of data.

    Parameters:
    x (numpy array): The input data array.
    sample_rate (int): The number of samples per second.
    window_duration_in_seconds (float): The duration of each window in seconds.

    Returns:
    reshaped_x (numpy array): The reshaped array with shape (C, T, window).
    """
    # Calculate the number of samples in one window
    window_size = int(window_duration_in_seconds * sample_rate)
    
    # Ensure the total length of x is a multiple of window_size
    total_samples = x.shape[-1]
    if total_samples % window_size != 0:
        # Truncate or pad x to make it divisible by window_size
        x = x[..., :total_samples - (total_samples % window_size)]
    # Reshape x into (C, T, window)
    reshaped_x = x.reshape(x.shape[0], -1, window_size)

    return reshaped_x


## Building a simple model based on our observation

In [5]:
# We first load and reshape all the data
all_data = []
all_targets = []
for (data,target) in training_data:
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)
    targets_flatten = target[..., :len(reshaped_data[0])].reshape(-1)
    reshaped_data = reshaped_data.reshape((-1,reshaped_data.shape[-1]))
    all_data.append(reshaped_data)
    all_targets.append(targets_flatten)
all_data = np.concatenate(all_data)
all_targets = np.concatenate(all_targets)
assert all_data.shape[0] == all_targets.shape[0]


In [6]:
features = pd.read_csv("features/features.csv", index_col=0)
features.head()

Unnamed: 0,amplitude,mean,max,min,stdev,skewness,kurtosis,hjorth_activity,hjorth_mobility,hjorth_complexity,...,sample_entropy,spectral_entropy,energy_band_0,energy_band_1,energy_band_2,energy_band_3,energy_band_4,energy_band_5,energy_band_6,energy_band_7
0,28600.257975,1245.670285,21471.069232,-7129.188743,7780.162127,0.765047,-0.610001,60530920.0,0.043994,6.48883,...,0.004544,0.055527,31317360000.0,202624900.0,2364575.0,273852.478188,7335.565786,10011.650727,17660.037549,11613.001761
1,7506.462109,-4965.798852,-476.438958,-7982.901067,2433.203314,0.304853,-1.31126,5920478.0,0.005689,34.624977,...,0.012692,0.115011,17250960000.0,20532.29,1374.747,1052.824111,176.143959,67.214865,3.894931,3.055691
2,4054.175414,2364.739039,3590.077431,-464.097984,1187.694358,-0.797484,-0.582127,1410618.0,0.00591,50.729171,...,0.00269,0.075448,3576688000.0,15743.83,889.0063,812.85486,145.044263,43.013947,2.302323,1.947904
3,2187.981464,2755.580822,3620.974382,1432.992918,695.837658,-0.312038,-1.245956,484190.0,0.006904,59.065314,...,0.015968,0.136786,4525491000.0,6757.636,384.9655,667.420601,125.16616,32.429763,1.707716,1.597904
4,2362.44708,8.122686,1420.37748,-942.0696,649.347508,0.350397,-0.977942,421652.2,0.006834,68.005797,...,0.017123,0.088179,290369800.0,16947.03,818.4609,814.47183,120.684325,28.069911,2.113378,2.217117


In [7]:
# We train a model on 70% of the data and evaluate the model on the remaining 30%
prop_train = 0.7
n_train = int(prop_train * len(features))

x_train = features[:n_train]
y_train = all_targets[:n_train]

x_val = features[n_train:]
y_val = all_targets[n_train:]

In [10]:
from models.automl import train_automl_model

# Train the AutoML model
model = train_automl_model(x_train, y_train)


                                                                                   
Generation 1 - Current best internal CV score: 0.9408058660414016
                                                                                    
Generation 2 - Current best internal CV score: 0.9408058660414016
                                                                                    
Generation 3 - Current best internal CV score: 0.9408058660414016
                                                                                   
Generation 4 - Current best internal CV score: 0.9408058660414016
                                                                                
Generation 5 - Current best internal CV score: 0.9408058660414016
                                                             
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.7000000000000001, min_samples_leaf=11, min_samples_split=4, n_estimators=100)


In [11]:
from models.automl import evaluate_model

# Evaluate the model
evaluate_model(model, x_val, y_val)

Cohen:  0.6903692881768673
F1 score:  0.7940495245268807


Save the model

In [14]:
model.export("models/automl_model.pkl")

We can now evaluate the cohen kappa

- What do you think of the performances ?
- What do you think of the split strategy ?
- What are additional features you could use ?

## Running the model on the test data and submitting to the leaderboard


In [27]:
# %load_ext autoreload
# %autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
from features.frequency_domain_features import extract_frequency_domain_features_multichannel
from features.time_domain_features import extract_time_domain_features
from features.complexity_features import extract_multichannel_entropy_features
from features.wavelet_decomposition import extract_wavelet_energy_features_multichannel

ROOT_TEST_PATH = Path("../data/test/")
test_data = {i:np.load(ROOT_TEST_PATH / f"data_{i}.npy") for i in [4,5]}
# We process each record independantly

def compute_features_on_record(data):
    """
    We compute each of the feature for each window and each channel
    Each value of the output dict has shape (Channels,T)
    """
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)
    print("Before any feature extraction: ", reshaped_data.shape)
    
    time_features = extract_time_domain_features(reshaped_data, return_type="numpy")
    print("Time features shape: ", {k: time_features[k].shape for k in time_features})

    frequency_features = extract_frequency_domain_features_multichannel(reshaped_data)
    print("Frequency features shape: ", {k: frequency_features[k].shape for k in frequency_features})

    entropy_features = extract_multichannel_entropy_features(reshaped_data)
    print("Entropy features shape: ", {k: entropy_features[k].shape for k in entropy_features})

    wavelet_energy_features_multichannel = extract_wavelet_energy_features_multichannel(reshaped_data)
    print("Wavelet energy features shape: ", {k: wavelet_energy_features_multichannel[k].shape for k in wavelet_energy_features_multichannel})
    
    features = {**time_features, **frequency_features, **entropy_features, **wavelet_energy_features_multichannel}
    print("Features shape: ", {k:features[k].shape for k in features})
    
    return features  # {5 ch x 13k, 5 ch x 13k, . . .}



def compute_predictions_on_record(data,model,features_name_for_model):
    predictions = []
    features = compute_features_on_record(data)
    features = np.array([features[k] for k in features_name_for_model]) 
    features = features.swapaxes(0,1).swapaxes(1,2)
    for channel in range(features.shape[0]):
        predictions.append(model.predict(features[channel]))
    return np.array(predictions)

def format_array_to_target_format(array, record_number):
    assert isinstance(record_number, int)
    assert isinstance(array, np.ndarray)
    assert len(array.shape) == 2
    assert array.shape[0] == 5
    print(set(np.unique(array)))
    assert set(np.unique(array)) == {0, 1}
    formatted_target = []
    for i in range(array.shape[0]):
        channel_encoding = (i + 1) * 100000
        record_number_encoding = record_number * 1000000
        for j in range(array.shape[1]):
            formatted_target.append(
                {
                    "identifier": record_number_encoding + channel_encoding + j,
                    "target": array[i, j],
                }
            )
    return formatted_target


We the functions defined above, we can now run the model and submit the predictions

In [24]:
from models.features_list import best_features

results = []
for record_number, data in test_data.items():
    preds = compute_predictions_on_record(data, model, best_features)
    formatted_preds = format_array_to_target_format(preds,record_number)
    results.extend(formatted_preds)
df = pd.DataFrame(results)
df.to_csv("../results/auto-ml.csv",index = False)

Before any feature extraction:  (5, 13204, 500)
Time features shape:  {'amplitude': (5, 13204), 'mean': (5, 13204), 'max': (5, 13204), 'min': (5, 13204), 'stdev': (5, 13204), 'skewness': (5, 13204), 'kurtosis': (5, 13204), 'hjorth_activity': (5, 13204), 'hjorth_mobility': (5, 13204), 'hjorth_complexity': (5, 13204)}
Frequency features shape:  {'delta_power': (5, 13204), 'theta_power': (5, 13204), 'alpha_power': (5, 13204), 'beta_power': (5, 13204), 'gamma_power': (5, 13204)}
Entropy features shape:  {'shannon_entropy': (5, 13204), 'sample_entropy': (5, 13204), 'spectral_entropy': (5, 13204)}
Wavelet energy features shape:  {'energy_band_0': (5, 13204), 'energy_band_1': (5, 13204), 'energy_band_2': (5, 13204), 'energy_band_3': (5, 13204), 'energy_band_4': (5, 13204), 'energy_band_5': (5, 13204), 'energy_band_6': (5, 13204), 'energy_band_7': (5, 13204)}
Features shape:  {'amplitude': (5, 13204), 'mean': (5, 13204), 'max': (5, 13204), 'min': (5, 13204), 'stdev': (5, 13204), 'skewness': (5

In [19]:
results = pd.read_csv("../results/auto-ml.csv")
len(results)

112615