## Introduction to the automated analysis of EEG quality

This notebook will introduce you to the challenge by going through the data and working towards a first very simple model.

## Loading the data

In [1]:
## First let's load the training data
from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter
import pandas as pd


In [4]:
ROOT_PATH = Path("../data/train/")
training_data = [(np.load(ROOT_PATH / f"data_{i}.npy"),np.load(ROOT_PATH / f"target_{i}.npy")) for i in range(4)]

We expect to have five channels and one label per channel for each two seconds of data.
Let's have a look at the data duration and shape

In [6]:
# Let's filter the signal to improve the visualisation

def butter_bandpass(lowcut, highcut, fs, order=5):
    return butter(order, [lowcut, highcut], fs=fs, btype='band')

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [7]:
# First we need to get the point that maps to a label

def reshape_array_into_windows(x, sample_rate, window_duration_in_seconds):
    """
    Reshape the data into an array of shape (C, T, window) where 'window' contains
    the points corresponding to 'window_duration' seconds of data.

    Parameters:
    x (numpy array): The input data array.
    sample_rate (int): The number of samples per second.
    window_duration_in_seconds (float): The duration of each window in seconds.

    Returns:
    reshaped_x (numpy array): The reshaped array with shape (C, T, window).
    """
    # Calculate the number of samples in one window
    window_size = int(window_duration_in_seconds * sample_rate)
    
    # Ensure the total length of x is a multiple of window_size
    total_samples = x.shape[-1]
    if total_samples % window_size != 0:
        # Truncate or pad x to make it divisible by window_size
        x = x[..., :total_samples - (total_samples % window_size)]
    # Reshape x into (C, T, window)
    reshaped_x = x.reshape(x.shape[0], -1, window_size)

    return reshaped_x


## Building a simple model based on our observation

In [8]:
# We first load and reshape all the data
all_data = []
all_targets = []
for (data,target) in training_data:
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)
    targets_flatten = target[..., :len(reshaped_data[0])].reshape(-1)
    reshaped_data = reshaped_data.reshape((-1,reshaped_data.shape[-1]))
    all_data.append(reshaped_data)
    all_targets.append(targets_flatten)
all_data = np.concatenate(all_data)
all_targets = np.concatenate(all_targets)
assert all_data.shape[0] == all_targets.shape[0]


In [5]:
# We can now compute the features over each 2 seconds segment
from features.time_domain_features import extract_time_domain_features, extract_amplitude_modulation_features
from features.frequency_domain_features import extract_frequency_domain_features, extract_relative_power
from features.complexity_features import extract_entropy_features
from features.wavelet_decomposition import extract_wavelet_energy_features

time_features = extract_time_domain_features(all_data, return_type="dataframe")
amplitude_modulation_features = extract_amplitude_modulation_features(all_data)
frequency_features = extract_frequency_domain_features(all_data)
relative_power_features = extract_relative_power(all_data)
entropy_features = extract_entropy_features(all_data)
wavelet_energy_features = extract_wavelet_energy_features(all_data)

# Combine the time, frequency, and entropy features into a single DataFrame
features = pd.concat([time_features, amplitude_modulation_features, 
                      frequency_features, relative_power_features,
                      entropy_features, wavelet_energy_features], axis=1)

In [7]:
features.to_csv("features/more-features.csv", index=False)

In [2]:
features = pd.read_csv("features/more-features.csv")
features.columns

Index(['amplitude', 'mean', 'max', 'min', 'stdev', 'skewness', 'kurtosis',
       'hjorth_activity', 'hjorth_mobility', 'hjorth_complexity',
       'envelope_mean', 'envelope_std', 'envelope_max', 'envelope_min',
       'delta_power', 'theta_power', 'alpha_power', 'beta_power',
       'gamma_power', 'delta_relative_power', 'theta_relative_power',
       'alpha_relative_power', 'beta_relative_power', 'gamma_relative_power',
       'shannon_entropy', 'sample_entropy', 'spectral_entropy',
       'energy_band_0', 'energy_band_1', 'energy_band_2', 'energy_band_3',
       'energy_band_4', 'energy_band_5', 'energy_band_6', 'energy_band_7'],
      dtype='object')

In [7]:
# We train a model on 70% of the data and evaluate the model on the remaining 30%
prop_train = 0.7
n_train = int(prop_train * len(features))

x_train = features[:n_train]
y_train = all_targets[:n_train]

x_val = features[n_train:]
y_val = all_targets[n_train:]

In [9]:
from models.automl import train_automl_model

# Train the AutoML model
model = train_automl_model(features, all_targets)


                                                                                    
Generation 1 - Current best internal CV score: 0.9178162785811159
                                                                                      
Generation 2 - Current best internal CV score: 0.9178162785811159
                                                                                    
Generation 3 - Current best internal CV score: 0.9178162785811159
                                                                                    
Generation 4 - Current best internal CV score: 0.9183358484078623
                                                               
Generation 5 - Current best internal CV score: 0.91869878321331
                                                               
Best pipeline: XGBClassifier(CombineDFs(input_matrix, input_matrix), learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=100, n_jobs=1, subsample=0.1, verbosity=0)


In [11]:
from models.automl import evaluate_model

# Evaluate the model
evaluate_model(model, x_val, y_val)

Cohen:  0.6903692881768673
F1 score:  0.7940495245268807


Save the model

In [10]:
model.export("models/automl_model_all_data.pkl")

We can now evaluate the cohen kappa

- What do you think of the performances ?
- What do you think of the split strategy ?
- What are additional features you could use ?

## Running the model on the test data and submitting to the leaderboard


In [27]:
# %load_ext autoreload
# %autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from features.frequency_domain_features import extract_frequency_domain_features_multichannel
from features.time_domain_features import extract_time_domain_features
from features.complexity_features import extract_multichannel_entropy_features
from features.wavelet_decomposition import extract_wavelet_energy_features_multichannel
from features.time_domain_features import extract_amplitude_modulation_features
from features.frequency_domain_features import extract_relative_power

ROOT_TEST_PATH = Path("../data/test/")
test_data = {i:np.load(ROOT_TEST_PATH / f"data_{i}.npy") for i in [4,5]}
# We process each record independantly

def compute_features_on_record(data):
    """
    We compute each of the feature for each window and each channel
    Each value of the output dict has shape (Channels,T)
    """
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)
    print("Before any feature extraction: ", reshaped_data.shape)
    
    time_features = extract_time_domain_features(reshaped_data, return_type="numpy")
    print("Time features shape: ", {k: time_features[k].shape for k in time_features})

    amplitude_modulation_features = extract_amplitude_modulation_features(reshaped_data)
    print("Amplitude modulation features shape: ", {k: amplitude_modulation_features[k].shape for k in amplitude_modulation_features})

    frequency_features = extract_frequency_domain_features_multichannel(reshaped_data)
    print("Frequency features shape: ", {k: frequency_features[k].shape for k in frequency_features})

    relative_power_features = extract_relative_power(reshaped_data)
    print("Relative power features shape: ", {k: relative_power_features[k].shape for k in relative_power_features})

    entropy_features = extract_multichannel_entropy_features(reshaped_data)
    print("Entropy features shape: ", {k: entropy_features[k].shape for k in entropy_features})

    wavelet_energy_features_multichannel = extract_wavelet_energy_features_multichannel(reshaped_data)
    print("Wavelet energy features shape: ", {k: wavelet_energy_features_multichannel[k].shape for k in wavelet_energy_features_multichannel})
    
    features = {**time_features, **frequency_features, **entropy_features, **wavelet_energy_features_multichannel}
    print("Features shape: ", {k:features[k].shape for k in features})
    
    return features  # {5 ch x 13k, 5 ch x 13k, . . .}



def compute_predictions_on_record(data,model,features_name_for_model):
    predictions = []
    features = compute_features_on_record(data)
    features = np.array([features[k] for k in features_name_for_model]) 
    features = features.swapaxes(0,1).swapaxes(1,2)
    for channel in range(features.shape[0]):
        predictions.append(model.predict(features[channel]))
    return np.array(predictions)

def format_array_to_target_format(array, record_number):
    assert isinstance(record_number, int)
    assert isinstance(array, np.ndarray)
    assert len(array.shape) == 2
    assert array.shape[0] == 5
    print(set(np.unique(array)))
    assert set(np.unique(array)) == {0, 1}
    formatted_target = []
    for i in range(array.shape[0]):
        channel_encoding = (i + 1) * 100000
        record_number_encoding = record_number * 1000000
        for j in range(array.shape[1]):
            formatted_target.append(
                {
                    "identifier": record_number_encoding + channel_encoding + j,
                    "target": array[i, j],
                }
            )
    return formatted_target


We the functions defined above, we can now run the model and submit the predictions

In [12]:
from models.features_list import list_features

results = []
for record_number, data in test_data.items():
    preds = compute_predictions_on_record(data, model, list_features)
    formatted_preds = format_array_to_target_format(preds,record_number)
    results.extend(formatted_preds)
df = pd.DataFrame(results)
df.to_csv("../results/auto-ml-all-data.csv",index = False)

Before any feature extraction:  (5, 13204, 500)
Time features shape:  {'amplitude': (5, 13204), 'mean': (5, 13204), 'max': (5, 13204), 'min': (5, 13204), 'stdev': (5, 13204), 'skewness': (5, 13204), 'kurtosis': (5, 13204), 'hjorth_activity': (5, 13204), 'hjorth_mobility': (5, 13204), 'hjorth_complexity': (5, 13204)}


ValueError: Per-column arrays must each be 1-dimensional

In [19]:
results = pd.read_csv("../results/auto-ml-all-data.csv")
len("It needs to be 112615, it's : ", results)

112615