In [None]:
import os
from pathlib import Path

# Define a base directory for the dataset
if 'google.colab' in str(get_ipython()):
    # Code is running in Google Colab
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = Path('/content/drive/My Drive/Supervised-Project/Data')
    !pip install numpy seaborn matplotlib scipy pandas ts2vg scikit-image pyfeats scikit-learn==1.2.0 lazypredict dask[dataframe] antropy pywt
else:
    # Code is running locally
    base_dir = Path('Data/')  # Adjust to your local relative path

## Loading the data

In [1]:
## First let's load the training data
from pathlib import Path
import numpy as np
from scipy.signal import butter, lfilter
import pandas as pd
from features.wavelet_decomposition import extract_wavelet_energy_features
from lazypredict.Supervised import LazyClassifier
from lazypredict.Supervised import CLASSIFIERS
from scipy.stats import skew, kurtosis
from scipy.signal import welch
import antropy as ant
import pywt


In [2]:
ROOT_PATH = Path("../data/train/")
training_data = [(np.load(ROOT_PATH / f"data_{i}.npy"),np.load(ROOT_PATH / f"target_{i}.npy")) for i in range(4)]


## Functions

In [3]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    return butter(order, [lowcut, highcut], fs=fs, btype='band')

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

def reshape_array_into_windows(x, sample_rate, window_duration_in_seconds):
    """
    Reshape the data into an array of shape (C, T, window) where 'window' contains
    the points corresponding to 'window_duration' seconds of data.

    Parameters:
    x (numpy array): The input data array.
    sample_rate (int): The number of samples per second.
    window_duration_in_seconds (float): The duration of each window in seconds.

    Returns:
    reshaped_x (numpy array): The reshaped array with shape (C, T, window).
    """
    # Calculate the number of samples in one window
    window_size = int(window_duration_in_seconds * sample_rate)
    
    # Ensure the total length of x is a multiple of window_size
    total_samples = x.shape[-1]
    if total_samples % window_size != 0:
        # Truncate or pad x to make it divisible by window_size
        x = x[..., :total_samples - (total_samples % window_size)]
    # Reshape x into (C, T, window)
    reshaped_x = x.reshape(x.shape[0], -1, window_size)

    return reshaped_x

In [None]:
# Extract time domain features
def extract_time_domain_features(data, return_type='dataframe'):
    """
    Extracts time-domain features from EEG data.
    Parameters:
    data (numpy.ndarray): A 2D or 3D array where each row (or each slice in the case of 3D) represents a 2-second window of EEG data sampled at 250 Hz (i.e., each row has 500 data points).
    return_type (str): The type of the return value, either 'dataframe' or 'numpy'.
    Returns:
    pandas.DataFrame or numpy.ndarray: A DataFrame or ndarray containing the following time-domain features for each row (or slice) of the input data:
        - amplitude: The difference between the maximum and minimum values.
        - mean: The mean value.
        - max: The maximum value.
        - min: The minimum value.
        - stdev: The standard deviation.
        - skewness: The skewness of the data.
        - kurtosis: The kurtosis of the data.
        - hjorth_activity: The Hjorth activity parameter.
        - hjorth_mobility: The Hjorth mobility parameter.
        - hjorth_complexity: The Hjorth complexity parameter.
    """

    is_3d = data.ndim == 3

    amplitude = np.max(data, axis=-1) - np.min(data, axis=-1)
    mean_values = np.mean(data, axis=-1)
    max_values = np.max(data, axis=-1)
    min_values = np.min(data, axis=-1)
    stdev_values = np.std(data, axis=-1)
    skewness_values = skew(data, axis=-1)
    kurtosis_values = kurtosis(data, axis=-1)

    # Hjorth parameters
    def hjorth_parameters(data):
        first_deriv = np.diff(data, axis=-1)
        second_deriv = np.diff(first_deriv, axis=-1)
        var_zero = np.var(data, axis=-1)
        var_d1 = np.var(first_deriv, axis=-1)
        var_d2 = np.var(second_deriv, axis=-1)
        activity = var_zero
        mobility = np.sqrt(var_d1 / var_zero)
        complexity = np.sqrt(var_d2 / var_d1) / mobility
        return activity, mobility, complexity

    hjorth_activity, hjorth_mobility, hjorth_complexity = hjorth_parameters(data)

    features = {
        "amplitude": amplitude,
        "mean": mean_values,
        "max": max_values,
        "min": min_values,
        "stdev": stdev_values,
        "skewness": skewness_values,
        "kurtosis": kurtosis_values,
        "hjorth_activity": hjorth_activity,
        "hjorth_mobility": hjorth_mobility,
        "hjorth_complexity": hjorth_complexity,
    }

    if not is_3d:
        for key in features:
            features[key] = features[key].reshape(-1)

    if return_type == 'dataframe':
        return pd.DataFrame(features)
    elif return_type == 'numpy':
        return features
    else:
       raise ValueError("return_type must be either 'dataframe' or 'numpy'")
    

# Extract frequency domain features
def extract_frequency_domain_features(data, fs=250):
    """
    Extracts frequency domain features from EEG signal data.
    Parameters:
    data (np.ndarray): A NumPy array where each row represents a 2-second window of the EEG signal, 
                       with each row containing 500 data points.
    fs (int): Sampling frequency of the EEG signal. Default is 250 Hz.
    Returns:
    pd.DataFrame: A DataFrame containing the extracted frequency domain features.
    """

    # Define frequency bands
    bands = {
        'delta': (0.5, 4),
        'theta': (4, 8),
        'alpha': (8, 12),
        'beta': (12, 30),
        'gamma': (30, 40)
    }

    features = []

    for window in data:
        f, Pxx = welch(window, fs=fs, nperseg=fs*2)
        band_powers = {}
        for band, (low, high) in bands.items():
            band_power = np.trapz(Pxx[(f >= low) & (f <= high)], f[(f >= low) & (f <= high)])
            band_powers[f'{band}_power'] = band_power
        features.append(band_powers)

    return pd.DataFrame(features)
    

def extract_frequency_domain_features_multichannel(data, fs=250):
    """
    Extracts frequency domain features from multi-channel EEG signal data.
    Parameters:
    data (np.ndarray): A NumPy array with shape (channel, segment, 500), where each segment represents a 2-second window of the EEG signal.
    fs (int): Sampling frequency of the EEG signal. Default is 250 Hz.
    Returns:
    dict: A dictionary containing the extracted frequency domain features with keys as {band}_power and values as numpy arrays of shape (channel, segment).
    """

    # Define frequency bands
    bands = {
        'delta': (0.5, 4),
        'theta': (4, 8),
        'alpha': (8, 12),
        'beta': (12, 30),
        'gamma': (30, 40)
    }

    num_channels, num_segments, _ = data.shape
    features = {f'{band}_power': np.zeros((num_channels, num_segments)) for band in bands}

    for ch in range(num_channels):
        for seg in range(num_segments):
            window = data[ch, seg, :]
            f, Pxx = welch(window, fs=fs, nperseg=fs*2)
            for band, (low, high) in bands.items():
                band_power = np.trapz(Pxx[(f >= low) & (f <= high)], f[(f >= low) & (f <= high)])
                features[f'{band}_power'][ch, seg] = band_power

    return features

def extract_entropy_features(signal):
    """
    Extracts entropy features from a 2D signal array.
    Parameters:
    signal (numpy.ndarray): A 2D array where each row represents a signal.
    Returns:
    pandas.DataFrame: A DataFrame containing the extracted entropy features:
        - 'shannon_entropy': List of Shannon entropy values for each signal.
        - 'sample_entropy': List of sample entropy values for each signal.
        - 'spectral_entropy': List of spectral entropy values for each signal.
    """
    
    features = {
        'shannon_entropy': [],
        'sample_entropy': [],
        'spectral_entropy': []
    }
    
    # Iterate over the last dimension
    for i in range(signal.shape[0]):
        features['shannon_entropy'].append(ant.perm_entropy(signal[i, ...]))
        features['sample_entropy'].append(ant.sample_entropy(signal[i, ...]))
        features['spectral_entropy'].append(ant.spectral_entropy(signal[i, ...], sf=250, method='welch', normalize=True))
    
    return pd.DataFrame(features)

def extract_multichannel_entropy_features(signal):
    """
    Extracts multichannel entropy features from a given EEG signal.
    Parameters:
    signal (numpy.ndarray): A 3D numpy array of shape (channels, windows, samples) representing the EEG signal.
    Returns:
    dict: A dictionary containing the following keys:
        - 'shannon_entropy': A 2D numpy array of shape (channels, windows) with Shannon entropy values.
        - 'sample_entropy': A 2D numpy array of shape (channels, windows) with Sample entropy values.
        - 'spectral_entropy': A 2D numpy array of shape (channels, windows) with Spectral entropy values.
    """

    channels, windows, _ = signal.shape
    
    features = {
        'shannon_entropy': np.zeros((channels, windows)),
        'sample_entropy': np.zeros((channels, windows)),
        'spectral_entropy': np.zeros((channels, windows))
    }
    
    for ch in range(channels):
        for win in range(windows):
            features['shannon_entropy'][ch, win] = ant.perm_entropy(signal[ch, win, :])
            features['sample_entropy'][ch, win] = ant.sample_entropy(signal[ch, win, :])
            features['spectral_entropy'][ch, win] = ant.spectral_entropy(signal[ch, win, :], sf=250, method='welch', normalize=True)
    
    return features

def extract_wavelet_energy_features(signal, wavelet='db4', max_level=3):  # db4 commonly used for EEG signals
    """
    Extracts wavelet energy features from a 2D numpy array signal.
    Parameters:
    signal (numpy.ndarray): A 2D numpy array where each row represents a signal segment.
    wavelet (str): The type of wavelet to use for decomposition. Default is 'db4'.
    max_level (int): The maximum level of wavelet decomposition. Default is 3.
    Returns:
    pandas.DataFrame: A DataFrame containing the wavelet energy features for each segment.
                        Each column corresponds to the energy of a specific sub-band.
    """
    
    features = []

    for segment in signal:
        wp = pywt.WaveletPacket(data=segment, wavelet=wavelet, maxlevel=max_level)
        feature_vector = []
        for node in wp.get_level(max_level, 'freq'):
            # Calculate energy of each node
            energy = np.sum(np.square(node.data))
            feature_vector.append(energy)
        features.append(feature_vector)

    # Name the features according to the sub-band
    feature_names = [f'energy_band_{i}' for i in range(len(features[0]))]
    df_features = pd.DataFrame(features, columns=feature_names)
    
    return df_features

def extract_wavelet_energy_features_multichannel(signal, wavelet='db4', max_level=3):
    """
    Extracts wavelet energy features from a 3D numpy array signal.
    Parameters:
    signal (numpy.ndarray): A 3D numpy array with shape (channels, windows, 500).
    wavelet (str): The type of wavelet to use for decomposition. Default is 'db4'.
    max_level (int): The maximum level of wavelet decomposition. Default is 3.
    Returns:
    dict: A dictionary where keys are feature names and values are numpy arrays of shape (channels, windows).
    """
    
    channels, windows, _ = signal.shape
    features_dict = {}

    for ch in range(channels):
        channel_features = []
        for win in range(windows):
            segment = signal[ch, win, :]
            wp = pywt.WaveletPacket(data=segment, wavelet=wavelet, maxlevel=max_level)
            feature_vector = []
            for node in wp.get_level(max_level, 'freq'):
                # Calculate energy of each node
                energy = np.sum(np.square(node.data))
                feature_vector.append(energy)
            channel_features.append(feature_vector)
        
        # Convert list of features to numpy array and store in dictionary
        feature_names = [f'energy_band_{i}' for i in range(len(channel_features[0]))]
        for i, feature_name in enumerate(feature_names):
            if feature_name not in features_dict:
                features_dict[feature_name] = np.zeros((channels, windows))
            features_dict[feature_name][ch, :] = np.array([cf[i] for cf in channel_features])
    
    return features_dict

## Preprocessing

In [4]:
# We first load and reshape all the data
all_data = []
all_targets = []
for (data,target) in training_data:
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)
    targets_flatten = target[..., :len(reshaped_data[0])].reshape(-1)
    reshaped_data = reshaped_data.reshape((-1,reshaped_data.shape[-1]))
    all_data.append(reshaped_data)
    all_targets.append(targets_flatten)
all_data = np.concatenate(all_data)
all_targets = np.concatenate(all_targets)
assert all_data.shape[0] == all_targets.shape[0]


## Feature extraction

In [6]:
# We can now compute the features over each 2 seconds segment

time_features = extract_time_domain_features(all_data, return_type="dataframe")
frequency_features = extract_frequency_domain_features(all_data)
entropy_features = extract_entropy_features(all_data)
wavelet_energy_features = extract_wavelet_energy_features(all_data)

# Combine the time, frequency, and entropy features into a single DataFrame
features = pd.concat([time_features, frequency_features, entropy_features, wavelet_energy_features], axis=1)


In [7]:
features.to_csv("features.csv")
features.head()

Unnamed: 0,amplitude,mean,max,min,stdev,skewness,kurtosis,hjorth_activity,hjorth_mobility,hjorth_complexity,...,sample_entropy,spectral_entropy,energy_band_0,energy_band_1,energy_band_2,energy_band_3,energy_band_4,energy_band_5,energy_band_6,energy_band_7
0,28600.257975,1245.670285,21471.069232,-7129.188743,7780.162127,0.765047,-0.610001,60530920.0,0.043994,6.48883,...,0.004544,0.055527,31317360000.0,202624900.0,2364575.0,273852.478188,7335.565786,10011.650727,17660.037549,11613.001761
1,7506.462109,-4965.798852,-476.438958,-7982.901067,2433.203314,0.304853,-1.31126,5920478.0,0.005689,34.624977,...,0.012692,0.115011,17250960000.0,20532.29,1374.747,1052.824111,176.143959,67.214865,3.894931,3.055691
2,4054.175414,2364.739039,3590.077431,-464.097984,1187.694358,-0.797484,-0.582127,1410618.0,0.00591,50.729171,...,0.00269,0.075448,3576688000.0,15743.83,889.0063,812.85486,145.044263,43.013947,2.302323,1.947904
3,2187.981464,2755.580822,3620.974382,1432.992918,695.837658,-0.312038,-1.245956,484190.0,0.006904,59.065314,...,0.015968,0.136786,4525491000.0,6757.636,384.9655,667.420601,125.16616,32.429763,1.707716,1.597904
4,2362.44708,8.122686,1420.37748,-942.0696,649.347508,0.350397,-0.977942,421652.2,0.006834,68.005797,...,0.017123,0.088179,290369800.0,16947.03,818.4609,814.47183,120.684325,28.069911,2.113378,2.217117


## Feature selection

In [5]:
features = pd.read_csv("features.csv", index_col=0)
features.head()

Unnamed: 0,amplitude,mean,max,min,stdev,skewness,kurtosis,hjorth_activity,hjorth_mobility,hjorth_complexity,...,sample_entropy,spectral_entropy,energy_band_0,energy_band_1,energy_band_2,energy_band_3,energy_band_4,energy_band_5,energy_band_6,energy_band_7
0,28600.26,1245.67,21471.07,-7129.19,7780.16,0.77,-0.61,60530922.73,0.04,6.49,...,0.0,0.06,31317359394.08,202624914.52,2364574.84,273852.48,7335.57,10011.65,17660.04,11613.0
1,7506.46,-4965.8,-476.44,-7982.9,2433.2,0.3,-1.31,5920478.37,0.01,34.62,...,0.01,0.12,17250962984.42,20532.29,1374.75,1052.82,176.14,67.21,3.89,3.06
2,4054.18,2364.74,3590.08,-464.1,1187.69,-0.8,-0.58,1410617.89,0.01,50.73,...,0.0,0.08,3576687818.39,15743.83,889.01,812.85,145.04,43.01,2.3,1.95
3,2187.98,2755.58,3620.97,1432.99,695.84,-0.31,-1.25,484190.05,0.01,59.07,...,0.02,0.14,4525490622.25,6757.64,384.97,667.42,125.17,32.43,1.71,1.6
4,2362.45,8.12,1420.38,-942.07,649.35,0.35,-0.98,421652.19,0.01,68.01,...,0.02,0.09,290369777.33,16947.03,818.46,814.47,120.68,28.07,2.11,2.22


In [6]:
from features.utils import remove_collinear_features

# Remove collinear features
features = remove_collinear_features(features, threshold=0.6)
features.head()

Removed features: ['max', 'min', 'stdev', 'hjorth_activity', 'delta_power', 'alpha_power', 'beta_power', 'gamma_power', 'sample_entropy', 'spectral_entropy', 'energy_band_0', 'energy_band_1', 'energy_band_2', 'energy_band_3', 'energy_band_5', 'energy_band_6', 'energy_band_7']


Unnamed: 0,amplitude,mean,skewness,kurtosis,hjorth_mobility,hjorth_complexity,theta_power,shannon_entropy,energy_band_4
0,28600.26,1245.67,0.77,-0.61,0.04,6.49,422.1,0.3,7335.57
1,7506.46,-4965.8,0.3,-1.31,0.01,34.62,529.88,0.75,176.14
2,4054.18,2364.74,-0.8,-0.58,0.01,50.73,155.72,0.75,145.04
3,2187.98,2755.58,-0.31,-1.25,0.01,59.07,362.7,1.11,125.17
4,2362.45,8.12,0.35,-0.98,0.01,68.01,100.72,1.02,120.68


In [7]:
prop_train = 0.7
n_train = int(prop_train * len(features))

x_train = features[:n_train]
y_train = all_targets[:n_train]

x_val = features[n_train:]
y_val = all_targets[n_train:]

In [None]:
'''
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

selectKBest = SelectKBest(f_classif, k=20)

x_train_sel = selectKBest.fit_transform(x_train, y_train)
x_val_sel = selectKBest.transform(x_val)
'''

## Find the best model

In [13]:
highmem_classifiers = ["LabelSpreading", "LabelPropagation", "BernoulliNB", "KNeighborsClassifier", "ElasticNetClassifier", "GradientBoostingClassifier", "HistGradientBoostingClassifier", "BaggingClassifier", "RandomForestClassifier", "SVC", "ExtraTreesClassifier", "AdaBoostClassifier", "KNeighborsClassifier"]

# Remove the high memory classifiers from the list
classifiers = [c for c in CLASSIFIERS if c[0] not in highmem_classifiers]
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, classifiers=classifiers)
models = clf.fit(x_train[:80000], x_val[:50000], y_train[:80000], y_val[:50000])  # Only use a subset of the data for faster computation
models

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


 95%|█████████▌| 21/22 [00:06<00:00,  4.60it/s]

[LightGBM] [Info] Number of positive: 77195, number of negative: 2805
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.964938 -> initscore=3.314931
[LightGBM] [Info] Start training from score 3.314931


100%|██████████| 22/22 [00:07<00:00,  2.92it/s]


(                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
 Model                                                                           
 LGBMClassifier                     0.83               0.82     0.82      0.83   
 XGBClassifier                      0.78               0.80     0.80      0.78   
 QuadraticDiscriminantAnalysis      0.80               0.79     0.79      0.80   
 ExtraTreeClassifier                0.80               0.79     0.79      0.80   
 GaussianNB                         0.77               0.78     0.78      0.78   
 DecisionTreeClassifier             0.72               0.77     0.77      0.73   
 Perceptron                         0.58               0.68     0.68      0.56   
 CalibratedClassifierCV             0.47               0.59     0.59      0.41   
 PassiveAggressiveClassifier        0.47               0.59     0.59      0.42   
 SGDClassifier                      0.44               0.57     0.57      0.36   
 LinearSVC      