In [1]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import mne
import matplotlib.pyplot as plt
import pyvista
import ipywidgets
import ipyevents
import pyvistaqt
import yasa

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras import layers, models

import scipy.signal as signal
from scipy.signal import hilbert

In [2]:
%matplotlib qt

## Importing labelled data

In [3]:
file_path = r"C:\EEG DATA\FL_label_data.pickle"
# added r in front of file path to make it a raw string, to make sure that \ is not interpreted as a newline character

# open the pickle file
with open(file_path, "rb") as file:
    label_data = pickle.load(file)

# show the label_data type
print(type(label_data))

<class 'dict'>


In [4]:
def extract_onsets(label_data):
    onset_dict = {}
    for key, value in label_data.items():
        labels = np.atleast_1d(value['label'])
        onsets = np.atleast_1d(value['onset'])
        # to ensure that labels and onsets are treated as array
        # because subsequently using np.where
        indices = np.where((labels == 1) | (labels == 2))[0]
        # returns indices where the label is 1 (N2) or 2 (N3)
        if indices.size > 0 and np.all(indices < len(onsets)):
            # to ensure that no out-of-bounds error
            selected_onsets = onsets[indices]
            # retrieve onset value corresponding to label 1 or 2
            onset_dict[key] = selected_onsets
            # save extracted onset under correct key in dict
            #print(f"Key: {key}, Onset values for labels 1 (N2) and 2 (N3): {', '.join(map(str, selected_onsets))}")
        else:
            print(f"Key: {key}, Warning: The indices do not match")
    return onset_dict
    # returning the onset_dict and what you're printing
    # should I be only returning what is supposed to be printed? or maybe only the dict, since already has commas?

label_data_onsets = extract_onsets(label_data)



In [5]:
# Extracting onset values corresponding to labels 1 and 2 (assuming you have a list of labels)
# onset_values_013 contains the relevant onset values

# Function 1: to split the onset values into sublists where the difference between two values is always 30. otherwise starts a new sublist.

def group_by_increment(onset_values, increment=30):
    groups = []
    # will be a list of lists
    current_group = [float(onset_values[0])]
    # initializes this list with the first value from onset_values (the input)
    
    for i in range(1, len(onset_values)):
        # loops through all the onset values
        if onset_values[i] - onset_values[i - 1] == increment:
            # if i = 1, if onset_values[1] - onset_values[0] == 30
            current_group.append(float(onset_values[i]))
            # add the value at current index
        else:
            # if not a difference of 30
            # means you've reached the end of that sublist
            if len(current_group) > 1:
                # if there is more than one value in that group
                groups.append(current_group)
                # add the sublist to the big list
            current_group = [float(onset_values[i])]
            # starts a new current group with the new value at the current index
    
    if len(current_group) > 1:
        groups.append(current_group)
    # once you exit the group, if the last current_group contains more than one value
    # then you can add it to group
    # to make sure that last sequence is not left out
    
    return groups

In [7]:
def extract_segments(raw, groups):
    raw_segments = []
    # empty list to store the extracted EEG segments
    #max_time = raw.times[-1]
    
    for group in groups:
        start = group[0]
        # start = first value in group
        #stop = min(group[-1], max_time) 
        stop = group[-1]
        # stop = last value in group

        #if start >= max_time:
            #continue
        # takes the smaller of the two values
        segment = raw.copy().crop(tmin=start, tmax=stop)
        raw_segments.append(segment)
    
    return raw_segments

## Spindle detection

In [8]:
#import scipy.signal as signal
#from scipy.signal import find_peaks

def detect_spindles_times(eeg_raw):
    # Parameters
    #channel = 'Fz'
    
    # 1. Filter between 12 and 16 Hz
    
    filtered_data = eeg_raw.copy().pick_channels(['Fz'])
    filtered_data.filter(l_freq=12, h_freq=16)
    
    # 2. Downsample at 100 Hz (100 samples per second)
    
    filtered_data.resample(100)
    sfreq = filtered_data.info['sfreq']  
    # update to new sampling frequency
    # because used later in the code
    channel_data = filtered_data.get_data()[0]
    # extract the filtered data
    
    
    # 3: Calculate amplitude by applying Hilbert transformation

    hilbert_signal = hilbert(channel_data)
    # apply hilbert transformation to bandpassed data
    # gives analytic signal with amplitude and phase information
    envelope = np.abs(hilbert_signal)
    # take the absolute part of the hilbert signal
    # also the instantaneous power of the signal
    # gives the envelope: amplitude modulation
    # how strength of oscillations change over time
    # size of sliding window
    
    # 4: Perform smoothing with a sliding window of 0.2 seconds
    # this removes high-frequency noise
    
    sliding_window = int(0.2 * sfreq)
    smoothed_envelope = np.convolve(envelope, np.ones(sliding_window) / sliding_window, mode='same')
    # convolving envelope with a uniform filter over the sliding window
    # convolution takes rolling average of 20 samples at a time
    # smooth the signal with the average of values in the window
    # in the smoothed envelope, can detect regions with higher amplitude 
    # which is when a spindle event occurs
    # np.ones: creates a filter kernel
    # have a filter where the sum of all elements equals 1
    # this filter is replaced by the average of the 20 surrounding samples
    # convolution between envelope and averaging filter
    # mode = 'same': so that output of convolution has same length as original envelope

    # 5. Define spindle detection threshold

    threshold = np.percentile(smoothed_envelope, 75)
    spindle_threshold = smoothed_envelope > threshold
    #threshold = np.mean(smoothed_envelope) + 1.5 * np.std(smoothed_envelope)
    #spindle_threshold = smoothed_envelope > threshold
    # threshold is 75th percentile of the smoothed envelope
    # will look at the duration later
    
    # 6. Detect spindles and define peaks and troughs for visualisation
    
    spindles = []
    # initialize list with spindles
    above_threshold = np.where(spindle_threshold)[0]
    # returns indices where signal above the threshold
    stacked_spindles = []
    # initialize list for stacking the spindles for the visualisation
    # contains aligned spindles at peak
    
    if len(above_threshold) > 0:
        # checking it's not empty
        start_idx = above_threshold[0]
        # would be the start of a potential spindle
        for i in range(1, len(above_threshold)):
            if above_threshold[i] > above_threshold[i - 1] + 1:  
                # if above threshold[1] > above_threshold[0] + 1
                # because all indices should be separated by 1
                # so here detects gaps
                # so starting from the second index
                # and comparing each index to the one before
                end_idx = above_threshold[i - 1]
                # so if above condition is true, this is the end of the spindle
                duration = (end_idx - start_idx) / sfreq
                if 0.5 <= duration <= 3:
                    # only keep spindles lasting 0.5 to 3 seconds
                    segment = channel_data[start_idx:end_idx]
                    # extract EEG segment corresponding to detected spindle
                    peak_idx = start_idx + np.argmax(segment) 
                    # extract the peak of the spindle
                    # this will be useful for later
                    spindles.append((start_idx / sfreq, end_idx / sfreq))
                    # all the spindles are stored in spindles
                    
                    # Aligning spindles at peak for visualization
                    before_peak_idx = max(0, peak_idx - int(1.5 * sfreq))
                    # still in the for loop, so this is the peak index of individual peak
                    after_peak_idx = min(len(channel_data), peak_idx + int(1.5 * sfreq))
                    # extracting 1.5 seconds before and after peak
                    # max and min are used for out of bounds situations at the start and end of EEG data
                    aligned_segment = channel_data[before_peak_idx:after_peak_idx]
                    stacked_spindles.append(aligned_segment)
                    # the aligned segment is saved in stacked spindles
                
                start_idx = above_threshold[i]
                # update the start index for the for loop

        # then need to process the final spindle
        end_idx = above_threshold[-1]
        duration = (end_idx - start_idx) / sfreq
        if 0.5 <= duration <= 3:
            segment = channel_data[start_idx:end_idx]
            peak_idx = start_idx + np.argmax(segment)
            spindles.append((start_idx / sfreq, end_idx / sfreq))

            before_peak_idx = max(0, peak_idx - int(1.5 * sfreq))
            after_peak_idx = min(len(channel_data), peak_idx + int(1.5 * sfreq))
            aligned_segment = channel_data[before_peak_idx:after_peak_idx]
            stacked_spindles.append(aligned_segment)
    
    return spindles

In [9]:
import scipy.signal as signal
from scipy.signal import find_peaks

def detect_spindles_peaks(eeg_raw):
    # Parameters
    #channel = 'Fz'
    
    # 1. Filter between 12 and 16 Hz
    
    filtered_data = eeg_raw.copy().pick_channels(['Fz'])
    filtered_data.filter(l_freq=12, h_freq=16)
    
    # 2. Downsample at 100 Hz (100 samples per second)
    
    filtered_data.resample(100)
    sfreq = filtered_data.info['sfreq']  
    # update to new sampling frequency
    # because used later in the code
    channel_data = filtered_data.get_data()[0]
    # extract the filtered data
    
    # 3: Calculate amplitude by applying Hilbert transformation

    hilbert_signal = hilbert(channel_data)
    # apply hilbert transformation to bandpassed data
    # gives analytic signal with amplitude and phase information
    envelope = np.abs(hilbert_signal)
    # take the absolute part of the hilbert signal
    # also the instantaneous power of the signal
    # gives the envelope: amplitude modulation
    # how strength of oscillations change over time
    # size of sliding window
    
    # 4: Perform smoothing with a sliding window of 0.2 seconds
    # this removes high-frequency noise
    
    sliding_window = int(0.2 * sfreq)
    smoothed_envelope = np.convolve(envelope, np.ones(sliding_window) / sliding_window, mode='same')
    # convolving envelope with a uniform filter over the sliding window
    # convolution takes rolling average of 20 samples at a time
    # smooth the signal with the average of values in the window
    # in the smoothed envelope, can detect regions with higher amplitude 
    # which is when a spindle event occurs
    # np.ones: creates a filter kernel
    # have a filter where the sum of all elements equals 1
    # this filter is replaced by the average of the 20 surrounding samples
    # convolution between envelope and averaging filter
    # mode = 'same': so that output of convolution has same length as original envelope

    # 5. Define spindle detection threshold

    threshold = np.percentile(smoothed_envelope, 75)
    spindle_threshold = smoothed_envelope > threshold
    # 75th percentile as criteria

    #threshold = np.mean(smoothed_envelope) + 1.5 * np.std(smoothed_envelope)
    #spindle_threshold = smoothed_envelope > threshold
    
    # 6. Detect spindles and define peaks and troughs for visualisation
    
    spindles = []
    # initialize list with spindles
    above_threshold = np.where(spindle_threshold)[0]
    # returns indices where signal above the threshold
    stacked_spindles = []
    # initialize list for stacking the spindles for the visualisation
    # contains aligned spindles at peak
    
    if len(above_threshold) > 0:
        # checking it's not empty
        start_idx = above_threshold[0]
        # would be the start of a potential spindle
        for i in range(1, len(above_threshold)):
            if above_threshold[i] > above_threshold[i - 1] + 1:  
                # if above threshold[1] > above_threshold[0] + 1
                # because all indices should be separated by 1
                # so here detects gaps
                end_idx = above_threshold[i - 1]
                # so if above condition is true, this is the end of the spindle
                duration = (end_idx - start_idx) / sfreq
                if 0.5 <= duration <= 3:
                    # only keep spindles lasting 0.5 to 3 seconds
                    segment = channel_data[start_idx:end_idx]
                    # extract EEG segment corresponding to detected spindle
                    peak_idx = start_idx + np.argmax(segment) 
                    # extract the peak of the spindle
                    # this will be useful for later
                    #spindles.append(f"Spindle detected from {start_idx / sfreq:.2f}s to {end_idx / sfreq:.2f}s, peak at {peak_idx / sfreq:.2f}s")
                    spindles.append((peak_idx / sfreq))
                    # all the spindles are stored in spindles
                    
                    # Aligning spindles at peak for visualization
                    before_peak_idx = max(0, peak_idx - int(1.5 * sfreq))
                    # still in the for loop, so this is the peak index of individual peak
                    after_peak_idx = min(len(channel_data), peak_idx + int(1.5 * sfreq))
                    # extracting 1.5 seconds before and after peak
                    # max and min are used for out of bounds situations at the start and end of EEG data
                    aligned_segment = channel_data[before_peak_idx:after_peak_idx]
                    stacked_spindles.append(aligned_segment)
                    # the aligned segment is saved in stacked spindles
                
                start_idx = above_threshold[i]
                # update the start index for the for loop

        # then need to process the final spindle
        end_idx = above_threshold[-1]
        duration = (end_idx - start_idx) / sfreq
        if 0.5 <= duration <= 3:
            segment = channel_data[start_idx:end_idx]
            peak_idx = start_idx + np.argmax(segment)
            spindles.append((peak_idx / sfreq))

            before_peak_idx = max(0, peak_idx - int(1.5 * sfreq))
            after_peak_idx = min(len(channel_data), peak_idx + int(1.5 * sfreq))
            aligned_segment = channel_data[before_peak_idx:after_peak_idx]
            stacked_spindles.append(aligned_segment)

    
    return spindles

### Import EEG data

In [10]:
participant_067_file = r"C:\EEG DATA\067\eeg\TMR.vhdr"

participant_067_raw = mne.io.read_raw_brainvision(vhdr_fname=participant_067_file, preload=True)

Extracting parameters from C:\EEG DATA\067\eeg\TMR.vhdr...
Setting channel info structure...
Reading 0 ... 14024599  =      0.000 ... 28049.198 secs...


### Extract onset times

In [12]:
label_data_onsets_067 = label_data_onsets['067']
groups_067 = group_by_increment(label_data_onsets_067, increment=30)
groups_067

[[1650.0,
  1680.0,
  1710.0,
  1740.0,
  1770.0,
  1800.0,
  1830.0,
  1860.0,
  1890.0,
  1920.0,
  1950.0,
  1980.0,
  2010.0,
  2040.0,
  2070.0,
  2100.0,
  2130.0,
  2160.0,
  2190.0,
  2220.0,
  2250.0,
  2280.0,
  2310.0,
  2340.0,
  2370.0,
  2400.0,
  2430.0,
  2460.0,
  2490.0,
  2520.0,
  2550.0,
  2580.0,
  2610.0,
  2640.0,
  2670.0,
  2700.0,
  2730.0,
  2760.0,
  2790.0],
 [2940.0, 2970.0, 3000.0, 3030.0, 3060.0, 3090.0, 3120.0, 3150.0],
 [3360.0, 3390.0, 3420.0, 3450.0, 3480.0, 3510.0, 3540.0, 3570.0, 3600.0],
 [3690.0, 3720.0, 3750.0, 3780.0, 3810.0, 3840.0, 3870.0, 3900.0, 3930.0],
 [3990.0,
  4020.0,
  4050.0,
  4080.0,
  4110.0,
  4140.0,
  4170.0,
  4200.0,
  4230.0,
  4260.0,
  4290.0,
  4320.0,
  4350.0,
  4380.0,
  4410.0,
  4440.0,
  4470.0,
  4500.0,
  4530.0,
  4560.0,
  4590.0,
  4620.0,
  4650.0,
  4680.0,
  4710.0,
  4740.0,
  4770.0,
  4800.0,
  4830.0,
  4860.0,
  4890.0,
  4920.0,
  4950.0,
  4980.0,
  5010.0,
  5040.0,
  5070.0,
  5100.0,
  5130.0,
  

### Concatenate raw data

In [13]:
segments_067 = extract_segments(participant_067_raw, groups_067)

if segments_067:
    combined_raw_067 = mne.concatenate_raws(segments_067)
    # concatenates raw segments as if they were continuous
    # boundaries of the raw files are annotated bad
    combined_raw_067.pick(["Fz"]).filter(l_freq=0.1, h_freq=40)

Filtering raw data in 30 contiguous segments
Setting up band-pass filter from 0.1 - 40 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.10
- Lower transition bandwidth: 0.10 Hz (-6 dB cutoff frequency: 0.05 Hz)
- Upper passband edge: 40.00 Hz
- Upper transition bandwidth: 10.00 Hz (-6 dB cutoff frequency: 45.00 Hz)
- Filter length: 16501 samples (33.002 s)



### Spindle detection

In [27]:
spindles_067_times = detect_spindles_times(combined_raw_067)

NOTE: pick_channels() is a legacy function. New code should use inst.pick(...).
Filtering raw data in 30 contiguous segments
Setting up band-pass filter from 12 - 16 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 12.00
- Lower transition bandwidth: 3.00 Hz (-6 dB cutoff frequency: 10.50 Hz)
- Upper passband edge: 16.00 Hz
- Upper transition bandwidth: 4.00 Hz (-6 dB cutoff frequency: 18.00 Hz)
- Filter length: 551 samples (1.102 s)



In [28]:
len(spindles_067_times)

3072

In [20]:
spindles_starts_067, spindles_ends_067 = zip(*spindles_067_times) if spindles_067_times else([],[])
# splits the spindle list into two list
# one with all the spindle starts
# and one with all the spindle ends

# if statement is in case found no spindles
# would just return empty lists

In [24]:
print(len(spindles_starts_067))
print(len(spindles_ends_067))

3072
3072


## Epoch the data

### Define 1-second epochs of EEG data

### Resample the EEG data (so it has same sampling rate as the data after spindle detection)

In [30]:
combined_raw_067.resample(100)

Unnamed: 0,General,General.1
,Filename(s),TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg  TMR.eeg
,MNE object type,RawBrainVision
,Measurement date,2023-09-06 at 23:28:45 UTC
,Participant,Unknown
,Experimenter,Unknown
,Acquisition,Acquisition
,Duration,04:38:30 (HH:MM:SS)
,Sampling frequency,100.00 Hz
,Time points,1671000
,Channels,Channels


In [82]:
combined_raw_067.info['sfreq']

100.0

In [76]:
# use mne function to make epochs

epochs_067 = mne.make_fixed_length_epochs(
    combined_raw_067,
    duration=1.0,          
    overlap=0.0,           
    preload=True,
    reject_by_annotation=False
)

# don't reject bad epochs to ensure that matches with spindles data

Not setting metadata
16710 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 16710 events and 100 original time points ...
0 bad epochs dropped


In [77]:
epochs_067.get_data().shape

(16710, 1, 100)

In [83]:
epochs_067._data *= 1e6

# to convert from volts to microvolts
# otherwise units are really small and harder to detect events for CNN model

###  Define labels for each 1-second epoch

In [84]:
def label_spindle_epochs(epochs, spindle_starts, spindle_ends, epoch_length_sec=1):

    epoch_starts = np.arange(len(epochs)) * epoch_length_sec
    # new np array with the start time of each epoch
    # epoch_starts[i] is the start time of each epoch

    epoch_labels = np.zeros(len(epochs), dtype=int)
    # initialize all the labels as 0 initially

    for start, end in zip(spindle_starts, spindle_ends):
        # loop through the start and end times of detected spindles by YASA
        for i, epoch_start in enumerate(epoch_starts):
            # loop through the one-second epochs that are not labelled yet
            epoch_end = epoch_start + epoch_length_sec
            # for each epoch, calculate the epoch end time
            # which is epoch_start + length of epoch
            # so now have the time range of each epoch
            if (start < epoch_end) and (end > epoch_start):
                # if the spindle started before the epoch ends
                # and the spindle ended after the epoch started
                epoch_labels[i] = 1
                
    return epoch_labels

In [85]:
epoch_labels_067 = label_spindle_epochs(epochs_067, spindles_starts_067, spindles_ends_067)

In [86]:
epoch_labels_067[:10]

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

In [87]:
spindles_starts_067[:10]

(np.float64(7.95),
 np.float64(14.93),
 np.float64(43.64),
 np.float64(45.79),
 np.float64(47.28),
 np.float64(49.45),
 np.float64(61.61),
 np.float64(66.12),
 np.float64(79.36),
 np.float64(85.38))

In [88]:
spindles_ends_067[:10]

(np.float64(8.6),
 np.float64(15.51),
 np.float64(44.83),
 np.float64(46.78),
 np.float64(47.93),
 np.float64(50.08),
 np.float64(62.56),
 np.float64(66.76),
 np.float64(79.89),
 np.float64(85.95))

## Building the CNN

### Prepare EEG data for CNN input

### Build the model

In [89]:
def build_cnn_model(input_shape=(100, 1)):

    # linear embedding layer
    input_layer = tf.keras.layers.Input(shape=input_shape)

    # Three convolutional blocks (like having three pattern detectors)

    # First convolution block, kernel size of 5
    padded1 = tf.keras.layers.ZeroPadding1D(padding=2)(input_layer)
    conv1 = tf.keras.layers.Conv1D(filters=10, kernel_size=5, strides=1, padding='valid')(padded1)
    # each filter learns a different type of short-time feature
    # stride of 1, moves one step at a time
    conv1 = tf.keras.layers.LeakyReLU(alpha=0.01)(conv1)
    conv1 = tf.keras.layers.MaxPooling1D(pool_size=2)(conv1)
    # K = 2
    conv1 = tf.keras.layers.BatchNormalization()(conv1)

    # Second convolution block, kernel size of 11
    padded2 = tf.keras.layers.ZeroPadding1D(padding=5)(input_layer)
    conv2 = tf.keras.layers.Conv1D(filters=10, kernel_size=11, strides=1, padding='valid')(padded2)
    conv2 = tf.keras.layers.LeakyReLU(alpha=0.01)(conv2)
    conv2 = tf.keras.layers.MaxPooling1D(pool_size=2)(conv2)
    conv2 = tf.keras.layers.BatchNormalization()(conv2)

    # Third convolution block, kernel size of 21
    padded3 = tf.keras.layers.ZeroPadding1D(padding=10)(input_layer)
    conv3 = tf.keras.layers.Conv1D(filters=10, kernel_size=21, strides=1, padding='valid')(padded3)
    conv3 = tf.keras.layers.LeakyReLU(alpha=0.01)(conv3)
    conv3 = tf.keras.layers.MaxPooling1D(pool_size=2)(conv3)
    conv3 = tf.keras.layers.BatchNormalization()(conv3)

    # Concatenate the outputs of all blocks
    concatenated = tf.keras.layers.Concatenate()([conv1, conv2, conv3])

    # GRU Layer
    gru = tf.keras.layers.GRU(64)(concatenated)

    # Fully connected (dense) layer
    dense = tf.keras.layers.Dense(64, activation='relu')(gru)
    # add a Dropout layer to prevent overfitting
    dense = tf.keras.layers.Dropout(0.5)(dense)

    # Two softmax outputs for dual-task classification
    #output_task1 = tf.keras.layers.Dense(2, activation='softmax', name='task1')(dense)
    #output_task2 = tf.keras.layers.Dense(2, activation='softmax', name='task2')(dense)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

    # Create the model
    #model = tf.keras.models.Model(inputs=input_layer, outputs=[output_task1, output_task2])
    model = tf.keras.models.Model(inputs=input_layer, outputs=output)

    # Compile the model
    #model.compile(optimizer='adam', loss={'task1': 'categorical_crossentropy', 'task2': 'categorical_crossentropy'}, metrics={'task1': 'accuracy', 'task2': 'accuracy'})
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Return the compiled model
    return model

In [90]:
input_shape = (100, 1)
cnn_model = build_cnn_model(input_shape)
cnn_model.summary()



In [91]:
epochs_067_np = np.array(epochs_067)
epochs_067_np = epochs_067_np.squeeze(axis=1)
# reshape to go from (16710, 1, 100) to (16710, 100)
print(epochs_067_np.shape)
print(epoch_labels_067.shape)

(16710, 100)
(16710,)


In [92]:
epochs_067_np[:5]
# do I need to convert back to correct unit?

array([[  0.09903025,  -0.24223909,  -1.21410004,  -1.96946127,
         -1.55216253,  -0.98461706,  -2.56594618,  -2.20750312,
          0.88334022,   3.18598336,   2.87988113,   1.34308405,
          1.82961451,   0.24389127,  -1.16891375,   0.46549835,
          0.35997208,  -0.45700827,  -2.14105873,  -4.04281693,
         -3.11719163,  -1.43340696,   0.45855997,   0.37620958,
         -0.87682963,   1.13601301,   2.52063196,   4.0420954 ,
          6.75054792,  10.51176425,  14.19537222,  11.68838946,
          9.69750434,   8.96449328,   6.7854667 ,   4.85932119,
          3.17424733,   1.97351461,  -0.85384191,  -2.99914857,
         -5.52888067,  -8.12274585, -10.28168662, -11.63217793,
        -12.31193521, -13.7051641 , -12.60996424, -10.36984018,
         -8.0356544 ,  -5.60597266,  -2.61708705,  -1.26840108,
         -1.22665561,  -0.61210984,  -1.80820875,  -1.3003169 ,
         -0.62498953,  -2.02394163,  -2.89565194,  -4.10086994,
         -5.20493944,  -6.08279338,  -4.

### Prepare X and y train and test sets

In [93]:
# split into X and y (labels) data
X = epochs_067_np
y = epoch_labels_067

# split into train and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# stratify = y to ensure that same proportion of classes in both training and test set

### Train the model

In [63]:
#class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
#class_weight_dict ={0: class_weights[0], 1: class_weights[1]}

In [94]:
training_info = cnn_model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=64)

Epoch 1/50
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 0.6805 - loss: 0.5977 - val_accuracy: 0.8455 - val_loss: 0.3901
Epoch 2/50
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.8235 - loss: 0.4248 - val_accuracy: 0.8369 - val_loss: 0.3982
Epoch 3/50
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.8403 - loss: 0.3886 - val_accuracy: 0.8534 - val_loss: 0.3622
Epoch 4/50
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step - accuracy: 0.8422 - loss: 0.3855 - val_accuracy: 0.8500 - val_loss: 0.3658
Epoch 5/50
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step - accuracy: 0.8440 - loss: 0.3739 - val_accuracy: 0.8549 - val_loss: 0.3430
Epoch 6/50
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.8476 - loss: 0.3545 - val_accuracy: 0.8534 - val_loss: 0.3483
Epoch 7/50
[1m168/168

### Evaluate the model

##### Plot the training history

In [95]:
def plot_training_history(training_info):
  fig, axs = plt.subplots(1, 2, figsize=(16, 5))
  axs[0].plot(training_info.history['loss'], label="training set")
  axs[0].plot(training_info.history['val_loss'], label="validation set")
  axs[0].set_xlabel("Epoch")
  axs[0].set_ylabel("Loss")
  axs[0].grid(True)
  axs[0].legend()
  try:
    axs[1].plot(training_info.history['accuracy'], label="training set")
    axs[1].plot(training_info.history['val_accuracy'], label="validation set")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Accuracy")
    axs[1].grid(True)
    axs[1].legend()
  except:
    pass
  plt.show()

plot_training_history(training_info)

##### Assess the model on the test set

In [96]:
cnn_model.evaluate(X_test, y_test)

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8508 - loss: 0.4055 


[0.43038758635520935, 0.8470975756645203]

##### Metrics: precision, recall, f1-score

In [97]:
y_pred = cnn_model.predict(X_test)
y_pred_labels = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred_labels))
print(classification_report(y_test, y_pred_labels))

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step   
[[2021  222]
 [ 289  810]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      2243
           1       0.78      0.74      0.76      1099

    accuracy                           0.85      3342
   macro avg       0.83      0.82      0.82      3342
weighted avg       0.85      0.85      0.85      3342

