In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib as mpl
from cycler import cycler
import scipy.io

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Some words on this notebook

Under the pretext of 'analysing' a bit the EEG files, I created this notebook which in reality is more a playground for me to get familiar with the data. For this prupose I wanted to investigate how to extract the information from the .mat files, how to plot the data of the EEG files, etc.
I also produced some artificial data and tried to find patterns that could be used to make a prediction.
I also wanted to see if it was possible to use the image of an espectogram for the same purposes.
All this without understanding really anything about EEG.

After a bit of research I realized it could have been interesting to know what kind of montage was used to produce these EEG values, so that we could train an algorithm to identify which electrodes (and the value differences between them) are more likely to yield a good prediction. However, after looking at the EEG plot (below) it seems that the EEGs were produced using one common reference for all electrode measurements.


So, I don't want to spend too much time on getting educated on the vast world of neuro science interpretations. Since I wasn't able to -very- easily find something visually useful, I will stop this notebook and instead I will go directly into trying to train an RNN algorithm for predictions.

In [None]:
#Visualizing an example:
interictal_tst = '/kaggle/input/seizure-prediction/Patient_1/Patient_1/Patient_1_interictal_segment_0001.mat'
interictal_data = scipy.io.loadmat(interictal_tst)          # loads the file as a dictionary
for item in interictal_data.items():
    print(item)

In [None]:
interictal_data.get('interictal_segment_1')

Check for data inconsistencies, like for example, that there are as much data samples needed as data seconds and that these two correspond to the data samples per seconds.

In [None]:
search_key = "_segment_"       # search key string

for dirname, _, filenames in os.walk('/kaggle/input/seizure-prediction/'):
    for filename in filenames:
        if filename != "sampleSubmission.csv":
            try:
                segment_dict = scipy.io.loadmat(os.path.join(dirname, filename))        # loads the .mat file as a dictionary
                segment = dict(filter(lambda item: search_key in item[0], segment_dict.items()))  # retrieves only the item which contain the data of interes
                segment_list_of_values = list(segment.values())
                total_num_dataSamples = np.array(segment_list_of_values[0][0][0][0].shape[1], dtype=np.float)
                datarows_duration_secs = np.array(segment_list_of_values[0][0][0][1], dtype=np.float)
                datasamples_perSecond = np.array(segment_list_of_values[0][0][0][2], dtype=np.float)

                if total_num_dataSamples != np.around((datarows_duration_secs * datasamples_perSecond)):
                    print("Data inconsistency in file: %s" % os.path.join(dirname, filename))
            except:
                e = sys.exc_info()[0]
                print("Exception %s caught on file %s " % (e, os.path.join(dirname, filename)))
                break
print("Check done.")

Does all the files of the same patient or dog (i.e. Patient_1 or Patient_2 or Dog_1, etc.) have the same number of electrodes?
And do they all have the same names?

In [None]:
search_key = "_segment_"       # search key string
subdirs = [
    '/kaggle/input/seizure-prediction/Patient_1/Patient_1',
    '/kaggle/input/seizure-prediction/Patient_2/Patient_2',
    '/kaggle/input/seizure-prediction/Dog_1/Dog_1',
    '/kaggle/input/seizure-prediction/Dog_2/Dog_2',
    '/kaggle/input/seizure-prediction/Dog_3/Dog_3',
    '/kaggle/input/seizure-prediction/Dog_4/Dog_4',
    '/kaggle/input/seizure-prediction/Dog_5/Dog_5',
]

for dirname in subdirs:
    for dirname, _, filenames in os.walk(dirname):
        # auxiliar variables needed to monitor both the
        # number of electrodes and electrode names between
        # the last file analyzed and the current file analyzed
        numElectrodes_lastFile = 0
        firstComparisonDone = False
        lastFileName = ""
        electrodeNames_lastFile = []
        for filename in filenames:
            matFile = scipy.io.loadmat(os.path.join(dirname, filename))
            segment = dict(filter(lambda item: search_key in item[0], matFile.items()))  # retrieves only the item which contain the data of interes
            segment_list_values = list(segment.values())
            # number of electrodes on the currently analyzed file
            numElectrodes_currentFile = segment_list_values[0][0][0][3][0].shape[0]
            # next two lines of code are relevant only if during the
            # first cycle of this loop
            numElectrodes_lastFile = numElectrodes_currentFile if True is not firstComparisonDone else numElectrodes_lastFile
            firstComparisonDone = True if True != firstComparisonDone else firstComparisonDone
            # check if number of electrodes between the current file and the last
            # file analyzed are equal
            if numElectrodes_currentFile != numElectrodes_lastFile:
                print("Discrepancy in number of electrodes between files:")
                print("     Last file    %s with %d electrodes" % (lastFileName, numElectrodes_lastFile))
                print("     Current file %s with %d electrodes" % (filename, numElectrodes_currentFile))
            numElectrodes_lastFile = numElectrodes_currentFile
            # collect the name of the electrodes on the current file being analyzed
            electrode_names = [name[0] for name in segment_list_values[0][0][0][3][0]]
            if len(electrodeNames_lastFile) > 0:
                # check if the electrode names of the last file analyzed and 
                # the current file analyzed are equal
                commonNames = set(electrodeNames_lastFile) & set(electrode_names)
                if len(commonNames) != len(electrode_names):
                    print("Different electrode names on files: ")
                    print("     %s (last faile name)" % lastFileName)
                    print("         Electrode names: ", electrodeNames_lastFile)
                    print("     %s " % filename)
                    print("         Electrode names: ", electrode_names)
            electrodeNames_lastFile = electrode_names
            lastFileName = filename
print("Check done")

Here's an example of how an eeg looks like.

In [None]:
# Code in this cell was partially taken from:
# https://matplotlib.org/3.3.1/gallery/specialty_plots/mri_with_eeg.html

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np

file_to_inspect = scipy.io.loadmat('/kaggle/input/seizure-prediction/Patient_1/Patient_1/Patient_1_interictal_segment_0001.mat')
# retrieves only the item which contain the data of interese
search_key = '_segment_'
segment = dict(filter(lambda item: search_key in item[0], file_to_inspect.items()))
segment = list(segment.values())
data = segment[0][0][0][0]
num_electrodes = data.shape[0]             # rows (i.e. electrodes) in the data matrix
n_samples = data.shape[1]                  # number of samples on each row (i.e. electrode's samples)
electrode_names = segment[0][0][0][3][0]   # name or labels of the electrodes
t = 10 * np.arange(n_samples) / n_samples

# create the 'figure'
fig = plt.figure("EEG samples",figsize=(30,15))

ticklocs = []
ax = fig.add_subplot(1, 1, 1)
ax.set_xlim(0, 10)
ax.set_xticks(np.arange(10))
data_min = data.min()
data_max = data.max()
dr = (data_max - data_min) * 0.7   #crowd it a bit
y0 = data_min
y1 = (num_electrodes -1) * dr + data_max
ax.set_ylim(y0, y1)

segs = []
for electrode in range(num_electrodes):
    segs.append(np.column_stack((t, data[electrode, :])))
    ticklocs.append(electrode * dr)
    
offsets = np.zeros((num_electrodes, 2), dtype=float)
offsets[:, 1] = ticklocs

lines = LineCollection(segs, offsets=offsets, transOffset=None)
ax.add_collection(lines)

# Set the yticks to use axes coordinates on the y axis
ax.set_yticks(ticklocs)
ax.set_yticklabels(electrode_names)

ax.set_xlabel('Time (minutes)')

plt.show()

Figure out if the average maximum and average minimum levels of each one of the subjects (for both the interictal and preictal measurements) are similar (or near form each other).
In the following cells it will be done:
 * Create a .csv file for each subject and its ictal and preictal measurements. Such .csv files will contain the average of each channel on each one of the 10 seconds sub-segments for both the ictal and precital .mat files
 * Plot all the averages to be able to make a visual comparison

In [None]:
search_key = "_segment_"       # search key string
subdirs = [
    '/kaggle/input/seizure-prediction/Patient_1/Patient_1',
    '/kaggle/input/seizure-prediction/Patient_2/Patient_2',
    '/kaggle/input/seizure-prediction/Dog_1/Dog_1',
    '/kaggle/input/seizure-prediction/Dog_2/Dog_2',
    '/kaggle/input/seizure-prediction/Dog_3/Dog_3',
    '/kaggle/input/seizure-prediction/Dog_4/Dog_4',
    '/kaggle/input/seizure-prediction/Dog_5/Dog_5',
]
for dirname in subdirs:
    for dirname, _, filenames in os.walk(dirname):
        electrodeNamesInterictal_pos = {}
        electrodeNamesInterictal_neg = {}
        electrodeNamesPreictal_pos = {}
        electrodeNamesPreictal_neg = {}
        # electrodesAllValues = {} --> ArHe
        print("In directory: ", os.path.basename(os.path.normpath(dirname)))
        # create a list of all the files in the current directory
        # and sort them alphabetically
        files = []
        for f in filenames:
            files.append(f)
        files = sorted(files)
        for file in files:
            matFile = scipy.io.loadmat(os.path.join(dirname, file))
            # retrieves only the item which contain the data of interes
            segment = dict(filter(lambda item: search_key in item[0], matFile.items()))
            segment_list_values = list(segment.values())
            # number of electrodes on the currently analyzed file
            for idx, electrode in enumerate(segment_list_values[0][0][0][3][0]):
                maxValue = segment_list_values[0][0][0][0][idx].max()
                minValue = segment_list_values[0][0][0][0][idx].min()
                if '_interictal_' in file:
                    if electrode[0] not in electrodeNamesInterictal_pos.keys():
                        electrodeNamesInterictal_pos[electrode[0]] = [maxValue]
                    else:
                        #electrodeNamesInterictal_pos[electrode[0]] = electrodeNamesInterictal_pos[electrode[0]] + (maxValue,)
                        electrodeNamesInterictal_pos[electrode[0]].append(maxValue)
                    if electrode[0] not in electrodeNamesInterictal_neg.keys():
                        electrodeNamesInterictal_neg[electrode[0]] = [minValue]
                    else:
                        #electrodeNamesInterictal_neg[electrode[0]] = electrodeNamesInterictal_neg[electrode[0]] + (minValue,)
                        electrodeNamesInterictal_neg[electrode[0]].append(minValue)
                    ## I need also to store the values of the electrodes in sequencial order...
                    #if electrode[0] not in electrodesAllValues.keys():
                    #    electrodesAllValues[electrode[0]] = np.array(segment_list_values[0][0][0][0][idx])
                    #else:
                    #    electrodesAllValues[electrode[0]] = np.concatenate(
                    #        (electrodesAllValues[electrode[0]], segment_list_values[0][0][0][0][idx]), axis=None
                    #    ) --> ArHe
                elif '_preictal_' in file:
                    if electrode[0] not in electrodeNamesPreictal_pos.keys():
                        electrodeNamesPreictal_pos[electrode[0]] = [maxValue]
                    else:
                        #electrodeNamesPreictal_pos[electrode[0]] = electrodeNamesPreictal_pos[electrode[0]] + (maxValue,)
                        electrodeNamesPreictal_pos[electrode[0]].append(maxValue)
                    if electrode[0] not in electrodeNamesPreictal_neg.keys():
                        electrodeNamesPreictal_neg[electrode[0]] = [minValue]
                    else:
                        #electrodeNamesPreictal_neg[electrode[0]] = electrodeNamesPreictal_neg[electrode[0]] + (minValue,)
                        electrodeNamesPreictal_neg[electrode[0]].append(minValue)
                    ## I need also to store the values of the electrodes in sequencial order...
                    #if electrode[0] not in electrodesAllValues.keys():
                    #    electrodesAllValues[electrode[0]] = np.array(segment_list_values[0][0][0][0][idx])
                    #else:
                    #    electrodesAllValues[electrode[0]] = np.concatenate(
                    #        (electrodesAllValues[electrode[0]], segment_list_values[0][0][0][0][idx]), axis=None
                    #    ) --> ArHe
        
        # for now path is just the name of the folder holding the files
        basePath = os.path.basename(os.path.normpath(dirname))
        # writing interictal file
        path = basePath + "_interictal.csv"
        electrodeNamesInterictal_dict = {'max': electrodeNamesInterictal_pos, 'min': electrodeNamesInterictal_neg}
        df = pd.DataFrame(electrodeNamesInterictal_dict)
        df.to_csv(path)
        print("File %s written." % path)
        # writing preictal file
        path = basePath + "_preictal.csv"
        electrodeNamesPreictal_dict = {'max': electrodeNamesPreictal_pos, 'min': electrodeNamesPreictal_neg}
        df = pd.DataFrame(electrodeNamesPreictal_dict)
        df.to_csv(path)
        print("File %s written." % path)
        ## writing all electrode values
        #path = basePath + "_allElectrodeValues.csv"
        #df = pd.DataFrame(electrodesAllValues)
        #df.to_csv(path)
        #print("File %s written" % path)
print("Done")

In [None]:
def plotMaxMinAveragesDF(dataframe, title):
    df = pd.read_csv(dataframe, converters={'max': eval, 'min':eval}, index_col=0)
    fig = plt.figure(figsize=(30,15))
    ax = fig.add_subplot(1,1,1)
    mpl.rcParams['axes.prop_cycle'] = cycler(color=[
        '#1f77b4', '#1f77b4',
        '#ff7f0e', '#ff7f0e',
        '#2ca02c', '#2ca02c',
        '#d62728', '#d62728',
        '#9467bd', '#9467bd',
        '#8c564b', '#8c564b',
        '#e377c2', '#e377c2',
        '#7f7f7f', '#7f7f7f',
        '#bcbd22', '#bcbd22',
        '#17becf', '#17becf',
        '#aaaaaa', '#aaaaaa',
        '#123456', '#132456',
        '#cdcdcd', '#cdcdcd',
        '#ff00ff', '#ff00ff',
        '#000000', '#000000',
        '#174038', '#174038',
        '#362819', '#362819',
        '#4B0082', '#4B0082',
        '#76EE00', '#76EE00',
        '#8B7500', '#8B7500',
        '#FF4500', '#FF4500',
        '#FF83FA', '#FF83FA',
        '#FFE600', '#FFE600',
        '#CC4E5C', '#CC4E5C',])
    legend = []
    for electrode in df.T.columns:
        if electrode not in legend:
            legend.append(electrode)
            legend.append('')
        for avrArray in df.T[electrode]:
            ax.plot(avrArray, 'o:')
    plt.legend(legend)
    plt.title(title)
    plt.show()

averagesFiles = []
for dirname, _, filenames in os.walk('./'):
    for f in filenames:
        averagesFiles.append(f)
        averagesFiles.sort()
for f in averagesFiles:
    if '_notebook_' not in f:
        plotMaxMinAveragesDF(os.path.join(dirname, f), f)

Observations on the plots:
 * There seems to exist some sort of symmetry between the positive and negative averages curves.
 * The values of both positive and negative curves are different between dogs for all the interictal and preictal curves as well as the values for the human patients. That is, it doen't seem to be a relation between the positive and negative values of the different dog patients nor of the different human patients.
 * On the human patients, the values of the curves during the preictal measurements get very close to zero forming an almost flat line pattern.
 * On the dogs, the values of the curves during the preictal measurments seem to follow some sort of alignment pattern which doesn't seem to exist on the interictal measurments. This sharpens the symmetry pattern on the preictal curves.

 In general it is not clear if there is really information on these graphs that could be used as clear indications of whether and when a seasure is going to occur other the fact that the values of the electrodes (all of them) seem to stay close around the same values during a preictal phase.

 Maybe an alternative for visual inspection could be to produce a spectogram of the patients EEGs.

How does an espectogram of an eeg electrode looks like? Could we use it to make a prediction?

In [None]:
def plotSpectograms(file, electrode=0):
    search_key = "_segment_"       # search key string
    matFile = scipy.io.loadmat(os.path.join(file))
    segment = dict(filter(lambda item: search_key in item[0], matFile.items()))
    segment_list_values = list(segment.values())

    x = np.array(segment_list_values[0][0][0][0][electrode], dtype=np.float)
    secs = np.array(segment_list_values[0][0][0][1], dtype=np.float)
    Fs = np.array(segment_list_values[0][0][0][2], dtype=np.float)[0][0]
    NFFT = 1024  # the length of the windowing segments
    dt = 1/Fs
    t = np.arange(0.0, 600.0, dt)

    fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(15,10))
    ax1.plot(t, x)
    Pxx, freqs, bins, im = ax2.specgram(x, NFFT=NFFT, Fs=Fs, noverlap=100, sides='twosided')
    # The `specgram` method returns 4 objects. They are:
    # - Pxx: the periodogram
    # - freqs: the frequency vector
    # - bins: the centers of the time bins
    # - im: the .image.AxesImage instance representing the data in the plot
    plt.show()
    
file = [
    '/kaggle/input/seizure-prediction/Patient_1/Patient_1/Patient_1_interictal_segment_0001.mat',
    '/kaggle/input/seizure-prediction/Patient_1/Patient_1/Patient_1_preictal_segment_0001.mat',
    '/kaggle/input/seizure-prediction/Dog_3/Dog_3/Dog_3_interictal_segment_0005.mat',
    '/kaggle/input/seizure-prediction/Dog_3/Dog_3/Dog_3_preictal_segment_0005.mat'
]
for f in file:
    plotSpectograms(f, electrode=5)