In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# additional libraries 
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# for better color representation 
plt.style.use('ggplot')
from inspect import signature
from sklearn import tree
from scipy import stats 
from scipy import signal
from sklearn import preprocessing
from scipy.fft import fft, fftfreq
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.ar_model import AutoReg
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, validation_curve
from sklearn import metrics 
from pandas.plotting import lag_plot, autocorrelation_plot
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
np.random.seed(42) # stable execution

In [None]:
# read data 
df = pd.read_csv("/kaggle/input/epileptic-seizure-recognition/Epileptic Seizure Recognition.csv")
df.name = "seizure_data"
print("DataFrame name: {data_name}".format(data_name = df.name))
print(f"DataFrame size: {df.shape}")
print(f"# of datapoints: {df.shape[0]*df.shape[1]}")
df.head(3)

### Data preprocessing

In [None]:
# drop subject identifier
df.drop(["Unnamed"], inplace = True, axis = 1)
assert(df.columns[0] != "Unnamed")

In [None]:
# missing values 
df.isna().any().sum()

In [None]:
# summary statistics of all features
df.drop(["y"], axis = 1).describe()

In [None]:
# correlation among all features 
df.drop(["y"], axis = 1).corr()

In [None]:
# get a measure of correlation firt two consecutive epochs 
# positive, very high correlation
print(pd.DataFrame(data = {"X1": df["X1"], "X2": df["X2"]}).corr())
print()

# get a measure of correlation between first and last epoch
# no correlation 
print(pd.DataFrame(data = {"X1": df["X1"], "X178": df["X178"]}).corr())

# plot first and last signal 
fig, (ax1, ax2) = plt.subplots(1, 2, sharex = True, sharey = True, figsize = (15, 5))
ax1.plot(df["X1"], color = "c")
ax1.set_title("Signal Curve of X1")
ax2.plot(df["X178"], color = "g")
ax2.set_title("Signal Curve of X178")
plt.show()

In [None]:
# calculate discrete linear convolution for moving average

# plot first epoch with convolution 
fig, (ax1, ax2) = plt.subplots(1, 2, sharex = True, sharey = True, figsize = (15, 5))
interval = 1000 
window = np.ones(interval) / interval
moving_avg = np.convolve(df["X1"], window, "same")
ax1.plot(moving_avg, c = "c")
ax1.set_title("Smoothed Signal Curve of X1")

# plot last epoch with alternative method 
ax2 = df["X178"].rolling(window = 1000).mean()
ax2.plot(c = "g")
plt.title("Smoothed Signal Curve of X178")
plt.show()

In [None]:
# define features, labels 
features = df.drop(["y"], axis = 1)
labels = df["y"]

In [None]:
# invert the time domain 
features = features.T

In [None]:
# switch from time domain to frequency domain with fft 

# define sampling rate 
"""
num_samples = 4094 
duration = 23
sampling_rate = num_samples / duration
"""
sampling_rate = df.shape[1]

# remove DC component
features = features - np.mean(features)

# fast fourier transformation 
# fourier_space = [features.iloc[:,i].ravel() for i in range(features.shape[1])]
# fourier_space = [k for j in fourier_space for k in j]
fourier_range = fft(features.T.values.ravel())
fourier_domain = fftfreq(features.T.values.ravel().size, 1/sampling_rate)

# use abs to deal with complex numbers 
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (15,5))
ax1.plot(fourier_domain, np.abs(fourier_range), c = "c")
ax1.plot(fourier_domain[:fourier_domain.size // 2], np.abs(fourier_range[:fourier_range.size // 2]), c = "g")
ax1.set_xlabel("frequency [Hz]", fontweight = "bold")
ax1.set_ylabel("Amplitude [m]", fontweight = "bold")
ax2.plot(fourier_domain[:fourier_domain.size // 2], np.abs(fourier_range[:fourier_range.size // 2]), c = "g")
ax2.set_xlabel("frequency [Hz]", fontweight = "bold")
ax2.set_ylabel("Amplitude [m]", fontweight = "bold")
plt.show()

In [None]:
# Welch's Method 
# make a plot with log scaling on the y-axis 
freqs, power_spectrum = signal.welch(features.values.ravel(), sampling_rate, "flattop", 1024, scaling = "spectrum")

# filter frequency and power spectrum 
freqs, power_spectrum = freqs[(freqs > 1) & (freqs < 89)], power_spectrum[(freqs > 1) & (freqs < 89)]

plt.figure(figsize = (15, 5))
plt.semilogy(freqs, np.sqrt(power_spectrum), c = "c")
plt.xlabel("frequency [Hz]", fontweight = "bold")
plt.ylabel("Linear spectrum [V RMS]", fontweight = "bold")
plt.show()

# RMS estimate 
print(f"RMS estimate: {round(np.sqrt(power_spectrum.max()), 2)}")

### Significant Frequency Bands 

1. **Delta:** has a frequency of 3 Hz or below.
1. **Theta:** has a frequency of 3.5 to 7.5 Hz and is classified as "slow" activity.
1. **Alpha:** has a frequency between 7.5 and 13 Hz.
1. **Beta:**  has a frequency bigger than 13 Hz.

> [See source](https://www.medicine.mcgill.ca/physio/vlab/biomed_signals/eeg_n.htm)


In [None]:
# there is a spike in the beta waves, this could be important 
# majority of spikes in power spectrum is composed of beta waves 
welch_df = pd.DataFrame({"frequency": freqs, "power": power_spectrum})
welch_df.head()

In [None]:
# extract individual wave amplitudes 
for wave in range(5):
    delta = welch_df[welch_df.frequency <= 3].power
    theta = welch_df[(welch_df.frequency >= 3.5) & (welch_df.frequency <= 7.5)].power
    alpha = welch_df[(welch_df.frequency >= 7.5) & (welch_df.frequency <= 13)].power
    beta = welch_df[welch_df.frequency >= 13].power


# of datapoints in each wave 
print(f"Delta size: {delta.size}")
print(f"Theta size: {theta.size}")
print(f"Alpha size: {alpha.size}")
print(f"Beta size: {beta.size}")
print(f"# of significant frequencies: {delta.size + theta.size + alpha.size + beta.size}")

# plot all waves
fig, axs = plt.subplots(2, 2, figsize = (15,5))
axs[0, 0].plot(delta, c = "c")
axs[0, 1].plot(theta, c = "g")
axs[1, 0].plot(alpha, c = "g")
axs[1, 1].plot(beta, c = "c")

for ax in axs.flat:
    ax.set(xlabel="#", ylabel="[m]")

for ax in axs.flat:
    ax.label_outer()

In [None]:
# --- add feature ---

waves = [delta, theta, alpha, beta]
features = features.append(pd.concat(waves))

In [None]:
# run cell after running the above cell only 

# recall individual wave counts 
print(f"Delta size: {delta.size}")
print(f"Theta size: {theta.size}")
print(f"Alpha size: {alpha.size}")
print(f"Beta size: {beta.size}")
print(f"# of significant frequencies: {delta.size + theta.size + alpha.size + beta.size}")

# extend each wave, construct amplitude feature vector
feat_vector = []
total = delta.size + theta.size + alpha.size + beta.size
factor = features.iloc[-1].size // total

for wave in waves:
    wave = wave.to_list() * factor
    feat_vector.append(wave)
    
# add remainder as a combination of scarce wave types 
feat_vector = [k for y in feat_vector for k in y]

avg = (delta.mean() + theta.mean() + alpha.mean() + beta.mean()) / 4 
remainder = features.iloc[-1].size - len(feat_vector)
print(f"# of beta waves for extension: {remainder}")

for k in range(remainder):
    feat_vector.append(avg)

# check computation
assert(len(feat_vector) == features.iloc[-1].size)

# --- add feature ---
features = features.iloc[:-1]
features = features.append(pd.DataFrame(feat_vector, columns = ["amplitudes"]).T)

In [None]:
# run cell after running the above cell only 

# add fourier domain, i.e frequencies to the feature space
for wave in range(5):
    delta = welch_df[welch_df.frequency <= 3].frequency
    theta = welch_df[(welch_df.frequency >= 3.5) & (welch_df.frequency <= 7.5)].frequency
    alpha = welch_df[(welch_df.frequency >= 7.5) & (welch_df.frequency <= 13)].frequency
    beta = welch_df[welch_df.frequency >= 13].frequency

# --- add feature ---
waves = [delta, theta, alpha, beta]
features = features.append(pd.DataFrame(pd.concat(waves), columns = ["frequency"]).T)

In [None]:
# run cell after running the above cell only 

# replace amplitude with frequency
# perform same arithmetic operations 
feat_vector = []
total = delta.size + theta.size + alpha.size + beta.size
factor = features.iloc[-1].size // total

for wave in waves:
    wave = wave.to_list() * factor
    feat_vector.append(wave)
    
# add remainder as a combination of scarce wave types 
feat_vector = [k for y in feat_vector for k in y]

avg = (delta.mean() + theta.mean() + alpha.mean() + beta.mean()) / 4
remainder = features.iloc[-1].size - len(feat_vector)
print(f"# of beta waves for extension: {remainder}")

for k in range(remainder):
    feat_vector.append(avg)

# check computation
assert(len(feat_vector) == features.iloc[-1].size)

# --- add feature --- 
features = features.iloc[:-1]
features = features.append(pd.DataFrame(feat_vector, columns = ["frequency"]).T)

In [None]:
# sample eeg-values across features 
sample = np.random.randint(low = 0, high = 178, size = 3)
fig, ax = plt.subplots(3, figsize = (15, 5))
for x in range(3):
    if x % 2 == 0:
        color = "c"
    else:
        color = "g"
    ax[x].plot(features.iloc[sample[x]], c = color)
    ax[x].set_xticks([])

### Binary Classification

In [None]:
# make labels binary
binary = lambda label: 0 if label != 1 else label
labels = labels.apply(binary)

# features, labels as numpy arrays 
features = features.values 
labels = labels.values

In [None]:
# below is principal component analysis
# it is not necessary unless you have trouble with high dimensionality 

# regular train, test split here with stratification 
X_train, X_test, y_train, y_test =  train_test_split(features, labels, test_size = 0.4, 
                                                     random_state = 42, stratify = labels)
assert(X_train.shape[0] == y_train.shape[0])
assert(X_test.shape[0] == y_test.shape[0])

print(f"Size of training sample: {X_train.shape}")
print(f"Size of test sample: {X_test.shape}")

### TODO: Multi-class Classification

In [None]:
# principal component analysis 

# scale 
features = preprocessing.StandardScaler().fit_transform(features)

pca = PCA(n_components = 50)
reduced = pca.fit_transform(features)
pca_data = pd.DataFrame(reduced, columns = ["component_" + str(idx) for idx in range(1,51)])

# display first five principal components 
print(pca_data[["component_" + str(idx) for idx in range(1,6)]])

# total information stored 
print(f"\nCumulative variance explained: {np.sum(pca.explained_variance_ratio_)}")

In [None]:
# linear support vector machine 
# train, test split 
X_train, X_test, y_train, y_test = train_test_split(pca_data.values, labels, 
                                                    test_size = 0.33,random_state = 42)

In [None]:
# ratio of labels in data 
print("Class distribution before over-sampling")
print(f"Ratio of label = 1: {np.sum(labels) / labels.size}")
print(f"Ratio of label = 0: {1 - np.sum(labels) / labels.size}")

# data is heavily unbalanced

### Oversampling
* Do not run for complexity reasons 

In [None]:
"""
# oversample training data
# apply to training set only 
X_train, y_train = RandomOverSampler(sampling_strategy = "minority").fit_resample(X_train, y_train)

# observe new ratio 
print("\nClass distribution after over-sampling")
print(f"Ratio of label = 1: {np.sum(y_train) / y_train.size}")
print(f"Ratio of label = 0: {1 - np.sum(y_train) / y_train.size}")
"""

In [None]:
# classification 
classifier = svm.SVC(kernel = "linear", C = 1, random_state = 42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
# accuracy 
print(f"Accuracy of linear SVM: {metrics.accuracy_score(y_test, y_pred)}\n")

# classification report 
print(metrics.classification_report(y_test, y_pred))

In [None]:
# random forest classifier 
classifier = RandomForestClassifier(random_state = 42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
# accuracy 
print(f"Accuracy of random forest classifier: {metrics.accuracy_score(y_test, y_pred)}\n")

# classification report 
print(metrics.classification_report(y_test, y_pred))

In [None]:
# Create range of values for parameter
param_range = np.arange(1, 250, 50)

# Calculate accuracy on training and test set using range of parameter values
train_scores, test_scores = validation_curve(RandomForestClassifier(), 
                                             features, 
                                             labels, 
                                             param_name="n_estimators", 
                                             param_range=param_range,
                                             cv=3, 
                                             scoring="accuracy", 
                                             n_jobs=-1)


# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="dimgrey")

# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")

# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()

In [None]:
# k-fold cross validation 
accs = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10).mean()*100
print(f"10-fold accuracy: {round(accs, 2)}%")

In [None]:
# no parameter optimization necessary 

### Can we predict seizures beforehand ? 

In [None]:
features = df.drop(["y"], axis = 1).T

In [None]:
# reindex 
idx = {}
for i in range(1,179):
    idx["X" + str(i)] = float(i)
features.rename(index = idx, inplace = True)
features.head(5)

In [None]:
# average the data before further analysis
avg_df = pd.DataFrame(features.mean(axis = 1), columns = ["t"])
avg_df.head(5)

In [None]:
# plot for lag = 1
lag_plot(avg_df)

In [None]:
# check correlation between (t+1) and (t-1)
corr_df = pd.concat([avg_df.shift(1), avg_df], axis=1)
corr_df.columns = ["t-1", "t+1"]
corr_df.corr()

In [None]:
# for further analysis 
corr_df.fillna(method = "bfill", inplace = True)
corr_df.head(1)

In [None]:
# autocorrelation 
autocorrelation_plot(avg_df)
plot_acf(avg_df, lags = 31)
plt.show()

In [None]:
# baseline persistence model 

# create lagged data 
X = corr_df.values 

# perform the split 
train, test = X[:len(X)-7], X[len(X)-7:]
X_train, y_train = train[:,0], train[:,1]
X_test, y_test = test[:,0], test[:,1]


# every item in X_test is a forecast 
pred = [x for x in X_test]

# scoring
mse = metrics.mean_squared_error(y_test, pred)
print(f"Mean squared error for persistence model: {round(mse,2)}")

# plot persistence model
fig, ax = plt.subplots(figsize = (15,5))
ax.plot(y_test)
ax.plot(pred, c = "b")
ax.legend(["true", "prediction"])
ax.set_title("Baseline model for autoregression", fontweight = "bold")
plt.show()

In [None]:
# make arrays one dimensional 
train = train.ravel()
test = test.ravel()

In [None]:
# autoregression model

model = AutoReg(train, lags = 31)
model_fit = model.fit()
print(f"Coefficients are {model_fit.params}")

# make predictions 
print()
pred = model_fit.predict(len(train), (len(train) + len(test) - 1), dynamic = False)
for p in range(len(pred)):
    print("predicted = {predicted}\tactual = {actual}".format(predicted = pred[p], actual = test[p]))
    
# root mean squared error 
rmse = np.sqrt(metrics.mean_squared_error(test, pred))
print(f"\nRoot mean squared error: {rmse}")

# plot true, pred values with cutoff
fig, ax = plt.subplots(figsize = (15,5))
plt.axvline(x = 6, c = "g")
ax.plot(test)
ax.plot(pred, c = "b")
ax.legend(["cutoff","true", "prediction"])
ax.set_title("Autoregression model", fontweight = "bold")
plt.show()

### Discrepancy starts at lag = 6

In [None]:
# learn the coefficients 
window = 31
model = AutoReg(train, lags = window)
model_fit = model.fit()
coef = model_fit.params

# get prior 31 observations 
# make predictions 
history = train[len(train) - window:]
history = [history[i] for i in range(len(history))]
yhat = coef[0]
pred = []
for p in range(len(test)):
    hist_len = len(history)
    lag = [history[i] for i in range(hist_len - window, hist_len)]
    yhat = coef[0]
    for d in range(window):
        yhat += coef[d+1] * lag[window-d-1]
    obs = test[p]
    pred.append(yhat)
    history.append(obs)
    print("predicted = {predicted}\tactual = {actual}".format(predicted = yhat, actual = obs))

# root mean squared error 
rmse = np.sqrt(metrics.mean_squared_error(test, pred))
print(f"\nRoot mean squared error: {rmse}")

# plot true, pred values with cutoff
fig, ax = plt.subplots(figsize = (15,5))
plt.axvline(x = 6, c = "g")
plt.axvline(x = 8, c = "g")
ax.plot(test)
ax.plot(pred, c = "b")
ax.legend(["cutoff","true", "prediction"])
ax.set_title("Autoregression model (learned coefficients)", fontweight = "bold")
plt.show()

### Major improvement in RMSE

### Multivariate linear regression, robustness, polynomial, lasso, etc. 