In [1]:
import os
import sys

import pandas as pd
import numpy as np

import pywt
import scipy.io as spio
from scipy.stats import entropy
from collections import Counter

from sklearn import svm
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import timeit

In [39]:
'''
   Authors: Shivam Chaudhary
            Centre for Brain and Cognitive Science, Indian Institute of Technology Gandhinagar 
   In this project we will be recognising Emotion of a Human being from EEG signal.
   About the data set : The data set is called the seed data set.
   It contains data of 15 people that underwent trails 15 times each thrice.

           Total data items =  15 (subjects) * 15 (trials each) * 3 (sessions each)
                            = 675 data items

   Our project consists of 4 modules, namely : pre processing, feature extraction, feature reduction and classification,
   all of which are mentioned in detail in the black book.

'''

'\n   Authors: Shivam Chaudhary\n            Centre of Behavioural and Cognitive Sciences, Allahabad\n   In this project we will be recognising Emotion of a Human being from EEG signal.\n   About the data set : The data set is called the seed data set.\n   It contains data of 15 people that underwent trails 15 times each thrice.\n\n           Total data items =  15 (subjects) * 15 (trials each) * 3 (sessions each)\n                            = 675 data items\n\n   Our project consists of 4 modules, namely : pre processing, feature extraction, feature reduction and classification,\n   all of which are mentioned in detail in the black book.\n\n'

In [2]:
WAVELET = "db6"
MAX_LEVEL = 5

In [41]:
# # Connecting google drive
# from google.colab import drive
# drive.mount('/content/drive')
# #4/1AY0e-g7fln4EfIjw7IjtOGqkoh3Y0EUpKlncOdTR6o4uuu3XqPjYElTbIwo

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# dir = "/content/drive/MyDrive/EEG-emotion-python/seed dataset/SEED"
dir = "data/eeg_raw_data/1"
# os.chdir(dir)
# os.getcwd()

In [4]:
# Unzip the data file
# # Do only once
# import zipfile
# with zipfile.ZipFile("ExtractedFeatures.zip", 'r') as zip_ref:
#     zip_ref.extractall(os.getcwd() + "/temp")
# !unzip -u "/content/drive/MyDrive/EEG-emotion-python/seed dataset/SEED/Preprocessed_EEG.zip" -d "/content/drive/MyDrive/EEG-emotion-python/seed dataset/SEED/temp"

In [5]:
# reading the channel order for dataframe
channel_order = pd.read_excel("Channel Order.xlsx", header=None)
channel_order.head()

Unnamed: 0,0
0,FP1
1,FPZ
2,FP2
3,AF3
4,AF4


In [44]:
# extract labels file
# labels = spio.loadmat("label.mat")
# labels_df = pd.DataFrame(np.hstack(labels["label"]))
# labels_df.head()

Unnamed: 0,0
0,1
1,0
2,-1
3,-1
4,0


In [31]:
session1_label = [1,2,3,0,2,0,0,1,0,1,2,1,1,1,2,3,2,2,3,3,0,3,0,3]
labels = session1_label * 10
labels_df = pd.DataFrame(labels)
labels_df.columns = ['label']
labels_df

Unnamed: 0,label
0,1
1,2
2,3
3,0
4,2
...,...
235,3
236,0
237,3
238,0


In [6]:
# files = os.listdir("temp/Preprocessed_EEG/")
files = os.listdir("data/eeg_raw_data/1")
files

['4_20151111.mat',
 '9_20151028.mat',
 '8_20151103.mat',
 '10_20151014.mat',
 '2_20150915.mat',
 '11_20150916.mat',
 '7_20150715.mat',
 '6_20150507.mat',
 '15_20150508.mat',
 '12_20150725.mat']

**Feature extraction**

In [7]:
def calc_wavelet_energy(data_set):
  """
    Input : 1 * N vector
    Output: Float with the wavelet energy of the input vector,
    rounded to 3 decimal places.
  """
  # p_sqr = [i ** 2 for i in data_set]
  wavelet_energy = np.nansum(np.log2(np.square(data_set)))
  return round(wavelet_energy, 3)

In [8]:
def calc_shannon_entropy(data_set):
    """
        Input : 1 * N vector
        Output: Float with the wavelet entropy of the input vector,
                rounded to 3 decimal places.
    """
    # probability = [i ** 2 for i in data_set]
    probability = np.square(data_set)
    shannon_entropy = -np.nansum(probability * np.log2(probability))
    return round(shannon_entropy, 3)

In [19]:
len(files)

10

In [20]:
participant_trial = []
features_table = pd.DataFrame(columns=range(620))
for file in files:
  # mat_file = spio.loadmat("temp/Preprocessed_EEG/" + file)
  mat_file = spio.loadmat("data/eeg_raw_data/1/" + file)
  keys = [key for key, values in mat_file.items() if key != '__header__' and key != '__version__' and key != '__globals__' ]
  for data_file in keys:
    data_df = pd.DataFrame(mat_file[data_file])
    # print(data_df)
    M = data_df.shape[0]
    N = data_df.shape[1]
    # Feature extraction part of the module
    Entropy = []
    Energy = []
    for channel in data_df.iterrows():  # Iterating through the 62 channels
      dwt_bands = []
      data = channel[1]
      int_ent = []
      int_eng = []
      for band in range(MAX_LEVEL):
        (data, coeff_d) = pywt.dwt(data, WAVELET)
        dwt_bands.append(coeff_d)
      
      for band in range(len(dwt_bands)):  # DWT_bands = 23504, 11755
        int_ent.append(calc_shannon_entropy(dwt_bands[len(dwt_bands) - band - 1]))
        int_eng.append(calc_wavelet_energy(dwt_bands[len(dwt_bands) - band - 1]))
      
      Entropy.append(int_ent)
      Energy.append(int_eng)
    
    unroll_entropy = []
    unroll_energy = []
    '''
        Transforming 2D array into 1D vector of features and then 
        combining the two 1D arrays.
    '''
    for i in range(len(Entropy)):
      for j in range(len(Entropy[0])):
        unroll_entropy.append(Entropy[i][j])
        unroll_energy.append(Energy[i][j])

    features = unroll_entropy + unroll_energy
    participant_trial.append(features)
    features_table.loc[len(features_table.index)] = features
    # print(data_file)
    # print(features)
  print(file)


4_20151111.mat
9_20151028.mat
8_20151103.mat
10_20151014.mat
2_20150915.mat
11_20150916.mat
7_20150715.mat
6_20150507.mat
15_20150508.mat
12_20150725.mat


In [18]:
features_table.shape

(24, 620)

In [10]:
dir

'data/eeg_raw_data/1'

In [11]:
file_dir = dir + "/temp/analysis/"
# file_dir = "outputs/"

In [13]:
features_table.to_csv(file_dir + "features" + WAVELET + ".csv", index=False)

In [15]:
features_table.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,610,611,612,613,614,615,616,617,618,619
0,-63974650.0,-18094990.0,-13498460.0,-22401240.0,-22384050.0,-67625350.0,-18627620.0,-11205630.0,-14722450.0,-17367880.0,...,7008.546,11507.271,18140.951,28128.404,48368.756,7040.383,11533.766,17855.794,27906.063,47703.254
1,-35791580.0,-7784747.0,-8271466.0,-13836960.0,-13793310.0,-29919300.0,-8139283.0,-6296407.0,-10209270.0,-9714509.0,...,4036.81,6342.075,10478.842,17803.473,30390.247,4016.292,6364.069,10436.333,17593.498,30016.484
2,-80573790.0,-23260970.0,-16507470.0,-21891060.0,-19313410.0,-88882040.0,-23978940.0,-12240320.0,-15250260.0,-14133460.0,...,8393.314,13902.817,23362.412,40077.574,69262.408,8365.401,13865.91,23159.602,39701.311,68417.309
3,-60607950.0,-16640130.0,-10682610.0,-15347850.0,-14226280.0,-69410040.0,-17541740.0,-10705630.0,-14831940.0,-13011400.0,...,5455.819,9183.598,13465.981,20463.425,33391.519,5493.282,9218.534,13319.612,20113.996,32496.45
4,-146821000.0,-36034300.0,-12434330.0,-23508350.0,-20583790.0,-156944500.0,-38400530.0,-9859054.0,-17564870.0,-12008120.0,...,3688.1,6142.809,9545.184,16689.145,28243.48,3717.69,6168.68,9403.942,16447.35,27864.277


**Principal Components Analysis**

In [21]:
data = pd.read_csv(file_dir + "features" + WAVELET + ".csv")

In [22]:
data.shape

(240, 620)

In [23]:
# 1. Normalising data and getting transpose
normalised = pd.DataFrame(normalize(data, axis = 0))

# 2. Finding covariance matrix
covariance_df = normalised.cov()

# 3. Eigen Vectors
u, s, v = np.linalg.svd(covariance_df)

# 4. Principal Components
data_reduced = normalised @ u
data_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,610,611,612,613,614,615,616,617,618,619
0,1.132079,0.834258,-0.568735,-0.279137,0.232388,0.130625,0.200517,0.073107,-0.087576,0.025902,...,7.9e-05,-0.000414,-6.4e-05,-0.000187,3.6e-05,9.2e-05,-0.000125,-0.000289,-0.000121,0.000163
1,0.671489,0.491039,-0.26975,-0.180885,0.112637,0.047381,0.094276,0.039127,-0.032007,0.009261,...,7.9e-05,-0.000414,-6.4e-05,-0.000187,3.6e-05,9.2e-05,-0.000125,-0.000289,-0.000121,0.000163
2,1.447622,1.044359,-0.591106,-0.392303,0.255709,0.164169,0.221489,0.060476,-0.096166,0.01984,...,7.9e-05,-0.000414,-6.4e-05,-0.000187,3.6e-05,9.2e-05,-0.000125,-0.000289,-0.000121,0.000163
3,0.852206,0.696203,-0.484486,-0.162725,0.168144,0.08574,0.145414,0.072946,-0.052177,0.01955,...,7.9e-05,-0.000414,-6.4e-05,-0.000187,3.6e-05,9.2e-05,-0.000125,-0.000289,-0.000121,0.000163
4,0.731689,0.545708,-0.263686,-0.146944,0.104475,0.122076,0.03684,0.006239,-0.033168,0.03439,...,7.9e-05,-0.000414,-6.4e-05,-0.000187,3.6e-05,9.2e-05,-0.000125,-0.000289,-0.000121,0.000163


In [25]:
data_reduced.shape

(240, 620)

In [26]:
data_reduced.to_csv(file_dir + "pc" + WAVELET + ".csv", index=False)

In [27]:
file_dir

'data/eeg_raw_data/1/temp/analysis/'

**Running data through classifiers**


1.   SVM



In [32]:
# Reading data and splitting 
pcs = pd.read_csv(file_dir + "pc" + WAVELET + ".csv")
# outputs = pd.read_csv(file_dir + "outputs_main.csv", header=None)
outputs = labels_df

X = pcs.iloc[:, :].values
Y = outputs.iloc[:, :].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [33]:
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)


(192, 620) (48, 620)
(192, 1) (48, 1)


In [34]:
svc = SVC()
parameters = {"C": (100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9), "gamma": (1e-08, 1e-7, 1e-6, 1e-5)}
grid_search = GridSearchCV(svc, parameters, n_jobs=-1, cv=5)
start_time = timeit.default_timer()
grid_search.fit(X_train, Y_train)
print("--- {0:.3f} seconds ---".format(timeit.default_timer() - start_time))
print(grid_search.best_params_)
svc_best = grid_search.best_estimator_
accuracy = svc_best.score(X_test, Y_test)
print("Accuracy on the testing set is: {0:.1f}%".format(accuracy*100))
prediction = svc_best.predict(X_test)
report = classification_report(Y_test, prediction)
print(report)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

--- 7.311 seconds ---
{'C': 1000000.0, 'gamma': 1e-05}
Accuracy on the testing set is: 60.4%
              precision    recall  f1-score   support

           0       0.53      1.00      0.70         8
           1       0.69      0.73      0.71        15
           2       0.44      0.31      0.36        13
           3       0.75      0.50      0.60        12

    accuracy                           0.60        48
   macro avg       0.60      0.64      0.59        48
weighted avg       0.61      0.60      0.59        48



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# SVM

In [38]:
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#create a SVM clssifier
clf = svm.SVC(kernel='rbf', C=1.0)

#train
clf.fit(X_train, Y_train)
#predict
Y_preds = clf.predict(X_test)
# Accuracy
print(f"accuracy: {accuracy_score(Y_test, Y_preds)}")
print(f"precision: {precision_score(Y_test, Y_preds, average='micro')}")
print(f"recall: {recall_score(Y_test, Y_preds, average='micro')}")
print(f"f1 score: {f1_score(Y_test, Y_preds, average='micro')}")


accuracy: 0.4166666666666667
precision: 0.4166666666666667
recall: 0.4166666666666667
f1 score: 0.4166666666666667


  y = column_or_1d(y, warn=True)


### RandomForest

In [39]:
import numpy as np
# from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the Iris dataset
# iris = load_iris()

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25)

# Create the random forest classifier
clf_rfc = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)

# Train the classifier
clf_rfc.fit(X_train, Y_train)

# Make predictions on the test set
Y_preds = clf_rfc.predict(X_test)

# # Calculate the accuracy
# accuracy = np.mean(y_pred == y_test)
# print('Accuracy:', accuracy)
print(classification_report(Y_test, Y_preds))

              precision    recall  f1-score   support

           0       0.17      0.50      0.26         8
           1       0.56      0.33      0.42        15
           2       0.43      0.23      0.30        13
           3       0.33      0.25      0.29        12

    accuracy                           0.31        48
   macro avg       0.37      0.33      0.32        48
weighted avg       0.40      0.31      0.33        48



  return fit_method(estimator, *args, **kwargs)


### LightGBM

### GradientBoosted

In [50]:
import numpy as np
# from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier

# Load the Iris dataset
# iris = load_iris()

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25)

# Create the gradient boosted tree classifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=3, random_state=0)

# Train the classifier
clf.fit(X_train, Y_train)

# Make predictions on the test set
Y_preds = clf.predict(X_test)

# Calculate the accuracy
print(classification_report(Y_test, Y_preds))

  


              precision    recall  f1-score   support

           0       0.47      1.00      0.64         8
           1       0.58      0.47      0.52        15
           2       0.50      0.31      0.38        13
           3       0.36      0.33      0.35        12

    accuracy                           0.48        48
   macro avg       0.48      0.53      0.47        48
weighted avg       0.49      0.48      0.46        48

