# Emotion Classification

### Import packages

In [None]:
from tqdm import tqdm
import os
import sys

import pandas as pd
import numpy as np

import pywt
import scipy.io as spio
from scipy.stats import entropy
from collections import Counter

from sklearn import svm
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from typing import Dict, Tutple, List

import timeit
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


In [None]:
'''
   Authors: Shivam Chaudhary
            Centre for Brain and Cognitive Science, Indian Institute of Technology Gandhinagar
   In this project we will be recognising Emotion of a Human being from EEG signal.
   About the data set : The data set is called the seed data set.
   It contains data of 15 people that underwent trails 15 times each thrice.

           Total data items =  15 (subjects) * 15 (trials each) * 3 (sessions each)
                            = 675 data items

   Our project consists of 4 modules, namely : pre processing, feature extraction, feature reduction and classification,
   all of which are mentioned in detail in the black book.

'''

'\n   Authors: Shivam Chaudhary\n            Centre for Brain and Cognitive Science, Indian Institute of Technology Gandhinagar \n   In this project we will be recognising Emotion of a Human being from EEG signal.\n   About the data set : The data set is called the seed data set.\n   It contains data of 15 people that underwent trails 15 times each thrice.\n\n           Total data items =  15 (subjects) * 15 (trials each) * 3 (sessions each)\n                            = 675 data items\n\n   Our project consists of 4 modules, namely : pre processing, feature extraction, feature reduction and classification,\n   all of which are mentioned in detail in the black book.\n\n'

### Define some params

In [None]:
WAVELET = "db6"
MAX_LEVEL = 5

In [None]:
# path of .mat files
input_dir = "data/eeg_raw_data/1/"
# path to save our processed results including features, pc
output_dir = "outputs_bp/"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# reading the channel order
channel_order = pd.read_excel("Channel Order.xlsx", header=None)
channel_order.head()

In [None]:
# number of input files
files = os.listdir("data/eeg_raw_data/1")
files

['4_20151111.mat',
 '9_20151028.mat',
 '8_20151103.mat',
 '10_20151014.mat',
 '2_20150915.mat',
 '11_20150916.mat',
 '7_20150715.mat',
 '6_20150507.mat',
 '15_20150508.mat',
 '12_20150725.mat']

In [None]:
# create labels dataframe
# it bases on len of files
session1_label = [1,2,3,0,2,0,0,1,0,1,2,1,1,1,2,3,2,2,3,3,0,3,0,3]
labels = session1_label * len(files)
labels_df = pd.DataFrame(labels)
labels_df.columns = ['label']
labels_df

Unnamed: 0,label
0,1
1,2
2,3
3,0
4,2
...,...
235,3
236,0
237,3
238,0


### Feature extraction

Now we have input data, labels, let's ready for feature extraction

In [None]:
def calculate_band_power(coeff_d, band_limits):
    # Calculate the power spectrum of the coefficients.
    psd = np.abs(coeff_d)**2

    # Calculate the band power by integrating the power spectrum within the band.
    band_power = np.trapz(psd, dx=(band_limits[1] - band_limits[0]))

    return band_power

In [None]:
participant_trial = []
features_table = pd.DataFrame(columns=range(310))

for file in tqdm(files, desc='tqdm() Progress Bar'):
    mat_file = spio.loadmat(input_dir + file)
    # take only signal data
    keys = [key for key, values in mat_file.items(
    ) if key != '__header__' and key != '__version__' and key != '__globals__']

    for idx, data_file in enumerate(keys):
        data_df = pd.DataFrame(mat_file[data_file])
        channel_bps = []
        for channel in data_df.iterrows():
            dwt_bands = []
            data = channel[1]
            # mode='symmetric': The default boundary extension mode is symmetric. This means that the signal is padded with its reflection at the boundaries.
            for band in range(MAX_LEVEL):
                (data, coeff_d) = pywt.dwt(data, WAVELET)
                dwt_bands.append(coeff_d)
            band_powers = []
            for band in range(len(dwt_bands)):
                band_limits = (2**band, 2**(band + 1))
                band_power = calculate_band_power(dwt_bands[band], band_limits)
                band_powers.append(band_power)
            channel_bps.append(band_powers)  # 62x5

        # Transforming 2D array into 1D vector of features
        unroll_bps = []
        for i in range(len(channel_bps)):
            for j in range(len(channel_bps[0])):
                unroll_bps.append(channel_bps[i][j])

        participant_trial.append(unroll_bps)
        features_table.loc[len(features_table.index)] = unroll_bps

tqdm() Progress Bar: 100%|██████████| 10/10 [00:47<00:00,  4.74s/it]


In [None]:
# Santity check
print(features_table.shape)

(240, 310)


In [None]:
# save to output dir
features_table.to_csv(output_dir + "features" + WAVELET + ".csv", index=False)

### Principal Components Analysis

In [None]:
data = pd.read_csv(output_dir + "features" + WAVELET + ".csv")

In [None]:
from sklearn.decomposition import PCA

# normalize data
normalised_data = pd.DataFrame(normalize(data, axis=0))

# fit transform PCA model
pca_model = PCA(n_components=100)
components = pca_model.fit_transform(normalised_data)
components_df = pd.DataFrame(data=components)


In [None]:
# Sanity check
components_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.074571,0.262085,0.051566,-0.152872,-0.062298,0.035531,0.034749,-0.008298,0.002771,0.0594,...,-0.009981,0.00577,-0.003674,-0.014739,-0.00142,-0.006554,0.002716,-0.003074,-0.00034,0.000958
1,-0.18599,0.044514,0.164302,-0.083018,0.007252,0.070198,-0.009881,-0.020256,-0.000429,-0.011892,...,0.000778,0.010145,-0.006817,-0.00908,0.002871,-0.004952,-0.000317,-0.002045,-0.00347,0.006633
2,0.283499,0.430817,0.11146,-0.185843,-0.063564,0.029101,0.041894,-0.003181,0.018374,0.055384,...,0.008095,-0.001944,0.001796,0.005582,-0.000262,-0.008191,-0.000938,5.6e-05,0.008669,-0.004381


In [None]:
# save for reproduction
components_df.to_csv(output_dir + "pc" + WAVELET + ".csv", index=False)

### Multiclasses classifiers



#### Data Splitting

In [None]:
# Reading data and splitting
pcs = pd.read_csv(output_dir + "pc" + WAVELET + ".csv")
# pcs = components_df
outputs = labels_df

X = pcs.iloc[:, :].values
Y = outputs.iloc[:, :].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [None]:
# Sanity check
print(type(X), type(Y))
print(X.shape, Y.shape)

# Splitted datasets
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(240, 100) (240, 1)
(192, 100) (48, 100)
(192, 1) (48, 1)


In [None]:
# Check data imbalanced
X_df = pd.DataFrame.from_records(X)
Y_df = pd.DataFrame.from_records(Y)

data_merged = pd.concat([X_df, Y_df], axis=1, ignore_index=True)

print(data_merged.shape)
data_merged.groupby(100).count().reset_index()


(240, 101)


Unnamed: 0,100,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
1,1,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
2,2,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
3,3,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60


#### SCV

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


# Create the parameter grid
parameters = {
    "C": (100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
    "gamma": (1e-08, 1e-7, 1e-6, 1e-5)
    }

# Create the SVC model and grid_search object
svc = SVC()
grid_search = GridSearchCV(svc, parameters, n_jobs=-1, cv=5)

#fit model
start_time = timeit.default_timer()
grid_search.fit(X_train, np.ravel(Y_train))
print("--- {0:.3f} seconds ---".format(timeit.default_timer() - start_time))

# best params
print(grid_search.best_params_)
svc_best = grid_search.best_estimator_

accuracy = svc_best.score(X_test, Y_test)
print("Accuracy on the testing set is: {0:.1f}%".format(accuracy*100))

prediction = svc_best.predict(X_test)

# confusion matrix report
report = classification_report(Y_test, prediction)
print(report)


--- 4.970 seconds ---
{'C': 10000000.0, 'gamma': 1e-05}
Accuracy on the testing set is: 47.9%
              precision    recall  f1-score   support

           0       0.50      0.88      0.64         8
           1       0.47      0.47      0.47        15
           2       0.45      0.38      0.42        13
           3       0.50      0.33      0.40        12

    accuracy                           0.48        48
   macro avg       0.48      0.51      0.48        48
weighted avg       0.48      0.48      0.46        48



#### RandomForest

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create the random forest classifier
clf_rfc = RandomForestClassifier()

# create param grids
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

# create grid search object
grid_search = GridSearchCV(clf_rfc, param_grid, cv=5)

# Train the classifier
start_time = timeit.default_timer()
grid_search.fit(X_train, np.ravel(Y_train))
print("--- {0:.3f} seconds ---".format(timeit.default_timer() - start_time))

# Make predictions on the test set
print(grid_search.best_params_)
Y_preds = grid_search.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_preds))

              precision    recall  f1-score   support

           0       0.17      0.38      0.23         8
           1       0.45      0.33      0.38        15
           2       0.25      0.15      0.19        13
           3       0.27      0.25      0.26        12

    accuracy                           0.27        48
   macro avg       0.29      0.28      0.27        48
weighted avg       0.31      0.27      0.28        48



#### GradientBoosted

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

# Create the gradient boosting classifier
clf_gbc = GradientBoostingClassifier()

# Create the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

# Create the grid search object
grid_search = GridSearchCV(clf_gbc, param_grid, cv=5)

# Train the classifier
start_time = timeit.default_timer()
grid_search.fit(X_train, np.ravel(Y_train))
print("--- {0:.3f} seconds ---".format(timeit.default_timer() - start_time))

# Make predictions on the test set
print(grid_search.best_params_)
Y_preds = grid_search.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_preds))

              precision    recall  f1-score   support

           0       0.38      0.75      0.50         8
           1       0.70      0.47      0.56        15
           2       0.50      0.38      0.43        13
           3       0.25      0.25      0.25        12

    accuracy                           0.44        48
   macro avg       0.46      0.46      0.44        48
weighted avg       0.48      0.44      0.44        48



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

def run_experiments(df, models):

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25)

    # Create a grid of hyperparameters to search over
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'sigmoid'],
    }

    # Create a grid search object
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)

    # Fit the grid search object to the training set
    grid_search.fit(X_train, y_train)

    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    score = best_model.score(X_test, y_test)
    print('Model: {} Score: {:.2f}'.format(best_model.__class__.__name__, score))

    # Export the experiment results
    with open('results.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Model', 'Score'])
        for model in models:
            writer.writerow([model.__class__.__name__, model.score(X_test, y_test)])

if __name__ == '__main__':

    # Load the dataset
    df = pd.read_csv('data.csv')

    # Create a list of models to test
    models = [SVC(), RandomForestClassifier(), GradientBoostingClassifier()]

    # Run the experiments
    run_experiments(df, models)


### Findings
The findings from your results are that SVC has the highest f1 score of 0.6, followed by SVM with an f1 score of 0.4. Random forest and gradient boosting classifier both have an f1 score of 0.4.

This suggests that SVC is the best performing classifier for your multiclass dataset. SVM and random forest are also performing well, but they are not as good as SVC. Gradient boosting classifier is not performing as well as the other three classifiers.

There are a few possible reasons for these results. One possibility is that your data is non-linearly separable. SVC is a non-linear classifier, so it is better suited for this type of data. SVM and random forest are also non-linear classifiers, but they may not be as good as SVC for your particular data set. Gradient boosting classifier is a linear classifier, so it is not as good as the other three classifiers for non-linearly separable data.

Another possibility is that your data is imbalanced. Imbalanced data means that there are more samples of one class than the other classes. SVC is not as sensitive to imbalanced data as SVM and random forest. Gradient boosting classifier is also not as sensitive to imbalanced data as SVM and random forest.

Finally, it is also possible that the hyperparameters of the classifiers were not tuned well. Hyperparameters are the parameters that control the behavior of the classifier. They need to be tuned carefully in order to get the best performance. It is possible that the hyperparameters of the SVC, SVM, random forest, and gradient boosting classifier were not tuned well for your particular data set.

Overall, the findings from your results suggest that SVC is the best performing classifier for your multiclass dataset. SVM and random forest are also performing well, but they are not as good as SVC. Gradient boosting classifier is not performing as well as the other three classifiers. It is possible that your data is non-linearly separable, imbalanced, or the hyperparameters of the classifiers were not tuned well.