# Emotion Classification

### Import packages

In [3]:
from tqdm import tqdm
import os
import sys

import pandas as pd
import numpy as np

import pywt
import scipy.io as spio
from collections import Counter

from sklearn import svm
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from typing import Dict, Tuple, List

import timeit
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


### Define some params/ args

In [4]:
# path of .mat files
input_dir = "data/eeg_raw_data/1/"
# path to save our processed results including features, pcs
output_dir = "outputs_bp/"
os.makedirs(output_dir, exist_ok=True)

In [5]:
# reading the channel order
channel_order = pd.read_excel("Channel Order.xlsx", header=None)
print(channel_order.shape)
channel_order.head(2)

(62, 1)


Unnamed: 0,0
0,FP1
1,FPZ


In [6]:
# create labels dataframe
# number of input files
files = os.listdir("data/eeg_raw_data/1")
# 0, 1, 2, and 3 denote the ground truth, neutral, sad, fear, and happy emotions, respectively
id2labels = {
    0: "neutral",
    1: "sad",
    2: "fear",
    3: "happy"
}
# create labels dataframe
session1_label = [1,2,3,0,2,0,0,1,0,1,2,1,1,1,2,3,2,2,3,3,0,3,0,3]
labels = session1_label * len(files)
labels_df = pd.DataFrame(labels)
labels_df.columns = ['label']
labels_df['label_name'] = labels_df['label'].map(id2labels)
print(labels_df.shape)
labels_df.head(2)

(240, 2)


Unnamed: 0,label,label_name
0,1,sad
1,2,fear


### Feature extraction

Now we have input data, labels, let's ready for feature extraction. Total data points will be calculated as below:
- Number of participants = number of .mat files = 10
- Number of trials that each individual will experience = 24
- Number of channels = 62

Therefore:
- For each participant/ a trial, eeg_signals will be collected are a matrix of (62, n), where n is the duration of the video.
- There will be 24 x (62, n) matrices for each participants
- Data will be converted into band powers of 5 sub-bands, therefore the number of features of a trial will be 62 x 5 = 310
- Total data points will be (10 x 24, 62 x 5) = (240, 310) 

In [7]:
def calculate_band_power(coeff_d, band_limits):
    # Calculate the power spectrum of the coefficients.
    psd = np.abs(coeff_d)**2

    # Calculate the band power by integrating the power spectrum within the band.
    band_power = np.trapz(psd, dx=(band_limits[1] - band_limits[0]))

    return band_power

In [8]:
WAVELET = "db6"  # method to transform eeg signals
MAX_LEVEL = 5  # number of sub-bands that will be divided.
n_features = 310  # calculated by n_sub_bands x number of channels = 5x62

participant_trial = []
features_table = pd.DataFrame(columns=range(n_features))

for file in tqdm(files, desc='tqdm() Progress Bar'):
    mat_file = spio.loadmat(input_dir + file)

    # take only signal data
    keys = [key for key, values in mat_file.items(
    ) if key != '__header__' and key != '__version__' and key != '__globals__']

    for idx, data_file in enumerate(keys):
        data_df = pd.DataFrame(mat_file[data_file])
        channel_bps = []
        
        for channel in data_df.iterrows():
            dwt_bands = []
            data = channel[1]
            # mode='symmetric': The default boundary extension mode is symmetric. This means that the signal is padded with its reflection at the boundaries.
            for band in range(MAX_LEVEL):
                (data, coeff_d) = pywt.dwt(data, WAVELET)
                dwt_bands.append(coeff_d)

            band_powers = []
            for band in range(len(dwt_bands)):
                band_limits = (2**band, 2**(band + 1))
                band_power = calculate_band_power(dwt_bands[band], band_limits)
                band_powers.append(band_power)
            channel_bps.append(band_powers)  # 62x5

        # Transforming 2D array into 1D vector of features
        unroll_bps = []
        for i in range(len(channel_bps)):
            for j in range(len(channel_bps[0])):
                unroll_bps.append(channel_bps[i][j])

        participant_trial.append(unroll_bps)
        features_table.loc[len(features_table.index)] = unroll_bps


tqdm() Progress Bar:   0%|          | 0/10 [00:00<?, ?it/s]

tqdm() Progress Bar: 100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


In [9]:
# Santity check
print(features_table.shape)

(240, 310)


In [10]:
# save to output dir
features_table.to_csv(output_dir + "features" + WAVELET + ".csv", index=False)

### Principal Components Analysis

In [11]:
data = pd.read_csv(output_dir + "features" + WAVELET + ".csv")

In [12]:
from sklearn.decomposition import PCA

# normalize data
normalised_data = pd.DataFrame(normalize(data, axis=0))

# fit transform PCA model
pca_model = PCA(n_components=100)
components = pca_model.fit_transform(normalised_data)
components_df = pd.DataFrame(data=components)

In [13]:
# Sanity check
components_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.422924,-0.14031,0.083086,-0.04711,-0.003453,0.038523,-0.011103,-0.019908,0.019422,-0.019295,...,0.004081,-0.001275,0.003786,-0.000558,0.003711,-7.9e-05,0.002481,-0.002651,0.003744,0.004836
1,-0.515743,-0.257238,0.228098,-0.028227,0.100964,0.174305,-0.390715,2.11912,-0.23006,0.099261,...,-8.1e-05,-2.5e-05,-1.1e-05,1.4e-05,2.1e-05,1e-06,2.2e-05,-3.3e-05,-1.4e-05,1.6e-05
2,-0.330618,-0.109024,0.071122,-0.011108,-0.071077,0.021504,0.008361,-0.015203,0.00059,-0.017432,...,0.006847,-0.001291,-0.000137,0.006385,-0.000426,-0.005574,0.000814,-0.003628,-0.005696,0.000415


In [14]:
# save for reproduction
components_df.to_csv(output_dir + "pc" + WAVELET + ".csv", index=False)

### Multiclasses classifiers



#### Data Splitting

In [15]:
# Reading data and splitting
pcs = pd.read_csv(output_dir + "pc" + WAVELET + ".csv")
# pcs = components_df
outputs = labels_df

X = pcs.iloc[:, :].values
Y = outputs.iloc[:, :1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [16]:
# Sanity check
print(type(X), type(Y))
print(X.shape, Y.shape)

# Splitted datasets
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(240, 100) (240, 1)
(192, 100) (48, 100)
(192, 1) (48, 1)


In [17]:
# Check data imbalanced
X_df = pd.DataFrame.from_records(X)

data_merged = pd.concat([X_df, labels_df], axis=1, ignore_index=True)
# print(data_merged.head(2))

print(data_merged.shape)
data_merged.groupby(101).count().reset_index()

(240, 102)


Unnamed: 0,101,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100
0,fear,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
1,happy,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
2,neutral,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
3,sad,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60


### Experiments

In [63]:
def report2csv(report):
    report_data = []
    lines = report.split('\n')

    for line in lines[2:-5]:
        row_data = line.split('      ')
        row = {
            'class': row_data[1],
            'precision': row_data[2],
            'recall': row_data[3],
            'f1_score': row_data[4],
        }
        report_data.append(row)    
    report_df = pd.DataFrame.from_dict(report_data)

    return report_df

In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

def run_experiments(X_train, X_test, Y_train, Y_test, param_grid, model, model_name):

    # Create a grid search object
    grid_search = GridSearchCV(model, param_grid, cv=5)

    # Fit the grid search object to the training set
    grid_search.fit(X_train, np.ravel(Y_train))

    # best_params
    print(grid_search.best_params_)
    Y_pred = grid_search.predict(X_test)

    # print classification report
    report = classification_report(Y_test, Y_pred)
    print(report)
    report_df = report2csv(report)
    report_df['model'] = model_name

    return report_df


In [106]:
import time

model_names = ["SVM", "Random Forest", "Gradient Boosting"]
models = [SVC(), RandomForestClassifier(), GradientBoostingClassifier()]
param_grids = [
    {
        "C": (100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
        "gamma": (1e-08, 1e-7, 1e-6, 1e-5)
    },
    {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 12],
    },
    {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
    }
]

final_report_df = pd.DataFrame()
process_time = []

for param_grid, model, model_name in zip(param_grids, models, model_names):
    start_time = time.time()
    print(f"Process model: {model}")
    report_df = run_experiments(X_train, X_test, Y_train, Y_test, param_grid, model, model_name)
    end_time = time.time()
    report_df['process_time'] = end_time - start_time
    final_report_df = pd.concat([final_report_df, report_df], axis=0, ignore_index=True)


Process model: SVC()
{'C': 10000000.0, 'gamma': 1e-05}
              precision    recall  f1-score   support

           0       0.50      0.75      0.60         8
           1       0.53      0.60      0.56        15
           2       0.62      0.38      0.48        13
           3       0.55      0.50      0.52        12

    accuracy                           0.54        48
   macro avg       0.55      0.56      0.54        48
weighted avg       0.55      0.54      0.54        48

           0       0.50      0.75      0.60         8
           1       0.53      0.60      0.56        15
           2       0.62      0.38      0.48        13
           3       0.55      0.50      0.52        12
Process model: RandomForestClassifier()
{'max_depth': 12, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.43      0.75      0.55         8
           1       0.54      0.47      0.50        15
           2       0.62      0.38      0.48        1

In [107]:
final_report_df

Unnamed: 0,class,precision,recall,f1_score,model,process_time
0,0,0.5,0.75,0.6,SVM,0.50227
1,1,0.53,0.6,0.56,SVM,0.50227
2,2,0.62,0.38,0.48,SVM,0.50227
3,3,0.55,0.5,0.52,SVM,0.50227
4,0,0.43,0.75,0.55,Random Forest,9.089895
5,1,0.54,0.47,0.5,Random Forest,9.089895
6,2,0.62,0.38,0.48,Random Forest,9.089895
7,3,0.38,0.42,0.4,Random Forest,9.089895
8,0,0.3,0.38,0.33,Gradient Boosting,99.496126
9,1,0.43,0.4,0.41,Gradient Boosting,99.496126


In [108]:
final_report_df.to_csv("outputs_bp/final_report.csv")

### Findings
The findings from your results are that SVC has the highest f1 score of 0.6, followed by SVM with an f1 score of 0.4. Random forest and gradient boosting classifier both have an f1 score of 0.4.

This suggests that SVC is the best performing classifier for your multiclass dataset. SVM and random forest are also performing well, but they are not as good as SVC. Gradient boosting classifier is not performing as well as the other three classifiers.

There are a few possible reasons for these results. One possibility is that your data is non-linearly separable. SVC is a non-linear classifier, so it is better suited for this type of data. SVM and random forest are also non-linear classifiers, but they may not be as good as SVC for your particular data set. Gradient boosting classifier is a linear classifier, so it is not as good as the other three classifiers for non-linearly separable data.

Another possibility is that your data is imbalanced. Imbalanced data means that there are more samples of one class than the other classes. SVC is not as sensitive to imbalanced data as SVM and random forest. Gradient boosting classifier is also not as sensitive to imbalanced data as SVM and random forest.

Finally, it is also possible that the hyperparameters of the classifiers were not tuned well. Hyperparameters are the parameters that control the behavior of the classifier. They need to be tuned carefully in order to get the best performance. It is possible that the hyperparameters of the SVC, SVM, random forest, and gradient boosting classifier were not tuned well for your particular data set.

Overall, the findings from your results suggest that SVC is the best performing classifier for your multiclass dataset. SVM and random forest are also performing well, but they are not as good as SVC. Gradient boosting classifier is not performing as well as the other three classifiers. It is possible that your data is non-linearly separable, imbalanced, or the hyperparameters of the classifiers were not tuned well.