# Import Libraries

In [4]:
# Import necessary libraries to get started
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from tqdm import tqdm
import pickle
import os

# Data Preprocessing

In [5]:
# Import the dataset
def get_data():
    """
    Load SEED data from .npz file.
    
    :return: Combined train data, combined test data, train labels, test labels.
    """
    feature_1s_dir = '../EEG-DE-feature/eeg_used_1s'
    file_1s_list = os.listdir(feature_1s_dir)
    file_1s_list.sort()
    
    all_train_data = {key: [] for key in ['delta', 'theta', 'alpha', 'beta', 'gamma']}
    all_test_data = {key: [] for key in ['delta', 'theta', 'alpha', 'beta', 'gamma']}
    all_train_labels = []
    all_test_labels = []
    
    for item in tqdm(file_1s_list, desc="Loading data"):
        npz_data = np.load(os.path.join(feature_1s_dir, item))
        
        # Extract train and test data
        train_data = pickle.loads(npz_data['train_data'])
        test_data = pickle.loads(npz_data['test_data'])
        
        # Append data from this file to the overall dictionary
        for key in all_train_data.keys():
            all_train_data[key].append(train_data[key])
            all_test_data[key].append(test_data[key])
        
        all_train_labels.append(npz_data['train_label'])
        all_test_labels.append(npz_data['test_label'])
        
    # Combine train and test data across all files
    train_data = {key: np.concatenate(all_train_data[key], axis=0) for key in all_train_data.keys()}
    test_data = {key: np.concatenate(all_test_data[key], axis=0) for key in all_test_data.keys()}
    train_labels = np.concatenate(all_train_labels, axis=0)
    test_labels = np.concatenate(all_test_labels, axis=0)
    
    # Convert labels: -1 (negative) -> 0, 0 (neutral) -> 1, 1 (positive) -> 2
    # train_labels = train_labels + 1
    # test_labels = test_labels + 1
    
    # Combine features from all bands into a single feature vector
    trainX = np.hstack([train_data[key] for key in train_data.keys()])
    testX = np.hstack([test_data[key] for key in test_data.keys()])
    
    # Debug information
    print("Combined Train Data Shape:", trainX.shape)
    print("Combined Test Data Shape:", testX.shape)
    print("Combined Train Labels Shape:", train_labels.shape)
    print("Combined Test Labels Shape:", test_labels.shape)
    
    return trainX, testX, train_labels, test_labels

# Modelling, Training and Test

In [6]:
#import sklearn's model selection and split the data set into %80 training and %20 test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = get_data()

Loading data: 100%|██████████| 36/36 [00:00<00:00, 38.30it/s]


Combined Train Data Shape: (72360, 310)
Combined Test Data Shape: (49824, 310)
Combined Train Labels Shape: (72360,)
Combined Test Labels Shape: (49824,)


In [7]:
#import RandomForestClassifier and RFC on the training dataset
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
estimator.fit(X_train, y_train)

In [8]:
#apply the trained estimator on the test dataset
mean_accuracy=estimator.score(X_test, y_test)
y_pred = estimator.predict(X_test)
print(mean_accuracy)
print(estimator.feature_importances_)

0.5798410404624278
[0.00128948 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.00065135 0.
 0.         0.         0.00289204 0.         0.         0.
 0.         0.         0.         0.         0.00098366 0.
 0.         0.         0.         0.         0.         0.00254791
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.0006027
 0.         0.         0.00167444 0.         0.         0.
 0.         0.         0.00377079 0.         0.00125824 0.00104893
 0.         0.         0.00180232 0.00131991 0.00110507 0.
 0.         0.         0.00093195 0.00075327 0.00432052 0.
 0.00228454 0.         0.         0.00088308 0.         0.
 0.         0.00426875 0.         0.         0.         0.
 0.         0.         0.         0.00124131 0.         0.
 0.         0.

In [9]:
# output the predicted values
y_pred

array([2., 2., 2., ..., 0., 0., 0.])

In [10]:
# import relevant metrics and print the confusion matrix and classification report
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 6913  4242  4649]
 [ 4916  8299  3705]
 [ 1831  1591 13678]]
              precision    recall  f1-score   support

         0.0       0.51      0.44      0.47     15804
         1.0       0.59      0.49      0.53     16920
         2.0       0.62      0.80      0.70     17100

    accuracy                           0.58     49824
   macro avg       0.57      0.58      0.57     49824
weighted avg       0.57      0.58      0.57     49824

