# Datasets

## Importing packages

In [None]:
!pip install speechbrain
!pip install transformers
!git clone https://github.com/GasserElbanna/serab-byols.git
!python3 -m pip install -e ./serab-byols

!pip install tqdm==4.60.0
!pip install opensmile


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting speechbrain
  Downloading speechbrain-0.5.12-py3-none-any.whl (496 kB)
[K     |████████████████████████████████| 496 kB 5.0 MB/s 
Collecting hyperpyyaml
  Downloading HyperPyYAML-1.0.1.tar.gz (14 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.1 MB/s 
Collecting torch<=1.11,>=1.7
  Downloading torch-1.11.0-cp37-cp37m-manylinux1_x86_64.whl (750.6 MB)
[K     |████████████████████████████████| 750.6 MB 10 kB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 44.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |█████████████████████

In [None]:
! pip install -q kaggle

from google.colab import files
files.upload()
files.upload()

# Name directory
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


Saving utilities.py to utilities.py


In [None]:
import os
import numpy as np
from tqdm import tqdm
from glob import glob
from random import sample
from pathlib import Path
import pandas as pd

import librosa
import soundfile as sf

import torch
import opensmile
import serab_byols
from transformers import Wav2Vec2Model, HubertModel

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split

import warnings
warnings.filterwarnings('ignore')

from utilities import load_audio_files, audio_embeddings_model, audio_embeddings, speaker_normalisation, split_train_test, get_hyperparams


  _resample_loop_p(x, t_out, interp_win, interp_delta, num_table, scale, y)


# Defining a function for the pipeline

In [None]:
# Defining a function for all steps 

def pipeline(audio_list, speakers, labels, model_names, dataset = None):
  '''
  Loads and resamples audio files 
  
  Parameters
  ------------
  audio_files: string
      The paths of the wav files 
  resampling_frequency: integer
      The frequency which all audios will be resampled to
  audio_list: list 
      The list of torch tensors of audios to which more audios need too be added, empty by default

  Returns
  ------------
  audio_list: list
      A list of torch tensors, one array for each audio file

  '''
  for model_name in model_names:
    print('MODEL: {}'.format(model_name))

    # Embeddings Extraction
    model = audio_embeddings_model(model_name = model_name)
    embeddings_array = audio_embeddings(audio_list, model_name=model_name, model=model)
    print('embeddings_array shape: {}'.format(embeddings_array.shape))

    # Speaker Normalisation
    normalised_embeddings = speaker_normalisation(embeddings_array, speakers)
    print('normalised_embeddings shape: {}'.format(normalised_embeddings.shape))
    columnwise_mean = torch.mean(normalised_embeddings, 0)
    if torch.all(columnwise_mean < 10**(-6)):
      print('PASSED: All means are less than 10**-6')
    else:
      print('FAILED: All means are NOT less than 10**-6')

    # Train Test Splitting
    X_train, X_test, y_train, y_test = split_train_test(normalised_embeddings, labels, speakers, test_size = 0.30)
    print('X_train shape: {}'.format(X_train.shape))
    print('X_test shape: {}'.format(X_test.shape))
    print('y_train len: {}'.format(len(y_train)))
    print('y_test len: {}'.format(len(y_test)))
    print()

    # Getting hyperparameters and checking Accuracy
    print('Logistic Regression:')
    classifier = LogisticRegression()
    parameters = {'penalty' : ['l1','l2'], 'C': np.logspace(-3,2,6), 'solver': ['lbfgs', 'sag']}
    get_hyperparams(X_train, X_test, y_train, y_test, classifier, parameters)
    print('Support Vector Machine:')
    classifier = SVC()
    parameters = {'C': np.logspace(-2,4,4), 'gamma': np.logspace(-5,3,5), 'kernel':['rbf']}
    get_hyperparams(X_train, X_test, y_train, y_test, classifier, parameters)
    print('Random Forest Classifier:')
    classifier = RandomForestClassifier()
    parameters = {'n_estimators' : [50,100,200], 'max_features' : ['auto', 'log2', 'sqrt'], 'bootstrap' : [False]}
    get_hyperparams(X_train, X_test, y_train, y_test, classifier, parameters)
    print()
    print()


# Dataset: Canadian French Emotion (CaFE)

In [None]:
# Phase_1
# Load dataset
! wget https://zenodo.org/record/1478765/files/CaFE_48k.zip?download=1
! unzip -q CaFE_48k.zip?download=1 -d /content/cafe

# Select all the audio files
audios = []
for file in Path('/content/cafe').glob("**/*.wav"):
    if not file.is_file(): 
        continue
    audios.append(str(file))

# Load and resample audio files
audio_list = load_audio_files(audios, resampling_frequency=16000)

# Making speakers list and labels list 
speakers = []
labels = []
for audio_file in audios:
  file_name = audio_file.split('/')[-1]
  speakers.append(file_name.split('-')[-1].split('.')[0])
  labels.append(audio_file.split('/')[3])


# Verify phase_1
print('Number of audio files: {}'.format(len(audio_list)))
print('Number of speaker classes: {}'.format(len(set(speakers))))
print('Speaker classes: {}'.format(set(speakers)))
print('Number of speakers: {}'.format(len(speakers)))
print('Number of label classes: {}'.format(len(set(labels))))
print('Label classes: {}'.format(set(labels)))
print('Number of labels: {}'.format(len(labels)))

--2022-07-31 15:53:44--  https://zenodo.org/record/1478765/files/CaFE_48k.zip?download=1
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 187220691 (179M) [application/octet-stream]
Saving to: ‘CaFE_48k.zip?download=1’


2022-07-31 15:53:54 (22.3 MB/s) - ‘CaFE_48k.zip?download=1’ saved [187220691/187220691]

Number of audio files: 936
Number of speaker classes: 6
Speaker classes: {'6', '3', '2', '5', '1', '4'}
Number of speakers: 936
Number of label classes: 7
Label classes: {'Neutre', 'Surprise', 'Joie', 'Peur', 'DВgoЦt', 'ColКre', 'Tristesse'}
Number of labels: 936


## Getting accuracy of all the models

In [None]:
# model_names = ['wav2vec', 'hubert']
# pipeline(audio_list, speakers, labels, model_names, 'CaFE')


In [None]:
model_names = ['hybrid_byols', 'compare', 'egemaps']
pipeline(audio_list, speakers, labels, model_names, 'CaFE')


MODEL: hybrid_byols


Generating Embeddings...: 100%|██████████| 936/936 [01:16<00:00, 12.17it/s]


embeddings_array shape: torch.Size([936, 2048])
normalised_embeddings shape: torch.Size([936, 2048])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([780, 2048])
X_test shape: torch.Size([156, 2048])
y_train len: 780
y_test len: 156

Logistic Regression:
Accuracy : 0.780952380952381
Best Parameters: {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.8452380952380952
Support Vector Machine:
Accuracy : 0.7904761904761906
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.8511904761904763
Random Forest Classifier:
Accuracy : 0.7178571428571429
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 200}
Accuracy on test_set: 0.7857142857142857


MODEL: compare


100%|██████████| 936/936 [02:14<00:00,  6.94it/s]


embeddings_array shape: torch.Size([936, 6373])
normalised_embeddings shape: torch.Size([936, 6373])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([780, 6373])
X_test shape: torch.Size([156, 6373])
y_train len: 780
y_test len: 156

Logistic Regression:
Accuracy : 0.6702380952380953
Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'sag'}
Accuracy on test_set: 0.6547619047619048
Support Vector Machine:
Accuracy : 0.6452380952380953
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.6369047619047619
Random Forest Classifier:
Accuracy : 0.6011904761904763
Best Parameters: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 200}
Accuracy on test_set: 0.6071428571428571


MODEL: egemaps


100%|██████████| 936/936 [02:36<00:00,  6.00it/s]


embeddings_array shape: torch.Size([936, 88])
normalised_embeddings shape: torch.Size([936, 88])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([780, 88])
X_test shape: torch.Size([156, 88])
y_train len: 780
y_test len: 156

Logistic Regression:
Accuracy : 0.5916666666666666
Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.5833333333333334
Support Vector Machine:
Accuracy : 0.5880952380952381
Best Parameters: {'C': 100.0, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy on test_set: 0.5595238095238095
Random Forest Classifier:
Accuracy : 0.6654761904761904
Best Parameters: {'bootstrap': False, 'max_features': 'log2', 'n_estimators': 200}
Accuracy on test_set: 0.5773809523809524




# Dataset: Persian Speech Emotion Detection Dataset (ShEMO)

In [None]:
# Phase_1
# Load dataset
! kaggle datasets download -d mansourehk/shemo-persian-speech-emotion-detection-database
! unzip -q shemo-persian-speech-emotion-detection-database.zip -d shemo;

# Select all the audio files
audios = []
for file in Path('/content/shemo').glob("**/*.wav"):
    if not file.is_file(): 
        continue
    audios.append(str(file))

# Load and resample audio files
audio_list = load_audio_files(audios, resampling_frequency=16000)

# Making speakers list and labels list 
speakers = []
labels = []
for audio_file in audios:
  file_name = audio_file.split('/')[4]
  speakers.append(file_name[4:6])
  labels.append(file_name[3])


# Verify phase_1
print('Number of audio files: {}'.format(len(audio_list)))
print('Number of speaker classes: {}'.format(len(set(speakers))))
print('Speaker classes: {}'.format(set(speakers)))
print('Number of speakers: {}'.format(len(speakers)))
print('Number of label classes: {}'.format(len(set(labels))))
print('Label classes: {}'.format(set(labels)))
print('Number of labels: {}'.format(len(labels)))

Downloading shemo-persian-speech-emotion-detection-database.zip to /content
 99% 824M/829M [00:06<00:00, 166MB/s]
100% 829M/829M [00:06<00:00, 131MB/s]
Number of audio files: 3000
Number of speaker classes: 87
Speaker classes: {'44', '23', '16', '38', '86', '15', '68', '72', '11', '09', '84', '51', '17', '80', '41', '79', '45', '08', '43', '67', '36', '76', '34', '06', '75', '30', '70', '31', '58', '32', '74', '28', '33', '56', '64', '62', '66', '59', '78', '53', '18', '55', '49', '73', '04', '81', '69', '27', '46', '40', '48', '42', '10', '20', '26', '12', '82', '07', '87', '37', '24', '47', '39', '54', '57', '13', '77', '19', '01', '52', '85', '60', '22', '65', '63', '21', '25', '71', '14', '50', '61', '05', '03', '29', '83', '35', '02'}
Number of speakers: 3000
Number of label classes: 6
Label classes: {'W', 'S', 'H', 'A', 'N', 'F'}
Number of labels: 3000


## Getting accuracy of all the models

In [None]:
# model_names = ['wav2vec', 'hubert']
# pipeline(audio_list, speakers, labels, model_names, 'ShEMO')


In [None]:
model_names = ['hybrid_byols', 'compare', 'egemaps']
pipeline(audio_list, speakers, labels, model_names, 'ShEMO')


MODEL: hybrid_byols


Generating Embeddings...: 100%|██████████| 3000/3000 [04:11<00:00, 11.92it/s]


embeddings_array shape: torch.Size([3000, 2048])
normalised_embeddings shape: torch.Size([3000, 2048])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([1626, 2048])
X_test shape: torch.Size([1374, 2048])
y_train len: 1626
y_test len: 1374

Logistic Regression:
Accuracy : 0.6241436715715076
Best Parameters: {'C': 10.0, 'penalty': 'l2', 'solver': 'sag'}
Accuracy on test_set: 0.5406500133083051
Support Vector Machine:
Accuracy : 0.6103820788523637
Best Parameters: {'C': 10000.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.5088360425600119
Random Forest Classifier:
Accuracy : 0.48096135022230657
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 200}
Accuracy on test_set: 0.3835617820821429


MODEL: compare


100%|██████████| 3000/3000 [07:00<00:00,  7.14it/s]


embeddings_array shape: torch.Size([3000, 6373])
normalised_embeddings shape: torch.Size([3000, 6373])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([1979, 6373])
X_test shape: torch.Size([1021, 6373])
y_train len: 1979
y_test len: 1021

Logistic Regression:
Accuracy : 0.5428248104221408
Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
Accuracy on test_set: 0.6126867592495242
Support Vector Machine:
Accuracy : 0.515592372361884
Best Parameters: {'C': 10000.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.6144201896699778
Random Forest Classifier:
Accuracy : 0.4166988058224397
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 50}
Accuracy on test_set: 0.4379319816774401


MODEL: egemaps


100%|██████████| 3000/3000 [08:09<00:00,  6.13it/s]


embeddings_array shape: torch.Size([3000, 88])
normalised_embeddings shape: torch.Size([3000, 88])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([1977, 88])
X_test shape: torch.Size([1023, 88])
y_train len: 1977
y_test len: 1023

Logistic Regression:
Accuracy : 0.4686051551668714
Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
Accuracy on test_set: 0.5025268582186633
Support Vector Machine:
Accuracy : 0.4704030410043183
Best Parameters: {'C': 10000.0, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy on test_set: 0.45075655053032015
Random Forest Classifier:
Accuracy : 0.42650405787906076
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 100}
Accuracy on test_set: 0.4300003123614043




# Dataset: EmoDB 

In [None]:
# Phase_1
# Load dataset
! kaggle datasets download -d piyushagni5/berlin-database-of-emotional-speech-emodb
! unzip -q berlin-database-of-emotional-speech-emodb.zip

# Load and resample audio files
audio_files = glob(os.path.join('/content/wav','*.wav'))
audio_list= load_audio_files(audio_files, resampling_frequency=16000)

# Making speakers list and labels list 
speakers = []
labels = []
for audio_file in audio_files:
  file_name = audio_file.split('/')[3]
  speakers.append(int(file_name[:2]))
  labels.append(file_name[5:6])


# Verify phase_1
print('Number of audio files: {}'.format(len(audio_list)))
print('Number of speaker classes: {}'.format(len(set(speakers))))
print('Speaker classes: {}'.format(set(speakers)))
print('Number of speakers: {}'.format(len(speakers)))
print('Number of label classes: {}'.format(len(set(labels))))
print('Label classes: {}'.format(set(labels)))
print('Number of labels: {}'.format(len(labels)))

Downloading berlin-database-of-emotional-speech-emodb.zip to /content
 63% 24.0M/38.0M [00:00<00:00, 130MB/s] 
100% 38.0M/38.0M [00:00<00:00, 149MB/s]
Number of audio files: 535
Number of speaker classes: 10
Speaker classes: {3, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Number of speakers: 535
Number of label classes: 7
Label classes: {'T', 'W', 'L', 'A', 'E', 'N', 'F'}
Number of labels: 535


## Getting accuracy of all the models

In [None]:
# model_names = ['wav2vec', 'hubert']
# pipeline(audio_list, speakers, labels, model_names, 'EmoDB')


In [None]:
model_names = ['hybrid_byols', 'compare', 'egemaps']
pipeline(audio_list, speakers, labels, model_names, 'EmoDB')


MODEL: hybrid_byols


Generating Embeddings...: 100%|██████████| 535/535 [00:28<00:00, 18.89it/s]


embeddings_array shape: torch.Size([535, 2048])
normalised_embeddings shape: torch.Size([535, 2048])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([398, 2048])
X_test shape: torch.Size([137, 2048])
y_train len: 398
y_test len: 137

Logistic Regression:
Accuracy : 0.8763512677798394
Best Parameters: {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.9182839802587701
Support Vector Machine:
Accuracy : 0.876091527520099
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.8915566226490597
Random Forest Classifier:
Accuracy : 0.7946661024856514
Best Parameters: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}
Accuracy on test_set: 0.7447827615894843


MODEL: compare


100%|██████████| 535/535 [00:59<00:00,  9.04it/s]


embeddings_array shape: torch.Size([535, 6373])
normalised_embeddings shape: torch.Size([535, 6373])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([387, 6373])
X_test shape: torch.Size([148, 6373])
y_train len: 387
y_test len: 148

Logistic Regression:
Accuracy : 0.8846693082827537
Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.8397108471180035
Support Vector Machine:
Accuracy : 0.8763696714116882
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.8530255278289977
Random Forest Classifier:
Accuracy : 0.8092630991790655
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 100}
Accuracy on test_set: 0.7786581381932427


MODEL: egemaps


100%|██████████| 535/535 [01:02<00:00,  8.55it/s]


embeddings_array shape: torch.Size([535, 88])
normalised_embeddings shape: torch.Size([535, 88])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([381, 88])
X_test shape: torch.Size([154, 88])
y_train len: 381
y_test len: 154

Logistic Regression:
Accuracy : 0.8160259414770692
Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.7805394605394606
Support Vector Machine:
Accuracy : 0.7850852781303909
Best Parameters: {'C': 10000.0, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy on test_set: 0.7710089910089909
Random Forest Classifier:
Accuracy : 0.784940706745218
Best Parameters: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}
Accuracy on test_set: 0.7266067266067265




# Dataset: RAVDESS

In [None]:
# Phase_1
# Load dataset
! kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio
! unzip -q ravdess-emotional-speech-audio.zip -d '/content/ravdess'

# Select all the audio files
audios = []
for file in Path('/content/ravdess/audio_speech_actors_01-24').glob("**/*.wav"):
    if not file.is_file(): 
        continue
    audios.append(str(file))

# Load and resample audio files
audio_list = load_audio_files(audios, resampling_frequency=16000)

# Making speakers list and labels list 
speakers = []
labels = []
for audio_file in audios:
  file_name = audio_file.split('/')[5]
  speakers.append(file_name[18:20])
  labels.append(file_name[6:8])


# Verify phase_1
print('Number of audio files: {}'.format(len(audio_list)))
print('Number of speaker classes: {}'.format(len(set(speakers))))
print('Speaker classes: {}'.format(set(speakers)))
print('Number of speakers: {}'.format(len(speakers)))
print('Number of label classes: {}'.format(len(set(labels))))
print('Label classes: {}'.format(set(labels)))
print('Number of labels: {}'.format(len(labels)))

Downloading ravdess-emotional-speech-audio.zip to /content
 96% 414M/429M [00:02<00:00, 159MB/s]
100% 429M/429M [00:02<00:00, 165MB/s]
Number of audio files: 1440
Number of speaker classes: 24
Speaker classes: {'10', '22', '20', '06', '12', '23', '07', '16', '24', '21', '15', '13', '19', '11', '14', '18', '01', '05', '09', '03', '04', '17', '08', '02'}
Number of speakers: 1440
Number of label classes: 8
Label classes: {'05', '07', '03', '04', '06', '08', '01', '02'}
Number of labels: 1440


## Getting accuracy of all the models

In [None]:
# model_names = ['wav2vec', 'hubert']
# pipeline(audio_list, speakers, labels, model_names, 'RAVDESS')


In [None]:
model_names = ['hybrid_byols', 'compare', 'egemaps']
pipeline(audio_list, speakers, labels, model_names, 'RAVDESS')


MODEL: hybrid_byols


Generating Embeddings...: 100%|██████████| 1440/1440 [01:36<00:00, 14.85it/s]


embeddings_array shape: torch.Size([1440, 2048])
normalised_embeddings shape: torch.Size([1440, 2048])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([1020, 2048])
X_test shape: torch.Size([420, 2048])
y_train len: 1020
y_test len: 420

Logistic Regression:
Accuracy : 0.8034035409035409
Best Parameters: {'C': 10.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.7544642857142857
Support Vector Machine:
Accuracy : 0.8018594831094831
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.7678571428571429
Random Forest Classifier:
Accuracy : 0.6595924908424908
Best Parameters: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 200}
Accuracy on test_set: 0.6875


MODEL: compare


100%|██████████| 1440/1440 [02:59<00:00,  8.02it/s]


embeddings_array shape: torch.Size([1440, 6373])
normalised_embeddings shape: torch.Size([1440, 6373])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([1020, 6373])
X_test shape: torch.Size([420, 6373])
y_train len: 1020
y_test len: 420

Logistic Regression:
Accuracy : 0.6754070004070003
Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.6629464285714286
Support Vector Machine:
Accuracy : 0.6596891534391534
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.6495535714285714
Random Forest Classifier:
Accuracy : 0.5954772079772079
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 200}
Accuracy on test_set: 0.5825892857142858


MODEL: egemaps


100%|██████████| 1440/1440 [03:22<00:00,  7.12it/s]


embeddings_array shape: torch.Size([1440, 88])
normalised_embeddings shape: torch.Size([1440, 88])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([1020, 88])
X_test shape: torch.Size([420, 88])
y_train len: 1020
y_test len: 420

Logistic Regression:
Accuracy : 0.6084554334554334
Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.5602678571428571
Support Vector Machine:
Accuracy : 0.5886345136345137
Best Parameters: {'C': 100.0, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy on test_set: 0.5424107142857143
Random Forest Classifier:
Accuracy : 0.5794210419210419
Best Parameters: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}
Accuracy on test_set: 0.53125




# Dataset: Crowd Sourced Emotional Multimodal Actors Dataset (CREMA-D)

In [None]:
# Phase_1
# Load dataset
! kaggle datasets download -d ejlok1/cremad
! unzip -q cremad.zip

# Load and resample audio files
audio_files = glob(os.path.join('/content/AudioWAV','*.wav'))
audio_list = load_audio_files(audio_files, resampling_frequency=16000)

# Make speakers list and labels list 
speakers = []
labels = []
for audio_file in audio_files:
  file_name = audio_file.split('/')[3]
  speakers.append(int(file_name[:4]))
  labels.append(file_name[9:12])


# Verify phase_1
print('Number of audio files: {}'.format(len(audio_list)))
print('Number of speaker classes: {}'.format(len(set(speakers))))
print('Speaker classes: {}'.format(set(speakers)))
print('Number of speakers: {}'.format(len(speakers)))
print('Number of label classes: {}'.format(len(set(labels))))
print('Label classes: {}'.format(set(labels)))
print('Number of labels: {}'.format(len(labels)))

Downloading cremad.zip to /content
100% 451M/451M [00:18<00:00, 23.4MB/s]
100% 451M/451M [00:18<00:00, 25.6MB/s]
Number of audio files: 7442
Number of speaker classes: 91
Speaker classes: {1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023}
Number of speakers: 7442
Number of label classes: 6
Label classes: {'NEU', 'ANG', 'DIS', 'FEA', 'SAD', 'HAP'}
Number of labels: 7442


## Getting accuracy of all the models

In [None]:
# model_names = ['wav2vec', 'hubert']
# pipeline(audio_list, speakers, labels, model_names, 'CREMA-D')


In [None]:
model_names = ['hybrid_byols']
pipeline(audio_list, speakers, labels, model_names, 'CREMA-D')


MODEL: hybrid_byols


Generating Embeddings...: 100%|██████████| 7442/7442 [05:44<00:00, 21.60it/s]


embeddings_array shape: torch.Size([7442, 2048])
normalised_embeddings shape: torch.Size([7442, 2048])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([5235, 2048])
X_test shape: torch.Size([2207, 2048])
y_train len: 5235
y_test len: 2207

Logistic Regression:
Accuracy : 0.7349894609501295
Best Parameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.7621230593494462
Support Vector Machine:
Accuracy : 0.7175696211152425
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.7275318384763662
Random Forest Classifier:
Accuracy : 0.6261708205415812
Best Parameters: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 200}
Accuracy on test_set: 0.6505112828201284




In [None]:
model_names = ['egemaps']
pipeline(audio_list, speakers, labels, model_names, 'CREMA-D')

MODEL: egemaps


100%|██████████| 7442/7442 [12:35<00:00,  9.85it/s]


embeddings_array shape: torch.Size([7442, 88])
normalised_embeddings shape: torch.Size([7442, 88])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([5241, 88])
X_test shape: torch.Size([2201, 88])
y_train len: 5241
y_test len: 2201

Logistic Regression:
Accuracy : 0.6142979092206828
Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.6009934049181415
Support Vector Machine:
Accuracy : 0.6245229551670562
Best Parameters: {'C': 100.0, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy on test_set: 0.6086048142992864
Random Forest Classifier:
Accuracy : 0.5926097429547971
Best Parameters: {'bootstrap': False, 'max_features': 'log2', 'n_estimators': 200}
Accuracy on test_set: 0.5664189368330351




In [None]:
model_names = ['compare']
pipeline(audio_list, speakers, labels, model_names, 'CREMA-D')

MODEL: compare


100%|██████████| 7442/7442 [12:13<00:00, 10.14it/s]


embeddings_array shape: torch.Size([7442, 6373])
normalised_embeddings shape: torch.Size([7442, 6373])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([5235, 6373])
X_test shape: torch.Size([2207, 6373])
y_train len: 5235
y_test len: 2207

Logistic Regression:
Accuracy : 0.6805116194853924
Best Parameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'sag'}
Accuracy on test_set: 0.71636159942007
Support Vector Machine:
Accuracy : 0.6576467155676393
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.6872044746857341
Random Forest Classifier:
Accuracy : 0.6136913013623977
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 200}
Accuracy on test_set: 0.632019155257536




# Dataset: SAVEE 

In [None]:
# Phase_1
# Load dataset
! kaggle datasets download -d barelydedicated/savee-database
! unzip -q savee-database.zip 

# Select all the audio files
audios = []
for file in Path('/content/AudioData').glob("**/*.wav"):
    if not file.is_file(): 
        continue
    audios.append(str(file))

# Load and resample audio files
audio_list = load_audio_files(audios, resampling_frequency=16000)

# Making speakers list and labels list 
speakers = []
labels = []
for audio_file in audios:
  file_name = audio_file.split('/')[4]
  speakers.append(audio_file.split('/')[3])
  labels.append(file_name[0])


# Verify phase_1
print('Number of audio files: {}'.format(len(audio_list)))
print('Number of speaker classes: {}'.format(len(set(speakers))))
print('Speaker classes: {}'.format(set(speakers)))
print('Number of speakers: {}'.format(len(speakers)))
print('Number of label classes: {}'.format(len(set(labels))))
print('Label classes: {}'.format(set(labels)))
print('Number of labels: {}'.format(len(labels)))

Downloading savee-database.zip to /content
 90% 193M/215M [00:07<00:00, 24.1MB/s]
100% 215M/215M [00:07<00:00, 28.3MB/s]
Number of audio files: 480
Number of speaker classes: 4
Speaker classes: {'DC', 'JK', 'KL', 'JE'}
Number of speakers: 480
Number of label classes: 6
Label classes: {'h', 's', 'n', 'd', 'a', 'f'}
Number of labels: 480


## Getting accuracy of all the models

In [None]:
# model_names = ['wav2vec', 'hubert']
# pipeline(audio_list, speakers, labels, model_names, 'SAVEE')


In [None]:
model_names = ['hybrid_byols', 'compare', 'egemaps']
pipeline(audio_list, speakers, labels, model_names, 'SAVEE')


MODEL: hybrid_byols


Generating Embeddings...: 100%|██████████| 480/480 [00:32<00:00, 14.71it/s]


embeddings_array shape: torch.Size([480, 2048])
normalised_embeddings shape: torch.Size([480, 2048])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([360, 2048])
X_test shape: torch.Size([120, 2048])
y_train len: 360
y_test len: 120

Logistic Regression:
Accuracy : 0.8296296296296296
Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'sag'}
Accuracy on test_set: 0.4888888888888889
Support Vector Machine:
Accuracy : 0.8481481481481481
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.4388888888888889
Random Forest Classifier:
Accuracy : 0.7351851851851852
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 200}
Accuracy on test_set: 0.48888888888888876


MODEL: compare


100%|██████████| 480/480 [01:01<00:00,  7.80it/s]


embeddings_array shape: torch.Size([480, 6373])
normalised_embeddings shape: torch.Size([480, 6373])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([360, 6373])
X_test shape: torch.Size([120, 6373])
y_train len: 360
y_test len: 120

Logistic Regression:
Accuracy : 0.6203703703703705
Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
Accuracy on test_set: 0.6166666666666666
Support Vector Machine:
Accuracy : 0.5907407407407407
Best Parameters: {'C': 100.0, 'gamma': 1e-05, 'kernel': 'rbf'}
Accuracy on test_set: 0.6111111111111112
Random Forest Classifier:
Accuracy : 0.6148148148148149
Best Parameters: {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 100}
Accuracy on test_set: 0.5


MODEL: egemaps


100%|██████████| 480/480 [01:08<00:00,  6.96it/s]


embeddings_array shape: torch.Size([480, 88])
normalised_embeddings shape: torch.Size([480, 88])
PASSED: All means are less than 10**-6
X_train shape: torch.Size([360, 88])
X_test shape: torch.Size([120, 88])
y_train len: 360
y_test len: 120

Logistic Regression:
Accuracy : 0.65
Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test_set: 0.39444444444444443
Support Vector Machine:
Accuracy : 0.6518518518518518
Best Parameters: {'C': 100.0, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy on test_set: 0.4777777777777778
Random Forest Classifier:
Accuracy : 0.7055555555555555
Best Parameters: {'bootstrap': False, 'max_features': 'auto', 'n_estimators': 50}
Accuracy on test_set: 0.5944444444444444




# Results

In [2]:
logistic_regression_results = {'EmoDB': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.9182839802587701, 'compare': 0.8397108471180035, 'egemaps': 0.7805394605394606},
        'CaFE': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.8452380952380952, 'compare': 0.6547619047619048, 'egemaps': 0.5833333333333334},
        'ShEMO': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.5406500133083051, 'compare': 0.6126867592495242, 'egemaps': 0.5025268582186633},
        'CREMA-D': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.7621230593494462, 'compare': 0.71636159942007, 'egemaps': 0.6009934049181415},
        'RAVDESS': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.7544642857142857, 'compare': 0.6629464285714286, 'egemaps': 0.5602678571428571},
        'SAVEE': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.4888888888888889, 'compare': 0.6166666666666666, 'egemaps': 0.39444444444444443}}

lr_data = pd.DataFrame(logistic_regression_results)
lr_data


Unnamed: 0,EmoDB,CaFE,ShEMO,CREMA-D,RAVDESS,SAVEE
wav2vec,0.0,0.0,0.0,0.0,0.0,0.0
hubert,0.0,0.0,0.0,0.0,0.0,0.0
hybrid_byols,0.918284,0.845238,0.54065,0.762123,0.754464,0.488889
compare,0.839711,0.654762,0.612687,0.716362,0.662946,0.616667
egemaps,0.780539,0.583333,0.502527,0.600993,0.560268,0.394444


In [3]:
support_vector_machine_results = {'EmoDB': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.8915566226490597, 'compare': 0.8530255278289977, 'egemaps': 0.7710089910089909},
        'CaFE': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.8511904761904763, 'compare': 0.6369047619047619, 'egemaps': 0.5595238095238095},
        'ShEMO': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.5088360425600119, 'compare': 0.6144201896699778, 'egemaps': 0.45075655053032015},
        'CREMA-D': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.7275318384763662, 'compare': 0.6872044746857341, 'egemaps': 0.6086048142992864},
        'RAVDESS': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.7678571428571429, 'compare': 0.6495535714285714, 'egemaps': 0.5424107142857143},
        'SAVEE': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.4388888888888889, 'compare': 0.6111111111111112, 'egemaps': 0.4777777777777778}}

svm_data = pd.DataFrame(support_vector_machine_results)
svm_data


Unnamed: 0,EmoDB,CaFE,ShEMO,CREMA-D,RAVDESS,SAVEE
wav2vec,0.0,0.0,0.0,0.0,0.0,0.0
hubert,0.0,0.0,0.0,0.0,0.0,0.0
hybrid_byols,0.891557,0.85119,0.508836,0.727532,0.767857,0.438889
compare,0.853026,0.636905,0.61442,0.687204,0.649554,0.611111
egemaps,0.771009,0.559524,0.450757,0.608605,0.542411,0.477778


In [4]:
random_forest_classifier_results = {'EmoDB': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.7447827615894843, 'compare': 0.7786581381932427, 'egemaps': 0.7266067266067265},
        'CaFE': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.7857142857142857, 'compare': 0.6071428571428571, 'egemaps': 0.5773809523809524},
        'ShEMO': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.3835617820821429, 'compare': 0.4379319816774401, 'egemaps': 0.4300003123614043},
        'CREMA-D': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.6505112828201284, 'compare': 0.632019155257536, 'egemaps': 0.5664189368330351},
        'RAVDESS': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.6875, 'compare': 0.5825892857142858, 'egemaps': 0.53125},
        'SAVEE': {'wav2vec': 0, 'hubert': 0, 'hybrid_byols': 0.48888888888888876, 'compare': 0.5, 'egemaps': 0.5944444444444444}}

rfc_data = pd.DataFrame(random_forest_classifier_results)
rfc_data


Unnamed: 0,EmoDB,CaFE,ShEMO,CREMA-D,RAVDESS,SAVEE
wav2vec,0.0,0.0,0.0,0.0,0.0,0.0
hubert,0.0,0.0,0.0,0.0,0.0,0.0
hybrid_byols,0.744783,0.785714,0.383562,0.650511,0.6875,0.488889
compare,0.778658,0.607143,0.437932,0.632019,0.582589,0.5
egemaps,0.726607,0.577381,0.43,0.566419,0.53125,0.594444
