# Phase 3

## Importing packages

In [3]:
import numpy as np
import soundfile as sf
import librosa
import os
from glob import glob
import torch

In [2]:
! pip install -q kaggle

from google.colab import files
files.upload()

# Name directory
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
# CREMA-D Dataset

! kaggle datasets download -d ejlok1/cremad
! unzip cremad.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: AudioWAV/1031_DFA_DIS_XX.wav  
  inflating: AudioWAV/1031_DFA_FEA_XX.wav  
  inflating: AudioWAV/1031_DFA_HAP_XX.wav  
  inflating: AudioWAV/1031_DFA_NEU_XX.wav  
  inflating: AudioWAV/1031_DFA_SAD_XX.wav  
  inflating: AudioWAV/1031_IEO_ANG_HI.wav  
  inflating: AudioWAV/1031_IEO_ANG_LO.wav  
  inflating: AudioWAV/1031_IEO_ANG_MD.wav  
  inflating: AudioWAV/1031_IEO_DIS_HI.wav  
  inflating: AudioWAV/1031_IEO_DIS_LO.wav  
  inflating: AudioWAV/1031_IEO_DIS_MD.wav  
  inflating: AudioWAV/1031_IEO_FEA_HI.wav  
  inflating: AudioWAV/1031_IEO_FEA_LO.wav  
  inflating: AudioWAV/1031_IEO_FEA_MD.wav  
  inflating: AudioWAV/1031_IEO_HAP_HI.wav  
  inflating: AudioWAV/1031_IEO_HAP_LO.wav  
  inflating: AudioWAV/1031_IEO_HAP_MD.wav  
  inflating: AudioWAV/1031_IEO_NEU_XX.wav  
  inflating: AudioWAV/1031_IEO_SAD_HI.wav  
  inflating: AudioWAV/1031_IEO_SAD_LO.wav  
  inflating: AudioWAV/1031_IEO_SAD_MD.wav  
  inflating

# Phase 1: Loading and resampling audio files

In [4]:
# Defining function for loading and resampling audio files

# Processing audio files
audio_files_crema_d = glob(os.path.join('/content/AudioWAV','*.wav'))

def load_audio_files(audio_files, resampling_frequency=16000, audio_list=None):
    '''
    Loads and resamples audio files 
    
    Parameters
    ------------
    audio_files: string
        The paths of the wav files 
    resampling_frequency: integer
        The frequency which all audios will be resampled to
    audio_list: list of torch arrays of audios to which more audios need too be added, empty by default

    Returns
    ------------
    audio_list: list of torch arrays
        A list of torch arrays, one array for each audio file
        
    '''

    # Making audio_list
    if audio_list is None:
      audio_list = []

    # Resampling
    for audio in audio_files:
        signal, fs = librosa.load(audio, sr=resampling_frequency)
        audio_list.append(torch.from_numpy(signal))
        
    return audio_list
        

In [34]:

audio_files_crema_d = glob(os.path.join('/content/AudioWAV','*.wav'))

# Using load_audio_files function
audio_list_crema_d = load_audio_files(audio_files_crema_d, resampling_frequency=16000)
print('number of audio files: {}'.format(len(audio_list_crema_d)))
print(audio_list_crema_d[0].shape)

number of audio files: 7442
torch.Size([43777])


# Phase 2: Embedding Extraction

In [1]:
!pip install speechbrain
!pip install transformers
!git clone https://github.com/GasserElbanna/serab-byols.git
!python3 -m pip install -e ./serab-byols

!pip install tqdm==4.60.0
!pip install opensmile


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'serab-byols' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Obtaining file:///content/serab-byols
Installing collected packages: serab-byols
  Attempting uninstall: serab-byols
    Found existing installation: serab-byols 0.0.0
    Can't uninstall 'serab-byols'. No files were found to uninstall.
  Running setup.py develop for serab-byols
Successfully installed serab-byols-0.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
from tqdm import tqdm
import serab_byols
import opensmile
from transformers import Wav2Vec2Model, HubertModel
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.model_selection import train_test_split

## Defining the functions

In [5]:
def audio_embeddings_model(model_name):
  '''
  Generates model for embedding extraction 
  
  Parameters
  ------------
  mode_name: string
      The model to used, could be 'wav2vec', 'hubert' or 'hybrid_byols'

  Returns
  ------------
  model: object

  '''
  if model_name=='wav2vec2':
    model_hub = 'facebook/wav2vec2-large-960h-lv60-self'
    model = Wav2Vec2Model.from_pretrained(model_hub)
  elif model_name=='hubert':
    model_hub = 'facebook/hubert-xlarge-ll60k'
    model = HubertModel.from_pretrained(model_hub)
  elif model_name=='hybrid_byols':
    model_name = 'cvt'
    checkpoint_path = "serab-byols/checkpoints/cvt_s1-d1-e64_s2-d1-e256_s3-d1-e512_BYOLAs64x96-osandbyolaloss6373-e100-bs256-lr0003-rs42.pth"
    model = serab_byols.load_model(checkpoint_path, model_name)
  elif model_name=='compare':
    model = opensmile.Smile(
        feature_set=opensmile.FeatureSet.ComParE_2016,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
  elif model_name=='egemaps':
    model = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
  return model


def audio_embeddings(audio_list, model_name, model, sampling_rate=16000):
  '''
  Loads and resamples audio files 
  
  Parameters
  ------------
  audio_list: list of arrays
      A list of arrays, one array for each audio file
  model_name: string
      The model to used, could be 'wav2vec', 'hubert' or 'hybrid_byols'
  model: object
      The model generated by audio_embeddings_model function
  n_feats: int
      The number of features of each audio file, 6373 for 'compare' and 88 for 'egemaps'

  Returns
  ------------
  embeddings_array: torch array
      The array containg embeddings of all audio_files, dimension (number of audio files × n_feats)
      
  '''
  if model_name=='hybrid_byols':
    embeddings_array = serab_byols.get_scene_embeddings(audio_list, model)
  else:
    embeddings_list = []
    for i in tqdm(range(len(audio_list))):
      if model_name=='wav2vec2' or model_name=='hubert':
        embeddings = model(audio_list[i].reshape(1,-1)).last_hidden_state.mean(1)
        embeddings_list.append(embeddings[0])
      elif model_name=='compare' or model_name=='egemaps':
        embeddings = model.process_signal(audio_list[i], sampling_rate)
        embeddings_list.append(torch.tensor(embeddings.values[0], dtype=torch.float32))
    embeddings_array = torch.stack(embeddings_list)
  return embeddings_array


## 1. Wav2vec 2.0

In [35]:
model = audio_embeddings_model(model_name='wav2vec2')


Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
embeddings_array_wav2vec = audio_embeddings(audio_list_crema_d[:2], model_name='wav2vec2', model=model)
print(embeddings_array_wav2vec.shape)

100%|██████████| 2/2 [00:07<00:00,  3.76s/it]

torch.Size([2, 1024])





## 2. Hubert

In [11]:
model = audio_embeddings_model(model_name='hubert')


Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

In [12]:
embeddings_array_hubert = audio_embeddings(audio_list_crema_d[:2], model_name='hubert', model=model)


100%|██████████| 2/2 [00:14<00:00,  7.05s/it]


In [13]:
print(embeddings_array_hubert.shape)
print(embeddings_array_hubert)

torch.Size([2, 1280])
tensor([[ 0.0828, -0.0093,  0.0158,  ...,  0.0499, -0.0047, -0.0357],
        [ 0.0589,  0.0189,  0.0697,  ..., -0.1487, -0.0101, -0.0034]],
       grad_fn=<StackBackward0>)


## 3. Hybrid BYOL-S

In [36]:
model = audio_embeddings_model(model_name='hybrid_byols')


In [37]:
embeddings_array_byols = audio_embeddings(audio_list_crema_d, model_name='hybrid_byols', model=model)

Generating Embeddings...: 100%|██████████| 7442/7442 [05:02<00:00, 24.62it/s]


In [38]:
print(embeddings_array_byols.shape)
print(embeddings_array_byols)

torch.Size([7442, 2048])
tensor([[ 4.6005,  3.3193,  1.6502,  ...,  3.0076,  0.7459,  3.8370],
        [ 5.1524,  3.6945,  1.3245,  ...,  4.2870, -0.1191,  5.4642],
        [ 3.5540,  3.3808,  1.4869,  ...,  3.0218, -1.3578,  3.7170],
        ...,
        [ 5.3331,  3.4665,  1.4162,  ...,  2.9429, -1.9563,  4.2440],
        [ 6.1522,  3.4810,  2.0665,  ...,  2.4415,  0.8819,  3.7001],
        [ 2.7747,  2.9199,  1.9036,  ...,  2.9964, -0.0655,  4.2349]])


## 4. openSMILE: ComParE_2016

In [39]:
model = audio_embeddings_model(model_name='compare')
embeddings_array_compare = audio_embeddings(audio_list_crema_d, model_name='compare', model=model)
print(embeddings_array_compare.shape)

100%|██████████| 7442/7442 [11:40<00:00, 10.62it/s]


torch.Size([7442, 6373])


In [40]:
print(embeddings_array_compare)

tensor([[1.2293e+00, 3.3835e-01, 1.8797e-02,  ..., 5.5680e+01, 1.1914e+02,
         6.0391e+01],
        [3.5037e+00, 4.5392e-01, 6.8259e-03,  ..., 6.2398e+01, 1.1730e+02,
         6.2005e+01],
        [5.2160e-01, 2.8049e-01, 4.0650e-03,  ..., 3.7978e+01, 8.8560e+01,
         4.5760e+01],
        ...,
        [1.1129e+00, 4.2487e-01, 2.8497e-01,  ..., 3.7853e+01, 9.3689e+01,
         7.5141e+01],
        [8.0568e-01, 5.7812e-01, 1.2500e-01,  ..., 4.4861e+01, 1.0203e+02,
         4.5219e+01],
        [9.5088e-01, 3.9331e-01, 7.1130e-02,  ..., 5.4985e+01, 1.1222e+02,
         4.8933e+01]])


## 5. openSMILE: eGeMAPSv02

In [41]:
model = audio_embeddings_model(model_name='egemaps')
embeddings_array_egemaps = audio_embeddings(audio_list_crema_d, model_name='egemaps', model=model)
print(embeddings_array_egemaps.shape)

100%|██████████| 7442/7442 [12:08<00:00, 10.22it/s]

torch.Size([7442, 88])





In [42]:
print(embeddings_array_egemaps)

tensor([[ 2.6969e+01,  1.7750e-01,  2.0970e+01,  ...,  4.1600e-01,
          3.8624e-01, -2.9704e+01],
        [ 2.7472e+01,  2.2636e-01,  2.4899e+01,  ...,  2.0400e-01,
          1.5292e-01, -2.3412e+01],
        [ 3.3227e+01,  2.5948e-02,  3.2583e+01,  ...,  7.4333e-01,
          2.8312e-01, -3.7683e+01],
        ...,
        [ 2.7446e+01,  7.9522e-02,  2.5437e+01,  ...,  5.9000e-01,
          3.7417e-02, -2.9595e+01],
        [ 3.7473e+01,  1.1554e-01,  3.3526e+01,  ...,  3.9250e-01,
          3.8906e-01, -3.1436e+01],
        [ 3.3596e+01,  5.2283e-02,  3.2301e+01,  ...,  2.8333e-01,
          2.5882e-01, -3.5557e+01]])


<B>Problems:</B>
- Colab crashes when using hubert or wav2vec for all 200 audio files, works well for a list of 20 audio files for wav2vec and a list of 5 audio files for hubert

# Phase 3: Downstream Task - Speech Emotion Recognotion

## Extracting labels and speaker ID

In [43]:
speakers = []
labels = []

for audio_file in audio_files_crema_d:
  file_name = audio_file.split('/')[3]
  segments = file_name.split('_')
  speakers.append(int(segments[0]))
  labels.append(segments[2])

print(speakers)
print(labels)

[1070, 1086, 1047, 1024, 1013, 1018, 1033, 1087, 1090, 1026, 1048, 1020, 1070, 1080, 1074, 1011, 1048, 1033, 1005, 1089, 1060, 1005, 1004, 1023, 1065, 1073, 1090, 1080, 1035, 1019, 1012, 1060, 1043, 1005, 1081, 1081, 1063, 1055, 1033, 1091, 1032, 1016, 1067, 1061, 1034, 1008, 1071, 1017, 1029, 1076, 1083, 1039, 1063, 1078, 1077, 1069, 1021, 1081, 1085, 1061, 1002, 1054, 1055, 1055, 1057, 1085, 1022, 1068, 1036, 1078, 1050, 1089, 1054, 1078, 1018, 1087, 1078, 1006, 1075, 1026, 1007, 1084, 1048, 1005, 1043, 1065, 1037, 1018, 1021, 1032, 1062, 1080, 1091, 1029, 1024, 1076, 1030, 1076, 1077, 1013, 1011, 1068, 1048, 1019, 1069, 1022, 1014, 1069, 1079, 1011, 1041, 1006, 1027, 1076, 1003, 1082, 1080, 1025, 1060, 1033, 1073, 1056, 1053, 1051, 1047, 1064, 1064, 1013, 1082, 1025, 1058, 1065, 1041, 1019, 1046, 1028, 1033, 1009, 1055, 1046, 1071, 1079, 1009, 1058, 1064, 1052, 1023, 1002, 1039, 1015, 1088, 1020, 1074, 1071, 1053, 1089, 1085, 1090, 1057, 1079, 1011, 1006, 1084, 1063, 1075, 1047, 105

## Speaker normalisation

In [44]:

def speaker_normalisation(embeddings_array, speakers):
  '''
  Normalises embeddings_array for each speaker
  
  Parameters
  ------------
  embeddings_array: torch tensor
      The tensor of embeddings, one row for each audio file
  speakers: list of integers
      The list of speakers

  Returns
  ------------
  embeddings_array: torch tensor
      The tensor containg normalised embeddings 
      
  '''
  speaker_ids = set(speakers)
  for speaker_id in speaker_ids:
    speaker_embeddings_indices = np.where(np.array(speakers)==speaker_id)[0]
    speaker_embeddings = embeddings_array[speaker_embeddings_indices,:]
    normalised_speaker_embeddings = scaler.fit_transform(speaker_embeddings)
    embeddings_array[speaker_embeddings_indices] = torch.tensor(normalised_speaker_embeddings).float()
  return embeddings_array


## Example 1: wav2vec 2.0

In [45]:
# normalised_embeddings_wav2vec = speaker_normalisation(embeddings_array_wav2vec, speakers)
 
# print('The shape of the normalised embeddings array is: {}'.format(normalised_embeddings_wav2vec.shape))
# print('Normalised Embeddings Array:')
# print((normalised_embeddings_wav2vec))
# print()
# columnwise_mean = torch.mean(speaker_normalisation(embeddings_array_wav2vec, speakers), 0)
# print('Columnwise_mean:')
# print(columnwise_mean)
# if torch.all(columnwise_mean < 10**(-6)):
#   print('All means are less than 10**-6')
# else:
#   print('All means are NOT less than 10**-6')

## Example 2: Hubert

In [46]:
# normalised_embeddings_hubert = speaker_normalisation(embeddings_array_hubert, speakers)
 
# print('The shape of the normalised embeddings array is: {}'.format(normalised_embeddings_hubert.shape))
# print('Normalised Embeddings Array:')
# print((normalised_embeddings_hubert))
# print()
# columnwise_mean = torch.mean(speaker_normalisation(embeddings_array_hubert, speakers), 0)
# print('Columnwise_mean:')
# print(columnwise_mean)
# if torch.all(columnwise_mean < 10**(-6)):
#   print('All means are less than 10**-6')
# else:
#   print('All means are NOT less than 10**-6')

## Example 3: Hybrid BYOLS

In [47]:
normalised_embeddings_byols = speaker_normalisation(embeddings_array_byols, speakers)
 
print('The shape of the normalised embeddings array is: {}'.format(normalised_embeddings_byols.shape))
print('Normalised Embeddings Array:')
print((normalised_embeddings_byols))
print()
columnwise_mean = torch.mean(speaker_normalisation(embeddings_array_byols, speakers), 0)
print('Columnwise_mean:')
print(columnwise_mean)
if torch.all(columnwise_mean < 10**(-6)):
  print('All means are less than 10**-6')
else:
  print('All means are NOT less than 10**-6')

The shape of the normalised embeddings array is: torch.Size([7442, 2048])
Normalised Embeddings Array:
tensor([[ 0.7446, -0.5880,  0.9038,  ..., -0.7028,  0.6437, -0.9952],
        [ 0.3341,  0.5120,  0.2099,  ...,  2.2052,  0.3882,  2.4736],
        [-1.4732, -0.4100, -0.2244,  ..., -0.4470, -0.5203, -0.7952],
        ...,
        [ 1.2575,  0.5924,  0.2838,  ..., -0.6039, -0.3173, -0.3609],
        [ 1.7047,  0.5935,  1.0780,  ..., -0.8513,  1.5532, -0.3280],
        [-1.1692, -1.5588, -0.0137,  ..., -0.1902,  0.2838,  0.7877]])

Columnwise_mean:
tensor([ 2.5630e-10,  6.4074e-10,  1.0252e-09,  ...,  2.0504e-09,
        -5.7666e-10,  2.0504e-09])
All means are less than 10**-6


## Example 4: openSMILE compare

In [48]:
normalised_embeddings_compare = speaker_normalisation(embeddings_array_compare, speakers)
 
print('The shape of the normalised embeddings array is: {}'.format(normalised_embeddings_compare.shape))
print('Normalised Embeddings Array:')
print((normalised_embeddings_compare))
print()
columnwise_mean = torch.mean(speaker_normalisation(embeddings_array_compare, speakers), 0)
print('Columnwise_mean:')
print(columnwise_mean)
if torch.all(columnwise_mean < 10**(-6)):
  print('All means are less than 10**-6')
else:
  print('All means are NOT less than 10**-6')

The shape of the normalised embeddings array is: torch.Size([7442, 6373])
Normalised Embeddings Array:
tensor([[-0.4050, -0.0596, -0.8373,  ...,  0.1758,  1.2240,  0.7034],
        [ 2.7049,  0.3318, -1.0071,  ...,  1.6309,  1.4679,  1.3748],
        [-0.6486, -0.7046, -0.9542,  ..., -1.3291, -1.0214, -0.5734],
        ...,
        [ 0.3511, -0.1359, -0.1063,  ..., -1.3793, -0.3577,  3.2220],
        [ 0.2447,  1.3583, -0.4678,  ..., -0.3359,  0.2238, -0.4654],
        [-0.1443, -0.3259, -0.5513,  ...,  0.1735,  0.8594, -0.3413]])

Columnwise_mean:
tensor([ 3.0755e-09,  1.5378e-09, -7.6889e-10,  ..., -1.2815e-10,
         0.0000e+00, -1.0252e-09])
All means are less than 10**-6


## Example 5: openSMILE egemaps

In [49]:
normalised_embeddings_egemaps = speaker_normalisation(embeddings_array_egemaps, speakers)
 
print('The shape of the normalised embeddings array is: {}'.format(normalised_embeddings_egemaps.shape))
print('Normalised Embeddings Array:')
print((normalised_embeddings_egemaps))
print()
columnwise_mean = torch.mean(speaker_normalisation(embeddings_array_egemaps, speakers), 0)
print('Columnwise_mean:')
print(columnwise_mean)
if torch.all(columnwise_mean < 10**(-6)):
  print('All means are less than 10**-6')
else:
  print('All means are NOT less than 10**-6')

The shape of the normalised embeddings array is: torch.Size([7442, 88])
Normalised Embeddings Array:
tensor([[-0.6591,  2.0093, -1.4414,  ...,  0.3270,  0.5491, -0.4299],
        [ 0.2242,  0.1654,  0.4036,  ..., -0.4577, -0.5073,  1.4695],
        [-1.0651, -0.9953, -0.8824,  ...,  2.2602,  0.3693, -0.7927],
        ...,
        [ 1.1003, -0.9123,  1.2628,  ...,  0.6397, -1.6709,  0.8557],
        [ 0.8310,  0.9314,  0.3174,  ..., -0.2864,  0.1737,  0.9185],
        [-0.7987, -0.2932, -0.8332,  ..., -0.3181, -0.0755, -0.5659]])

Columnwise_mean:
tensor([-1.0252e-09,  2.5630e-09,  1.0252e-09, -4.1007e-09,  2.0504e-09,
        -1.0252e-09, -1.5378e-09,  6.7277e-10,  0.0000e+00,  1.5378e-09,
         2.0504e-09, -4.1007e-09, -3.9726e-09, -2.0504e-09,  0.0000e+00,
        -3.0755e-09,  0.0000e+00, -1.0252e-09,  1.0252e-09,  0.0000e+00,
         1.0252e-09,  5.1259e-10,  1.0252e-09, -5.1259e-10, -1.0252e-09,
        -1.5378e-09,  1.0252e-09, -1.5378e-09, -1.0252e-09, -3.0755e-09,
        -

## Dividing into Training and Test sets

In [50]:
# Dividing into Training and Test sets
def split_train_test(normalised_embeddings_array, labels, test_size = 0.30):
  X_train, X_test, y_train, y_test = train_test_split(normalised_embeddings_array, labels, test_size=test_size, random_state=1)
  return X_train, X_test, y_train, y_test

In [52]:
# Train Test splitting
# X_train_wav2vec, X_test_wav2vec, y_train_wav2vec, y_test_wav2vec = split_train_test(normalised_embeddings_wav2vec, labels, test_size = 0.30)
# X_train_hubert, X_test_hubert, y_train_hubert, y_test_hubert = split_train_test(normalised_embeddings_hubert, labels, test_size = 0.30)
X_train_byols, X_test_byols, y_train_byols, y_test_byols = split_train_test(normalised_embeddings_byols, labels, test_size = 0.30)
X_train_compare, X_test_compare, y_train_compare, y_test_compare = split_train_test(normalised_embeddings_compare, labels, test_size = 0.30)
X_train_egemaps, X_test_egemaps, y_train_egemaps, y_test_egemaps = split_train_test(normalised_embeddings_egemaps, labels, test_size = 0.30)

models = ['byols', 'compare', 'egemaps']
normalised_embeddings_arrays = [normalised_embeddings_byols, normalised_embeddings_compare, normalised_embeddings_egemaps]

# Verify
for i in range(len(models)):
  print()
  print()
  print('MODEL: {}'.format(models[i]))
  print()
  X_train, X_test, y_train, y_test = split_train_test(normalised_embeddings_arrays[i], labels, test_size = 0.30)
  print('The shape of X_train is: {}'.format(X_train.shape))
  print('X_train')
  print(X_train)
  print()
  print('The shape of X_test is: {}'.format(X_test.shape))
  print('X_test')
  print(X_test)
  print()
  print('The length of y_train is: {}'.format(len(y_train)))
  print('y_train')
  print(y_train)
  print()
  print('The length of y_test is: {}'.format(len(y_test)))
  print('y_test')
  print(y_test)




MODEL: byols

The shape of X_train is: torch.Size([5209, 2048])
X_train
tensor([[-0.7098,  0.1866, -0.4347,  ..., -0.5939,  1.7977,  0.2677],
        [-1.4140, -0.1380,  0.6529,  ..., -2.0226, -1.1296, -0.5417],
        [ 1.5970,  0.6306, -0.8873,  ..., -1.5388, -0.9297, -0.0992],
        ...,
        [-1.9555, -0.5569,  0.5279,  ...,  0.0810,  0.5017, -1.7509],
        [ 0.5073,  0.7398, -1.9712,  ..., -0.7826, -0.4282, -0.9772],
        [ 0.0717,  0.4705,  1.4781,  ...,  0.9536,  0.9126,  0.1855]])

The shape of X_test is: torch.Size([2233, 2048])
X_test
tensor([[-1.4345,  1.7667,  0.2888,  ...,  0.5796, -0.4333, -0.2417],
        [-0.8196, -0.8285,  0.7556,  ..., -0.8025, -1.5675, -0.9801],
        [-1.1978, -1.0728, -0.1663,  ..., -0.0934, -0.0636, -1.4311],
        ...,
        [-0.6771, -0.2425,  0.9134,  ...,  0.7599, -1.0981,  2.0553],
        [ 0.7506,  0.7905,  2.0447,  ..., -0.3995,  0.1458,  1.6060],
        [-0.6730,  0.0987,  1.1640,  ...,  0.0543,  0.6453,  0.8884]])



# EmoDB

## Phase 1

In [53]:
# Phase_1
# Load dataset
! kaggle datasets download -d piyushagni5/berlin-database-of-emotional-speech-emodb
! unzip berlin-database-of-emotional-speech-emodb.zip

# Resample dataset
audio_files_emo = glob(os.path.join('/content/wav','*.wav'))
audio_list_emo= load_audio_files(audio_files_emo, resampling_frequency=16000)


# Verify phase_1
print()
print('number of audio files: {}'.format(len(audio_list_emo)))
print(audio_list_emo[0].shape)


berlin-database-of-emotional-speech-emodb.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  berlin-database-of-emotional-speech-emodb.zip
replace wav/03a01Fa.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: None

number of audio files: 535
torch.Size([87957])


## Phase 2

In [54]:
# Phase_2
# Wav2vec
# model = audio_embeddings_model(model_name='wav2vec2')
# embeddings_array_wav2vec = audio_embeddings(audio_list_emo[:2], model_name='wav2vec2', model=model)

# Hubert
# model = audio_embeddings_model(model_name='hubert')
# embeddings_array_hubert = audio_embeddings(audio_list_emo[:2], model_name='hubert', model=model)

# Hybrid BYOLS
model = audio_embeddings_model(model_name='hybrid_byols')
embeddings_array_byols = audio_embeddings(audio_list_emo, model_name='hybrid_byols', model=model)

# EmoDB compare
model = audio_embeddings_model(model_name='compare')
embeddings_array_compare = audio_embeddings(audio_list_emo, model_name='compare', model=model)

# EmoDB egemaps
model = audio_embeddings_model(model_name='egemaps')
embeddings_array_egemaps = audio_embeddings(audio_list_emo, model_name='egemaps', model=model)


# Verify Phase_2
models = ['byols', 'compare', 'egemaps']
embeddings_arrays = [embeddings_array_byols, embeddings_array_compare, embeddings_array_egemaps]

for i in range(len(models)):
  print()
  print()
  print('MODEL: {}'.format(models[i]))
  print()
  print('The shape of the embeddings array is {}'.format(embeddings_arrays[i].shape))
  print('The embeddings array is: ')
  print((embeddings_arrays[i]))


Generating Embeddings...: 100%|██████████| 535/535 [00:24<00:00, 21.72it/s]
100%|██████████| 535/535 [00:55<00:00,  9.65it/s]
100%|██████████| 535/535 [01:00<00:00,  8.87it/s]




MODEL: byols

The shape of the embeddings array is torch.Size([535, 2048])
The embeddings array is: 
tensor([[ 3.2043,  5.1246,  0.7757,  ...,  5.2862, -1.5630,  4.2663],
        [ 3.6598,  7.1753,  2.1226,  ...,  4.0313,  0.7807,  3.5573],
        [ 4.9119,  4.5869,  1.1540,  ...,  4.8820, -0.8673,  2.8869],
        ...,
        [ 5.5373,  3.1271,  1.8133,  ...,  4.4415, -1.1598,  3.0143],
        [ 3.0584,  3.6032, -0.1855,  ...,  5.9404, -0.3081,  3.4181],
        [ 4.2166,  3.6123,  1.7138,  ...,  4.7688,  0.1352,  3.7743]])


MODEL: compare

The shape of the embeddings array is torch.Size([535, 6373])
The embeddings array is: 
tensor([[2.3726e+00, 1.0517e-01, 9.6310e-01,  ..., 6.7568e+01, 1.1536e+02,
         5.9326e+01],
        [2.2908e+00, 1.0602e-01, 9.7135e-01,  ..., 6.3124e+01, 1.3373e+02,
         7.4071e+01],
        [2.5637e+00, 1.5772e-01, 3.5570e-01,  ..., 6.2385e+01, 1.1408e+02,
         5.0285e+01],
        ...,
        [3.4053e+00, 4.1453e-01, 3.4188e-02,  ..., 7.7

## Phase 3

In [55]:
# Phase_3
speakers = []
labels = []

for audio_file in audio_files_emo:
  file_name = audio_file.split('/')[3]
  speakers.append(int(file_name[:2]))
  labels.append(file_name[5:7])


# Verify speakers and labels array
print()
print()
print('Speakers:')
print(speakers)
print('Labels:')
print(labels)


# Normalised arrays
# normalised_embeddings_wav2vec = speaker_normalisation(embeddings_array_wav2vec, speakers)
# normalised_embeddings_hubert = speaker_normalisation(embeddings_array_hubert, speakers)
normalised_embeddings_byols = speaker_normalisation(embeddings_array_byols, speakers)
normalised_embeddings_compare= speaker_normalisation(embeddings_array_compare, speakers)
normalised_embeddings_egemaps = speaker_normalisation(embeddings_array_egemaps, speakers)


# Verifying Phase_3
normalised_embeddings_arrays = [normalised_embeddings_byols, normalised_embeddings_compare, normalised_embeddings_egemaps]

for i in range(len(models)):
  print()
  print()
  print('MODEL: {}'.format(models[i]))
  print()
  print('The shape of the normalised embeddings array is: {}'.format(normalised_embeddings_arrays[i].shape))
  print('Normalised Embeddings Array:')
  print((normalised_embeddings_arrays[i]))
  print()
  columnwise_mean = torch.mean(speaker_normalisation(embeddings_arrays[i], speakers), 0)
  print('Columnwise_mean:')
  print(columnwise_mean)
  if torch.all(columnwise_mean < 10**(-6)):
    print('All means are less than 10**-6')
  else:
    print('All means are NOT less than 10**-6')




Speakers:
[14, 13, 14, 14, 10, 16, 3, 14, 16, 15, 9, 15, 14, 9, 8, 11, 14, 11, 8, 11, 3, 13, 8, 14, 14, 13, 14, 12, 10, 3, 8, 16, 3, 13, 14, 14, 12, 11, 16, 13, 3, 15, 15, 15, 14, 15, 14, 16, 8, 3, 8, 3, 11, 12, 15, 14, 14, 8, 14, 13, 16, 12, 14, 15, 12, 16, 10, 9, 14, 13, 15, 16, 13, 15, 10, 3, 8, 3, 11, 10, 15, 8, 11, 13, 8, 9, 11, 14, 13, 14, 16, 3, 16, 11, 11, 13, 14, 9, 13, 15, 8, 16, 15, 9, 14, 8, 14, 13, 13, 16, 15, 11, 3, 16, 13, 13, 14, 11, 13, 15, 16, 12, 3, 9, 14, 3, 14, 16, 13, 13, 15, 12, 10, 16, 9, 10, 14, 8, 16, 13, 15, 15, 3, 3, 14, 11, 10, 16, 14, 9, 9, 8, 16, 3, 11, 9, 14, 9, 14, 13, 14, 16, 14, 10, 9, 14, 11, 14, 13, 8, 15, 8, 15, 3, 15, 12, 8, 13, 13, 9, 3, 9, 8, 8, 14, 9, 3, 15, 11, 16, 16, 9, 8, 3, 16, 9, 15, 8, 8, 8, 12, 16, 15, 14, 3, 13, 14, 14, 16, 9, 13, 16, 13, 11, 12, 10, 10, 12, 9, 14, 13, 3, 3, 16, 13, 16, 3, 11, 10, 10, 13, 8, 10, 10, 11, 10, 16, 16, 16, 16, 12, 9, 14, 15, 16, 16, 16, 9, 13, 13, 12, 3, 10, 8, 15, 11, 11, 15, 16, 15, 13, 15, 3, 15, 15, 

In [56]:
# Phase_3

# Train Test splitting
# X_train_wav2vec, X_test_wav2vec, y_train_wav2vec, y_test_wav2vec = split_train_test(normalised_embeddings_wav2vec, labels, test_size = 0.30)
# X_train_hubert, X_test_hubert, y_train_hubert, y_test_hubert = split_train_test(normalised_embeddings_hubert, labels, test_size = 0.30)
X_train_byols, X_test_byols, y_train_byols, y_test_byols = split_train_test(normalised_embeddings_byols, labels, test_size = 0.30)
X_train_compare, X_test_compare, y_train_compare, y_test_compare = split_train_test(normalised_embeddings_compare, labels, test_size = 0.30)
X_train_egemaps, X_test_egemaps, y_train_egemaps, y_test_egemaps = split_train_test(normalised_embeddings_egemaps, labels, test_size = 0.30)

# Verify
for i in range(len(models)):
  print()
  print()
  print('MODEL: {}'.format(models[i]))
  print()
  X_train, X_test, y_train, y_test = split_train_test(normalised_embeddings_arrays[i], labels, test_size = 0.30)
  print('The shape of X_train is: {}'.format(X_train.shape))
  print('X_train')
  print(X_train)
  print()
  print('The shape of X_test is: {}'.format(X_test.shape))
  print('X_test')
  print(X_test)
  print()
  print('The length of y_train is: {}'.format(len(y_train)))
  print('y_train')
  print(y_train)
  print()
  print('The length of y_test is: {}'.format(len(y_test)))
  print('y_test')
  print(y_test)




MODEL: byols

The shape of X_train is: torch.Size([374, 2048])
X_train
tensor([[ 2.8226e-01,  3.3452e+00,  8.1016e-01,  ...,  1.2256e-01,
         -1.7933e-01, -1.1523e+00],
        [-3.9343e-02,  1.7592e+00, -7.0430e-01,  ...,  6.9959e-01,
          1.1014e+00,  6.0119e-01],
        [ 1.3394e+00,  5.6732e-02, -4.5411e-02,  ..., -9.6064e-01,
         -8.1373e-01,  1.8339e-02],
        ...,
        [-2.0256e+00,  1.1691e+00, -9.3264e-01,  ..., -1.0528e+00,
          1.4449e+00,  1.3276e+00],
        [-1.1762e+00,  6.9049e-04, -3.6890e-01,  ..., -5.2358e-01,
          4.1650e-01,  3.3237e+00],
        [ 9.7718e-01,  1.6894e+00,  2.5897e+00,  ...,  9.0606e-01,
         -8.5933e-01,  5.4565e-01]])

The shape of X_test is: torch.Size([161, 2048])
X_test
tensor([[ 1.2725, -1.4142,  0.5159,  ...,  0.7724,  1.2364, -0.9796],
        [ 1.0047,  0.7296, -0.5194,  ..., -0.1874, -0.2266,  0.8707],
        [-1.7569, -0.7789,  0.9907,  ...,  0.5762,  0.5052,  1.3295],
        ...,
        [ 1.1123