# Phase 2: Embedding Extraction

## Importing packages

In [None]:
import numpy as np
import soundfile as sf
import librosa
import os
from glob import glob
import torch

In [None]:
! pip install -q kaggle

from google.colab import files
files.upload()

# Name directory
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
# CREMA-D Dataset

! kaggle datasets download -d ejlok1/cremad
! unzip cremad.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: AudioWAV/1031_DFA_DIS_XX.wav  
  inflating: AudioWAV/1031_DFA_FEA_XX.wav  
  inflating: AudioWAV/1031_DFA_HAP_XX.wav  
  inflating: AudioWAV/1031_DFA_NEU_XX.wav  
  inflating: AudioWAV/1031_DFA_SAD_XX.wav  
  inflating: AudioWAV/1031_IEO_ANG_HI.wav  
  inflating: AudioWAV/1031_IEO_ANG_LO.wav  
  inflating: AudioWAV/1031_IEO_ANG_MD.wav  
  inflating: AudioWAV/1031_IEO_DIS_HI.wav  
  inflating: AudioWAV/1031_IEO_DIS_LO.wav  
  inflating: AudioWAV/1031_IEO_DIS_MD.wav  
  inflating: AudioWAV/1031_IEO_FEA_HI.wav  
  inflating: AudioWAV/1031_IEO_FEA_LO.wav  
  inflating: AudioWAV/1031_IEO_FEA_MD.wav  
  inflating: AudioWAV/1031_IEO_HAP_HI.wav  
  inflating: AudioWAV/1031_IEO_HAP_LO.wav  
  inflating: AudioWAV/1031_IEO_HAP_MD.wav  
  inflating: AudioWAV/1031_IEO_NEU_XX.wav  
  inflating: AudioWAV/1031_IEO_SAD_HI.wav  
  inflating: AudioWAV/1031_IEO_SAD_LO.wav  
  inflating: AudioWAV/1031_IEO_SAD_MD.wav  
  inflating

# Phase 1: Loading and resampling audio files

In [None]:
# Defining function for loading and resampling audio files

# Processing audio files
audio_files_crema_d = glob(os.path.join('/content/AudioWAV','*.wav'))

def load_audio_files(audio_files, resampling_frequency=16000, audio_list=None):
    '''
    Loads and resamples audio files 
    
    Parameters
    ------------
    audio_files: string
        The paths of the wav files 
    resampling_frequency: integer
        The frequency which all audios will be resampled to
    audio_list: list of torch arrays of audios to which more audios need too be added, empty by default

    Returns
    ------------
    audio_list: list of torch arrays
        A list of torch arrays, one array for each audio file
        
    '''

    # Making audio_list
    if audio_list is None:
      audio_list = []

    # Resampling
    for audio in audio_files:
        signal, fs = librosa.load(audio, sr=resampling_frequency)
        audio_list.append(torch.from_numpy(signal))
        
    return audio_list
        

In [None]:

audio_files_crema_d = glob(os.path.join('/content/AudioWAV','*.wav'))

# Using load_audio_files function
audio_list_crema_d = load_audio_files(audio_files_crema_d, resampling_frequency=16000)
print('number of audio files: {}'.format(len(audio_list_crema_d)))
print(audio_list_crema_d[0].shape)

number of audio files: 7442
torch.Size([26159])


# Phase 2: Embedding Extraction

In [None]:
!pip install speechbrain
!pip install transformers
!git clone https://github.com/GasserElbanna/serab-byols.git
!python3 -m pip install -e ./serab-byols

!pip install tqdm==4.60.0
!pip install opensmile


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting speechbrain
  Downloading speechbrain-0.5.12-py3-none-any.whl (496 kB)
[K     |████████████████████████████████| 496 kB 4.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 53.9 MB/s 
Collecting hyperpyyaml
  Downloading HyperPyYAML-1.0.1.tar.gz (14 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 5.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 73.3 MB/s 
Collecting ruamel.yaml>=0.17.8
  Downloading ruamel.yaml-0.17.21-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109

In [None]:
from tqdm import tqdm
import serab_byols
import opensmile
from transformers import Wav2Vec2Model, HubertModel

## Defining the functions

In [None]:
def audio_embeddings_model(model_name):
  '''
  Generates model for embedding extraction 
  
  Parameters
  ------------
  mode_name: string
      The model to used, could be 'wav2vec', 'hubert' or 'hybrid_byols'

  Returns
  ------------
  model: object

  '''
  if model_name=='wav2vec2':
    model_hub = 'facebook/wav2vec2-large-960h-lv60-self'
    model = Wav2Vec2Model.from_pretrained(model_hub)
  elif model_name=='hubert':
    model_hub = 'facebook/hubert-xlarge-ll60k'
    model = HubertModel.from_pretrained(model_hub)
  elif model_name=='hybrid_byols':
    model_name = 'cvt'
    checkpoint_path = "serab-byols/checkpoints/cvt_s1-d1-e64_s2-d1-e256_s3-d1-e512_BYOLAs64x96-osandbyolaloss6373-e100-bs256-lr0003-rs42.pth"
    model = serab_byols.load_model(checkpoint_path, model_name)
  elif model_name=='compare':
    model = opensmile.Smile(
        feature_set=opensmile.FeatureSet.ComParE_2016,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
  elif model_name=='egemaps':
    model = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
  return model


def audio_embeddings(audio_list, model_name, model, sampling_rate=16000):
  '''
  Loads and resamples audio files 
  
  Parameters
  ------------
  audio_list: list of arrays
      A list of arrays, one array for each audio file
  model_name: string
      The model to used, could be 'wav2vec', 'hubert' or 'hybrid_byols'
  model: object
      The model generated by audio_embeddings_model function
  n_feats: int
      The number of features of each audio file, 6373 for 'compare' and 88 for 'egemaps'

  Returns
  ------------
  embeddings_array: torch array
      The array containg embeddings of all audio_files, dimension (number of audio files × n_feats)
      
  '''
  if model_name=='hybrid_byols':
    embeddings_array = serab_byols.get_scene_embeddings(audio_list, model)
  else:
    embeddings_list = []
    for i in tqdm(range(len(audio_list))):
      if model_name=='wav2vec2' or model_name=='hubert':
        embeddings = model(audio_list[i].reshape(1,-1)).last_hidden_state.mean(1)
        embeddings_list.append(embeddings[0])
      elif model_name=='compare' or model_name=='egemaps':
        embeddings = model.process_signal(audio_list[i], sampling_rate)
        embeddings_list.append(torch.tensor(embeddings.values[0], dtype=torch.float32))
    embeddings_array = torch.stack(embeddings_list)
  return embeddings_array


## 1. Wav2vec 2.0

In [None]:
model = audio_embeddings_model(model_name='wav2vec2')


Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
embeddings_array_wav2vec = audio_embeddings(audio_list_crema_d[:2], model_name='wav2vec2', model=model)
print(embeddings_array_wav2vec.shape)

100%|██████████| 2/2 [00:04<00:00,  2.49s/it]

torch.Size([2, 1024])





## 2. Hubert

In [None]:
model = audio_embeddings_model(model_name='hubert')


Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

In [None]:
embeddings_array_hubert = audio_embeddings(audio_list_crema_d[:2], model_name='hubert', model=model)


100%|██████████| 2/2 [00:14<00:00,  7.28s/it]


In [None]:
print(embeddings_array_hubert.shape)
print(embeddings_array_hubert)

torch.Size([2, 1280])
tensor([[-0.1331, -0.0381, -0.0212,  ...,  0.1765, -0.0201, -0.0123],
        [ 0.0113, -0.0210,  0.0478,  ...,  0.1125, -0.0100, -0.0545]],
       grad_fn=<StackBackward0>)


## 3. Hybrid BYOL-S

In [None]:
model = audio_embeddings_model(model_name='hybrid_byols')


In [None]:
embeddings_array_byols = audio_embeddings(audio_list_crema_d, model_name='hybrid_byols', model=model)

Generating Embeddings...: 100%|██████████| 7442/7442 [16:20<00:00,  7.59it/s]


In [None]:
print(embeddings_array_byols.shape)
print(embeddings_array_byols)

torch.Size([7442, 2048])
tensor([[ 3.7383,  3.5858,  1.6220,  ...,  4.1465,  0.4608,  4.7381],
        [ 4.4155,  2.8294,  2.1176,  ...,  4.2396, -2.4157,  4.4473],
        [ 4.5136,  2.9190,  1.1278,  ...,  3.3659,  0.7211,  4.4434],
        ...,
        [ 4.0179,  2.9099,  1.6047,  ...,  3.6393, -1.2134,  4.7288],
        [ 4.8532,  2.9524,  1.8612,  ...,  3.1700,  0.0949,  3.8128],
        [ 2.9410,  3.9386,  1.3845,  ...,  2.8808, -0.7497,  4.5383]])


## 4. openSMILE: ComParE_2016

In [None]:
model = audio_embeddings_model(model_name='compare')
embeddings_array_compare = audio_embeddings(audio_list_crema_d, model_name='compare', model=model)
print(embeddings_array_compare.shape)

100%|██████████| 7442/7442 [12:40<00:00,  9.79it/s]

torch.Size([7442, 6373])





In [None]:
print(embeddings_array_compare)

tensor([[2.1515e+00, 2.6282e-01, 9.9359e-01,  ..., 4.5654e+01, 1.1108e+02,
         3.9020e+01],
        [1.5562e+00, 2.8326e-01, 3.0043e-02,  ..., 7.0729e+01, 1.0730e+02,
         5.6093e+01],
        [1.2251e+00, 3.6073e-01, 5.9361e-02,  ..., 5.3312e+01, 8.9132e+01,
         4.6726e+01],
        ...,
        [1.5438e+00, 2.9155e-01, 8.4257e-01,  ..., 5.0345e+01, 9.2052e+01,
         5.4620e+01],
        [2.2159e+00, 4.1636e-01, 1.5242e-01,  ..., 4.1756e+01, 1.1056e+02,
         6.2525e+01],
        [7.3584e-01, 6.1232e-01, 0.0000e+00,  ..., 4.6456e+01, 9.6339e+01,
         4.4667e+01]])


## 5. openSMILE: eGeMAPSv02

In [None]:
model = audio_embeddings_model(model_name='egemaps')
embeddings_array_egemaps = audio_embeddings(audio_list_crema_d, model_name='egemaps', model=model)
print(embeddings_array_egemaps.shape)

100%|██████████| 7442/7442 [13:08<00:00,  9.44it/s]

torch.Size([7442, 88])





In [None]:
print(embeddings_array_egemaps)

tensor([[ 32.9292,   0.0568,  32.1032,  ...,   0.2025,   0.1737, -22.1917],
        [ 43.6140,   0.1619,  37.6572,  ...,   0.3233,   0.3570, -28.8015],
        [ 22.6801,   0.0748,  21.9986,  ...,   0.1337,   0.1166, -27.5571],
        ...,
        [ 33.0593,   0.0738,  31.0595,  ...,   0.2900,   0.4182, -26.6729],
        [ 40.6903,   0.1076,  39.1804,  ...,   0.2867,   0.2730, -25.3926],
        [ 24.4548,   0.1244,  22.0018,  ...,   0.3050,   0.2615, -30.9298]])


<B>Problems:</B>
- Colab crashes when using hubert or wav2vec for all 200 audio files, works well for a list of 20 audio files for wav2vec and a list of 5 audio files for hubert

# Phase 3: Downstream Task - Speech Emotion Recognotion

## Extracting labels and speaker ID

In [None]:
speakers = []
labels = []

for audio_file in audio_files_crema_d:
  file_name = audio_file.split('/')[3]
  segments = file_name.split('_')
  speakers.append(int(segments[0]))
  labels.append(segments[2])

print(speakers)
print(labels)

[1015, 1002, 1077, 1078, 1016, 1013, 1055, 1017, 1020, 1006, 1024, 1062, 1042, 1001, 1037, 1067, 1065, 1009, 1029, 1034, 1033, 1030, 1064, 1089, 1054, 1013, 1038, 1039, 1045, 1005, 1022, 1088, 1047, 1035, 1082, 1056, 1078, 1012, 1042, 1010, 1080, 1016, 1045, 1032, 1010, 1051, 1076, 1058, 1063, 1027, 1007, 1075, 1009, 1086, 1081, 1029, 1019, 1063, 1052, 1056, 1024, 1032, 1063, 1008, 1040, 1064, 1021, 1024, 1055, 1004, 1086, 1053, 1025, 1041, 1002, 1055, 1041, 1039, 1060, 1082, 1042, 1067, 1016, 1024, 1024, 1060, 1039, 1062, 1031, 1005, 1013, 1054, 1084, 1079, 1048, 1046, 1017, 1079, 1076, 1064, 1048, 1052, 1014, 1033, 1072, 1068, 1028, 1076, 1022, 1069, 1037, 1071, 1069, 1024, 1059, 1070, 1077, 1044, 1034, 1021, 1066, 1050, 1057, 1009, 1044, 1020, 1057, 1063, 1061, 1019, 1027, 1061, 1029, 1074, 1009, 1032, 1061, 1043, 1014, 1053, 1088, 1056, 1049, 1012, 1084, 1028, 1056, 1082, 1082, 1064, 1068, 1085, 1091, 1024, 1078, 1023, 1087, 1074, 1019, 1002, 1061, 1007, 1081, 1066, 1053, 1091, 105

## Speaker normalisation

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [None]:

def speaker_normalisation(embeddings_array, speakers):
  '''
  Normalises embeddings_array for each speaker
  
  Parameters
  ------------
  embeddings_array: torch tensor
      The tensor of embeddings, one row for each audio file
  speakers: list of integers
      The list of speakers

  Returns
  ------------
  embeddings_array: torch tensor
      The tensor containg normalised embeddings 
      
  '''
  speaker_ids = set(speakers)
  for speaker_id in speaker_ids:
    speaker_embeddings_indices = np.where(np.array(speakers)==speaker_id)[0]
    speaker_embeddings = embeddings_array[speaker_embeddings_indices,:]
    normalised_speaker_embeddings = scaler.fit_transform(speaker_embeddings)
    embeddings_array[speaker_embeddings_indices] = torch.tensor(normalised_speaker_embeddings).float()
  return embeddings_array


## Example 1: wav2vec 2.0

In [None]:
# print(embeddings_array_wav2vec.shape)
# print(embeddings_array_wav2vec)
# print(speaker_normalisation(embeddings_array_wav2vec, speakers).shape)
# print(speaker_normalisation(embeddings_array_wav2vec, speakers))

# validate by taking column-wise mean
# print(torch.mean(speaker_normalisation(embeddings_array_wav2vec, speakers), 1))

## Example 2: Hubert

In [None]:
# print(embeddings_array_hubert.shape)
# print(embeddings_array_hubert)
# print(speaker_normalisation(embeddings_array_hubert, speakers).shape)
# print(speaker_normalisation(embeddings_array_hubert, speakers))

# validate by taking column-wise mean
# print(torch.mean(speaker_normalisation(embeddings_array_hubert, speakers), 1))

## Example 3: Hybrid BYOLS

In [None]:
print(embeddings_array_byols.shape)
print(embeddings_array_byols)
print(speaker_normalisation(embeddings_array_byols, speakers).shape)
print(speaker_normalisation(embeddings_array_byols, speakers))

# validate by taking column-wise mean
print(torch.mean(speaker_normalisation(embeddings_array_byols, speakers), 1))

torch.Size([7442, 2048])
tensor([[ 3.7383,  3.5858,  1.6220,  ...,  4.1465,  0.4608,  4.7381],
        [ 4.4155,  2.8294,  2.1176,  ...,  4.2396, -2.4157,  4.4473],
        [ 4.5136,  2.9190,  1.1278,  ...,  3.3659,  0.7211,  4.4434],
        ...,
        [ 4.0179,  2.9099,  1.6047,  ...,  3.6393, -1.2134,  4.7288],
        [ 4.8532,  2.9524,  1.8612,  ...,  3.1700,  0.0949,  3.8128],
        [ 2.9410,  3.9386,  1.3845,  ...,  2.8808, -0.7497,  4.5383]])
torch.Size([7442, 2048])
tensor([[-0.1946,  1.2379,  0.2978,  ...,  1.3946,  1.6653,  1.4108],
        [-0.0304, -1.3611,  1.1428,  ...,  2.3054, -1.3158, -0.0924],
        [ 0.9255, -0.2377, -1.1589,  ..., -0.5091,  0.5474,  0.6157],
        ...,
        [ 0.6414,  0.0873,  0.1530,  ...,  0.9645, -0.5425,  0.4777],
        [ 0.4065,  0.4576, -0.8603,  ...,  0.1212,  0.8779, -0.2968],
        [-1.1490,  1.1902, -0.4735,  ..., -0.6292, -0.4608,  0.7694]])


## Example 4: openSMILE compare

In [None]:
print(embeddings_array_compare.shape)
print(embeddings_array_compare)
print(speaker_normalisation(embeddings_array_compare, speakers).shape)
print(speaker_normalisation(embeddings_array_compare, speakers))

# validate by taking column-wise mean
print(torch.mean(speaker_normalisation(embeddings_array_compare, speakers), 1))

torch.Size([7442, 6373])
tensor([[2.1515e+00, 2.6282e-01, 9.9359e-01,  ..., 4.5654e+01, 1.1108e+02,
         3.9020e+01],
        [1.5562e+00, 2.8326e-01, 3.0043e-02,  ..., 7.0729e+01, 1.0730e+02,
         5.6093e+01],
        [1.2251e+00, 3.6073e-01, 5.9361e-02,  ..., 5.3312e+01, 8.9132e+01,
         4.6726e+01],
        ...,
        [1.5438e+00, 2.9155e-01, 8.4257e-01,  ..., 5.0345e+01, 9.2052e+01,
         5.4620e+01],
        [2.2159e+00, 4.1636e-01, 1.5242e-01,  ..., 4.1756e+01, 1.1056e+02,
         6.2525e+01],
        [7.3584e-01, 6.1232e-01, 0.0000e+00,  ..., 4.6456e+01, 9.6339e+01,
         4.4667e+01]])
torch.Size([7442, 6373])
tensor([[ 1.4951, -1.1937,  1.6263,  ..., -0.1372,  1.1085, -1.0800],
        [ 0.3217, -0.9272, -1.2508,  ...,  1.9619,  0.3010,  0.4407],
        [-0.2444,  0.0856, -0.9097,  ...,  0.5958, -0.7331,  0.1181],
        ...,
        [ 0.0213, -0.1868,  0.6908,  ...,  0.0659, -0.7856,  0.5868],
        [ 1.0842, -0.2237, -0.4513,  ..., -0.7978,  0.6979,  

## Example 5: openSMILE egemaps

In [None]:
print(embeddings_array_egemaps.shape)
print(embeddings_array_egemaps)
print(speaker_normalisation(embeddings_array_egemaps, speakers).shape)
print(speaker_normalisation(embeddings_array_egemaps, speakers))

# validate by taking column-wise mean
print(torch.mean(speaker_normalisation(embeddings_array_egemaps, speakers), 1))

torch.Size([7442, 88])
tensor([[ 32.9292,   0.0568,  32.1032,  ...,   0.2025,   0.1737, -22.1917],
        [ 43.6140,   0.1619,  37.6572,  ...,   0.3233,   0.3570, -28.8015],
        [ 22.6801,   0.0748,  21.9986,  ...,   0.1337,   0.1166, -27.5571],
        ...,
        [ 33.0593,   0.0738,  31.0595,  ...,   0.2900,   0.4182, -26.6729],
        [ 40.6903,   0.1076,  39.1804,  ...,   0.2867,   0.2730, -25.3926],
        [ 24.4548,   0.1244,  22.0018,  ...,   0.3050,   0.2615, -30.9298]])
torch.Size([7442, 88])
tensor([[ 0.7936, -0.7183,  1.0777,  ..., -0.9591, -1.3260,  1.9072],
        [ 1.1886,  1.9200,  0.2709,  ..., -0.8155, -0.2885,  0.7240],
        [-0.5616, -0.1517, -0.2618,  ..., -1.7764, -1.6499,  0.2195],
        ...,
        [ 0.6619, -0.5094,  0.7873,  ..., -0.6515,  0.3256,  0.7100],
        [ 1.1256, -0.1432,  1.1934,  ..., -0.1999, -0.0109,  0.9992],
        [-1.2436,  0.1313, -1.1348,  ..., -0.5049, -0.3859, -0.2636]])
