# Phase 2: Embedding Extraction

## Importing packages

In [2]:
import numpy as np
import soundfile as sf
import librosa
import os
from glob import glob
import torch

In [3]:
! pip install -q kaggle

from google.colab import files
files.upload()

# Name directory
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [4]:
# TESS Dataset

# Paste API from kaggle
! kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess
# Unzip
! unzip toronto-emotional-speech-set-tess.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_back_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_bar_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_base_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_bath_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_bean_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_beg_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_bite_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_boat_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_bone_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_book_angry.wav  
  inflating: TESS Toronto emotional speech set data/OAF_angry/OAF_bought_angry.wav  
  inflating: TESS Toro

# Phase 1: Loading and resampling audio files

In [3]:
# Defining function for loading and resampling audio files

# Processing audio files
audio_files = glob(os.path.join('/content/TESS Toronto emotional speech set data/OAF_Fear','*.wav'))

def load_audio_files(audio_files, resampling_frequency=16000, audio_list=None):
    '''
    Loads and resamples audio files 
    
    Parameters
    ------------
    audio_files: string
        The paths of the wav files 
    resampling_frequency: integer
        The frequency which all audios will be resampled to
    audio_list: list of audios to which more audios need too be added, empty by default

    Returns
    ------------
    audio_list: list of arrays
        A list of arrays, one array for each audio file
        
    '''

    # Making audio_list
    if audio_list is None:
      audio_list = []

    # Resampling
    for audio in audio_files:
        signal, fs = librosa.load(audio, sr=resampling_frequency)
        audio_list.append(torch.from_numpy(signal))
        
    return audio_list
        

In [4]:

audio_files = glob(os.path.join('/content/TESS Toronto emotional speech set data/OAF_Fear','*.wav'))

# Using load_audio_files function
audio_list_tess = load_audio_files(audio_files, resampling_frequency=16000)
print('number of audio files: {}'.format(len(audio_list_tess)))
print(audio_list_tess[0].shape)

number of audio files: 200
torch.Size([26147])


# Phase 2: Embedding Extraction

In [7]:
!pip install speechbrain
!pip install transformers
!git clone https://github.com/GasserElbanna/serab-byols.git
!python3 -m pip install -e ./serab-byols

!pip install tqdm==4.60.0
!pip install opensmile
import serab_byols
import opensmile
from transformers import Wav2Vec2Model, HubertModel


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'serab-byols' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Obtaining file:///content/serab-byols
Installing collected packages: serab-byols
  Attempting uninstall: serab-byols
    Found existing installation: serab-byols 0.0.0
    Can't uninstall 'serab-byols'. No files were found to uninstall.
  Running setup.py develop for serab-byols
Successfully installed serab-byols-0.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Defining the functions

In [5]:
def audio_embeddings_model(model_name):
  if model_name=='wav2vec2':
    model_hub = 'facebook/wav2vec2-large-960h-lv60-self'
    model = Wav2Vec2Model.from_pretrained(model_hub)
  elif model_name=='hubert':
    model_hub = 'facebook/hubert-xlarge-ll60k'
    model = HubertModel.from_pretrained(model_hub)
  elif model_name=='hybrid_byols':
    model_name = 'cvt'
    checkpoint_path = "serab-byols/checkpoints/cvt_s1-d1-e64_s2-d1-e256_s3-d1-e512_BYOLAs64x96-osandbyolaloss6373-e100-bs256-lr0003-rs42.pth"
    model = serab_byols.load_model(checkpoint_path, model_name)
  elif model_name=='compare':
    model = opensmile.Smile(
        feature_set=opensmile.FeatureSet.ComParE_2016,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
  elif model_name=='egemaps':
    model = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
  return model


def audio_embeddings(audio_list, model_name, model, n_feats):
  if model_name=='hybrid_byols':
    embeddings_array = serab_byols.get_scene_embeddings(audio_list, model)
  else:
    embeddings_array = torch.zeros([len(audio_list), n_feats])
    for i in range(len(audio_list)):
      if model_name=='wav2vec2' or model_name=='hubert':
        embeddings = model(audio_list[i].reshape(1,-1)).last_hidden_state.mean(1)
      # elif model_name=='compare' or model_name=='egemaps':
      #   embeddings = model.process_file(audio_list[i])
      embeddings_array[i] = embeddings
  return embeddings_array

def audio_features(audio_files, model, n_feats):
  embeddings_array = torch.zeros([len(audio_files), n_feats])
  for i, audio in enumerate(audio_files):
    embeddings = model.process_file(audio)
    embeddings_array[i] = torch.tensor(embeddings.values)
  return embeddings_array


## 1. Wav2vec 2.0

In [6]:
model = audio_embeddings_model(model_name='wav2vec2')
embeddings_array = audio_embeddings(audio_list_tess[:20], model_name='wav2vec2', model=model, n_feats=1024)
print(embeddings_array.shape)

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2Model: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([20, 1024])


## 2. Hubert

In [8]:
model = audio_embeddings_model(model_name='hubert')
embeddings_array = audio_embeddings(audio_list_tess[:5], model_name='hubert', model=model, n_feats=1280)
print(embeddings_array.shape)

torch.Size([5, 1280])


## 3. Hybrid BYOL-S

In [9]:
model = audio_embeddings_model(model_name='hybrid_byols')
embeddings_array = audio_embeddings(audio_list_tess, model_name='hybrid_byols', model=model, n_feats=2048)
print()
print(embeddings_array.shape)

Generating Embeddings...: 100%|██████████| 200/200 [00:06<00:00, 29.40it/s]


torch.Size([200, 2048])





## 4. openSMILE: ComParE_2016

In [12]:
model = audio_embeddings_model(model_name='compare')
audio_files = glob(os.path.join('/content/TESS Toronto emotional speech set data/OAF_Fear','*.wav'))
embeddings_array = audio_features(audio_files, model=model, n_feats=6373)

print(embeddings_array.shape)

torch.Size([200, 6373])


## 5. openSMILE: eGeMAPSv02

In [13]:
model = audio_embeddings_model(model_name='egemaps')
audio_files = glob(os.path.join('/content/TESS Toronto emotional speech set data/OAF_Fear','*.wav'))
embeddings_array = audio_features(audio_files, model=model, n_feats=88)

print(embeddings_array.shape)

torch.Size([200, 88])


<B>Problems:</B>
- Colab crashes when using hubert or wav2vec for all 200 audio files, works well for a list of 20 audio files for wav2vec and a list of 5 audio files for hubert