<a href="https://colab.research.google.com/github/satvik-dixit/speech_emotion_recognition/blob/main/speech_emotion_recogniser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Speech Emotion Recogniser

A notebook to identify the emotion of an utterance in English. Trained on RAVDESS. The demo has been divided into 3 phases:
- Phase 1: Uploading Audio File
- Phase 2: Loading RAVDESS and extracting metadata
- Phase 3: Speech Emotion Recognotion



### About RAVDESS:
- English
- 7356 recordings
- 24 actors (12 female, 12 male)
- 8 emotions: neutral, calm, happy, sad, angry, fearful, surprise, and disgust

### References:
- Dataset: https://zenodo.org/record/1188976#.YvyPHexBy3K
- Paper: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0196391


Lets start by importing a uploading the audio clip.


### Importing packages

In [None]:
!git clone -q https://github.com/GasserElbanna/serab-byols.git
!python3 -m pip install -q -e ./serab-byols

!pip install -q tqdm==4.60.0
!pip install ffmpeg-python

[?25l[K     |████▎                           | 10 kB 26.0 MB/s eta 0:00:01[K     |████████▋                       | 20 kB 31.3 MB/s eta 0:00:01[K     |█████████████                   | 30 kB 37.3 MB/s eta 0:00:01[K     |█████████████████▎              | 40 kB 22.8 MB/s eta 0:00:01[K     |█████████████████████▋          | 51 kB 16.6 MB/s eta 0:00:01[K     |██████████████████████████      | 61 kB 18.9 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 71 kB 17.9 MB/s eta 0:00:01[K     |████████████████████████████████| 75 kB 3.4 MB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [None]:
import os
import numpy as np
from tqdm import tqdm
from glob import glob
from random import sample
from pathlib import Path 

import librosa
import soundfile as sf

import torch
import serab_byols

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')


# Phase 1: Uploading Audio File

### Function for recording audio

In [None]:
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

### Uploading audio

In [None]:
audio, sr = get_audio()
audio = torch.Tensor(audio)
test_list = [audio]
print(test_list)

[tensor([  0.,   0.,   0.,  ..., -25., -21., -18.])]


# Phase 2: Loading RAVDESS audio files and extracting metadata

Includes downloading the dataset, loading audio files, resampling audio files, extracting metadata



### Defining a function for loading and resampling audio files

In [None]:
# Defining a function for loading and resampling audio files

def load_audio_files(audio_files, resampling_frequency=16000, audio_list=None):
  '''
  Loads and resamples audio files 
  
  Parameters
  ------------
  audio_files: string
      The paths of the wav files 
  resampling_frequency: integer
      The frequency which all audios will be resampled to
  audio_list: list 
      The list of torch tensors of audios to which more audios need too be added, empty by default

  Returns
  ------------
  audio_list: list
      A list of torch tensors, one array for each audio file

  '''
  # Making audio_list
  if audio_list is None:
    audio_list = []

  # Resampling
  for audio in audio_files:
    signal, fs = librosa.load(audio, sr=resampling_frequency)
    audio_list.append(torch.from_numpy(signal))
      
  return audio_list
        

### Metadata:
Speakers: (24 speakers) 
- Odd numbered actors are male 
- Even numbered actors are female

Labels: (8 labels)
- 01 = neutral
- 02 = calm
- 03 = happy
- 04 = sad
- 05 = angry
- 06 = fearful
- 07 = disgust
- 08 = surprised

### Loading and resampling audiofiles and collecting metadata on EmoDB dataset

In [None]:
# Phase_1
# Load dataset
! wget -O ravdess-emotional-speech-audio.zip -q https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip?download=1
! unzip -q ravdess-emotional-speech-audio.zip -d '/content/ravdess'

# Select all the audio files
audios = []
for file in Path('/content/ravdess').glob("**/*.wav"):
    if not file.is_file(): 
        continue
    audios.append(str(file))

# Load and resample audio files
audio_list = load_audio_files(audios, resampling_frequency=16000)

# Making speakers list and labels list 
speakers = []
old_labels = []
for audio_file in audios:
  file_name = audio_file.split('/')[4]
  speakers.append(file_name[18:20])
  old_labels.append(file_name[6:8])

label_dict = {'01':'NEUTRAL', '02':'CALM', '03':'HAPPY', '04':'SAD', '05':'ANGRY', '06':'FEARFUL', '07':'DISGUST', '08':'SURPRISE'}
labels = []
for old_label in old_labels:
  labels.append(label_dict[old_label])

# Verify phase_1
print('Number of audio files: {}'.format(len(audio_list)))
print('Number of speaker classes: {}'.format(len(set(speakers))))
print('Speaker classes: {}'.format(set(speakers)))
print('Number of speakers: {}'.format(len(speakers)))
print('Number of label classes: {}'.format(len(set(labels))))
print('Label classes: {}'.format(set(labels)))
print('Number of labels: {}'.format(len(labels)))


Number of audio files: 1440
Number of speaker classes: 24
Speaker classes: {'10', '17', '09', '18', '16', '02', '22', '05', '01', '12', '08', '06', '19', '21', '20', '23', '03', '04', '14', '11', '07', '24', '13', '15'}
Number of speakers: 1440
Number of label classes: 8
Label classes: {'HAPPY', 'FEARFUL', 'SURPRISE', 'NEUTRAL', 'DISGUST', 'CALM', 'SAD', 'ANGRY'}
Number of labels: 1440


# Phase 3: Defining functions for Speech Emotion Recognition



### Audio embeddings extraction functions

In [None]:
# Defining a function for generating audio embedding extraction models

def audio_embeddings_model(model_name):
  '''
  Generates model for embedding extraction 
  
  Parameters
  ------------
  mode_name: string
      The model to used, could be 'hybrid_byols'

  Returns
  ------------
  model: object
      The embedding extraction model
  '''
  if model_name=='hybrid_byols':
    model_name = 'cvt'
    checkpoint_path = "serab-byols/checkpoints/cvt_s1-d1-e64_s2-d1-e256_s3-d1-e512_BYOLAs64x96-osandbyolaloss6373-e100-bs256-lr0003-rs42.pth"
    model = serab_byols.load_model(checkpoint_path, model_name)
  return model


# Defining a function for embedding exctraction from the audio list
def audio_embeddings(audio_list, model_name, model, sampling_rate=16000):
  '''
  Loads and resamples audio files 
  
  Parameters
  ------------
  audio_list: list
      A list of arrays, one array for each audio file
  model_name: string
      The model to used, could be 'hybrid_byols'
  model: object
      The embedding extraction model generated by audio_embeddings_model function
  sampling_rate: int
      The sampling rate, 16 kHz by default

  Returns
  ------------
  embeddings_array: array
      The array containg embeddings of all audio_files, dimension (number of audio files × n_feats)
      
  '''
  if model_name=='hybrid_byols':
    embeddings_array = serab_byols.get_scene_embeddings(audio_list, model)
  return embeddings_array


### Speaker normalisation functions

In [None]:
# Defining a function for speaker normalisation using standard scaler

def speaker_normalisation(embeddings_array, speakers):
  '''
  Normalises embeddings_array for each speaker
  
  Parameters
  ------------
  embeddings_array: array
      The array of embeddings, one row for each audio file
  speakers: list 
      The list of speakers

  Returns
  ------------
  embeddings_array: array
      The array containg normalised embeddings of all audio_files, dimension (number of audio files × n_feats)
      
  '''
  speaker_ids = set(speakers)
  for speaker_id in speaker_ids:
    speaker_embeddings_indices = np.where(np.array(speakers)==speaker_id)[0]
    speaker_embeddings = embeddings_array[speaker_embeddings_indices,:]
    scaler = StandardScaler()
    normalised_speaker_embeddings = scaler.fit_transform(speaker_embeddings)
    embeddings_array[speaker_embeddings_indices] = torch.tensor(normalised_speaker_embeddings).float()
  return embeddings_array


### Hyperparameter tuning functions

In [None]:
# Defining a function for hyperparameter tuning and getting the accuracy on the test set

def get_hyperparams(X_train, X_test, y_train, classifier, parameters):
  '''
  Splits into training and testing set with different speakers

  Parameters
  ------------
  X_train: torch tensor
    The normalised embeddings that will be used for training
  X_test: torch tensor
    The normalised embeddings that will be used for testing
  y_train: list
    The labels that will be used for training
  y_test: list
    The labels that will be used for testing
  classifier: object
    The instance of the classification model 
  parameters: dictionary
    The dictionary of parameters for GridSearchCV 

  Returns
  ------------
    The dictionary of the best hyperparameters
  
  '''
  grid = GridSearchCV(classifier, param_grid = parameters, cv=5, scoring='recall_macro')                     
  grid.fit(X_train,y_train)
  print('recall_macro :',grid.best_score_)
  print('Best Parameters: {}'.format(grid.best_params_))
  prediction = grid.predict(X_test)
  print('PREDICTION: {}'.format(prediction))
  return prediction


### Pipeline function

In [None]:
# Defining a function for all steps 

def pipeline(audio_list, test_list, speakers, labels):
  '''
  Loads and resamples audio files 
  
  Parameters
  ------------
  audio_files: string
      The paths of the wav files 
  resampling_frequency: integer
      The frequency which all audios will be resampled to
  audio_list: list 
      The list of torch tensors of audios to which more audios need too be added, empty by default

  Returns
  ------------
  audio_list: list
      A list of torch tensors, one array for each audio file

  '''

  # Embeddings Extraction
  model = audio_embeddings_model(model_name = 'hybrid_byols')
  embeddings_array = audio_embeddings(audio_list, model_name = 'hybrid_byols', model=model)
  test_embeddings_array = audio_embeddings(test_list, model_name = 'hybrid_byols', model=model)
  print('embeddings_array shape: {}'.format(embeddings_array.shape))
  print('test_embeddings_array shape: {}'.format(test_embeddings_array.shape))

  # Speaker Normalisation
  normalised_embeddings = speaker_normalisation(embeddings_array, speakers)
  print('normalised_embeddings shape: {}'.format(normalised_embeddings.shape))
  columnwise_mean = torch.mean(normalised_embeddings, 0)
  if torch.all(columnwise_mean < 10**(-6)):
    print('PASSED: All means are less than 10**-6')
  else:
    print('FAILED: All means are NOT less than 10**-6')

  X_train = normalised_embeddings
  y_train = labels
  X_test = test_embeddings_array

  # Getting hyperparameters and checking max_recall
  print('Support Vector Machine:')
  classifier = SVC()
  parameters = {'C': np.logspace(-2,4,7), 'gamma': np.logspace(-5,-3,9), 'kernel':['linear', 'poly', 'rbf', 'sigmoid']}
  get_hyperparams(X_train, X_test, y_train, classifier, parameters)
  

# Results

Getting the emotion of the audio based on a model trained using Hybrid BYOL-S on RAVDESS

In [None]:
pipeline(audio_list, test_list, speakers, labels)

Generating Embeddings...: 100%|██████████| 1440/1440 [01:33<00:00, 15.39it/s]
Generating Embeddings...: 100%|██████████| 1/1 [00:00<00:00,  6.12it/s]


embeddings_array shape: torch.Size([1440, 2048])
test_embeddings_array shape: torch.Size([1, 2048])
normalised_embeddings shape: torch.Size([1440, 2048])
PASSED: All means are less than 10**-6
Support Vector Machine:
recall_macro : 0.7818572874493926
Best Parameters: {'C': 10.0, 'gamma': 0.00031622776601683794, 'kernel': 'rbf'}
PREDICTION: ['FEARFUL']
