## Reading the .pkl raw data and understanding how it works

In [1]:
import pandas as pd
import random

In [2]:
pkl_path = 'audio/MELD_features/MELD_features_raw.pkl'
videoIDs, videoSpeakers, videoLabels, videoText, videoAudio, videoSentence, trainVid, testVid, _ = pd.read_pickle(pkl_path)

In [3]:
videoAudio[0][0].shape

(300,)

In [4]:
# label_map = {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger':6}
num_to_label_map = {0: 'neutral', 1: 'surprise', 2: 'fear', 3: 'sadness', 4: 'joy', 5: 'disgust', 6: 'anger'}

random_idx = random.randint(0, len(videoSentence))

list_of_speakers = set(''.join(str(e) for e in speaker) for speaker in videoSpeakers[random_idx])
num_of_speakers = len(list_of_speakers)
nums = [i+1 for i in range(num_of_speakers)]
speaker_dict = dict(zip(list_of_speakers, nums))

print(f"Loading dialogue #{random_idx} with {num_of_speakers} speakers, {len(videoSentence[random_idx])} utterances.\n")

i=1
for sentence, speaker, label in zip(videoSentence[random_idx], videoSpeakers[random_idx], videoLabels[random_idx]):
    speaker = ''.join(str(e) for e in speaker)

    print(f'[#{i}] Person {speaker_dict[speaker]}: {sentence} ({num_to_label_map[label]})')
    i+=1

Loading dialogue #1007 with 4 speakers, 16 utterances.

[#1] Person 1: Tell us what happened, Brown Bird Ross. (neutral)
[#2] Person 3: Well, I lost. (sadness)
[#3] Person 3: Some little girl loaned her uniform to her nineteen year old sister, who went down to the U.S.S. (sadness)
[#4] Person 3: Nimitz, and sold over 2,000 boxes. (anger)
[#5] Person 1: Hey! Howd the interview go? (joy)
[#6] Person 4: Oh, I blew it. I wouldnt of even hired me. (sadness)
[#7] Person 3: Oh, come here sweetie, listen, youre gonna go on like a thousand interviews before you get a job. (sadness)
[#8] Person 3: Thats not how that was supposed to come out. (sadness)
[#9] Person 2: This is the worst Christmas ever. (sadness)
[#10] Person 1: Y'know what Rach, maybe you should just, y'know stay here at the coffee house. (neutral)
[#11] Person 4: I cant! (anger)
[#12] Person 4: Its too late! (anger)
[#13] Person 4: Terry already hired that girl over there. (sadness)
[#14] Person 4: Look at her, shes even go

## Training

In [81]:
%%time
!python main.py

Namespace(no_cuda=True, dir='MELD_features/', lr=0.0001, l2=0.0003, dropout=0.5, batch_size=8, epochs=2, class_weight=False, seed=100, mu=0, verbose=True)
Running on CPU
EmoNet model.
EmoNet(
  (dropout): Dropout(p=0.5, inplace=False)
  (emo_rnn_b): EmotionRNN(
    (dropout): Dropout(p=0.5, inplace=False)
    (cell): EmotionGRUCell(
      (g_cell): GRUCell(600, 150)
      (p_cell): GRUCell(450, 150)
      (pl_cell): GRUCell(450, 150)
      (r_cell): GRUCell(450, 150)
      (rl_cell): GRUCell(450, 150)
      (e_cell): GRUCell(600, 150)
      (dropout): Dropout(p=0.5, inplace=False)
      (attention): SimpleAttention(
        (scalar): Linear(in_features=150, out_features=1, bias=False)
      )
    )
  )
  (emo_rnn_f): EmotionRNN(
    (dropout): Dropout(p=0.5, inplace=False)
    (cell): EmotionGRUCell(
      (g_cell): GRUCell(600, 150)
      (p_cell): GRUCell(450, 150)
      (pl_cell): GRUCell(450, 150)
      (r_cell): GRUCell(450, 150)
      (rl_cell): GRUCell(450, 150)
      (e_cell): 

## Inference

In [71]:
from audio.encoder import *
from audio.dataloader import *
from audio.model import *
from audio.trainer import *

In [72]:
# load model
global D_s
D_m = 300
D_g = D_q = D_r = D_e = 150
D_h = 100
n_classes = 7

model = EmoNet(D_m, D_q, D_g, D_r, D_e, D_h, n_classes=n_classes)

saved_model_ckpt = "audio/MELD_features/models/EmoNet_31.pt"

model.load_state_dict(torch.load(saved_model_ckpt))

<All keys matched successfully>

In [73]:
train_loader, _, _ = get_MELD_loaders(pkl_path,
                                    7,
                                    valid=0.2,
                                    batch_size=32,
                                    num_workers=0)

In [9]:
for data in train_loader:
    textf, acouf, qmask, umask, label = data[:-1]
    
labels_ = label.view(-1).data.cpu().numpy()

log_prob, alpha_f, alpha_b = model(acouf, qmask, umask)

lp_ = log_prob.transpose(0,1).contiguous().view(-1,log_prob.size()[2]) # batch*seq_len, n_classes
pred_ = torch.argmax(lp_,1) # batch*seq_len
preds = pred_.data.cpu().numpy()

for i in preds[:5]:
    print(f"Predicted: {num_to_label_map[i]}\nActual: {num_to_label_map[labels_[i].item()]}\n")

Predicted: neutral
Actual: fear

Predicted: neutral
Actual: fear

Predicted: joy
Actual: neutral

Predicted: neutral
Actual: fear

Predicted: joy
Actual: neutral



## Using OpenSMILE

In [23]:
import opensmile
import numpy as np
import torch.nn as nn

In [24]:
def opensmile_feature(audio_file_path, feature_type):
    if feature_type == "emobase":
        smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.emobase,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
    elif feature_type == "ComParE":
        smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
    else:
        smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
    return np.array(smile.process_file(audio_file_path))

As suggested in https://aclanthology.org/N18-1193.pdf, we use openSMILE with the ComParE configuration to get 6373 features for each utterance video. Z- standardization is performed for voice normaliza- tion and dimension of the audio vector is reduced to 100 using a fully-connected neural layer. This provides the final audio feature vector.

In [51]:
def get_audio_vector(path):
    
    # Extract ComParE features to (1, 6373)
    a = opensmile_feature(path, 'ComParE')

    # Z-score standardization
    mean = np.mean(a)
    std_dev = np.std(a)
    standardized_a= (a - mean) / std_dev

    # Dimensionality reduction to (300,)
    fc_layer = nn.Linear(6373, 300)
    a_ = torch.tensor(standardized_a).float()
    output_tensor = fc_layer(a_).detach().numpy().reshape(-1,)
    
    return output_tensor

In [52]:
output_emb = get_audio_vector('audio/tmp/examples/utt0.wav')

In [53]:
from numpy.testing import assert_allclose
assert_allclose(actual=output_tensor, desired=videoAudio[1][0], atol=2, err_msg='The output tensor is different.', verbose=True)

This shows that our audio features are nearly same to the ones provided as part of open-source code. The difference in exact values arises due to the use of a fully connected layer to reduce the dimension of the audio vector to 300, since each initialisation of weights and biases is done randomly, thus unable to ensure reproducibility.

In [68]:
eg = make_embs_for_dialogue('audio/tmp/examples')

MoviePy - Writing audio in audio/tmp/examples/dia1_utt0.wav


                                                       

MoviePy - Done.




MoviePy - Writing audio in audio/tmp/examples/dia1_utt1.wav


                                                       

MoviePy - Done.




MoviePy - Writing audio in audio/tmp/examples/dia1_utt2.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in audio/tmp/examples/dia1_utt3.wav


                                                       

MoviePy - Done.




MoviePy - Writing audio in audio/tmp/examples/dia1_utt4.wav


                                                       

MoviePy - Done.




MoviePy - Writing audio in audio/tmp/examples/dia1_utt5.wav


                                                       

MoviePy - Done.




MoviePy - Writing audio in audio/tmp/examples/dia1_utt8.wav


                                                       

MoviePy - Done.




In [70]:
eg.shape

(7, 300)

In [64]:
videoAudio[1].shape

(7, 300)