In [1]:
import pandas as pd
import random
import numpy as np

from src.inference.utils import *
from src.trainer import load_model_from_ckpt, run_inference

In [2]:
pkl_path = 'audio/MELD_features/MELD_features_raw.pkl'
videoIDs, videoSpeakers, videoLabels, videoText, videoAudio, videoSentence, trainVid, testVid, labels = pd.read_pickle(pkl_path)

In [3]:
def load_conversation(idx, pred_emotions=None):
    num_to_label_map = {0: 'neutral', 1: 'surprise', 2: 'fear', 3: 'sadness', 4: 'joy', 5: 'disgust', 6: 'anger'}

    list_of_speakers = set(''.join(str(e) for e in speaker) for speaker in videoSpeakers[idx])
    num_of_speakers = len(list_of_speakers)
    nums = [i+1 for i in range(num_of_speakers)]
    speaker_dict = dict(zip(list_of_speakers, nums))

    print(f"Loading dialogue #{idx} with {num_of_speakers} speakers, {len(videoSentence[idx])} utterances.\n")

    i=1
    sentence_count=0
    for sentence, speaker, label in zip(videoSentence[idx], videoSpeakers[idx], videoLabels[idx]):
        speaker = ''.join(str(e) for e in speaker)
        
        if pred_emotions is not None:
            print(f'[#{i}] Person {speaker_dict[speaker]} ({num_to_label_map[label]} - {num_to_label_map[pred_emotions[sentence_count]]})\t:\t{sentence}')
        else:
            print(f'[#{i}] Person {speaker_dict[speaker]} ({num_to_label_map[label]}) :\t{sentence}')

        i+=1
        sentence_count+=1

In [4]:
# label_map = {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger':6}
random_idx = random.randint(0, len(videoSentence))
load_conversation(1)

Loading dialogue #1 with 2 speakers, 7 utterances.

[#1] Person 1 (surprise) :	But then who? The waitress I went out with last month?
[#2] Person 2 (sadness) :	You know? Forget it!
[#3] Person 1 (surprise) :	No-no-no-no, no! Who, who were you talking about?
[#4] Person 2 (fear) :	No, I-I-I-I don't, I actually don't know
[#5] Person 1 (neutral) :	Ok!
[#6] Person 1 (neutral) :	All right, well...
[#7] Person 2 (neutral) :	Yeah, sure!


In [5]:
dialogue_dir = 'example'
speakers_for_utterance = [1,2,2,2,1,1,2]
acouf, qmask, umask = get_features_for_dialogue(dialogue_dir=dialogue_dir, X=speakers_for_utterance)

In [6]:
model = load_model_from_ckpt('audio/MELD_features/models/EmoNet_38.pt')

In [7]:
model

EmotionNet(
  (dropout): Dropout(p=0.5, inplace=False)
  (emo_rnn_b): EmotionRNN(
    (dropout): Dropout(p=0.5, inplace=False)
    (cell): EmotionGRUCell(
      (g_cell): GRUCell(600, 150)
      (p_cell): GRUCell(450, 150)
      (pl_cell): GRUCell(450, 150)
      (r_cell): GRUCell(450, 150)
      (rl_cell): GRUCell(450, 150)
      (e_cell): GRUCell(600, 150)
      (dropout): Dropout(p=0.5, inplace=False)
      (attention): SimpleAttention(
        (scalar): Linear(in_features=150, out_features=1, bias=False)
      )
    )
  )
  (emo_rnn_f): EmotionRNN(
    (dropout): Dropout(p=0.5, inplace=False)
    (cell): EmotionGRUCell(
      (g_cell): GRUCell(600, 150)
      (p_cell): GRUCell(450, 150)
      (pl_cell): GRUCell(450, 150)
      (r_cell): GRUCell(450, 150)
      (rl_cell): GRUCell(450, 150)
      (e_cell): GRUCell(600, 150)
      (dropout): Dropout(p=0.5, inplace=False)
      (attention): SimpleAttention(
        (scalar): Linear(in_features=150, out_features=1, bias=False)
      )
 

In [8]:
preds = run_inference(model, acouf.unsqueeze(1), qmask.unsqueeze(1), umask.unsqueeze(0))

In [9]:
load_conversation(1, preds)
# actual_emotion - predicted_emotion

Loading dialogue #1 with 2 speakers, 7 utterances.

[#1] Person 1 (surprise - neutral)	:	But then who? The waitress I went out with last month?
[#2] Person 2 (sadness - neutral)	:	You know? Forget it!
[#3] Person 1 (surprise - anger)	:	No-no-no-no, no! Who, who were you talking about?
[#4] Person 2 (fear - sadness)	:	No, I-I-I-I don't, I actually don't know
[#5] Person 1 (neutral - neutral)	:	Ok!
[#6] Person 1 (neutral - joy)	:	All right, well...
[#7] Person 2 (neutral - joy)	:	Yeah, sure!
