In [1]:
import pickle
import argparse
import torch
from src.model import EmotionNet
from src.dataloader import get_MELD_loaders

In [2]:
def map_speakers_to_int(videoSpeakers):
    unique_speakers = {}
    speaker_id = 1  # Start numbering speakers from 1

    # Assign a unique integer to each unique speaker vector
    for speaker_vector in videoSpeakers:

        # Convert the vector to a tuple to use it as a key in the dictionary
        speaker_tuple = tuple(speaker_vector)
        if speaker_tuple not in unique_speakers:
            unique_speakers[speaker_tuple] = speaker_id
            speaker_id += 1

    return unique_speakers, len(unique_speakers)

In [3]:
labels_map = {0: 'neutral', 1: 'surprise',
              2: 'fear', 3: 'sadness',
              4: 'joy', 5: 'disgust', 6: 'anger'}

In [4]:
def parse_opt():
    parser = argparse.ArgumentParser()

    parser.add_argument('--no-cuda', action='store_true',
                        default=True, help='does not use CUDA')
    parser.add_argument('--dir', type=str, default='./MELD_features/',
                        help='dataset directory (for .pkl file)')
    parser.add_argument('--n-classes', type=int, default=7,
                        help='number of classes')
    parser.add_argument('--val-split', type=float,
                        default=0.1, help='validation split')
    parser.add_argument('--num-workers', type=int,
                        default=0, help='number of workers')

    parser.add_argument('--loss-fn', type=str, default='masked_nll',
                        help='loss function (masked_nll or unmaksed_weighted_nll or masked_mse)')
    parser.add_argument('--optimizer', type=str, default='sgd',
                        help='optimizer (adam or sgd or rmsprop)')

    parser.add_argument('--lr', type=float, default=1e-4,
                        metavar='LR', help='learning rate')
    parser.add_argument('--l2', type=float, default=3e-4,
                        metavar='L2', help='L2 regularization weight')
    parser.add_argument('--dropout', type=float, default=0.25,
                        metavar='dropout', help='dropout rate')
    parser.add_argument('--batch-size', type=int, default=20,
                        metavar='BS', help='batch size')
    parser.add_argument('--epochs', type=int, default=50,
                        metavar='E', help='number of epochs')

    parser.add_argument('--class-weight', action='store_true',
                        default=True, help='use class weights (true or false)')
    parser.add_argument('--mu', type=float, default=0,
                        help='class weight (mu)')

    parser.add_argument('--seed', type=int, default=42,
                        metavar='seed', help='seed')

    parser.add_argument('--feature_type', type=str, default='multimodal',
                        help='features (text or audio or multimodal)')
    parser.add_argument('--attention', type=str, default='general',
                        help='attention type (simple or general or general2 or concat or dot)')

    parser.add_argument('--verbose', action='store_true',
                        default=True, help='verbose (true or false)')

    args = parser.parse_args("")

    return args


args = parse_opt()
args.cuda = not args.no_cuda and torch.cuda.is_available()

In [5]:
D_m = 900
D_g = D_q = D_r = 150
D_h = D_e = 100

model = EmotionNet(D_m, D_q, D_g, D_r, D_e, D_h, n_classes=args.n_classes, dropout=args.dropout, attention=args.attention)

# load the model ckpt
model.load_state_dict(torch.load('model_v2.pt'))

<All keys matched successfully>

In [6]:
raw_features_pkl_filepath = './MELD_features/MELD_features_raw.pkl'

videoIDs, videoSpeakers, videoLabels, videoText, \
    videoAudio, videoSentence, trainVid, testVid, vids = pickle.load(
        open(raw_features_pkl_filepath, 'rb'))

In [12]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
seed_everything(420)

In [13]:
train_loader , valid_loader , test_loader = get_MELD_loaders(path=raw_features_pkl_filepath, n_classes=7, batch_size=1)

In [14]:
print(f'Evaluating on train set')

data = next(iter(train_loader))

textf, acouf, qmask, umask, label = [
    d.to('cuda') for d in data[:-1]] if args.cuda else data[:-1]

log_prob, alpha_f, alpha_b = model(torch.cat((textf,acouf),dim=-1), qmask,umask) # seq_len, batch, n_classes
lp_ = log_prob.transpose(0,1).contiguous().view(-1,log_prob.size()[2]) # batch*seq_len, n_classes
pred_ = torch.argmax(lp_,1) # batch*seq_len
pred = pred_.data.cpu().numpy()
pred

random_idx = data[-1][0]
print(f'\nVideo conversation #{random_idx}')

speakers, unique_speakers = map_speakers_to_int(videoSpeakers[random_idx])

print(f'Speakers in the video : {unique_speakers}\n')
count = total = 0
for i, j, k, p in zip(videoSentence[random_idx], videoSpeakers[random_idx], videoLabels[random_idx], pred):
    if p == k:
        count += 1
        
    print(
        f'[pred: {labels_map[p]}]',
        f'Speaker {speakers[tuple(j)]}',
        f'\t:\t {i}')
    print(f'[actual: {labels_map[k]}]')
    print()
    total += 1

print(f'\nAccuracy : {count/total*100:.3f}%')

Evaluating on train set

Video conversation #959
Speakers in the video : 2

[pred: neutral] Speaker 1 	:	 I didnt even realise how late it was, until I noticed the 5 oclock shadow on her head.
[actual: neutral]

[pred: neutral] Speaker 1 	:	 Anyway, she didnt want to stay.
[actual: sadness]

[pred: neutral] Speaker 1 	:	 I called a cab. she just left.
[actual: neutral]

[pred: neutral] Speaker 2 	:	 I wrote you a letter.
[actual: neutral]

[pred: joy] Speaker 1 	:	 Ohh! Thank you! I like mail.
[actual: joy]

[pred: neutral] Speaker 2 	:	 Its just some things Ive been thinking about.
[actual: neutral]

[pred: sadness] Speaker 2 	:	 Some things about us, and before we can even think about the two of us getting back together, I just need to know how you feel about this stuff.
[actual: neutral]

[pred: disgust] Speaker 1 	:	 Okay.  Wow, its-its 5:30 in the morning.  So, Id better get cracking on this baby.
[actual: neutral]

[pred: sadness] Speaker 2 	:	 Well, Ill be waiting for y

In [15]:
print(f'Evaluating on validation set')

data = next(iter(valid_loader))

textf, acouf, qmask, umask, label = [
    d.to('cuda') for d in data[:-1]] if args.cuda else data[:-1]

log_prob, alpha_f, alpha_b = model(
    torch.cat((textf, acouf), dim=-1), qmask, umask)  # seq_len, batch, n_classes
lp_ = log_prob.transpose(0, 1).contiguous(
).view(-1, log_prob.size()[2])  # batch*seq_len, n_classes
pred_ = torch.argmax(lp_, 1)  # batch*seq_len
pred = pred_.data.cpu().numpy()
pred

random_idx = data[-1][0]
print(f'\nVideo conversation #{random_idx}')

speakers, unique_speakers = map_speakers_to_int(videoSpeakers[random_idx])

print(f'Speakers in the video : {unique_speakers}\n')
count = total = 0
for i, j, k, p in zip(videoSentence[random_idx], videoSpeakers[random_idx], videoLabels[random_idx], pred):
    if p == k:
        count += 1
    print(
        f'[pred: {labels_map[p]}]',
        f'Speaker {speakers[tuple(j)]}',
        f'\t:\t {i}')
    print(f'[actual: {labels_map[k]}]')
    print()

    total += 1

print(f'\nAccuracy : {count/total*100:.3f}%')

Evaluating on validation set

Video conversation #94
Speakers in the video : 2

[pred: neutral] Speaker 1 	:	 My drinking?
[actual: neutral]

[pred: fear] Speaker 2 	:	 Oh, I mustve said that after you left.
[actual: neutral]

[pred: surprise] Speaker 1 	:	 Said what? Exactly.
[actual: fear]

[pred: sadness] Speaker 2 	:	 That you enjoyed the occasional drinking binge.
[actual: neutral]

[pred: anger] Speaker 1 	:	 Oh my God!! Ohh, that is it! Im leaving! You are just a horrible person!
[actual: disgust]

[pred: anger] Speaker 2 	:	 Wait-wait-wait-wait-wait-wait-wait-wait!!
[actual: fear]

[pred: sadness] Speaker 2 	:	 If youre gonna get all sensitive about it!
[actual: anger]

[pred: sadness] Speaker 2 	:	 I dont want to lose you.
[actual: sadness]

[pred: sadness] Speaker 2 	:	 What if I, create a position for you?
[actual: neutral]

[pred: sadness] Speaker 2 	:	 Ill make you an assistant buyer in this department.
[actual: neutral]

[pred: sadness] Speaker 1 	:	 Say more things

In [16]:
print(f'Evaluating on test set')

data = next(iter(test_loader))
textf, acouf, qmask, umask, label = [
    d.to('cuda') for d in data[:-1]] if args.cuda else data[:-1]

log_prob, alpha_f, alpha_b = model(
    torch.cat((textf, acouf), dim=-1), qmask, umask)  # seq_len, batch, n_classes
lp_ = log_prob.transpose(0, 1).contiguous().view(-1, log_prob.size()[2])  # batch*seq_len, n_classes
pred_ = torch.argmax(lp_, 1)  # batch*seq_len
pred = pred_.data.cpu().numpy()
print(pred)

random_idx = data[-1][0]
print(f'\nVideo conversation #{random_idx}')

speakers, unique_speakers = map_speakers_to_int(videoSpeakers[random_idx])

print(f'Speakers in the video : {unique_speakers}\n')
count = total = 0
for i, j, k, p in zip(videoSentence[random_idx], videoSpeakers[random_idx], videoLabels[random_idx], pred):
    if p == k:
        count += 1
    print(
        f'[pred: {labels_map[p]}]',
        f'Speaker {speakers[tuple(j)]}',
        f'\t:\t {i}')
    print(f'[actual: {labels_map[k]}]')
    print()
    total += 1

print(f'\nAccuracy : {count/total*100:.3f}%')

Evaluating on test set
[5 3]

Video conversation #1360
Speakers in the video : 2

[pred: disgust] Speaker 1 	:	 Ah, oh God. Oh, honey, oh that's OK.
[actual: sadness]

[pred: sadness] Speaker 2 	:	 What. Oh no, you just rolled over the juice box.
[actual: sadness]


Accuracy : 50.000%
