In [17]:
import pickle
import argparse
import torch
from src.model import EmotionNet
from src.dataloader import get_MELD_loaders

In [18]:
def map_speakers_to_int(videoSpeakers):
    unique_speakers = {}
    speaker_id = 1  # Start numbering speakers from 1

    # Assign a unique integer to each unique speaker vector
    for speaker_vector in videoSpeakers:

        # Convert the vector to a tuple to use it as a key in the dictionary
        speaker_tuple = tuple(speaker_vector)
        if speaker_tuple not in unique_speakers:
            unique_speakers[speaker_tuple] = speaker_id
            speaker_id += 1

    return unique_speakers, len(unique_speakers)

In [19]:
labels_map = {0: 'neutral', 1: 'surprise',
              2: 'fear', 3: 'sadness',
              4: 'joy', 5: 'disgust', 6: 'anger'}

In [20]:
def parse_opt():
    parser = argparse.ArgumentParser()

    parser.add_argument('--no-cuda', action='store_true',
                        default=True, help='does not use CUDA')
    parser.add_argument('--dir', type=str, default='./MELD_features/',
                        help='dataset directory (for .pkl file)')
    parser.add_argument('--n-classes', type=int, default=7,
                        help='number of classes')
    parser.add_argument('--val-split', type=float,
                        default=0.1, help='validation split')
    parser.add_argument('--num-workers', type=int,
                        default=0, help='number of workers')

    parser.add_argument('--loss-fn', type=str, default='masked_nll',
                        help='loss function (masked_nll or unmaksed_weighted_nll or masked_mse)')
    parser.add_argument('--optimizer', type=str, default='sgd',
                        help='optimizer (adam or sgd or rmsprop)')

    parser.add_argument('--lr', type=float, default=1e-4,
                        metavar='LR', help='learning rate')
    parser.add_argument('--l2', type=float, default=3e-4,
                        metavar='L2', help='L2 regularization weight')
    parser.add_argument('--dropout', type=float, default=0.25,
                        metavar='dropout', help='dropout rate')
    parser.add_argument('--batch-size', type=int, default=20,
                        metavar='BS', help='batch size')
    parser.add_argument('--epochs', type=int, default=50,
                        metavar='E', help='number of epochs')

    parser.add_argument('--class-weight', action='store_true',
                        default=True, help='use class weights (true or false)')
    parser.add_argument('--mu', type=float, default=0,
                        help='class weight (mu)')

    parser.add_argument('--seed', type=int, default=42,
                        metavar='seed', help='seed')

    parser.add_argument('--feature_type', type=str, default='multimodal',
                        help='features (text or audio or multimodal)')
    parser.add_argument('--attention', type=str, default='general',
                        help='attention type (simple or general or general2 or concat or dot)')

    parser.add_argument('--verbose', action='store_true',
                        default=True, help='verbose (true or false)')

    args = parser.parse_args("")

    return args


args = parse_opt()
args.cuda = not args.no_cuda and torch.cuda.is_available()

In [21]:
D_m = 900
D_g = D_q = D_r = 150
D_h = D_e = 100

model = EmotionNet(D_m, D_q, D_g, D_r, D_e, D_h, n_classes=args.n_classes, dropout=args.dropout, attention=args.attention)

# load the model ckpt
model.load_state_dict(torch.load('model_v2.pt'))

<All keys matched successfully>

In [22]:
raw_features_pkl_filepath = './MELD_features/MELD_features_raw.pkl'

videoIDs, videoSpeakers, videoLabels, videoText, \
    videoAudio, videoSentence, trainVid, testVid, vids = pickle.load(
        open(raw_features_pkl_filepath, 'rb'))

In [35]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
seed_everything(12345)

In [36]:
train_loader , valid_loader , test_loader = get_MELD_loaders(path=raw_features_pkl_filepath, n_classes=7, batch_size=1)

In [37]:
print(f'Evaluating on train set')

data = next(iter(train_loader))

textf, acouf, qmask, umask, label = [
    d.to('cuda') for d in data[:-1]] if args.cuda else data[:-1]

log_prob, alpha_f, alpha_b = model(torch.cat((textf,acouf),dim=-1), qmask,umask) # seq_len, batch, n_classes
lp_ = log_prob.transpose(0,1).contiguous().view(-1,log_prob.size()[2]) # batch*seq_len, n_classes
pred_ = torch.argmax(lp_,1) # batch*seq_len
pred = pred_.data.cpu().numpy()
pred

random_idx = data[-1][0]
print(f'\nVideo conversation #{random_idx}')

speakers, unique_speakers = map_speakers_to_int(videoSpeakers[random_idx])

print(f'Speakers in the video : {unique_speakers}\n')
count = total = 0
for i, j, k, p in zip(videoSentence[random_idx], videoSpeakers[random_idx], videoLabels[random_idx], pred):
    if p == k:
        count += 1
        
    print(
        f'[pred: {labels_map[p]}]',
        f'Speaker {speakers[tuple(j)]}',
        f'\t:\t {i}')
    print(f'[actual: {labels_map[k]}]')
    print()
    total += 1

print(f'\nAccuracy : {count/total*100:.3f}%')

Evaluating on train set

Video conversation #221
Speakers in the video : 2

[pred: surprise] Speaker 1 	:	 Your parents?
[actual: neutral]

[pred: neutral] Speaker 2 	:	 Yeah, theyre out of town.
[actual: neutral]

[pred: joy] Speaker 1 	:	 Ohh.
[actual: surprise]

[pred: neutral] Speaker 2 	:	 Yeah-yeah, its this
[actual: neutral]

[pred: neutral] Speaker 1 	:	 Yeah that works.
[actual: neutral]

[pred: fear] Speaker 2 	:	 They-they-they can smell fear.
[actual: fear]


Accuracy : 66.667%


In [38]:
print(f'Evaluating on test set')

data = next(iter(test_loader))
textf, acouf, qmask, umask, label = [
    d.to('cuda') for d in data[:-1]] if args.cuda else data[:-1]

log_prob, alpha_f, alpha_b = model(
    torch.cat((textf, acouf), dim=-1), qmask, umask)  # seq_len, batch, n_classes
lp_ = log_prob.transpose(0, 1).contiguous().view(-1, log_prob.size()[2])  # batch*seq_len, n_classes
pred_ = torch.argmax(lp_, 1)  # batch*seq_len
pred = pred_.data.cpu().numpy()
print(pred)

random_idx = data[-1][0]
print(f'\nVideo conversation #{random_idx}')

speakers, unique_speakers = map_speakers_to_int(videoSpeakers[random_idx])

print(f'Speakers in the video : {unique_speakers}\n')
count = total = 0
for i, j, k, p in zip(videoSentence[random_idx], videoSpeakers[random_idx], videoLabels[random_idx], pred):
    if p == k:
        count += 1
    print(
        f'[pred: {labels_map[p]}]',
        f'Speaker {speakers[tuple(j)]}',
        f'\t:\t {i}')
    print(f'[actual: {labels_map[k]}]')
    print()
    total += 1

print(f'\nAccuracy : {count/total*100:.3f}%')

Evaluating on test set
[0 0 0 6 6 6 6 6 6 0 6]

Video conversation #1155
Speakers in the video : 2

[pred: neutral] Speaker 1 	:	 Okay.
[actual: neutral]

[pred: neutral] Speaker 2 	:	 Ross, didn't you say that there was an elevator in here?
[actual: neutral]

[pred: neutral] Speaker 1 	:	 Uhh, yes I did but there isn't. Okay, here we go.
[actual: sadness]

[pred: anger] Speaker 1 	:	 Okay, go left. Left! Left!
[actual: surprise]

[pred: anger] Speaker 2 	:	 Okay, y'know what? There is no more left, left!
[actual: anger]

[pred: anger] Speaker 1 	:	 Oh okay, lift it straight up over your head!
[actual: anger]

[pred: anger] Speaker 1 	:	 Straight up over your head!
[actual: anger]

[pred: anger] Speaker 1 	:	 You can do it!
[actual: joy]

[pred: anger] Speaker 1 	:	 You can do it!
[actual: joy]

[pred: neutral] Speaker 1 	:	 Okay.
[actual: neutral]

[pred: anger] Speaker 1 	:	 You got it?
[actual: neutral]


Accuracy : 54.545%
