### A notebook to query the trained embedding with the test set to qualitatively evaluate the text-video retrieval

In [23]:
import torch as th
from torch.utils.data import DataLoader
import numpy as np
import torch.optim as optim
from args import get_args
import random
import os
from model import Net
from metrics import compute_metrics, print_computed_metrics
from loss import MaxMarginRankingLoss
from gensim.models.keyedvectors import KeyedVectors
import pickle
from m2e2_dataloader import M2E2DataLoader
import pandas as pd 
import json

In [3]:
word2vec_path='data/GoogleNews-vectors-negative300.bin'

print('Loading word vectors: {}'.format(word2vec_path))
we = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
print('done')

Loading word vectors: data/GoogleNews-vectors-negative300.bin
done


In [4]:
sentences_path = "/kiwi-data/users/shoya/AIDA/event_occurences_video_and_text_pairs.json"
max_words = 20
we_dim = 300 
batch_size=256
batch_size_val=3500
num_workers = 4

dataset_val_m2e2 = M2E2DataLoader(
    csv="/home/shoya/howto100m/data_paths_test.csv",
    sentences=sentences_path,
    we=we,
    max_words=max_words,
    we_dim=we_dim,
)
dataloader_val_m2e2 = DataLoader(
    dataset_val_m2e2,
    batch_size=batch_size_val,
    num_workers=num_workers,
    shuffle=False,
)

In [5]:
net = Net(
    video_dim=4096,
    embd_dim=6144,
    we_dim=300,
    n_pair=1,
    max_words=max_words,
    sentence_dim=-1,
)

net.load_checkpoint('e31.pth')

In [6]:
net.eval() 
net.cuda()

Net(
  (text_pooling): Sentence_Maxpool(
    (fc): Linear(in_features=300, out_features=6144, bias=True)
  )
  (GU_text): Gated_Embedding_Unit(
    (fc): Linear(in_features=6144, out_features=6144, bias=True)
    (cg): Context_Gating(
      (fc): Linear(in_features=6144, out_features=6144, bias=True)
      (batch_norm): BatchNorm1d(6144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (GU_video): Gated_Embedding_Unit(
    (fc): Linear(in_features=4096, out_features=6144, bias=True)
    (cg): Context_Gating(
      (fc): Linear(in_features=6144, out_features=6144, bias=True)
      (batch_norm): BatchNorm1d(6144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
)

In [8]:
batch = next(iter(dataloader_val_m2e2))

In [11]:
text = batch['text'].cuda() 
video = batch['video'].cuda() 

In [13]:
output = net(video, text)
output = output.cpu().detach().numpy()

In [40]:
eval_paths = pd.read_csv("/home/shoya/howto100m/data_paths_test.csv")
eval_file_names = eval_paths['video_id'].values
video2sentence = json.load(open(sentences_path))

In [69]:
for correct_idx, prob in enumerate(output[:30]):
    corresponding_video = eval_file_names[correct_idx]
    corresponding_sentences = video2sentence[corresponding_video]
    
    correct_idx_prob = prob[correct_idx]
    sorted_x = np.sort(-prob)
    correct_guessed_at = np.where(sorted_x+correct_idx_prob == 0)[0][0]
    
    print('============= Test Sample {} ============='.format(correct_idx+1))
    print("Query Sentence: {}".format(corresponding_sentences))
    print("Corresponding Video: {}".format(corresponding_video))
    print("Correctly Guessed on Index {}".format(correct_guessed_at))
    print()
    
    
    top_5_index = prob.argsort()[-5:][::-1]
    for i,pred_idx in enumerate(top_5_index):
        print("Guess {}: {}".format(i+1,eval_file_names[pred_idx]))
    print()
    

Query Sentence: ['US President Donald Trump has told his citizens they should brace for " painful " weeks ahead .']
Corresponding Video: Coronavirus Trump warns of very painful weeks ahead - BBC News clipped_23.557_86.748.mp4
Correctly Guessed on Index 1

Guess 1: Coronavirus President Trump Cuts US Funding to WHO clipped_0_38.157.mp4
Guess 2: Coronavirus Trump warns of very painful weeks ahead - BBC News clipped_23.557_86.748.mp4
Guess 3: Blasts Heard at Ashraf Ghani Inauguration as Afghan President clipped_38.802_46.198.mp4
Guess 4: Senate Committee Holds Hearing into Fort Hood Attack clipped_116.027_155.277.mp4
Guess 5: Prince William attends Manchester attack remembrance service in UK clipped_19.004_30.04.mp4

Query Sentence: ["Police officers and army personnel in Mosul , Kirkuk and Ramadi queued to cast their votes two days before the rest of the nation ' s voters can go to the polls to elect a new parliament ."]
Corresponding Video: Early voting for Iraq military ahead of 12 May

A Few Notable Ones 
- Sample 3 - mostly about corona and masks 
- Sample 4 - mostly about demonstrators 
- Sample 7 - mostly about violent protests 