In [129]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [186]:
import sys
sys.path.append('../src/')

from __future__ import print_function
import sys
import time
import os
import numpy as np
import matplotlib.pyplot as plt
import nltk
import pickle
import pandas as pd
import argparse

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.autograd import Variable

from utils.Vocabulary import Vocabulary
from utils.VideoDataloader import get_video_dataloader, VideoDataset
from models.VideoCaptioner import VideoCaptioner

In [190]:
videos_path = os.environ['HOME'] + '/Database/MSR-VTT/train-video/'
captions_path = '../data/processed/msrvtt_captions.csv'
models_path = '../models/'
batch_size = 64
load_features = True
preload = True
base_model='vgg16' # 'resnet152'
embedding_size = 25088 # 2048

load_captions = True

In [191]:
vocab_path = '../data/processed/msrvtt_vocab.pkl'

In [192]:
print ("Loading validation data...\r", end="")
val_loader = get_video_dataloader('dev',videos_path, 
                                  vocab_path, captions_path, 
                                  batch_size, 
                                  load_features=load_features,
                                  load_captions=load_captions,
                                  preload=preload,
                                  model=base_model,
                                  embedding_size=embedding_size,
                                  num_workers=0)
val_loader.dataset.mode = 'dev'
print ("Loading validation data...Done")

Loading validation data...Done


In [193]:
vocab_size = val_loader.dataset.get_vocab_size()
start_id = val_loader.dataset.get_idx()[val_loader.dataset.vocab.start_word]
end_id = val_loader.dataset.get_idx()[val_loader.dataset.vocab.end_word]
max_caption_length = val_loader.dataset.max_len

In [194]:
embed_size = 256
hidden_size = 512
rnn_type = 'lstm'

In [195]:
captioner = VideoCaptioner(embedding_size, embed_size, 
                           hidden_size, vocab_size, 
                           max_caption_length,
                           start_id, end_id,
                           rnn_type=rnn_type)

if torch.cuda.is_available():
  captioner.cuda()

In [198]:
model_path = '../models/'
model_path += 'video_caption-model10-45-0.3175-5.0.pkl'

In [199]:
checkpoint = torch.load(model_path)

captioner.load_state_dict(checkpoint['params'])
captioner.eval()

VideoCaptioner(
  (inp): Linear(in_features=25088, out_features=256, bias=True)
  (inp_dropout): Dropout(p=0.2)
  (inp_bn): BatchNorm1d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (embed): Embedding(9648, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (out): Linear(in_features=512, out_features=9648, bias=True)
)

In [200]:
beam_size = 0

In [201]:
val_bleu = 0.0

for val_id, val_batch in enumerate(val_loader):
  idxs, vid_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    vid_embeddings = vid_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs)
            
  preds = captioner.predict(vid_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
    #print (pred_embed)
    #print (refs[pred_id])
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/24], Bleu: 0.2822
Validation step [23/24], Bleu: 0.2290
Validation -- bleu: 0.2831


In [202]:
val_bleu = 0.0
beam_size = 3

for val_id, val_batch in enumerate(val_loader):
  idxs, vid_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    vid_embeddings = vid_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs)
            
  preds = captioner.predict(vid_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
    #print (pred_embed)
    #print (refs[pred_id])
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/24], Bleu: 0.2867
Validation step [23/24], Bleu: 0.2408
Validation -- bleu: 0.3102


In [203]:
val_bleu = 0.0
beam_size = 5

for val_id, val_batch in enumerate(val_loader):
  idxs, vid_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    vid_embeddings = vid_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs)
            
  preds = captioner.predict(vid_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
    #print (pred_embed)
    #print (refs[pred_id])
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/24], Bleu: 0.3220
Validation step [23/24], Bleu: 0.3549
Validation -- bleu: 0.3184
