In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import sys
sys.path.append('../src/')

from __future__ import print_function
import time
import os
import numpy as np
import matplotlib.pyplot as plt
import nltk
import pickle
import pandas as pd
import argparse

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.autograd import Variable

from utils.Vocabulary import Vocabulary
from utils.ImageDataloader import get_image_dataloader, ImageDataset
from models.ImageCaptioner import ImageCaptioner

In [39]:
images_path = os.environ['HOME'] + '/Database/coco/images/'
captions_path = os.environ['HOME'] + '/programs/cocoapi/annotations/coco_captions.csv'
models_path = '../models/'
batch_size = 64
coco_set = 2014
load_features = True
preload = True
base_model='vgg16' # 'vgg16' # 'resnet152'
embedding_size = 25088 # 25088 # 2048
load_captions = True

In [40]:
vocab_path = '../data/processed/coco_vocab.pkl'

In [41]:
print ("Loading validation data...\r", end="")
val_loader = get_image_dataloader('val',coco_set,
                                  images_path, 
                                  vocab_path, captions_path, 
                                  batch_size, 
                                  embedding_size=embedding_size,
                                  load_features=load_features,
                                  load_captions=load_captions,
                                  model=base_model,
                                  preload=preload)
val_loader.dataset.mode = 'val'
print ("Loading validation data...Done")

Loading validation data...Done


In [42]:
vocab_size = val_loader.dataset.get_vocab_size()
start_id = val_loader.dataset.get_idx()[val_loader.dataset.vocab.start_word]
end_id = val_loader.dataset.get_idx()[val_loader.dataset.vocab.end_word]
max_caption_length = val_loader.dataset.max_len

In [43]:
embed_size = 256
hidden_size = 512
rnn_type = 'lstm'

In [44]:
captioner = ImageCaptioner(embedding_size, embed_size, 
                           hidden_size, vocab_size, 
                           max_caption_length, 
                           start_id, end_id)

if torch.cuda.is_available():
  captioner.cuda()

In [45]:
model_path = '../models/'
model_path += 'image_caption-model10-80-0.1179-5.0.pkl'

In [46]:
checkpoint = torch.load(model_path)

captioner.load_state_dict(checkpoint['params'])
captioner.eval()

ImageCaptioner(
  (inp): Linear(in_features=25088, out_features=256, bias=True)
  (inp_dropout): Dropout(p=0.2)
  (inp_bn): BatchNorm1d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (embed): Embedding(12433, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (out): Linear(in_features=512, out_features=12433, bias=True)
)

In [47]:
val_bleu = 0.0
beam_size = 0

for val_id, val_batch in enumerate(val_loader):
  idxs, im_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    im_embeddings = im_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs.numpy())
            
  preds = captioner.predict(im_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/608], Bleu: 0.0943
Validation step [250/608], Bleu: 0.1091
Validation step [500/608], Bleu: 0.1104
Validation step [607/608], Bleu: 0.1444
Validation -- bleu: 0.1076


In [None]:
val_bleu = 0.0
beam_size = 3

for val_id, val_batch in enumerate(val_loader):
  idxs, im_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    im_embeddings = im_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs.numpy())
            
  preds = captioner.predict(im_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/608], Bleu: 0.0876
Validation step [250/608], Bleu: 0.1547
Validation step [500/608], Bleu: 0.1245
Validation step [582/608], Bleu: 0.0905

In [26]:
val_bleu = 0.0
beam_size = 5

for val_id, val_batch in enumerate(val_loader):
  idxs, im_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    im_embeddings = im_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs.numpy())
            
  preds = captioner.predict(im_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

KeyboardInterrupt: 