In [2]:
from __future__ import print_function
import os
import pickle

import numpy
from data import get_test_loader
import time
import numpy as np
from vocab import Vocabulary  # NOQA
import torch
from model import VSE, order_sim
from collections import OrderedDict

from evaluation import encode_data

In [3]:
def t2i(images, captions, npts=None, return_ranks=False):
    """
    Text->Images (Image Search)
    Images: (5N, K) matrix of images
    Captions: (5N, K) matrix of captions
    """
    if npts is None:
        npts = int(images.shape[0] / 5)
        print(npts)
    ims = numpy.array([images[i] for i in range(0, len(images), 5)])

    ranks = numpy.zeros(5 * npts)
    
    top1 = numpy.zeros(5 * npts)
    for index in range(npts):

        # Get query captions
        queries = captions[5 * index:5 * index + 5]

        # Compute scores
        d = numpy.dot(queries, ims.T)
        inds = numpy.zeros(d.shape)
        for i in range(len(inds)):
            inds[i] = numpy.argsort(d[i])[::-1]
            ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0]
            top1[5 * index + i] = inds[i][0]

    # Compute metrics
    r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
    medr = numpy.floor(numpy.median(ranks)) + 1
    meanr = ranks.mean() + 1
    if return_ranks:
        return (r1, r5, r10, medr, meanr), (ranks, top1)
    else:
        return (r1, r5, r10, medr, meanr)

In [4]:
#model_path="data/runs/coco_vse++/model_best.pth.tar"
model_path="data/runs/coco_vse++_resnet_restval/model_best.pth.tar"
data_path="data/data/"
split="test"
vocab_path=None
on_gpu=False

In [5]:
device = 'cpu' if not on_gpu else 'cuda'
checkpoint = torch.load(model_path, map_location=torch.device(device))
opt = checkpoint['opt']

FileNotFoundError: [Errno 2] No such file or directory: 'data/runs/coco_vse++_resnet_restval/model_best.pth.tar'

In [21]:
opt

Namespace(Diters=5, Gimage_size=32, Giters=1, batch_size=128, beta1=0.9, beta2=0.999, betas=(0.9, 0.999), clamp_lower=-0.01, clamp_upper=0.01, cnn_type='resnet152', crop_size=224, data_name='coco', embed_size=1024, eta=1.0, eta_m=1.0, finetune=False, gamma=0.1, grad_clip=2.0, img_dim=4096, learning_rate=0.0002, log_step=10, logger_name='runs/coco_uvs_resnet_restval_l2norm', lr_update=15, margin=0.2, max_violation=True, measure='cosine', model_name='UVS', model_path='./model/', ndf=64, no_imgnorm=False, no_prel2norm=False, noadam=False, nol2norm=False, num_epochs=30, num_layers=1, resume='', save_step=1000, txt_dim=6000, use_abs=False, use_mask=False, use_restval=True, val_step=500, vocab_path='./data/', vocab_size=11755, word_dim=300, workers=10)

In [22]:
if data_path is not None:
    opt.data_path = data_path

if vocab_path is not None:
    opt.vocab_path = vocab_path

# load vocabulary used by the model
with open(os.path.join(opt.vocab_path,
                       '%s_vocab.pkl' % opt.data_name), 'rb') as f:
    vocab = pickle.load(f)
opt.vocab_size = len(vocab)

# construct model
model = VSE(opt)

# load model state
model.load_state_dict(checkpoint['model'])

print('Loading dataset')
data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                              opt.batch_size, opt.workers, opt, 
                              image_location='/home/simeon/Dokumente/Code/Uni/Repos/Adaptive/data/images/mscoco/val2014/')

=> using pre-trained model 'resnet152'
Loading dataset
loading annotations into memory...
0:00:00.197571
creating index...
index created!


In [32]:
for i, (images, targets, lengths, ids) in enumerate(data_loader):
    if i > 1:
        break

In [44]:
ann_id = data_loader.dataset.ids[0]
caption = data_loader.dataset.coco[0].anns[ann_id]['caption']
img_id = data_loader.dataset.coco[0].anns[ann_id]['image_id']
path = data_loader.dataset.coco[0].loadImgs(img_id)[0]['file_name']

In [None]:
print('Computing results...')
img_embs, cap_embs = encode_data(model, data_loader, on_gpu=on_gpu)
print('Images: %d, Captions: %d' %
      (img_embs.shape[0] / 5, cap_embs.shape[0]))

In [9]:
t2i(img_embs, cap_embs, return_ranks=True)

1000


((33.76, 68.8, 81.02, 3.0, 12.9344),
 (array([ 0.,  1., 16., ...,  4.,  2., 20.]),
  array([  0.,  21., 789., ..., 387., 839., 958.])))

In [23]:
# no cross-validation, full evaluation
ri, rti = t2i(img_embs, cap_embs, return_ranks=True, npts=1000)
ari = (ri[0] + ri[1] + ri[2]) / 3

print("Average t2i Recall: %.1f" % ari)
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)

Average t2i Recall: 61.2
Text to image: 33.8 68.8 81.0 3.0 12.9
