In [46]:
import json
import glob
import pickle
from PIL import Image
import torch
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from build_vocab import Vocabulary
from adaptive import Encoder2Decoder
import random
from os.path import join
from tqdm.autonotebook import tqdm

from coco.pycocotools.coco import COCO
from coco.pycocoevalcap.eval import COCOEvalCap

In [2]:
def beam_search(images, model, beam_width=5, max_len=20):
    ''' Beam search decoder '''
    # initilizer model to obtain V, v_g
    V, v_g, captions = model.init_sampler(images)
    states = None

    for step in range(max_len):
        scores, states, _, _ = model.decoder(V, v_g, captions, states)
        scores = F.softmax(scores, dim=2)

        best_scores, token_index = torch.topk(scores, beam_width, dim=2)
        best_scores = best_scores.squeeze()
        token_index = token_index.squeeze()

        if step == 0:
            # expand vectors
            V = V.repeat(beam_width, 1, 1)
            v_g = v_g.repeat(beam_width, 1)
            states = (states[0].repeat(1, beam_width, 1),
                      states[1].repeat(1, beam_width, 1))

            scores = best_scores.view(-1, 1)
            prev_scores = scores
            captions = token_index.view(-1, 1)
            candidates = captions

        else:
        # update scores for unfinished candidate captions
            non_eos = candidates[:, -1] != 2
            #non_eos = non_eos.squeeze()

            best_scores[candidates[:, -1] == 2] = -1e20
            #n_scores = prev_scores
            prev_scores = (prev_scores.view(-1, 1).expand_as(
                best_scores)[non_eos]*step + best_scores[non_eos])/(step+1)

            token_index[candidates[:, -1] == 2] = 2

            scores = prev_scores.view(-1)
            tokens = token_index.view(-1)

            # keep the best beam_width candidates
            prev_scores, indices = torch.topk(scores, beam_width, dim=0)
            tokens = torch.gather(tokens, 0, indices).view(-1)
            states = (states[0][:, indices/beam_width, :],
                      states[1][:, indices/beam_width, :])

            captions = tokens.view(-1, 1)
            candidates = torch.cat([candidates[indices/beam_width], captions],
                                   dim=1)

            if candidates[prev_scores.topk(1)[1]].squeeze().tolist()[-1] == 2:
                return (prev_scores.topk(1)[0].item(),
                        candidates[prev_scores.topk(1)[1]].squeeze().tolist())

    return (prev_scores.topk(1)[0].item(),
            candidates[prev_scores.topk(1)[1]].squeeze().tolist())

In [16]:
with open('/mnt/local/AdaptiveAttentionModel/coco_splits.json') as f:
    karpathy = json.load(f)

In [22]:
val_imgs = [i for i in karpathy['images'] if i['split'] == 'val']
val_files = [i['filename'] for i in val_imgs]

In [30]:
CROP_SIZE = 224

with open('/mnt/local/AdaptiveAttentionModel/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)

transform = transforms.Compose([transforms.Resize((CROP_SIZE, CROP_SIZE)),
                                transforms.ToTensor(),
                                transforms.Normalize((0.485, 0.456, 0.406),
                                                     (0.229, 0.224, 0.225))])

model = Encoder2Decoder(256, len(vocab), 512)
model.load_state_dict(torch.load("/mnt/local/AdaptiveAttentionModel/adaptive-11.pkl",map_location='cpu' ))
model.cuda()
model.eval()

img_dir = '/home/vu48pok/.data/compling/data/corpora/external/MSCOCO/COCO/resized/val2014/'

In [54]:
max_beam_width = 15
results = []
bw = 3

for img in tqdm(val_imgs):
    filename = img['filename']
    img_id = img['cocoid']
    
    img_path = join(img_dir, filename)
    image = Image.open(img_path).convert('RGB')
    image = image.resize([224, 224], Image.LANCZOS)
    image = transform(image)

    
    (prob, best) = beam_search(image.cuda().view(1, 3, 224, 224), model, beam_width=bw, max_len=20)
    words = []
    for ix in best:
        #ix = ix.tolist()[0][0]
        words.append(vocab.idx2word[ix])
        
    sentence = " ".join(words[:-1])    
    temp = {'image_id': img_id, 'caption': sentence}
    
    results.append(temp)

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [55]:
resFile = "/home/vu48pok/Dokumente/Projekte/diversity/AdaptiveAttentionModel/results/beam_search_results.json"
with open(resFile, "w") as f:
    json.dump(results, f)

In [56]:
annFile = '/home/vu48pok/Dokumente/Projekte/diversity/AdaptiveAttentionModel/data/annotations/karpathy_split_val.json'

coco = COCO(annFile)
cocoRes = coco.loadRes(resFile)

cocoEval = COCOEvalCap(coco, cocoRes)
cocoEval.params['image_id'] = cocoRes.getImgIds()
cocoEval.evaluate()

loading annotations into memory...
0:00:00.021655
creating index...
index created!
Loading and preparing results...     
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 46841, 'reflen': 46805, 'guess': [46841, 41841, 36841, 31841], 'correct': [30914, 14585, 6536, 2896]}
ratio: 1.0007691485952142
Bleu_1: 0.660
Bleu_2: 0.480
Bleu_3: 0.344
Bleu_4: 0.247
computing CIDEr score...
CIDEr: 0.770
