In [1]:
import pandas as pd
import torch
import torchvision.transforms as transforms
import pickle

from data_utils import get_karpathy_split, refcoco_splits
from data_loader import get_caption_loader, COCOCaptionDataset, get_reg_loader

from build_vocab import Vocabulary

In [2]:
import sys
sys.path.append('/home/simeon/Dokumente/Code/Uni/Repos/Adaptive/nlg-eval')
from nlgeval import NLGEval
nlgeval = NLGEval(no_skipthoughts=True, no_glove=True)  # loads the models

In [3]:
crop_size=224
image_dir='/home/simeon/Dokumente/Code/Data/COCO/'

In [4]:
with open('data/coco_vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [5]:
caps_df = get_karpathy_split(splits_path='/home/simeon/Dokumente/Code/Data/COCO/splits/karpathy/caption_datasets/', caps_path='/home/simeon/Dokumente/Code/Data/COCO/')

In [6]:
transform = transforms.Compose([
    transforms.Resize((crop_size, crop_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))
])

In [23]:
c = caps_df.loc[caps_df.split == 'restval'].iloc[:1000]

In [29]:
loader = get_caption_loader(
        decoding_level='word', 
        split=['val'],
        data_df=caps_df, 
        image_dir=image_dir, 
        vocab=vocab,
        transform=transform, 
        batch_size=20, 
        shuffle=False,
        num_workers=2, 
        drop_last=False
)

In [30]:
len(loader)

1251

In [35]:
from tqdm.autonotebook import tqdm

hypotheses = []
references = []

for i, (images, _, _, image_ids, _) in enumerate(loader):
    
    if i % 100 == 0:
        print(i)
    
    # Build caption based on Vocabulary and the '<end>' token
    for image_idx in range(images.size()[0]):

        img_id = int(image_ids[image_idx])

        refs = caps_df.loc[caps_df.image_id == img_id].caption.to_list()
        references.append(refs)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200


In [39]:
with open('data/refcoco_vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [40]:
ref_df = refcoco_splits('/home/simeon/Dokumente/Code/Data/RefCOCO/refcoco/')[0]

In [49]:
loader = get_reg_loader(
        decoding_level='word', 
        split=['val'],
        data_df=ref_df.groupby('ann_id').agg('first').reset_index(), 
        image_dir=image_dir, 
        vocab=vocab,
        transform=transform, 
        batch_size=20, 
        shuffle=False,
        num_workers=2, 
        drop_last=False
)

In [51]:
len(loader)

191

In [52]:
from tqdm.autonotebook import tqdm

hypotheses = []
references = []

for i, (images, targets, positions, lengths, ann_ids, filenames) in enumerate(loader):
    
    if i % 100 == 0:
        print(i)
    
    # Build caption based on Vocabulary and the '<end>' token
    for ann_idx in range(images.size()[0]):

        ann_id = int(ann_ids[ann_idx])

        refs = ref_df.loc[ref_df.ann_id == ann_id].caption.to_list()
        references.append(refs)

0
100


In [8]:
for i, (images, captions, positions, lengths, ann_ids, filenames) in enumerate(loader):    
    if i > 2:
        break
    print(i)

0
1
2


In [9]:
idx = [i.item() for i in captions[2]]
' '.join([vocab.idx2word[i] for i in idx])

'<start> leftmost animal edge of pic <end> <pad> <pad> <pad>'

In [21]:
df = loader.dataset.df
df.loc[df.ann_id == 1719310].caption.to_list()

['the lady with the blue shirt', 'lady with back to us', 'blue shirt']