In [1]:
import pandas as pd
import torch
import torchvision.transforms as transforms
import pickle

from data_utils import get_karpathy_split, refcoco_splits
from data_loader import get_caption_loader, COCOCaptionDataset, get_reg_loader

from build_vocab import Vocabulary

In [2]:
crop_size=224
image_dir='/home/simeon/Dokumente/Code/Data/COCO/'

In [6]:
with open('data/coco_vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [7]:
caps_df = get_karpathy_split(splits_path='/home/simeon/Dokumente/Code/Data/COCO/splits/karpathy/caption_datasets/', caps_path='/home/simeon/Dokumente/Code/Data/COCO/')

In [6]:
transform = transforms.Compose([
    transforms.Resize((crop_size, crop_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))
])

In [10]:
loader = get_caption_loader(
        decoding_level='word', 
        split=['train'],
        data_df=caps_df, 
        image_dir=image_dir, 
        vocab=vocab,
        transform=transform, 
        batch_size=20, 
        shuffle=False,
        num_workers=2, 
        drop_last=False
)

In [11]:
for i, (images, captions, lengths, _, _) in enumerate(loader):    
    if i > 2:
        break
    print(i)

0
1
2


In [12]:
idx = [i.item() for i in captions[2]]
' '.join([vocab.idx2word[i] for i in idx])

'<start> the top of a kitchen cabinet covered with brass pots and pans <end> <pad> <pad> <pad>'

In [3]:
with open('data/refcoco_vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [4]:
ref_df = refcoco_splits('/home/simeon/Dokumente/Code/Data/RefCOCO/refcoco/')[0]

In [7]:
loader = get_reg_loader(
        decoding_level='word', 
        split=['train'],
        data_df=ref_df, 
        image_dir=image_dir, 
        vocab=vocab,
        transform=transform, 
        batch_size=20, 
        shuffle=False,
        num_workers=2, 
        drop_last=False
)

In [8]:
for i, (images, captions, positions, lengths, ann_ids, filenames) in enumerate(loader):    
    if i > 2:
        break
    print(i)

0
1
2


In [9]:
idx = [i.item() for i in captions[2]]
' '.join([vocab.idx2word[i] for i in idx])

'<start> leftmost animal edge of pic <end> <pad> <pad> <pad>'