In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import torch
import torchvision.transforms as transforms
import pickle

from argparse import Namespace

from data_utils import get_karpathy_split, refcoco_splits
from data_loader import get_caption_loader, COCOCaptionDataset, get_caption_loader, get_reg_loader, RefCOCOClusters

from build_vocab import Vocabulary

In [2]:
args = Namespace(
    splits_path = './data/splits/karpathy/caption_datasets/',
    caption_path = './data/captions/',
    refcoco_path = './data/refcoco/',
    image_dir = './data/images/mscoco/',
    vocab_path='./data/coco_vocab.pkl',
    crop_size=224,
    batch_size = 1,
    num_workers = 1,
)

In [3]:
with open(args.vocab_path, 'rb') as f:
    vocab = pickle.load(f)
    
transform = transforms.Compose([
    transforms.Resize((args.crop_size, args.crop_size)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))])

# COCO DataLoader

In [4]:
caps_df = get_karpathy_split(
    splits_path=args.splits_path,
    caps_path=args.caption_path
    )

data_loader = get_caption_loader(
    decoding_level='word',
    split=['train', 'restval'],
    data_df=caps_df,
    image_dir=args.image_dir,
    vocab=vocab,
    transform=transform,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
    drop_last=False
)

for i, (images, targets, lengths, img_ids, filenames) in enumerate(data_loader):
    print(i)
    print('images', images)
    print('targets', targets)
    print('lengths', lengths)
    print('img_ids', img_ids)
    print('filenames', filenames)
    break

0
images tensor([[[[ 1.5982,  1.6153,  1.6153,  ..., -0.1486, -0.0629,  0.1254],
          [ 1.5982,  1.6153,  1.6153,  ..., -0.1828, -0.1999, -0.0287],
          [ 1.5982,  1.5982,  1.6153,  ..., -0.3369, -0.3541, -0.3712],
          ...,
          [ 1.0502,  1.0159,  1.0502,  ...,  0.9474,  0.9646,  0.9474],
          [ 0.9988,  0.9817,  1.0159,  ...,  0.9646,  0.9817,  1.0331],
          [ 0.9646,  0.9303,  0.9817,  ...,  0.9646,  0.9646,  1.0331]],

         [[ 1.7808,  1.7983,  1.7983,  ...,  0.5903,  0.6779,  0.7479],
          [ 1.7808,  1.7983,  1.7983,  ...,  0.5028,  0.5553,  0.6254],
          [ 1.7808,  1.7808,  1.7983,  ...,  0.5028,  0.4678,  0.4853],
          ...,
          [ 0.7654,  0.7304,  0.8004,  ...,  0.6779,  0.6779,  0.6954],
          [ 0.7304,  0.7304,  0.7304,  ...,  0.7304,  0.7654,  0.7654],
          [ 0.7304,  0.7129,  0.7304,  ...,  0.7129,  0.7129,  0.7479]],

         [[ 2.0300,  2.0474,  2.0474,  ...,  1.4897,  1.5071,  1.5942],
          [ 2.0300,  

# RefCOCO DataLoader

In [5]:
ref_df = refcoco_splits(
    args.refcoco_path
)[0]

data_loader = get_reg_loader(
    decoding_level='word',
    split=['train'],
    data_df=ref_df,
    image_dir=args.image_dir,
    vocab=vocab,
    transform=transform,
    batch_size=2,#args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
    drop_last=False
)

for i, (images, targets, positions, lengths, ann_ids, filenames) in enumerate(data_loader):
    print(i)
    print('images', images)
    print('targets', targets)
    print('positions', positions)
    print('lengths', lengths)
    print('ann_ids', ann_ids)
    print('filenames', filenames)
    break

0
images tensor([[[[ 1.1529,  1.1700,  1.1872,  ..., -2.0665, -2.0837, -2.0837],
          [ 1.1700,  1.1529,  1.1700,  ..., -2.0837, -2.1008, -2.1008],
          [ 1.1872,  1.1872,  1.2557,  ..., -2.1008, -2.1179, -2.1179],
          ...,
          [-2.0665, -2.0494, -2.0837,  ..., -0.6794, -0.3369, -0.1314],
          [-2.0152, -2.0494, -2.1179,  ..., -0.6623, -0.3712, -0.1828],
          [-1.9809, -2.0494, -2.1179,  ..., -0.6623, -0.3541, -0.1999]],

         [[ 1.4132,  1.4307,  1.4307,  ..., -1.9132, -1.9307, -1.9307],
          [ 1.4307,  1.4132,  1.4132,  ..., -1.9307, -1.9657, -2.0007],
          [ 1.3957,  1.3957,  1.4657,  ..., -1.9832, -2.0182, -2.0182],
          ...,
          [-1.9482, -1.9132, -1.9482,  ..., -0.7577, -0.3025, -0.0749],
          [-1.8957, -1.9132, -1.9657,  ..., -0.7227, -0.3375, -0.1099],
          [-1.8431, -1.8957, -1.9482,  ..., -0.7227, -0.3200, -0.1275]],

         [[ 2.0474,  2.0997,  2.0997,  ..., -1.4733, -1.4907, -1.4907],
          [ 2.0648,  

# RefCOCO Cluster Loader

In [6]:
cluster_loader = RefCOCOClusters(
    split=['val'], 
    data_df=ref_df, 
    image_dir=args.image_dir, 
    vocab=vocab, 
    decoding_level='word', 
    transform=transform
)

for i, (sent_ids, ann_ids, images, positions, targets, filenames) in enumerate(cluster_loader):    
    print('sent_ids', sent_ids)
    print('ann_ids', ann_ids)
    print('images', images)
    print('positions', positions)
    print('targets', targets)
    print('filenames', filenames)
    break

sent_ids [222, 226]
ann_ids [1537681, 710706]
images [tensor([[[-0.3712, -0.4226, -0.4739,  ..., -0.7650, -0.8164, -0.8335],
         [-0.3712, -0.4226, -0.4568,  ..., -0.7650, -0.8164, -0.8335],
         [-0.3883, -0.4054, -0.4397,  ..., -0.7822, -0.8335, -0.8507],
         ...,
         [ 0.1254,  0.3481,  0.4508,  ...,  0.6734,  0.6734,  0.6734],
         [ 0.3138,  0.3994,  0.3823,  ...,  0.6906,  0.6906,  0.6906],
         [ 0.4166,  0.4337,  0.3481,  ...,  0.6906,  0.6906,  0.6906]],

        [[-1.3004, -1.3179, -1.3179,  ..., -0.6352, -0.7052, -0.7402],
         [-1.3004, -1.3179, -1.3179,  ..., -0.6352, -0.7052, -0.7402],
         [-1.3179, -1.3354, -1.3179,  ..., -0.6527, -0.7227, -0.7577],
         ...,
         [ 0.1877,  0.4853,  0.6254,  ...,  0.8354,  0.8354,  0.8354],
         [ 0.4678,  0.6078,  0.6604,  ...,  0.8529,  0.8529,  0.8529],
         [ 0.6078,  0.6779,  0.6779,  ...,  0.8529,  0.8529,  0.8529]],

        [[-1.6999, -1.6999, -1.6999,  ..., -0.5147, -0.5844, -