In [140]:
import torch
from torchvision import transforms
import nltk
import json
import pandas as pd
from PIL import Image
import torch.utils.data as data
from torch.utils.data import DataLoader
import numpy as np

from vocabulary import Vocabulary
from model import EncoderCNN, DecoderRNN


# python3 -m pip install pycocotools

In [109]:
# Define a transform to pre-process the training images.


# Set the minimum word count threshold.
vocab_threshold = 5

# Specify the batch size.
batch_size = 5

base_data_path = '/Users/vignesh/Desktop/code/cerebro_etl_stuff/coco'
train_annotations_path = base_data_path + "/annotations/captions_train2014.json"
val_annotations_path = base_data_path + "/annotations/captions_val2014.json"
images_path = base_data_path + "/train2014/"

In [110]:
# loading vocabulary for annotations
vocab = Vocabulary(vocab_threshold, vocab_from_file=True) # loading from file (change to annotations_file attribute?)

Vocabulary successfully loaded from vocab.pkl file!


In [111]:
# row preprocessing function: processing images to tensors and captions to their index vectors (using vocab)
def row_preprocessing_routine(row, to_root_path, **kwargs):
    from torchvision import transforms
    
    # Convert image to tensor and pre-process using transform
    transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])
    
    # print(row)
    # print(str(row["file_name"]))
    # print(row["captions"])
    input_image_path = to_root_path + str(row["file_name"])
    image = Image.open(input_image_path).convert("RGB")
    image_tensor = transform_train(image)
    
    vocab = kwargs["vocab"]
    
    output_caption = row["captions"]
    tokens = nltk.tokenize.word_tokenize(str(output_caption).lower())
    caption = []
    caption.append(vocab(vocab.start_word))
    caption.extend([vocab(token) for token in tokens])
    caption.append(vocab(vocab.end_word))
    caption_tensor = torch.Tensor(caption).long()
    return image_tensor, caption_tensor

In [112]:
# creating the metadata data frame
data_json = None
with open(train_annotations_path) as f:
    data_json = json.load(f)
feature_names = ["id", "file_name", "height", "width", "caption", "date_captured"]
dataset = {
    'id': [],
    'file_name': [],
    'height': [],
    'width': [],
    'captions': [],
    'date_captured': []
}

dataset_modified = {
    'id': [],
    'file_name': [],
    'height': [],
    'width': [],
    'captions': [],
    'date_captured': []
}

annotations = {}
annotations_list = data_json['annotations']
for i in annotations_list:
    if not i["image_id"] in annotations:
        annotations[i["image_id"]] = []
    annotations[i["image_id"]].append(i["caption"])

for i in range(len(data_json['images'])):
    for caption in annotations[data_json["images"][i]['id']]:
        dataset['id'].append(data_json["images"][i]['id'])
        dataset['file_name'].append(data_json["images"][i]['file_name'])
        dataset['height'].append(data_json["images"][i]['height'])
        dataset['width'].append(data_json["images"][i]['width'])
        dataset['captions'].append(caption)
        dataset['date_captured'].append(data_json["images"][i]['date_captured'])

pd_df = pd.DataFrame(dataset)


In [113]:
# example
# image_tensor, caption_tensor = row_preprocessing_routine(pd_df.iloc[[0]], images_path, vocab=vocab)

In [114]:
small_df = pd_df.head(20)
small_df

Unnamed: 0,id,file_name,height,width,captions,date_captured
0,57870,COCO_train2014_000000057870.jpg,480,640,A restaurant has modern wooden tables and chairs.,2013-11-14 16:28:13
1,57870,COCO_train2014_000000057870.jpg,480,640,A long restaurant table with rattan rounded ba...,2013-11-14 16:28:13
2,57870,COCO_train2014_000000057870.jpg,480,640,a long table with a plant on top of it surroun...,2013-11-14 16:28:13
3,57870,COCO_train2014_000000057870.jpg,480,640,A long table with a flower arrangement in the ...,2013-11-14 16:28:13
4,57870,COCO_train2014_000000057870.jpg,480,640,A table is adorned with wooden chairs with blu...,2013-11-14 16:28:13
5,384029,COCO_train2014_000000384029.jpg,429,640,A man preparing desserts in a kitchen covered ...,2013-11-14 16:29:45
6,384029,COCO_train2014_000000384029.jpg,429,640,A chef is preparing and decorating many small ...,2013-11-14 16:29:45
7,384029,COCO_train2014_000000384029.jpg,429,640,A baker prepares various types of baked goods.,2013-11-14 16:29:45
8,384029,COCO_train2014_000000384029.jpg,429,640,a close up of a person grabbing a pastry in a ...,2013-11-14 16:29:45
9,384029,COCO_train2014_000000384029.jpg,429,640,Close up of a hand touching various pastries.,2013-11-14 16:29:45


In [115]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
ret = small_df.apply(lambda row: row_preprocessing_routine(row, images_path, vocab=vocab), axis=1, result_type='expand')

In [85]:
type(ret.iloc[[0]][0][0])

torch.Tensor

In [86]:
ret

Unnamed: 0,0,1
0,"[[[tensor(-1.3987), tensor(-1.4158), tensor(-1...","[tensor(0), tensor(3), tensor(535), tensor(105..."
1,"[[[tensor(-0.7308), tensor(-0.7479), tensor(-0...","[tensor(0), tensor(3), tensor(890), tensor(535..."
2,"[[[tensor(-0.1486), tensor(-0.6794), tensor(-1...","[tensor(0), tensor(3), tensor(890), tensor(112..."
3,"[[[tensor(0.5022), tensor(0.4679), tensor(0.45...","[tensor(0), tensor(3), tensor(890), tensor(112..."
4,"[[[tensor(2.2147), tensor(2.2147), tensor(2.19...","[tensor(0), tensor(3), tensor(112), tensor(130..."


In [116]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
pkl_data_file = "./small_tensor_data.pkl"
ret.to_pickle(pkl_data_file)

In [4]:
unpickled_df = pd.read_pickle(pkl_data_file)

In [132]:
class GeneralPytorchDataset(data.Dataset):
    def __init__(self, pickled_data_file, batch_size):
        self.data_df = pd.read_pickle(pickled_data_file)
        self.caption_lengths = [caption.size(dim=0) for caption in self.data_df[[1]][1]] 
        self.batch_size = batch_size
        
    def __getitem__(self, index):
        image = self.data_df[[0]][0][index]
        caption = self.data_df[[1]][1][index]
        # sample = {'image': image, 'caption': caption}
        return image, caption
    
    def get_indices(self):
        selected_length = np.random.choice(self.caption_lengths)
        all_indices = np.where([self.caption_lengths[i] == \
                               selected_length for i in np.arange(len(self.caption_lengths))])[0]
        indices = list(np.random.choice(all_indices, size=self.batch_size))
        return indices
    
    def __len__(self):
        return len(self.data_df)

In [136]:
coco_dataset = GeneralPytorchDataset(pkl_data_file, batch_size)
for i in range(len(coco_dataset)):
    image, caption = coco_dataset[i]
    print(i, image.size(), caption.size(dim=0))

0 torch.Size([3, 224, 224]) 11
1 torch.Size([3, 224, 224]) 12
2 torch.Size([3, 224, 224]) 16
3 torch.Size([3, 224, 224]) 14
4 torch.Size([3, 224, 224]) 13
5 torch.Size([3, 224, 224]) 13
6 torch.Size([3, 224, 224]) 12
7 torch.Size([3, 224, 224]) 11
8 torch.Size([3, 224, 224]) 14
9 torch.Size([3, 224, 224]) 11
10 torch.Size([3, 224, 224]) 13
11 torch.Size([3, 224, 224]) 10
12 torch.Size([3, 224, 224]) 14
13 torch.Size([3, 224, 224]) 12
14 torch.Size([3, 224, 224]) 12
15 torch.Size([3, 224, 224]) 11
16 torch.Size([3, 224, 224]) 12
17 torch.Size([3, 224, 224]) 12
18 torch.Size([3, 224, 224]) 13
19 torch.Size([3, 224, 224]) 14


In [137]:
indices = coco_dataset.get_indices()
initial_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader = data.DataLoader(dataset=coco_dataset, 
                              num_workers=0,
                              batch_sampler=data.sampler.BatchSampler(sampler=initial_sampler,
                                                                      batch_size=coco_dataset.batch_size,
                                                                      drop_last=False))

In [138]:
# dataloader = DataLoader(coco_dataset, batch_size=batch_size,shuffle=True, num_workers=0)
from collections import Counter
counter = Counter(data_loader.dataset.caption_lengths)
lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
for value, count in lengths:
    print('value: %2d --- count: %5d' % (value, count))


value: 12 --- count:     6
value: 11 --- count:     4
value: 14 --- count:     4
value: 13 --- count:     4
value: 16 --- count:     1
value: 10 --- count:     1


In [139]:
indices = data_loader.dataset.get_indices()
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler
for batch in data_loader:
    images, captions = batch[0], batch[1]
    print('images.shape:', images.shape)
    print('captions.shape:', captions.shape)


images.shape: torch.Size([5, 3, 224, 224])
captions.shape: torch.Size([5, 13])


In [121]:
data_df = pd.read_pickle(pkl_data_file)

In [126]:
[caption.size(dim=0) for caption in data_df[[1]][1]]

[11,
 12,
 16,
 14,
 13,
 13,
 12,
 11,
 14,
 11,
 13,
 10,
 14,
 12,
 12,
 11,
 12,
 12,
 13,
 14]

In [141]:
# Specify the dimensionality of the image embedding.
embed_size = 256

# Initialize the encoder. (We can add additional arguments if necessary.)
encoder = EncoderCNN(embed_size)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/vignesh/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████████████| 97.8M/97.8M [00:03<00:00, 29.2MB/s]


In [142]:
features = encoder(images)


In [143]:
print('type(features):', type(features))
print('features.shape:', features.shape)

type(features): <class 'torch.Tensor'>
features.shape: torch.Size([5, 256])


In [144]:
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."


In [145]:
hidden_size = 512
vocab_size = len(vocab)

In [146]:
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)


In [147]:
outputs = decoder(features, captions)


In [148]:
print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)


type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([5, 13, 8855])


In [149]:
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
