In [34]:
from pycocotools.coco import COCO

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.datasets import CocoCaptions

from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

from PIL import Image

import numpy as np

from tqdm import tqdm

from typing import Any, Callable, Optional, Tuple, List

import time

In [20]:
coco_dataset = CocoCaptions(
    root="../data/images/train2014", 
    annFile="../data/annotations/captions_train2014.json", 
    target_transform=lambda cap: cap[0]
)

loading annotations into memory...
Done (t=0.47s)
creating index...
index created!


In [12]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for (img, caption) in tqdm(coco_dataset):
    counter.update(tokenizer(caption))
vocab = Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

100%|███████████████████████████████████████████████████████████████████████████| 82783/82783 [10:44<00:00, 128.51it/s]


In [22]:
batch_size = 4

In [24]:
bos_index = vocab['<bos>']
eos_index = vocab['<eos>']
pad_index = vocab['<pad>']

In [23]:
[vocab[token] for token in tokenizer('a herd of giraffes <eos>')]

[4, 203, 6, 231, 3]

In [10]:
coco = COCO('../data/annotations/captions_train2014.json')

loading annotations into memory...
Done (t=0.46s)
creating index...
index created!


In [17]:
coco.loadAnns(coco.getAnnIds(57870)[0])

[{'image_id': 57870,
  'id': 787980,
  'caption': 'A restaurant has modern wooden tables and chairs.'}]

In [11]:
coco.loadAnns([57870])

[{'image_id': 523700,
  'id': 57870,
  'caption': 'A group of bike riders waiting to load bikes on a truck'}]

In [25]:
coco_dataset[0][0].size()

torch.Size([3, 224, 224])

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
def generate_batch(data_batch):
    img_batch, cap_batch = [], []
    for (img, cap) in data_batch:
        img_batch.append(img.unsqueeze(0))
        cap_batch.append(torch.cat([torch.tensor([bos_index]), cap, torch.Tensor([eos_index])]))
    
    img_batch = torch.cat(img_batch, dim=0)
    cap_batch = pad_sequence(cap_batch, batch_first=True, padding_value=pad_index)
    
    return img_batch.to(device), cap_batch.to(device)

In [50]:
resnet = models.resnet50(pretrained=True)

<generator object Module.children at 0x000001F521198510>

In [51]:
list(resnet.children())

[Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False),
 BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
 ReLU(inplace=True),
 MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False),
 Sequential(
   (0): Bottleneck(
     (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
     (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
     (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (relu): ReLU(inplace=True)
     (downsample): Sequential(
       (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
       (1): BatchNorm2d(256, eps=1e-05, momentum

In [60]:
class Encoder(nn.Module):
    
    def __init__(self, hidden_size):
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        
        resnet = models.resnet50(pretrained=True)
        
        for params in resnet.parameters(): # will not be fine-tuning resnet
            params.requires_grad = False
        
        self.resnet = nn.Sequential(*(list(resnet.children())[:-1])) # will not be using last layer of resnet since that layer outputs a 1000-D vector for imagenet classification
        self.embedding = nn.Linear(2048, hidden_size) # add a linear layer to get a feature vector to pass into the decoder
        
    def forward(self, inputs):
        output = self.resnet(inputs)
        output = output.view(output.size(0), -1)
        output = self.embedding(output)
        return output

In [67]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, hidden_size):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=hidden_size)
        self.rnn = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True) # word embeddings will be size hidden_size, hidden states will be size hidden_size, and batch_first=True since we want input dimensions (batch, seq, hidden)
        self.out = nn.Linear(in_features=hidden_size, out_features=vocab_size)
    
    def forward(self, inputs, encoder_out):
        """
        inputs: (batch, seq_len) set of captions, each entry is an integer index into the vocabulary
        encoder_out: (batch, hidden_size) vector of CNN encoder output
        """
        input_embdedded = self.embedding(inputs)
        output = self.rnn(input_embdedded, (encoder_out, encoder_out))
        output = self.out(output)
        return output

In [70]:
def train(batch_size, learning_rate, num_epochs, hidden_size):
    train_dataset = CocoCaptions(
    root="../data/images/train2014", 
    annFile="../data/annotations/captions_train2014.json", 
    transform=transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ]),
    target_transform=lambda cap: torch.Tensor([vocab[token] for token in tokenizer(cap[0])]),
    )
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        collate_fn=generate_batch
    )
    
    val_dataset = CocoCaptions(
    root="../data/images/val2014", 
    annFile="../data/annotations/captions_val2014.json", 
    transform=transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ]),
    target_transform=lambda cap: torch.Tensor([vocab[token] for token in tokenizer(cap[0])]),
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=generate_batch
    )
    
    encoder = Encoder(hidden_size).to(device)
    decoder = Decoder(len(vocab), hidden_size).to(device)
    
    critereon = nn.CrossEntropyLoss(ignore_index=pad_index)
    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)
    
    for epoch in tqdm(range(num_epochs)):
        epoch_loss = 0
        
        start_time = time.time()
        
        for images, captions in train_loader:
            print(captions)
            optimizer.zero_grad()
            features = encoder(images)
            output = decoder(captions, features)

            output = output[1:].view(-1, output.shape[-1])
            captions = captions[1:].view(-1)
            
            loss = critereon(output, captions)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        epoch_loss /= len(train_loader)
        
        end_time = time.time()
        
        elapsed_time = end_time - start_time
        
        print('Epoch: {} | Time: {}s'.format(epoch, elapsed_time))
        print('\tTrain Loss: {}'.format(epoch_loss))
    
    return encoder, decoder

In [71]:
encoder, decoder = train(128, 0.005, 20, 128)

loading annotations into memory...
Done (t=0.46s)
creating index...
index created!
loading annotations into memory...
Done (t=0.23s)
creating index...
index created!


  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

tensor([[2.0000e+00, 8.5400e+02, 6.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 4.0000e+00, 1.0600e+02,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 4.0000e+00, 4.9600e+02,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        ...,
        [2.0000e+00, 2.1000e+01, 1.2010e+03,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 4.0000e+00, 2.7000e+02,  ..., 3.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 4.0000e+00, 1.0000e+02,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00]])


  0%|                                                                                           | 0/20 [00:08<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)