In [None]:
# bahdanau             - 0.5530967579684627
# luong                - 0.5891869092912313
# par-inject           - 0.34457920062888253
# par-inject 4-lstm    - 0.39452743009536984
# init-inject          - 0.27340657084194314
# transformer          - 0.37336743809031714

# Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/ImageCaptioning

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/13dGpwyY-c5FPJTEacGkw8XNTkbGVWT2D/ImageCaptioning


In [None]:
import re
import os
import cv2
import math
import glob
import spacy
import random
import numpy as np
import pandas as pd
from time import time
from PIL import Image
from tqdm import tqdm
import tensorflow as tf
from collections import Counter
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction


import torch
import torch.nn.functional as F
from torchvision import transforms
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.models import resnet50, ResNet50_Weights
from torch.nn import TransformerDecoder, TransformerDecoderLayer, TransformerEncoder, TransformerEncoderLayer



device = 'cuda' if torch.cuda.is_available() else 'cpu'

spacy_eng = spacy.load("en_core_web_sm")

# Utils

In [None]:
def map_target(in_caption):
    out_caption = list()
    for caption5s in in_caption:
        temp5 = list()
        for cap in caption5s:
            out_cap = list()
            for idx in cap:
                if idx == 0:
                    break
                else:
                    out_cap.append(dataset.vocab.index2word[idx])
            temp5.append(out_cap)
        out_caption.append(temp5)
    return out_caption


def map_predict(in_caption):
    out_caption = list()
    for idx in in_caption:
        if idx == 2:
            break
        else:
            out_caption.append(dataset.vocab.index2word[idx])
    return out_caption

# Dataset

In [None]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.index2word = {0:"<PAD>", 1:"<SOS>", 2:"<EOS>", 3:"<UNK>"}
        self.word2index = {v: k for k, v in self.index2word.items()}

        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.index2word)

    @staticmethod
    def tokenize(text):
        return [token.text.lower() for token in spacy_eng.tokenizer(text)]

    def build_vocab(self, sentence_list):
        frequencies = Counter()
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                frequencies[word] += 1

                #add the word to the vocab if it reaches minum frequecy threshold
                if frequencies[word] == self.freq_threshold:
                    self.word2index[word] = idx
                    self.index2word[idx] = word
                    idx += 1

    def numericalize(self, text):
        """ For each word in the text corresponding index token for that word form the vocab built as list """
        tokenized_text = self.tokenize(text)
        return [self.word2index[token] if token in self.word2index else self.word2index["<UNK>"] for token in tokenized_text ]

In [None]:
class ImageCaptioningDataset(Dataset):
    """Image Captioning dataset"""

    def __init__(self, csv_file, transform, freq_threshold=5):
        self.dataframe = pd.read_csv(csv_file)
        self.transform = transform

        self.images = sorted(os.listdir("dataset/Images"))
        self.captions = self.dataframe['caption']

        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.captions.tolist())


    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        captions = self.captions[5 * idx: 5 * idx + 5].tolist()
        image_path = self.images[idx]

        image = cv2.imread(f'dataset/Images/{image_path}')
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform:
            image = self.transform(image)

        caption_vec = []
        caption_vec.append(torch.full((50,), 0))
        for cap in captions:
            temp = self.vocab.numericalize(cap)
            caption_vec.append(torch.tensor(temp))

        targets = pad_sequence(caption_vec, batch_first=True, padding_value=0)

        return image, targets

In [None]:
dataset = ImageCaptioningDataset(
                    csv_file=f"dataset/captions.txt",
                    transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize(232, antialias=True),
                    transforms.CenterCrop(224),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])]))



loader = DataLoader(
                dataset=dataset,
                batch_size=32,
                num_workers=2)

# Bahdanau Atttention and Luong Atttention

## Model

### Encoder

In [None]:
class ImageFeatureExtractor(torch.nn.Module):
    def __init__(self):
        super(ImageFeatureExtractor, self).__init__()

        # Load pretrained model and remove last fc layer
        pretrained_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        self.model = torch.nn.Sequential(*list(pretrained_model.children())[:-2]).to(device)

        # Freeze layer
        for param in self.model.parameters():
            param.requires_grad = False

    def forward(self, images):
        # Preprocess images
        images = images.to(device)

        features = self.model(images)                                       # (batch_size, 2048, 7, 7)
        features = features.permute(0, 2, 3, 1)                             # (batch_size, 7, 7, 2048)
        features = features.view(features.size(0), -1, features.size(-1))   # (batch_size, 49, 2048)
        return features

### Attention


In [None]:
class Attention(torch.nn.Module):
    def __init__(self, attention_dim, encoder_dim, decoder_dim):
        super(Attention, self).__init__()

        self.attention_dim = attention_dim
        self.W_layer = torch.nn.Linear(decoder_dim, attention_dim).to(device)
        self.U_layer = torch.nn.Linear(encoder_dim, attention_dim).to(device)
        self.V_layer = torch.nn.Linear(attention_dim, 1).to(device)

    def forward(self, keys, query):
        U = self.U_layer(keys)     # (batch_size, num_layers, attention_dim)
        W = self.W_layer(query) # (batch_size, attention_dim)

        combined = torch.tanh(U + W.unsqueeze(1)) # (batch_size, num_layers, attention_dim)
        score = self.V_layer(combined)  # (batch_size, num_layers, 1)
        score = score.squeeze(2) # (batch_size, num_layers)

        weights = F.softmax(score, dim=1)    # (batch_size, num_layers)

        context = keys * weights.unsqueeze(2) # (batch_size, num_layers, feature_dim)
        context = context.sum(dim=1)   # (batch_size, feature_dim)
        return context, weights


### Decoder

In [None]:
class TextFeatureExtractor(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, attention_dim, encoder_dim, decoder_dim, drop_prob=0.3):
        super(TextFeatureExtractor, self).__init__()
        self.vocab_size = vocab_size

        # Embedding layer
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim).to(device)

        # LSTM layer
        self.lstm = torch.nn.LSTMCell(input_size=embed_dim + encoder_dim,
                                      hidden_size=decoder_dim, bias=True).to(device)



        # Attention layer
        self.init_h = torch.nn.Linear(encoder_dim, decoder_dim).to(device)
        self.init_c = torch.nn.Linear(encoder_dim, decoder_dim).to(device)
        self.attention = Attention(attention_dim, encoder_dim, decoder_dim)
        self.attention_type = 'luong'

        if self.attention_type == 'bahdanau':
            self.fcn = torch.nn.Linear(decoder_dim, self.vocab_size).to(device)
        elif self.attention_type == 'luong':
            self.fcn = torch.nn.Linear(decoder_dim + encoder_dim, self.vocab_size).to(device)

        self.drop = torch.nn.Dropout(drop_prob)

    def init_hidden_state(self, features):
        mean_features = features.mean(dim=1)
        h = self.init_h(mean_features)
        c = self.init_c(mean_features)
        return h, c

    def forward_step(self, embed_word, features, hidden_state, cell_state):
        # Computation between features and hidden state to create a context vector
        context, attn_weight = self.attention(features, hidden_state)

        # Compute feature vector of input text
        lstm_input = torch.cat((embed_word, context), dim=1)

        hidden_state, cell_state = self.lstm(lstm_input, (hidden_state, cell_state))

        # Predicted vector
        output = None
        if self.attention_type == 'bahdanau':
            output = self.fcn(self.drop(hidden_state))
        elif self.attention_type == 'luong':
            input_linear = torch.cat((hidden_state, context), dim=1)
            output = self.fcn(self.drop(input_linear))

        return output, hidden_state, cell_state, attn_weight

    def forward(self, features, sequences):
        # Sequence
        sequence_length = len(sequences[0]) - 1
        sequences = sequences.to(device)

        # Prediction store
        preds = torch.zeros(sequences.shape[0], sequence_length, self.vocab_size).to(device)

        # Embedding sequence
        embeds = self.embedding(sequences)
        embeds = embeds.to(torch.float32)

        # Init hidden state
        hidden_state, cell_state = self.init_hidden_state(features)

        # Forward pass
        for idx in range(sequence_length):
            embed_word = embeds[:, idx]

            # Predicted vector
            output, hidden_state, cell_state, _ = self.forward_step(embed_word, features, hidden_state, cell_state)

            # Store output
            preds[:, idx] = output

        return preds

    def predict(self, feature, max_length, vocab=None):
        # Starting input
        word = torch.tensor(vocab.word2index['<SOS>']).view(1, -1).to(device)
        feature = feature.to(device)

        # Embedding sequence
        embeds = self.embedding(word)

        captions = []
        attention = []
        hidden_state, cell_state = self.init_hidden_state(feature)

        for idx in range(max_length):
            embed_word = embeds[:, 0]
            output, hidden_state, cell_state, attn_weight = self.forward_step(embed_word, feature, hidden_state, cell_state)
            attention.append(attn_weight.cpu().detach().numpy())

            # Predict word index
            predicted_word_idx = output.argmax(dim=1)

            # End if <EOS> appears
            if vocab.index2word[predicted_word_idx.item()] == "<EOS>":
                break

            captions.append(predicted_word_idx.item())

            # Send generated word as the next caption
            embeds = self.embedding(predicted_word_idx.unsqueeze(0))

        # Convert the vocab idx to words and return sentence
        return ' '.join([vocab.index2word[idx] for idx in captions]), attention


    def predict_batch(self, feature, max_length, vocab=None):
        # Starting input
        word = torch.full((feature.shape[0], 1), vocab.word2index['<SOS>']).to(device)
        feature = feature.to(device)

        # Embedding sequence
        embeds = self.embedding(word)
        predicted_captions = torch.zeros(max_length, feature.shape[0])
        hidden_state, cell_state = self.init_hidden_state(feature)

        for idx in range(max_length):
            embed_word = embeds[:, 0]
            output, hidden_state, cell_state, attn_weight = self.forward_step(embed_word, feature, hidden_state, cell_state)
            # Predict word index
            predicted_word_idx = output.argmax(dim=1)
            predicted_captions[idx, :] = predicted_word_idx.unsqueeze(0)[:, :]

            # Send generated word as the next caption
            embeds = self.embedding(predicted_word_idx.unsqueeze(1))
        predicted_captions = predicted_captions.permute(1, 0)
        return predicted_captions

### Captioner

In [None]:
class Captioner(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, attention_dim, encoder_dim, decoder_dim, vocab):
        super(Captioner, self).__init__()
        self.image_encoder =  ImageFeatureExtractor()
        self.text_decoder = TextFeatureExtractor(vocab_size, embed_dim, attention_dim,
                                                 encoder_dim, decoder_dim)
        self.vocab = vocab

    def forward(self, images, captions):

        features = self.image_encoder(images)
        output = self.text_decoder(features, captions)

        return output

    def generate_caption(self, image, max_length=20):
        image = image.to(device)
        feature = self.image_encoder(image)
        predicted_caption, attn_weights = self.text_decoder.predict(feature, max_length, self.vocab)

        return predicted_caption, attn_weights

    def generate_caption_batch(self, images, max_length=20):
        images = images.to(device)
        features = self.image_encoder(images)
        predicted_captions = self.text_decoder.predict_batch(features, max_length, self.vocab)

        return predicted_captions


## Test

In [None]:
def load_model(path):
    checkpoint = torch.load(path, map_location=torch.device('cpu'))

    model = Captioner(
        vocab_size=checkpoint['vocab_size'],
        embed_dim=checkpoint['embed_dim'],
        attention_dim=checkpoint['attention_dim'],
        encoder_dim=checkpoint['encoder_dim'],
        decoder_dim=checkpoint['decoder_dim'],
        vocab=checkpoint['vocab']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    return model

In [None]:
model = load_model("models/luong_attn/model_best.pth")
model.eval()
model = model.to(device)
print("Load model successfully")

Load model successfully


In [None]:
with torch.no_grad():
    list_of_references = []
    hypotheses = []
    bleu_score = []
    for idx, (image, target) in tqdm(enumerate(iter(loader))):
        image, target = image.to(device), target[:, 1:, :].tolist()


        mapped_target = map_target(target)
        list_of_references.extend(mapped_target)

        predicted_captions = model.generate_caption_batch(image).tolist()
        predicted_captions= list(map(map_predict, predicted_captions))

        hypotheses.extend(predicted_captions)
        score = corpus_bleu(list_of_references, hypotheses)
        bleu_score.append(score)

253it [07:45,  1.84s/it]


In [None]:
sum(bleu_score) / len(bleu_score)

0.5891869092912313

# Transformer

## Model

### Positional Encoding

In [None]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model, dropout = 0.1, max_len = 50):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        self.pe = torch.zeros(1, max_len, d_model).to(device)
        self.pe[0, :, 0::2] = torch.sin(position * div_term)
        self.pe[0, :, 1::2] = torch.cos(position * div_term)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


### Encoder

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, encoder_dim, d_model):
        super(Encoder, self).__init__()

        # Load pretrained model and remove last fc layer
        pretrained_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        self.model = torch.nn.Sequential(*list(pretrained_model.children())[:-2]).to(device)

        # Freeze layer
        for param in self.model.parameters():
            param.requires_grad = False

        self.linear = torch.nn.Linear(encoder_dim, d_model).to(device)
    def forward(self, images):
        images = images.to(device)

        features = self.model(images)
        features = features.view(features.size(0), features.size(1), -1)
        features = features.permute(0, 2, 1)
        features = self.linear(features)
        return features # (batch_size, 49, d_model)

### Decoder

In [None]:
class Decoder(torch.nn.Module):

    def __init__(self, n_tokens, d_model,
                 n_heads, dim_forward,
                 n_layers, dropout = 0.2):
        super(Decoder, self).__init__()
        self.embedding = torch.nn.Embedding(n_tokens, d_model).to(device) # embedding layer
        self.pos_encoder = PositionalEncoding(d_model, dropout).to(device) # positional encoder

        decoder_layers = TransformerDecoderLayer(d_model, n_heads, dim_forward, dropout, batch_first=True) # encoder layer
        self.transformer_decoder = TransformerDecoder(decoder_layers, n_layers).to(device) # transformer encoder


        self.d_model = d_model # number of features
        self.linear = torch.nn.Linear(d_model, n_tokens).to(device) # last linear model for prediction

    def forward(self, features, captions, padding_mask, captions_mask = None):
        """
        Arguments:
            captions: Tensor, shape ``[batch_size, seq_len]``
            captions_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[batch_size, seq_len, n_tokens]``
        """
        captions = captions.to(device)
        captions = self.embedding(captions)
        captions = captions * math.sqrt(self.d_model)
        captions = self.pos_encoder(captions)

        if captions_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            captions_mask = torch.nn.Transformer.generate_square_subsequent_mask(captions.size(1)).to(device)

        output = self.transformer_decoder(tgt=captions,
                                          memory=features,
                                          tgt_key_padding_mask=padding_mask,
                                          tgt_mask=captions_mask)
        output = self.linear(output)
        return output


    def predict(self, feature, max_length, vocab):
        word = torch.tensor([vocab.word2index['<SOS>']] + [0] * (max_length - 1)).view(1, -1).to(device)
        padding_mask = torch.Tensor([True] * max_length).view(1, -1).to(device)

        predicted_captions = []

        for i in range(max_length - 1):
            # Update the padding masks
            padding_mask[:, i] = False

            # Get the model prediction for the next word
            output = self.forward(feature, word, padding_mask)
            output = output[0, i]
            predicted_word_idx = output.argmax(dim=-1)
            predicted_captions.append(predicted_word_idx.item())
            word[:, i + 1] = predicted_word_idx.item()

            # End if <EOS> appears
            if vocab.index2word[predicted_word_idx.item()] == "<EOS>":
                break

        return ' '.join([vocab.index2word[idx] for idx in predicted_captions])

    def predict_batch(self, features, max_length, vocab):
        n_samples = features.size(0)

        word = torch.tensor([vocab.word2index['<SOS>']] + [0] * (max_length - 1)).view(1, -1).to(device)
        word = word.repeat(n_samples, 1)

        padding_mask = torch.Tensor([True] * max_length).view(1, -1).to(device)
        padding_mask = padding_mask.repeat(n_samples, 1)

        predicted_captions = [[] for _ in range(n_samples)]
        is_predicted = [False] * n_samples

        for i in range(max_length - 1):
            # Update the padding masks
            padding_mask[:, i] = False

            # Get the model prediction for the next word
            output = self.forward(features, word, padding_mask)
            output = output[torch.arange(n_samples), [i] * n_samples].clone()
            predicted_word_idx = output.argmax(dim=-1)

            for idx in range(n_samples):
                if is_predicted[idx]:
                    continue
                predicted_captions[idx].append(predicted_word_idx[idx].item())
                if predicted_word_idx[idx].item() == 2:
                    is_predicted[idx] = True
            if np.all(is_predicted):
                break

            word[torch.arange(n_samples), [i + 1] * n_samples] = predicted_word_idx.view(-1)
        return predicted_captions


### Captioner

In [None]:
class Captioner(torch.nn.Module):
    def __init__(self, n_tokens, d_model, n_heads, dim_forward, n_layers, encoder_dim, vocab):
        super(Captioner, self).__init__()
        self.encoder =  Encoder(encoder_dim=encoder_dim,
                                d_model=d_model)
        self.decoder = Decoder(n_tokens=n_tokens,
                               d_model=d_model,
                               n_heads=n_heads,
                               dim_forward=dim_forward,
                               n_layers=n_layers)

        self.vocab = vocab

    def forward(self, images, captions, padding_mask):

        features = self.encoder(images)
        output = self.decoder(features, captions, padding_mask)

        return output

    def generate_caption(self, image, max_length=50):
        image = image.to(device)
        feature = self.encoder(image)
        predicted_caption = self.decoder.predict(feature, max_length, self.vocab)
        return predicted_caption


    def generate_caption_batch(self, images, max_length=50):
        images = images.to(device)
        feature = self.encoder(images)
        predicted_captions = self.decoder.predict_batch(feature, max_length, self.vocab)
        return predicted_captions


## Test

In [None]:
def load_model(path):
    checkpoint = torch.load(path)
    model = Captioner(
        n_tokens=checkpoint['n_tokens'],
        d_model=checkpoint['d_model'],
        n_heads=checkpoint['n_heads'],
        dim_forward=checkpoint['dim_forward'],
        n_layers=checkpoint['n_layers'],
        encoder_dim=checkpoint['encoder_dim'],
        vocab=checkpoint['vocab'],

    )
    model.load_state_dict(checkpoint['model_state_dict'])
    return model


In [None]:
model = load_model("models/transformer2/model_best.pth")
model.eval()
print("Load model successfully")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 107MB/s]


Load model successfully


In [None]:
with torch.no_grad():
    list_of_references = []
    hypotheses = []
    bleu_score = []
    for idx, (images, targets) in tqdm(enumerate(iter(loader))):
        images, targets = images.to(device), targets[:, 1:, :].tolist()


        mapped_target = map_target(targets)
        list_of_references.extend(mapped_target)

        predicted_captions = model.generate_caption_batch(images)
        predicted_captions= list(map(map_predict, predicted_captions))

        hypotheses.extend(predicted_captions)
        score = corpus_bleu(list_of_references, hypotheses)
        bleu_score.append(score)
        break

0it [00:02, ?it/s]

[['a', 'woman', 'in', 'a', 'pink', 'dress', 'is', 'sitting', 'on', 'a', 'wooden', 'bench', '.'], ['two', 'dogs', 'are', 'playing', 'with', 'a', 'ball', 'on', 'the', 'grass', '.'], ['a', 'little', 'girl', 'is', 'sitting', 'on', 'a', 'yellow', 'and', 'yellow', 'tent', '.'], ['a', 'man', 'laying', 'on', 'a', 'bench', 'with', 'a', 'dog', 'on', 'his', 'back', '.'], ['a', 'man', 'with', 'a', 'mohawk', 'and', 'a', 'hat', 'is', 'wearing', 'a', 'hat', '.'], ['a', 'little', 'girl', 'in', 'a', 'red', 'shirt', 'is', 'swinging', 'a', 'rope', '.'], ['a', 'dog', 'running', 'in', 'the', 'grass', 'with', 'a', 'ball', 'in', 'its', 'mouth', '.'], ['a', 'white', 'dog', 'with', 'a', 'red', 'collar', 'is', 'running', 'on', 'the', 'beach', '.'], ['a', 'little', 'boy', 'is', 'standing', 'on', 'a', 'sidewalk', 'with', 'his', 'arms', 'outstretched', '.'], ['a', 'black', 'and', 'white', 'dog', 'is', 'jumping', 'over', 'a', 'log', '.'], ['a', 'white', 'dog', 'is', 'running', 'through', 'the', 'snow', '.'], ['a', 




In [None]:
sum(bleu_score) / len(bleu_score)

0.29821697109124284

# Par-Inject

## Model

### Encoder

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, encoder_dim):
        super(Encoder, self).__init__()
        self.encoder_dim = encoder_dim

        # Load pretrained model and remove last fc layer
        pretrained_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        self.model = torch.nn.Sequential(*list(pretrained_model.children())[:-1]).to(device)

        # Freeze layer
        for param in self.model.parameters():
            param.requires_grad = False

        # Add a linear layer add the end of model
        self.linear = torch.nn.Linear(2048, self.encoder_dim).to(device)
        self.drop = torch.nn.Dropout(0.3)

    def forward(self, images):
        # Preprocess images
        images = images.to(device)

        # Forward pass
        features = self.model(images)                     # (batch_size, 2048, 1, 1)
        features = features.view(images.shape[0], 1, -1)  # (batch_size, 1, 2048)
        features = self.linear(self.drop(features))       # (batch_size, 1, 512)
        features = features.squeeze(1)                    # (batch_size, 512)
        return features

### Decoder

In [None]:
class Decoder(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, encoder_dim, decoder_dim):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.encoder_dim = encoder_dim
        self.decoder_dim = decoder_dim


        # Embedding layer
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim).to(device)

        # LSTM layer
        self.lstm = torch.nn.LSTMCell(input_size=embed_dim + encoder_dim, hidden_size=decoder_dim).to(device)

        # Linear layer
        self.linear1 = torch.nn.Linear(decoder_dim, decoder_dim).to(device)
        self.linear2 = torch.nn.Linear(decoder_dim, vocab_size).to(device)
        self.drop = torch.nn.Dropout(0.3)


    def init_hidden_state(self, features):
        hidden = torch.zeros(features.size(0), self.decoder_dim).to(device)
        cell = torch.zeros(features.size(0), self.decoder_dim).to(device)
        return hidden, cell


    def forward_step(self, embed_words, features, hidden, cell):
        lstm_input = torch.cat((embed_words, features), dim=1)
        hidden, cell = self.lstm(lstm_input, (hidden, cell))

        decoded = self.linear1(hidden)
        decoded = self.drop(decoded)
        output = self.linear2(decoded)

        return output, hidden, cell

    def forward(self, features, sequences):

        sequence_length = len(sequences[0]) - 1
        preds = torch.zeros(sequences.shape[0], sequence_length, self.vocab_size)

        sequences = sequences.to(device)
        preds = preds.to(device)

        # Embedding sequence
        embeds = self.embedding(sequences)
        embeds = embeds.to(torch.float32)

        hidden, cell = self.init_hidden_state(features)

        # Forward pass
        for idx in range(sequence_length):
            # Compute feature vector of input text
            embed_words = embeds[:, idx]

            output, hidden, cell = self.forward_step(embed_words, features, hidden, cell)

            # Predicted vector
            preds[:, idx] = output

        return preds

    def predict(self, feature, max_length=20, vocab=None):
        # Starting input
        word = torch.tensor(vocab.word2index['<SOS>']).view(1, -1)
        word = word.to(device)
        feature = feature.to(device)

        # Embedding sequence
        embeds = self.embedding(word)

        captions = []

        hidden, cell = self.init_hidden_state(feature)


        for idx in range(max_length):
            embed_word = embeds[:, 0]
            output, hidden, cell = self.forward_step(embed_word, feature, hidden, cell)
            # Predict word index
            predicted_word_idx = output.argmax(dim=1)
            captions.append(predicted_word_idx.item())

            # End if <EOS> appears
            if vocab.index2word[predicted_word_idx.item()] == "<EOS>":
                break

            # Send generated word as the next caption
            embeds = self.embedding(predicted_word_idx.unsqueeze(0))

        # Convert the vocab idx to words and return sentence
        return ' '.join([vocab.index2word[idx] for idx in captions])


    def predict_batch(self, features, max_length=20, vocab=None):
        word = torch.full((features.shape[0], 1), vocab.word2index['<SOS>']).to(device)
        features = features.to(device)

        # Embedding sequence
        embeds = self.embedding(word)
        predicted_captions = torch.zeros(max_length, features.shape[0])
        hidden, cell = self.init_hidden_state(features)


        for idx in range(max_length):
            embed_words = embeds[:, 0]
            output, hidden, cell = self.forward_step(embed_words, features, hidden, cell)

            # Predict word index
            predicted_word_idx = output.argmax(dim=1)
            predicted_captions[idx, :] = predicted_word_idx.unsqueeze(0)[:, :]

            # Send generated word as the next caption
            embeds = self.embedding(predicted_word_idx.unsqueeze(1))
        predicted_captions = predicted_captions.permute(1, 0)
        return predicted_captions

### Captioner

In [None]:
class Captioner(torch.nn.Module):
    def __init__(self, vocab_size,  vocab, embed_dim, encoder_dim, decoder_dim):
        super().__init__()
        self.encoder =  Encoder(encoder_dim)
        self.decoder = Decoder(vocab_size, embed_dim, encoder_dim, decoder_dim)
        self.vocab = vocab

    def forward(self, images, captions):

        image_fv = self.encoder(images)
        output = self.decoder(image_fv, captions)

        return output

    def generate_caption(self, image, max_length=20):
        feature = self.encoder(image)
        predicted_caption = self.decoder.predict(feature, max_length, self.vocab)

        return predicted_caption

    def generate_caption_batch(self, images, max_length=20):
        features = self.encoder(images)
        predicted_captions = self.decoder.predict_batch(features, max_length, self.vocab)

        return predicted_captions


## Test

In [None]:
def load_model(path):
    checkpoint = torch.load(path)
    model = Captioner(
        vocab_size=checkpoint['vocab_size'],
        vocab=checkpoint['vocab'],
        embed_dim=checkpoint['embed_dim'],
        encoder_dim=checkpoint['encoder_dim'],
        decoder_dim=checkpoint['decoder_dim'],
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    return model

In [None]:
model = load_model("models/par_inject/model_best.pth")
model.eval()
print("Load model successfully")

Load model successfully


In [None]:
with torch.no_grad():
    list_of_references = []
    hypotheses = []
    bleu_score = []
    for idx, (image, target) in tqdm(enumerate(iter(loader))):
        image, target = image.to(device), target[:, 1:, :].tolist()


        mapped_target = map_target(target)
        list_of_references.extend(mapped_target)

        predicted_captions = model.generate_caption_batch(image).tolist()
        predicted_captions = list(map(map_predict, predicted_captions))

        hypotheses.extend(predicted_captions)
        score = corpus_bleu(list_of_references, hypotheses)
        bleu_score.append(score)

253it [08:20,  1.98s/it]


In [None]:
sum(bleu_score) / len(bleu_score)

0.34457920062888253

# Init-Inject

## Model

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, encoder_dim):
        super(Encoder, self).__init__()
        self.encoder_dim = encoder_dim

        # Load pretrained model and remove last fc layer
        pretrained_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        self.model = torch.nn.Sequential(*list(pretrained_model.children())[:-1]).to(device)

        # Freeze layer
        for param in self.model.parameters():
            param.requires_grad = False

        # Add a linear layer add the end of model
        self.linear = torch.nn.Linear(2048, self.encoder_dim).to(device)
        self.drop = torch.nn.Dropout(0.3)

    def forward(self, images):
        # Preprocess images
        images = images.to(device)

        # Forward pass
        features = self.model(images)                     # (batch_size, 2048, 1, 1)
        features = features.view(images.shape[0], 1, -1)  # (batch_size, 1, 2048)
        features = self.linear(self.drop(features))       # (batch_size, 1, 512)
        features = features.squeeze(1)                    # (batch_size, 512)
        return features

In [None]:
class Decoder(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, encoder_dim, decoder_dim):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.encoder_dim = encoder_dim
        self.decoder_dim = decoder_dim


        # Embedding layer
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim).to(device)

        # LSTM layer
        self.lstm = torch.nn.LSTMCell(input_size=embed_dim, hidden_size=decoder_dim).to(device)

        # Linear layer
        self.linear1 = torch.nn.Linear(decoder_dim, decoder_dim).to(device)
        self.linear2 = torch.nn.Linear(decoder_dim, vocab_size).to(device)
        self.drop = torch.nn.Dropout(0.3)

    def init_hidden_state(self, features):
        hidden = features
        cell = features
        return hidden, cell

    def forward_step(self, embed_words, features, hidden, cell):
        hidden, cell = self.lstm(embed_words, (hidden, cell))

        decoded = self.linear1(hidden)
        decoded = self.drop(decoded)
        output = self.linear2(decoded)

        return output, hidden, cell

    def forward(self, features, sequences):

        sequence_length = len(sequences[0]) - 1
        preds = torch.zeros(sequences.shape[0], sequence_length, self.vocab_size)

        sequences = sequences.to(device)
        preds = preds.to(device)

        # Embedding sequence
        embeds = self.embedding(sequences)
        embeds = embeds.to(torch.float32)

        hidden, cell = self.init_hidden_state(features)

        # Forward pass
        for idx in range(sequence_length):
            # Compute feature vector of input text
            embed_words = embeds[:, idx]

            output, hidden, cell = self.forward_step(embed_words, features, hidden, cell)

            # Predicted vector
            preds[:, idx] = output

        return preds

    def predict(self, feature, max_length=20, vocab=None):
        # Starting input
        word = torch.tensor(vocab.word2index['<SOS>']).view(1, -1).to(device)
        feature = feature.to(device)

        # Embedding sequence
        embeds = self.embedding(word)

        captions = []

        hidden, cell = self.init_hidden_state(feature)

        for idx in range(max_length):
            embed_word = embeds[:, 0]
            output, hidden, cell = self.forward_step(embed_word, feature, hidden, cell)
            # Predict word index
            predicted_word_idx = output.argmax(dim=1)
            captions.append(predicted_word_idx.item())

            # End if <EOS> appears
            if vocab.index2word[predicted_word_idx.item()] == "<EOS>":
                break

            # Send generated word as the next caption
            embeds = self.embedding(predicted_word_idx.unsqueeze(0))

        # Convert the vocab idx to words and return sentence
        return ' '.join([vocab.index2word[idx] for idx in captions])

    def predict_batch(self, features, max_length=20, vocab=None):
        word = torch.full((features.shape[0], 1), vocab.word2index['<SOS>']).to(device)
        features = features.to(device)

        # Embedding sequence
        embeds = self.embedding(word)
        predicted_captions = torch.zeros(max_length, features.shape[0])
        hidden, cell = self.init_hidden_state(features)


        for idx in range(max_length):
            embed_words = embeds[:, 0]
            output, hidden, cell = self.forward_step(embed_words, features, hidden, cell)

            # Predict word index
            predicted_word_idx = output.argmax(dim=1)
            predicted_captions[idx, :] = predicted_word_idx.unsqueeze(0)[:, :]

            # Send generated word as the next caption
            embeds = self.embedding(predicted_word_idx.unsqueeze(1))
        predicted_captions = predicted_captions.permute(1, 0)
        return predicted_captions

In [None]:
class Captioner(torch.nn.Module):
    def __init__(self, vocab_size,  vocab, embed_dim, encoder_dim, decoder_dim):
        super().__init__()
        self.encoder =  Encoder(encoder_dim)
        self.decoder = Decoder(vocab_size, embed_dim, encoder_dim, decoder_dim)
        self.vocab = vocab

    def forward(self, images, captions):

        image_fv = self.encoder(images)
        output = self.decoder(image_fv, captions)

        return output

    def generate_caption(self, image, max_length=20):
        feature = self.encoder(image)
        predicted_caption = self.decoder.predict(feature, max_length, self.vocab)

        return predicted_caption

    def generate_caption_batch(self, images, max_length=20):
        features = self.encoder(images)
        predicted_captions = self.decoder.predict_batch(features, max_length, self.vocab)

        return predicted_captions

## Test

In [None]:
def load_model(path):
    checkpoint = torch.load(path)
    model = Captioner(
        vocab_size=checkpoint['vocab_size'],
        vocab=checkpoint['vocab'],
        embed_dim=checkpoint['embed_dim'],
        encoder_dim=checkpoint['encoder_dim'],
        decoder_dim=checkpoint['decoder_dim'],
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    return model

In [None]:
model = load_model("models/init_inject/model_best.pth")
model.eval()
print("Load model successfully")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 151MB/s]


Load model successfully


In [None]:
with torch.no_grad():
    list_of_references = []
    hypotheses = []
    bleu_score = []
    for idx, (image, target) in tqdm(enumerate(iter(loader))):
        image, target = image.to(device), target[:, 1:, :].tolist()


        mapped_target = map_target(target)
        list_of_references.extend(mapped_target)

        predicted_captions = model.generate_caption_batch(image).tolist()
        predicted_captions = list(map(map_predict, predicted_captions))

        hypotheses.extend(predicted_captions)
        score = corpus_bleu(list_of_references, hypotheses)
        bleu_score.append(score)

253it [09:32,  2.26s/it]


In [None]:
sum(bleu_score) / len(bleu_score)

0.27340657084194314