# **Building the DataLoader**

We build the dataloader to enable easier and more systematic access of our images and captions for the actual CNN and RNN models.

First, we import all the necessary libraries for out Dataloader.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# %cd /content/drive/MyDrive/
# !unzip image_captions.zip

In [None]:
import os
import spacy
import matplotlib.pyplot as plt
from PIL import Image
from nltk.translate.bleu_score import sentence_bleu
import torch
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision import transforms
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torch import nn
from tqdm import tqdm
import pandas as pd


Here we create 2 variables for storing the loaction of the caption and image folders from the dataset. The ```spacy_eng``` is made for later use in tokenising the captions as it has english datasets which enable it for better tokenisation around punctuation.



In [None]:
os.chdir('/content/drive/MyDrive/image_captions')

In [None]:
spacy_eng = spacy.load("en_core_web_sm")
img_folder = r"/content/drive/MyDrive/image_captions/Images"
caption_file = r"/content/drive/MyDrive/image_captions/captions.txt"

I had created a function `read_data` to read the data from the captions.txt file and store the image id's and captions in a dictionary with each image id as key pointing to all 5 captions for that image in a list as the value. But I had to discard that and use `pd.read_csv`because I got an error of size mismatch in the `forward` of `DecoderRNN` class while concatenating the features and embeddings tensors(also mentioned in that block) because with all 5 captions together for each image provided as the dataset for Dataloader, it generated the captions as a (5, batch_size, max_len) dimensioned tensor, but for concatenating, I needed it to be a (batch_size, max_len) tensor.

In [None]:
# # Extracting data from Captions file
# def read_data(caption_file):
#     img_captions = {}

#     with open(caption_file, "r") as f:
#         next(f)
#         cap = f.readlines()

#     # Mapping image ID's with their captions
#     for line in cap:

#         line = line.split(',')
#         img_id, caption = line[0], line[1:]
#         caption = ",".join(caption)                 # if any caption had commas, combining the split caption
#         if img_id not in img_captions:
#             img_captions[img_id] = []
#         img_captions[img_id].append(caption)

#     return img_captions

Now we create another function `caption_cleaner` which is used to preprocess our stored captions. It lowercases all characters, removes all punctuations, all non-alphabetical characters, and all multiple spaces from the captions. It also adds the `<SOS>` and the `<EOS>` token at the start and end of each caption respectively, representing the start and the end of sequence.

In [None]:
# Preprocess caption for easier implementation
def caption_cleaner(captions):
    for i in range(len(captions)):
        caption = captions[i]
        caption = caption.lower().strip()           # lowercase all characters
        caption = caption.replace("\"", '')
        caption = caption.replace('[^a-z]' , '')    # removes all special characters
        caption = caption.replace('\s+', ' ')       # replaces multiple spaces by single space
        caption = f"<SOS> {caption} <EOS>"
        captions[i] = caption
    return captions

### **Creating a Vocabulary**
We create a vocabulary to store all of the words used in the captions into a dictionary, thus forming a mini-dictionary(not just a python dictionary, but an actual one) of words. We used this vocabulary to assign indexes to each and every word, allowing us to convert these words to their specific indexes and those indexes back into words using the two dictionaries `string_to_index` and `index_to_string`.

We do this by creating the class **vocab**, which contains the two dictionaries, and pass to it the frequency threshold, which specifies the minimum number of times a word should repeat in the captions for it to actually be considered in the dictionary. We have a `__len__()` method which provides us the length of the vocabulary, the `tokenizer_eng` method(as a static method because it does not need any parameters of the vocab class to work, but only the text provided) to tokenize the captions, and the `numericalize` method which can convert a tokenized caption into a list containing numbers which uniquely indentifies each caption to be evaluated by the captioning model. Lastly, the `build_vocab` method allows us to actually build the vocabulary. It takes each word from caption and checks if it meets the reqired freq_threshold, and then appends it to the dictoinaries with unique indices.

In [None]:
class vocab():
    def __init__(self, freq_thresh):
        self.string_to_index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.index_to_string = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.freq_thresh = freq_thresh

    def __len__(self):
        return len(self.string_to_index)

    @staticmethod
    def tokenizer_eng(text):
        return [word for word in spacy_eng.tokenizer(text)]

    def build_vocab(self, captions):
        freq = {}
        idx = 4

        for sentence in captions:
            for word in self.tokenizer_eng(sentence):
                if word not in freq:
                    freq[word] = 1
                else:
                    freq[word] += 1

                if freq[word] >= self.freq_thresh:
                    self.string_to_index[word] = idx
                    self.index_to_string[idx] = word
                    index += 1

    def numericalize(self, text):
        tokenized_txt = self.tokenizer_eng(text)
        return [
            self.string_to_index[token] if token in self.string_to_index else self.string_to_index["<UNK>"]
            for token in tokenized_txt
        ]

### **Creating the Dataset**
Finally, we create the dataset which allows us to access all the images with their respective captions, all in form of pytorch tensors.

The **dataset8k** class inherits from the `torch.utils.data.Dataset` class. We provide the path to the image folder, caption file, any transform if required, and a default freq_threshold of 5 for creating the vocabulary. It reads the data from the captions file and then it cleans the captions.It creates a vocabulary for the read data.Under the `__getitem__` method, it takes each image from the read dataset and applies any transform if provided, then applies the numericalize method on the each of the 5 corresponding captions for the image.

We also pad each caption at the end upto a specified max length of 50 words, with the `<PAD>` token whose string_to_index value is 0.

In [None]:
class dataset8k(Dataset):
    def __init__(self, img_folder, caption_file, transform = None, freq_thresh = 5):
        self.img_folder = img_folder
        self.dataset = pd.read_csv(caption_file)
        self.transform  = transform

        # Getting lists for image_ids and captions
        self.imgs = self.dataset["image"]
        self.captions = self.dataset["caption"]
        self.captions = caption_cleaner(self.captions)

        # Creating a vocabulary
        self.vocab = vocab(freq_thresh)
        self.vocab.build_vocab([cap for cap in self.captions])

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        img_id = self.imgs[index]
        caption = self.captions[index]
        img = Image.open(os.path.join(self.img_folder, img_id)).convert("RGB")

        if self.transform:
            img = self.transform(img)

        numericalized_caption = self.vocab.numericalize(caption)
        return img, torch.tensor(numericalized_caption)

We define `MyCollate` class to arrange the images and captions in form of sequential batches, while also applying formatting such as padding each caption upto the max length of caption for each batch, using `pad_sequence` function in torch utils. `MyCollate` is passed as an argument to the `DataLoader` which then actually performs the batching.

In [None]:
class MyCollate():
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [torch.unsqueeze(item[0], 0) for item in batch]
        imgs = torch.cat(imgs, dim = 0)
        cap = [item[1] for item in batch]
        cap = pad_sequence(cap, batch_first = False, padding_value = self.pad_idx)

        return imgs, cap

### **Creating The Loader Fucntion**
To finish of the Dataloader, we create the `get_loader` function which takes all the parameters of dataset class, along with some hyperparameters like batch size and num of workers. We also provide the val and test split(the fraction of the dataset forming val set and test set) to split the dataset into three train-val-test datasets. It then creates the dataset and splits it into the train-val-test sets. Then it creates the train_loader, val_loader and test_loader, each with the mentioned `batch_size` and `num_workers`.

It returns the train, val, test loaders as a 3 length dictionary with keys --> `'train', 'val', 'test'`

In [None]:
def get_loader(
        img_folder,
        caption_file,
        transform,
        val_split,
        test_split,
        batch_size = 32,
        num_workers = 2,
        shuffle = True,
        pin_memory = False,
):
    dataset = dataset8k(img_folder, caption_file, transform = transform)
    pad_idx = dataset.vocab.string_to_index["<PAD>"]

    train_idx, val_test_idx = train_test_split(list(range(len(dataset))), test_size=(val_split + test_split))
    val_idx, test_idx = train_test_split(list(range(len(val_test_idx))), test_size=test_split)

    train_set = Subset(dataset, train_idx)
    val_set = Subset(dataset, val_idx)
    test_set = Subset(dataset, test_idx)

    train_loader = DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn= MyCollate(pad_idx),
    )
    val_loader = DataLoader(
        dataset=val_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn= MyCollate(pad_idx),
    )
    test_loader = DataLoader(
        dataset=test_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn= MyCollate(pad_idx),
    )
    return {'train': train_loader, 'val': val_loader, 'test': test_loader}, dataset

  This is a sample code to check the working of the dataloader, with an eg. transform. It is important to note that the CNN Encoder we will be using later is based on the VGG16 architecture, which accpets images of size (3, 224, 224). So any transform we apply should have a layer that resizes or crops the image to (224, 224):

In [None]:
# transform = transforms.Compose(
#     [
#     transforms.Resize((224,224)),
#     transforms.ToTensor(),]
# )

# dataloader, _ = get_loader(
#     img_folder=img_folder,
#     caption_file=caption_file,
#     transform = transform,
#     val_split=0.2,
#     test_split = 0.1,
# )

# for idx, (imgs, captions) in enumerate(dataloader['train']):
#     print(imgs.shape)
#     print(captions.shape)

---

# **Creating The Model**

For an image captioning model, we need to first identify the image features and then use those image features to identify the features of the caption, to calculate the loss function between the generated caption and the actual caption for backward propagation. So we need to implement two models, one to identify the image features and another to generate the captions from the features, and then link those two together to form the final caption.

### **EncoderCNN**

To identify image features, we use a CNN model, which specialises in extracting image features. I have implemented the CNN based on the VGG16 architecture, which is one of the most popopular image recognition models.

I have not used a pretrained VGG16 model, but implemented its structure with all the layers from it's paper. I have removed it's last SoftMax layer which is used for feature classification, because we just need the features of the image, not to classify them.

NB: In the `forward` method of the model, we reshaped the image after applying all the conv layers because now we need to apply the fully connected linear layers, for which we need to flattemn the image into one dimension

In [None]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, num_classes = 1000):
        super(EncoderCNN,self).__init__()

        self.block1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.block2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.block3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.block4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.block5 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.fc_layers = nn.Sequential(
            nn.Linear(7*7*512, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, num_classes),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(num_classes, embed_size))

    def forward(self, image):
        features = self.block1(image)
        features = self.block2(features)
        features = self.block3(features)
        features = self.block4(features)
        features = self.block5(features)
        features = torch.reshape(features, (features.size(0), -1))
        features = self.fc_layers(features)
        return features

### **DecoderRNN**
For the caption generation, we use an RNN model, as it is made for analysing sequential data, such as lists, text(sentences), etc. We use specifically LSTM for this task as it can predict the next element in sequences from the previous element, which is really handy for caption generation.

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        features = torch.unsqueeze(features, 0)

        # print(f"{features.shape}\t{embeddings.shape}\n")
        '''was having an error where the features and embeddings weren't concateneting due to size incompatibility
        so i tried printing their shapes to check where the error could have been '''

        embeddings = torch.cat((features, embeddings), dim=0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

### **Combining CNN and RNN**
Now, we need to find a way to attach the two models together so that, when we feed it an image, it can generate the respective caption in one go, by extracting the features and then linking the features to the caption.

Here, we first provided the CNN with the image, then took the generated output features and the caption for the image, and fed it to the RNN. We have another function for caption generation which predicts the caption output, and converts it into words, and return the caption as a list of its tokens(or words).

In [None]:
class Image_Captioning(nn.Module):
    def __init__(self, num_classes, embed_size, hidden_size, vocab_size, num_layers):
        super(Image_Captioning, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size, embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, captions)
        return outputs

    def caption_return(self, images, vocab, max_len = 50):
        with torch.no_grad():
            feature = torch.unsqueeze(self.encoderCNN(images), 0)
            token = None
            predicted_captions = []

            for _ in range(max_len):
                hiddens, token = self.decoderRNN.lstm(feature, token)
                output = self.decoderRNN.linear(torch.squeeze(hiddens, 0))
                prediction = output.argmax(1)
                feature = torch.unsqueeze(self.decoderRNN.embed(prediction), 0)
                prediction = torch.unsqueeze(prediction, 1)

                for i in range(len(prediction)):
                    word_idx = prediction[i].item()
                    if word_idx == vocab.string_to_index["<EOS>"]:
                        break
                    elif word_idx in vocab.index_to_string:
                        predicted_captions.append(word_idx)
                    else:
                        predicted_captions.append(vocab.string_to_index["<UNK>"])
            predicted_captions = torch.transpose(torch.reshape(torch.tensor(predicted_captions), (-1,images.size(0))), 0, 1)

        return predicted_captions

The `caption_return` method here was the toughest to deal with, as while training the dataset, there were a bunch of errors popping up, and I had to run multiple times to identify every single one, while waiting for 5-10 min for all the code blocks to run before generating any output/error each time.

---

# **Training the Model**

For training, we first define our hyperparamters and the transforms we want to apply to the images. Here, I have resized the image to(224,224) because that is the image size VGG16 accepts. Then I converted the resized image to a tensor, and finally normalized the tensor.

In [None]:
# Hyperparameters
embed_size = 256
hidden_size = 256
batch_size = 32
num_epochs = 2
learning_rate = 1e-3
num_layers = 10
num_classes = 1000
num_workers = 2

transform = transforms.Compose(
    [transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)),]
)

Then we create our dataloader that loads the images and captions after splitting the dataset into train-val-test batches. Then, we calculate the vocabulary size of our dataset, which is required by our DecoderRNN model.

In [None]:
dataloader, dataset = get_loader(
    img_folder=img_folder,
    caption_file=caption_file,
    transform=transform,
    val_split = 0.15,
    test_split=0.1,
    batch_size=batch_size,
    num_workers= num_workers
)
vocabulary = dataset.vocab
vocab_size = dataset.vocab.__len__()



We finally define the train function, which goes over our training dataset and feeds it to the Image_captioning model, which then calculates the loss function and then optimizes the model parameters to reduce the loss. For each epoch after training, we save the model's weights and biases in "model_weights.pth" file and the optimizer parameters in the "optimizer.pth" file so that the best optimized model at all stages is saved and we can load the model at any later stage to further optimize or use it.

In [None]:
def train(dataloader, vocab_size):
    train_loader = dataloader['train']
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = Image_Captioning(
        num_classes= num_classes,
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        num_layers=num_layers,
    )
    # creaitng loss function and optimizer
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocabulary.string_to_index["<PAD>"])
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in tqdm(range(num_epochs)):
        score = 0
        total_loss = 0

        for idx ,(imgs, captions) in enumerate(train_loader):
            imgs = imgs.to(device)
            captions = captions.to(device)
            optimizer.zero_grad()
            loss = 0

            outputs = model(imgs, captions[:-1])
            loss += loss_fn(torch.reshape(outputs, (-1, outputs.shape[2])), captions.reshape(-1))
            loss /= batch_size
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            predicted_captions = model.caption_return(imgs, vocabulary)
            for i in range(len(predicted_captions)):
                score += sentence_bleu([[caption[i] for caption in captions]], predicted_captions[i])

        avg_score = score/len(train_loader)
        avg_loss = total_loss/len(train_loader)
        print(f"Avg Loss: {avg_loss}\nAccuracy: {avg_score*100}%")

        torch.save(model.state_dict(), f"model_weights{epoch + 1}.pth")
        torch.save(optimizer.state_dict(), f"optimiser{epoch + 1}.pth")

In [None]:
train(dataloader, vocab_size)

  self.pid = os.fork()


# **Testing the Model**


In [None]:
def test(dataloader, vocab_size):
    test_loader = dataloader['test']
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = Image_Captioning(
        num_classes= num_classes,
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        num_layers=num_layers,
    ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    model.load_state_dict(torch.load("model_weights.pth"))
    model.eval()
    score = 0

    for idx ,(imgs, captions) in enumerate(test_loader):
        imgs = imgs.to(device)
        captions = captions.to(device)

        predicted_captions = model.caption_return(imgs, vocabulary)
        for i in range(len(predicted_captions)):
            score += sentence_bleu([[caption[i] for caption in captions]], predicted_captions[i])


    avg_score = score/len(test_loader)
    print(f"Accuracy: {score*100}%")