## Image Captioning using Resnet101 and 1 Layered LSTM Network

In [2]:
# !pip install nbimporter torchmetrics transformers pycocotools wandb

In [1]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence

from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from torchvision import transforms
from torchvision import models

from tqdm import tqdm
from PIL import Image
import pickle as pkl

import nbimporter
from lstmcell import Encoder, DecoderWithAttention

import nltk
nltk.download('wordnet')

device = torch.device("cuda")

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import wandb

config = {
    "learning_rate": 8e-4,
    "batch_size": 32,
    "num_epochs": 6
}

wandb.init(
  project="Image-Captioning",
  config=config,
)

train_path = "../processed-files/train.csv"
val_path = "../processed-files/validation.csv"
test_path = "../processed-files/test.csv"

In [None]:
with open("../processed-files/vocab.pkl", "rb") as file:
    vocab = pkl.load(file)
    file.close()
    
with open("../processed-files/w2i.pkl", "rb") as file:
    w2i = pkl.load(file)
    file.close()

### Hyperparameters

In [None]:
ATTENTION_DIM = 512
EMBEDDING_DIM = 512
DECODER_DIM = 512
VOCAB_SIZE = len(vocab)
batch_size = 32
LR = 8e-4
START_EPOCH = 1
NUM_EPOCHS = 3

checkpoint_path = "./checkpoint/current_checkpoint.pt"
best_model_path = "./best_model/best_model.pt"

### Building Vocabulary

In [3]:
class Vocabulary:
    def __init__(self):
        self.word2count = {}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.word2index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.num_words = 4
        self.num_sentences = 0
        self.longest_sentence = 0

    def __len__(self):
        return len(self.word2index)

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 0
        for word in sentence.split(" "):
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def build_vocabulary(self, sentences):
        for sentence in sentences:
            self.add_sentence(sentence)

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, sentence):
        sentence = sentence.split(" ")
        return [self.word2index[word] if word in self.word2index else self.word2index["<UNK>"] for word in sentence]

### Custom Pytorch Dataset

In [4]:
class Flickr8kDataset(Dataset):
    def __init__(self, train_path, train_image_path, val_path=None, test_path=None, category=None, transform=None) -> None:

        self.transform = transform
        self.category = category
        self.train_image_path = train_image_path
        self.train_data = self.load_files(train_path)
        self.initialize()

        if self.category == "validation":
            self.val_data = self.load_files(val_path)

        elif self.category == "testing":
            self.test_data = self.load_files(test_path)

    def initialize(self):
        self.vocab = Vocabulary()
        self.vocab.build_vocabulary(self.train_data.caption.tolist())

    def load_files(self, path):
        df = pd.read_csv(path, sep=",")
        return df

    def __len__(self):
        if self.category == "validation":
            return len(self.val_data)
        elif self.category == "testing":
            return len(self.test_data)
        else:
            return len(self.train_data)

    def __getitem__(self, index):
        if self.category == "validation":
            image = self.val_data.image[index]
            caption = self.val_data.caption[index]
        elif self.category == "testing":
            image = self.test_data.image[index]
            caption = self.test_data.caption[index]
        else:
            image = self.train_data.image[index]
            caption = self.train_data.caption[index]

        img = Image.open(os.path.join(self.train_image_path, image)).convert('RGB')
        
        if (self.transform):
            img = self.transform(img)

        numericalized_caption = [self.vocab.word2index["<SOS>"]]
        numericalized_caption += self.vocab.to_index(caption)
        numericalized_caption.append(self.vocab.word2index["<EOS>"])
        
        return img, torch.tensor(numericalized_caption)

In [5]:
train_transform = transforms.Compose([transforms.Resize((256)),
transforms.RandomCrop(224),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

val_transform = transforms.Compose([transforms.Resize((256)),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

train_dataset = Flickr8kDataset(train_path=train_path, transform=train_transform)
val_dataset = Flickr8kDataset(train_path=train_path, val_path=val_path, category="validation", transform=val_transform)
test_dataset = Flickr8kDataset(train_path=train_path, test_path=test_path, category="testing", transform=val_transform)

### DataLoaders

In [6]:
class MyCollate:
    def __init__(self, pad_value):
        self.pad_value = pad_value
    
    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        img = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        lengths = torch.tensor([len(i) for i in targets])
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad_value)
        
        return img, targets, lengths

In [7]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=MyCollate(0))
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=MyCollate(0))
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=MyCollate(0))

In [8]:
len(train_dataset), len(val_dataset), len(test_dataset)

(30000, 5454, 5000)

In [9]:
len(train_dataset.vocab.word2index), len(train_dataset.vocab.index2word)

(8369, 8369)

### Model Architecture

In [11]:
class CaptionerAttention(nn.Module):
    def __init__(self, attention_dim, embed_size, decoder_dim, vocab_size, encoder_dim=2048):
        """ Initialize Resnet101 Encoder and Attention LSTM based Decoder """
        super(CaptionerAttention, self).__init__()
        self.encoder = Encoder()
        self.decoder = DecoderWithAttention(attention_dim, embed_size, decoder_dim, vocab_size, encoder_dim)
        
    def forward(self, image, caption, lengths):
        x = self.encoder(image)
        scores, caps_sorted, decode_lengths, alphas, sort_ind = self.decoder(x, caption, lengths)
        return scores, caps_sorted, decode_lengths, alphas, sort_ind
    
    def captionImage(self, images, max_len):
        features = self.encoder(images)
        captions = self.decoder.sample(features, max_len)
        return captions 

In [14]:
captioner = CaptionerAttention(ATTENTION_DIM, EMBEDDING_DIM, DECODER_DIM, VOCAB_SIZE).to(device)

wandb.watch(captioner, log_freq=100)

In [15]:
params = list(captioner.decoder.parameters())
optimizer = torch.optim.RAdam(params, lr=LR)
scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=2, verbose=True)

### Training and Validation Code

In [16]:
import shutil

def save_checkpoint(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [20]:
def training(start_epoch, batch_size, train_loader, val_loader, valid_loss_min_input, model, optimizer, scheduler, 
             num_epochs, device, checkpoint_path, best_model_path):
    
    # initialize tracker for minimum validation loss
    valid_loss_min = valid_loss_min_input
    criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)
    
    trainSteps = len(train_loader) // batch_size
    valSteps = len(val_loader) // batch_size
    
    for epoch in range(start_epoch, num_epochs):
        model.train()
        totalTrainLoss, totalValLoss = 0, 0
        
        for i, (image, caption, length) in tqdm(enumerate(train_loader)):
            sort_ind = torch.argsort(length, descending=True)
            image = image[sort_ind]
            caption = caption[sort_ind]
            length = length[sort_ind]
            image, caption = image.to(device), caption.to(device)

            optimizer.zero_grad()
            
            # Forward, backward and optimize
            scores, caps_sorted, decode_lengths, alphas, sort_ind = model(image, caption, length)

            targets = caps_sorted[:, 1:]

            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]
    
            loss = criterion(scores, targets)
            loss.backward()
            optimizer.step()
            
            # Gather data and report
            totalTrainLoss += loss
            
        avgTrainLoss = totalTrainLoss / trainSteps
        wandb.log({"training_loss": avgTrainLoss})

        # Evaluation Phase
        model.eval()
        with torch.no_grad():
            for i, (image, caption, length) in tqdm(enumerate(val_loader)):
                sort_ind = torch.argsort(length, descending=True)
                image = image[sort_ind]
                caption = caption[sort_ind]
                length = length[sort_ind]
                image, caption = image.to(device), caption.to(device)

                scores, caps_sorted, decode_lengths, alphas, sort_ind = model(image, caption, length)

                targets = caps_sorted[:, 1:]

                scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]
                targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]

                totalValLoss += criterion(scores, targets)

        avgTestLoss = totalValLoss / valSteps   
        
        wandb.log({"val_loss": avgTestLoss})
        
        # Store the state of Model, Optimizer and Scheduler to retrain the model continuosly
        checkpoint = {
            'epoch': epoch,
            'valid_loss_min': avgTestLoss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict(),
        }

        print('Epoch: {} \tTraining Loss: {:.6f} \t Validation Loss: {:.6f}'.format(epoch, avgTrainLoss, avgTestLoss))
        
        # Save the model if validation loss has decreased
        if avgTestLoss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving the model ...'.format(valid_loss_min, avgTestLoss))
            save_checkpoint(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = avgTestLoss
            
    return model

In [21]:
model = training(START_EPOCH, train_loader, val_loader, np.Inf, captioner, optimizer,
                 scheduler, NUM_EPOCHS, device, checkpoint_path, best_model_path)

Epoch: 1 	Training Loss: 0.004079 	Validation Loss: 0.019259
Validation loss decreased (inf --> 0.019259).  Saving model ...
Epoch: 2 	Training Loss: 0.003307 	Validation Loss: 0.017841
Validation loss decreased (0.019259 --> 0.017841).  Saving model ...
Epoch: 3 	Training Loss: 0.003035 	Validation Loss: 0.017300
Validation loss decreased (0.017841 --> 0.017300).  Saving model ...


In [22]:
def load_checkpoint(checkpoint_fpath, model, optimizer, scheduler):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    scheduler.load_state_dict(checkpoint['scheduler'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, scheduler, checkpoint['epoch'], valid_loss_min.item()

In [23]:
optimizer = torch.optim.Adam(captioner.decoder.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=2, verbose=True)

for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

In [24]:
model, optimizer, scheduler, start_epoch, valid_loss_min = load_checkpoint(checkpoint_path, captioner, optimizer, scheduler)

In [27]:
restart_model = training(start_epoch, train_loader, val_loader, valid_loss_min, model, optimizer, scheduler,
                         6, device, checkpoint_path, best_model_path)

Epoch: 3 	Training Loss: 0.002826 	Validation Loss: 0.016918
Validation loss decreased (0.017300 --> 0.016918).  Saving model ...
Epoch: 4 	Training Loss: 0.002674 	Validation Loss: 0.016794
Validation loss decreased (0.016918 --> 0.016794).  Saving model ...
Epoch: 5 	Training Loss: 0.002544 	Validation Loss: 0.016745
Validation loss decreased (0.016794 --> 0.016745).  Saving model ...
Epoch: 6 	Training Loss: 0.002430 	Validation Loss: 0.016816


### Inference Code

In [None]:
def save_model(model, path):
    """ Saves the model in given path """
    torch.save(model.state_dict(), path)
    
def load_model(path):
    """ Loads Pytorch model from the given path """
    model = CaptionerAttention(ATTENTION_DIM, EMBEDDING_DIM, DECODER_DIM, VOCAB_SIZE).to(device)
    model.load_state_dict(torch.load(path))
    return model.eval()

# save_model(captioner, path)
model = load_model(path)

In [None]:
from matplotlib.pyplot import imread
import skimage.transform

def process_image(image_path):
    # Read image and process
    img = imread(image_path)
    if len(img.shape) == 2:
        img = img[:, :, np.newaxis]
        img = np.concatenate([img, img, img], axis=2)
    img = skimage.transform.resize(img, (256, 256))
    img = img.transpose(2, 0, 1)
    img = torch.FloatTensor(img).to("cuda")
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform = transforms.Compose([normalize])
    image = transform(img)  # (3, 256, 256)

    # Encode
    image = image.unsqueeze(0)  # (1, 3, 256, 256)
    return image


def caption_image_beam_search(model, image_path, word_map, beam_size, device, word2index):
    """
    Reads an image and captions it with beam search.
    :param image_path: path to image
    :param word_map: word map
    :param beam_size: number of sequences to consider at each decode-step
    :return: caption, weights for visualization
    """

    k = beam_size
    vocab_size = len(word_map)
    w2i = word2index
    
    image = process_image(image_path)
    
    encoder_out = model.encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)

    # We'll treat the problem as having a batch size of k
    encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

    # Tensor to store top k previous words at each step; now they're just <start>
    k_prev_words = torch.LongTensor([[w2i["<SOS>"]]] * k).to(device)  # (k, 1)

    # Tensor to store top k sequences; now they're just <start>
    seqs = k_prev_words  # (k, 1)

    # Tensor to store top k sequences' scores; now they're just 0
    top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

    # Tensor to store top k sequences' alphas; now they're just 1s
    seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(device)  # (k, 1, enc_image_size, enc_image_size)

    # Lists to store completed sequences, their alphas and scores
    complete_seqs = list()
    complete_seqs_alpha = list()
    complete_seqs_scores = list()

    # Start decoding
    step = 1
    h, c = model.decoder.init_hidden_state(encoder_out)

    # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
    while True:

        embeddings = model.decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

        awe, alpha = model.decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

        alpha = alpha.view(-1, enc_image_size, enc_image_size)  # (s, enc_image_size, enc_image_size)

        gate = model.decoder.sigmoid(model.decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe

        h, c = model.decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

        scores = model.decoder.fc(h)  # (s, vocab_size)
        scores = F.log_softmax(scores, dim=1)

        # Add
        scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

        # For the first step, all k points will have the same scores (since same k previous words, h, c)
        if step == 1:
            top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
        else:
            # Unroll and find top scores, and their unrolled indices
            top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

        # Convert unrolled indices to actual indices of scores
        prev_word_inds = torch.div(top_k_words, vocab_size, rounding_mode='floor') # (s)
        next_word_inds = top_k_words % vocab_size  # (s)

        # Add new words to sequences, alphas

        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
        seqs_alpha = torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
                               dim=1)  # (s, step+1, enc_image_size, enc_image_size)

        # Which sequences are incomplete (didn't reach <end>)?
        incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != w2i["<EOS>"]]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

        # Set aside complete sequences
        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])
        k -= len(complete_inds)  # reduce beam length accordingly

        # Proceed with incomplete sequences
        if k == 0:
            break
        seqs = seqs[incomplete_inds]
        seqs_alpha = seqs_alpha[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        # Break if things have been going on too long
        if step > 50:
            break
        step += 1

    i = complete_seqs_scores.index(max(complete_seqs_scores))
    seq = complete_seqs[i]
    alphas = complete_seqs_alpha[i]

    return seq, alphas

In [None]:
def inference(image_path, model, vocab, beam_size, device, w2i):
    """ 
    Predicts the caption given an Image
    image: [channels, dim_x, dim_y] --> numpy array
    vocab: vocab_size -> dictionary
    """
    caption = " "
    sequence_ids, alph = caption_image_beam_search(model, image_path, vocab, beam_size, device, w2i)

    for ids in sequence_ids:
        caption = caption + " " + vocab[ids]
    return caption

In [None]:
import random, os
import matplotlib.pyplot as plt

def load_image(image_name, transforms):
    image = Image.open(image_name).convert("RGB")
    tensor_image = transforms(image)
    return image, tensor_image

folder = "../Images"
ids = [i for i in range(5000)]
index = random.choice(ids)

img_path, caption = test_dataset.test_data.image[index], test_dataset.test_data.caption[index]
image_name = os.path.join(folder, img_path)
print(image_name)

image, img_tensor = load_image(image_name, val_transform)
plt.imshow(image)

preds = inference(image_name, model, vocab, 10, device, w2i)

print(f"Expected Caption : {caption}")
print(f"Generated Caption : {preds}")

In [None]:
folder = "../Images"
ids = [i for i in range(5000)]
index = random.choice(ids)

img_path = test_dataset.test_data.image[index]
image_name = os.path.join(folder, img_path)
print(image_name)

captions = test_dataset.test_data.loc[test_dataset.test_data.image == img_path].caption.tolist()

image, img_tensor = load_image(image_name, val_transform)
plt.imshow(image)

preds = inference(image_name, model, vocab, 10, device, w2i)

print(f"Expected Caption/(s) : {captions}")
print(f"\nGenerated Caption : {preds}")

In [None]:
folder = "../Images"
ids = [i for i in range(5000)]
index = random.choice(ids)

img_path = test_dataset.test_data.image[index]
image_name = os.path.join(folder, img_path)
print(image_name)

captions = test_dataset.test_data.loc[test_dataset.test_data.image == img_path].caption.tolist()

image, img_tensor = load_image(image_name, val_transform)
plt.imshow(image)

preds = inference(image_name, model, vocab, 10, device, w2i)

print(f"Expected Caption/(s) : {captions}")
print(f"\nGenerated Caption : {preds}")

### Compute Metrics for Model Evaluation

In [None]:
from torchmetrics import BLEUScore
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.multimodal import CLIPScore

r_scorer = ROUGEScore(use_stemmer=True, rouge_keys=('rouge1', 'rouge2', 'rougeL'))

clip = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")

bleu1 = BLEUScore(n_gram=1, weights=[1])
bleu2 = BLEUScore(n_gram=2, weights=[0.5, 0.5])
bleu3 = BLEUScore(n_gram=3, weights=[0.33, 0.33, 0.33])
bleu4 = BLEUScore(n_gram=4, weights=[0.25, 0.25, 0.25, 0.25])

def rouge_score(prediction, reference):
    p1, p2, pL, r1, r2, rL, f1, f2, fL = 0, 0, 0, 0, 0, 0, 0, 0, 0

    for ref in reference:
        score = r_scorer(prediction, ref)
        precision1, recall1, fmeasure1 = score["rouge1_precision"], score["rouge1_recall"], score['rouge1_fmeasure']
        precision2, recall2, fmeasure2 = score["rouge2_precision"], score["rouge2_recall"], score['rouge2_fmeasure']
        precisionL, recallL, fmeasureL = score["rougeL_precision"], score["rougeL_recall"], score['rougeL_fmeasure']
        
        p1, p2, pL, r1, r2, rL, f1, f2, fL = max(precision1, p1), max(precision2, p2), max(precisionL, pL), max(recall1, r1), max(recall2, r2), max(recallL, rL), max(fmeasure1, f1), max(fmeasure2, f2), max(fmeasureL, fL)
    
    return [p1, p2, pL, r1, r2, rL, f1, f2, fL]

def eval_metrics(image, reference, prediction):
    bleu_1 = bleu1(prediction, reference)
    bleu_2 = bleu2(prediction, reference)
    bleu_3 = bleu3(prediction, reference)
    bleu_4 = bleu4(prediction, reference)
    rouge = rouge_score(prediction, reference)
    clip_score = clip(image, prediction[0])
    return rouge, bleu_1, bleu_2, bleu_3, bleu_4, clip_score.detach().cpu().numpy()

In [None]:
folder = "../Images"

import time

start = time.time()

m, rL, b1, b2, b3, b4, cs = [], [], [], [], [], [], []
for index in range(100):
    img_path = test_dataset.test_data.image[index]
    image_name = os.path.join(folder, img_path)
    image_tensor = test_dataset[index][0]

    captions = test_dataset.test_data.loc[test_dataset.test_data.image == img_path].caption.tolist()
    captions_mod = [sent for sent in captions]

    image, img_tensor = load_image(image_name, val_transform)
    preds = inference(image_name, model, vocab, 5, device, w2i)
    
    preds = preds.split(" ")[3:-1]
    preds = [" ".join(preds)]
    rouge, b_1, b_2, b_3, b_4, c_score = eval_metrics(image_tensor, [captions_mod], preds)
    rL.append(rouge[-1]), b1.append(b_1), b2.append(b_2), b3.append(b_3), b4.append(b_4), cs.append(c_score)
    
end = time.time()

print(f"Time Elaspsed : {end-start}")
print(f"Rouge Score on Test Set is :  {round(np.mean(rL),2)}")
print(f"BLEU-1 Score on Test Set is : {round(np.mean(b1),2)}")
print(f"BLEU-2 Score on Test Set is : {round(np.mean(b2),2)}")
print(f"BLEU-3 Score on Test Set is : {round(np.mean(b3),2)}")
print(f"BLEU-4 Score on Test Set is : {round(np.mean(b4),2)}")
print(f"CLIP Score on Test Set is : {round(np.mean(cs),2)}")