In [28]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models
from collections import Counter
from skimage import io, transform
from torch.nn.utils.rnn import pack_padded_sequence
import matplotlib.pyplot as plt # for plotting
import numpy as np
from time import time
import collections
import pickle
import os
import nltk
import re
from scipy import ndimage
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize

  from .collection import imread_collection_wrapper
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/prakank/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/prakank/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
class Rescale(object):
    """Rescale the image in a sample to a given size.
    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        h, w = image.shape[:2]
        #print("TA RESCALE INPUT", image.shape)
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        img = transform.resize(image, (new_h, new_w))
        #print("TA RESCALE OUTPUT", image.shape)
        return img

In [5]:
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.transpose((2, 0, 1))
        return torch.tensor(image)

In [6]:
IMAGE_RESIZE = (256, 256)
device = "cuda" if torch.cuda.is_available() else "cpu"
img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ])
print("Current device set to {}".format(device))

Current device set to cpu


In [7]:
class CaptionsPreprocessing:
    """Preprocess the captions, generate vocabulary and convert words to tensor tokens

    Args:
        captions_file_path (string): captions tsv file path
    """
    def __init__(self, captions_file_path):
        self.captions_file_path = captions_file_path
        self.raw_captions_dict = self.read_raw_captions()
        self.captions_dict = self.process_captions()
        self.vocab = self.generate_vocabulary()
    def read_raw_captions(self):
        # Dictionary with raw captions list keyed by image ids (integers)
        captions_dict = {}
        with open(self.captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                image_path = '/content/drive/MyDrive/data/train_data_main/' + img_captions[0]
                
                image_path = '/home/prakank/IIT Delhi/3rd_year/Sem5/COL774_Machine_Learning/COL774-Machine-Learning-Assignments/Assignment-4/data/' + img_captions[0]
                
                if os.path.exists(image_path):
                    captions_dict[img_captions[0]] = img_captions[1]
                
                if len(captions_dict) == 5000:
                    break

        return captions_dict

    def process_captions(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        raw_captions_dict = self.raw_captions_dict

        # Do the preprocessing here
        # Can remove the stopwords and gibberish in the caption
        stop_words = stopwords.words('english')
        self.allowedLength = 7
        punctuation = list(string.punctuation)

        for key, value in raw_captions_dict.items():
            cleaned_caption = re.sub('[^A-Za-z0-9]+', ' ', value) #Extra space removal
            tokens = word_tokenize(cleaned_caption)
            cleaned_tokens = [token for token in tokens if token not in stop_words and token not in punctuation] # Remove stopwords and punctuation
            cleaned_caption = "[START] " + " ".join(cleaned_tokens[:self.allowedLength]) + " [END]"
            raw_captions_dict[key] = cleaned_caption        

        captions_dict = raw_captions_dict

        return captions_dict

    def generate_vocabulary(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        captions_dict = self.captions_dict
        vocabulary = {}
        max_caption = 0
        idx = 1
        index_to_word = {}
        for key, value in captions_dict.items():
            val = value.split()
            max_caption = max(max_caption, len(val))

            for i in val:
                if i not in vocabulary.keys():
                    vocabulary[i] = idx
                    index_to_word[idx] = i
                    idx+=1

        self.max_caption = max_caption
        self.max_caption = (self.allowedLength+2)

        index_to_word[0] = "NIL"

        self.index_to_word = index_to_word
        
        # Generate the vocabulary
        print("Size of Vocabulary = {}".format(len(vocabulary)))
        return vocabulary


    def get_captions(self, tensor_tokens):
        caption = [self.index_to_word[int(x)] for x in tensor_tokens]
        return " ".join(caption)

    def captions_transform(self, img_caption):
        """
        Use this function to generate tensor tokens for the text captions
        Args:
            img_caption_list: List of captions for a particular image
        """
        vocab = self.vocab

        caption = img_caption.split(" ")
        
        
        # print(img_caption, caption)

        caption_mapped = np.zeros(self.max_caption)
        for i in range(len(caption)):
            try: caption_mapped[i] = self.vocab[caption[i]]
            except: print(img_caption, caption, i)

        # caption_mapped = np.zeros((self.max_caption, len(self.vocab)))
        # for i in range(len(caption)):
        #     val = np.zeros(len(self.vocab))
        #     val[self.vocab[caption[i]]] = 1
        #     caption_mapped[i,:] = val 

        #captions_mapped = np.argmax(captions_mapped, axis = 1)
        
        return torch.LongTensor(caption_mapped)

# Set the captions tsv file path

# CAPTIONS_FILE_PATH = '/content/drive/MyDrive/data/train_text.tsv'
#CAPTIONS_FILE_PATH = '/Users/pratyushsaini/Documents/Semester 5/COL774/Assignment-4/Train_text.tsv'

BASE_DIR = '/home/prakank/IIT Delhi/3rd_year/Sem5/COL774_Machine_Learning/COL774-Machine-Learning-Assignments/Assignment-4/'
CAPTIONS_FILE_PATH = os.path.join(BASE_DIR, 'data', 'train_text.tsv')

captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)

Size of Vocabulary = 2512


In [8]:
# print(captions_preprocessing_obj.index_to_word)
# print(captions_preprocessing_obj.vocab)

In [9]:
print(len(captions_preprocessing_obj.index_to_word))

2513


## DataSet Loader

In [10]:
class ImageCaptionsDataset(Dataset):

    def __init__(self, img_dir, captions_dict, img_transform=None, captions_transform=None):
        """
        Args:
            img_dir (string): Directory with all the images.
            captions_dict: Dictionary with captions list keyed by image paths (strings)
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.

            captions_transform: (callable, optional): Optional transform to be applied
                on the caption sample (list).
        """
        self.img_dir = img_dir
        self.captions_dict = captions_dict
        self.img_transform = img_transform
        self.captions_transform = captions_transform

        self.image_ids = list(captions_dict.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = self.image_ids[idx]
        image = io.imread(img_name)
        captions = self.captions_dict[img_name]

        if self.img_transform:
            image = self.img_transform(image)

        if self.captions_transform:
            captions = self.captions_transform(captions)

        sample = {'image': image, 'captions': captions}

        return sample

In [11]:
def collate_fn(batch):
  res = {}
#   print('Initial shape: ',np.asarray(batch).shape)

  res['image'] = [sample['image'].unsqueeze(0) for sample in batch] 

#   print('Res image1:',(res['image']))

  res['image'] = torch.cat((res['image']), dim=0)
  
#   print('Res image2:',res['image'].shape)

  res['captions'] = [sample['captions'] for sample in batch]
  
#   print('Res caption1:',res['captions'].shape)

  res['captions'] = torch.nn.utils.rnn.pad_sequence(res['captions'], batch_first=True)

#   print(res)

  return res

In [12]:
class TestDatasetLoader(Dataset):
    
    def __init__(self, img_dir, img_transform):
        """
        Args:
            img_dir (string): Directory with all the test images.            
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.
        """
        self.img_dir = img_dir
        self.img_transform = img_transform
        
        self.image_ids = ['test_data/test' + str(i) + '.jpg' for i in range(1, 5001)]
        
    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = self.image_ids[idx]
        image = io.imread(img_name)
        
        if self.img_transform:
            image = self.img_transform(image)
        angle_in_degrees = 45

        #output = torch.from_numpy(ndimage.rotate(alpha, angle_in_degrees, reshape=False))
        # sample = {
        #     'top': image,
        #     'left': torch.from_numpy(ndimage.rotate(image, 90, reshape=False)),
        #     'bottom': torch.from_numpy(ndimage.rotate(image, 180, reshape=False)),
        #     'right': torch.from_numpy(ndimage.rotate(image, 270, reshape=False))
        #     }
        sample['image'] = image
        
        return sample

## Model Architecture

In [13]:
# the VGG11 architecture
from torch.nn.utils.rnn import pack_padded_sequence

class Encoder(nn.Module):
    def __init__(self, embed_dim):
        super(Encoder,self).__init__()
        self.in_channels = 3
        self.num_classes = embed_dim
        # convolutional layers 
        self.conv_layers = nn.Sequential(
            nn.Conv2d(self.in_channels, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        # fully connected linear layers
        self.linear_layers = nn.Sequential(
            nn.Linear(in_features=32768, out_features=4096),
            nn.ReLU(),
            nn.Dropout2d(0.5),
            nn.Linear(in_features=4096, out_features=4096),
            nn.ReLU(),
            nn.Dropout2d(0.5),
            nn.Linear(in_features=4096, out_features=embedding_dim)
        )
    def forward(self, x):
        x = self.conv_layers(x)
        # flatten to prepare for the fully connected layers
        #rint(x.shape)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        print(x.shape)
        return x
        
# class Decoder(nn.Module):
#     '''
#     This class represents the Decoder Module which consists of LSTM layers
#     Parameters:
#         embed_size : Embedding dimension of words and images
#         hidden_size : hidden_state dimension of LSTM
#         vocab_size : Length of vocabulary
#         num_layers : Number of LSTM layers
        
#     Input :
#         features : Encoded image features
#         captions : Tokenized training captions
#         lengths: Length of each sequence 
    
#     Output :
#         Outputs probability distribution over vocabulary ( dimension : 1 * vocab_size)
#     '''
#     def __init__(self,embed_dim, lstm_hidden_size, num_layers=1):
#         """Set the hyper-parameters and build the layers."""
#         super(Decoder, self).__init__()
#         self.embed = nn.Embedding(self.vocab_size, embed_size)
#         self.lstm = nn.LSTM(embed_dim, lstm_hidden_size, num_layers, batch_first=True)
#         self.linear = nn.Linear(lstm_hidden_size, self.vocab_size)
#         #self.relu = nn.ReLU(inplace = True)   # Performs worse since LSTM already has sigmoid activation
#         self.dropout = nn.Dropout(p=0.5, inplace = False)
# #        self.init_weights
#         self.vocab_size = len(captions_preprocessing_obj.vocab)
        
#     def init_weights(self):
#         self.embed.weight.data.uniform_(-0.1, 0.1)
#         self.linear.weight.data.uniform_(-0.1, 0.1)
#         self.linear.bias.data.fill_(0)
        
#     def forward(self, features, captions, lengths):
#         """Decode image feature vectors and generates captions."""
#         embeddings = self.embed(captions)   #Embedd tokenized captions into latent space
#         embeddings = torch.cat((features.unsqueeze(1), embeddings), 1) # Concatenate image enocded features with embedded captions
#         #embeddings = self.dropout(embeddings)
#         # Dropout after concatenation leads to better Bleu Score
#         packed_seq = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted= False)
#         hiddens , _ = self.lstm(packed_seq)
#         #outputs = self.linear(hiddens_rasha[0])  #Pass output of lstm through a linear layer to get prob. dist. over vocab
#         outputs = self.linear(self.dropout(hiddens[0]))  #Pass output of lstm through a linear layer to get prob. dist. over vocab
#         return outputs


#     def get_pred(self, features, hidden=None):
#         '''Helper function for max_predictions'''
#         output, hidden = self.lstm(features, hidden)
#         output = self.linear(output.squeeze(1))
#         return output, hidden

class Decoder(nn.Module):
    def __init__(self, embed_dim, lstm_hidden_size, lstm_layers=1):
        super(Decoder, self).__init__()
        self.lstm_hidden_size = lstm_hidden_size
        self.vocab_size = len(captions_preprocessing_obj.vocab)
        # print("VOCAB SIZE = ", self.vocab_size)
        
        self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = lstm_hidden_size,
                            num_layers = lstm_layers, batch_first = True)
        #self.attention = AttentionBlock(embed_dim, lstm_hidden_size, self.vocab_size)
        self.linear = nn.Linear(lstm_hidden_size, self.vocab_size)        
        #self.embed = nn.Embedding.from_pretrained(init_weights)
        self.embed = nn.Embedding(self.vocab_size, embed_dim)
        self.dropout = nn.Dropout(0.5)

        
    def forward(self, image_features, image_captions):
        image_features = image_features.unsqueeze(1)
        embeddings = self.dropout(self.embed(image_captions))
        print("Dimension:",embeddings.shape, image_features.shape)
        embeddings = torch.cat((image_features, embeddings[:,:-1]), dim = 1)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)

        return outputs


# class Decoder(nn.Module):
#     def __init__(self, embed_dim, lstm_hidden_size,lstm_layers=1):
#         super(Decoder, self).__init__()

# #         self.embed.weight.data.uniform_(-0.1, 0.1)
# #         self.linear.weight.data.uniform_(-0.1, 0.1)
# #         self.linear.bias.data.fill_(0)

#         self.lstm_hidden_size = lstm_hidden_size
#         self.vocab_size = len(captions_preprocessing_obj.vocab)

#         self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=lstm_hidden_size, num_layers=lstm_layers, batch_first=True)
        
#         # self.attention = AttentionBlock(embed_dim, lstm_hidden_size, self.vocab_size)

#         self.linear = nn.Linear(lstm_hidden_size, self.vocab_size)
#         self.dropout = nn.Dropout(p=0.4, inplace = False)

#         #self.embed = nn.Embedding.from_pretrained(init_weights)

#         self.embed = nn.Embedding(self.vocab_size, embed_dim)
#         self.init_weights()

#     def init_weights(self):
#         """
#         Initializes some parameters with values from the uniform distribution, for easier convergence.
#         """
#         # self.lstm.weight.data.uniform_(-1,1)
#         self.embed.weight.data.uniform_(-0.1, 0.1)
#         # self.fc.bias.data.fill_(0)
#         # self.fc.weight.data.uniform_(-0.1, 0.1)


#     def forward(self, image_features, image_captions):
#         #print("DECODER INPUT", image_features)
#         # if phase == "Train":
#         #     #print(image)
#         #     image_features = torch.Tensor.repeat_interleave(image_features, repeats=5 , dim=0)

# #         embeddings = torch.cat((features.unsqueeze(1), embeddings), 1) # Concatenate image enocded features with embedded captions
# #         #embeddings = self.dropout(embeddings)
# #         # Dropout after concatenation leads to better Bleu Score
# #         packed_seq = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted= False)
# #         hiddens , _ = self.lstm(packed_seq)
# #         #outputs = self.linear(hiddens_rasha[0])  #Pass output of lstm through a linear layer to get prob. dist. over vocab
# #         outputs = self.linear(self.dropout(hiddens[0]))  #Pass output of lstm through a linear layer to get prob. dist. over vocab
# #         return outputs


#         if not torch.cuda.is_available():
#             image_features = torch.LongTensor(image_features)            

#         image_features = image_features.unsqueeze(1)
    
#         embedded_captions = self.embed(image_captions)
#         embedded_captions = self.dropout(embedded_captions)
        
#         input_lstm = torch.cat((image_features, embedded_captions[:,:-1]), dim = 1) # Teacher Forcing :)

#         #input_lstm = pack_padded_sequence(input_lstm, lengths, batch_first=True, enforce_sorted=False)

#         lstm_outputs, _ = self.lstm(input_lstm)
#         #lstm_outputs = self.linear(lstm_outputs[0]) 
#         # print("lstm_outputs.shape", lstm_outputs.shape)
#         lstm_outputs = self.linear(lstm_outputs) 
        
#         return lstm_outputs

In [14]:
class ImageCaptionsNet(nn.Module):
    def __init__(self, embed_size, hidden_size, num_of_layers):
        super(ImageCaptionsNet, self).__init__()              
        self.Encoder = Encoder(embed_size)
        self.Decoder = Decoder(embed_size, hidden_size, num_of_layers)

    def forward(self, img_batch, cap_batch):
        x = self.Encoder(img_batch)

        if torch.cuda.is_available():
            # x = torch.LongTensor(x)
            x = x.cpu().long().numpy()
            x = torch.LongTensor(x)
            x = x.cuda()
        else:
            x = x.long().numpy()

        try:
            print('Working',x.shape, cap_batch.shape)
            x = self.Decoder(x, cap_batch)
        except:
            print('Error',x.shape, cap_batch.shape)
        return x
    
    # def predict(self, device, test_loader):
    #     self.Encoder.eval()
    #     self.Decoder.eval()

    #     with torch.no_grad():
    #         conc_out = []
    #         captions = []
    #         # conc_label = []

    #         for batch_idx, sample in enumerate(test_loader):
    #             # Move tensor to the proper device
    #             image_batch = sample['image'].float()
                
    #             #image_batch = image_batch.to(device)
    #             # Encode data
    #             encoded_data = self.Encoder(image_batch)
    #             # Decode data
    #             decoded_data = self.Decoder(encoded_data)  # 32*11

    #             conc_out = decoded_data.cpu().numpy()
                                
    #             for i in conc_out:
    #                 captions.append(self.captions_preprocessing_obj.get_caption(i))

    #             print("Batch:{}".format(batch_idx))
                
    #             if batch_idx > 3:
    #                 return captions

    #     return captions
        
device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
# os.chdir('/content/drive/MyDrive/data/train_data_main/')
# IMAGE_DIR = '/content/drive/MyDrive/data/train_data_main/'

# IMAGE_DIR = '/Users/pratyushsaini/Documents/Semester 5/COL774/Assignment-4'

os.chdir(os.path.join(BASE_DIR, 'data'))
IMAGE_DIR = os.path.join(BASE_DIR, 'data')

embedding_dim = 200
units = 512
lstm_layers = 4

net = ImageCaptionsNet(embedding_dim, units, lstm_layers)

if torch.cuda.is_available():
    net = net.to(torch.device("cuda:0"))
else:
    net = net.to(torch.device("cpu"))
# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)
print("Train Dataset loaded")
# Define your hyperparameters
NUMBER_OF_EPOCHS = 5
LEARNING_RATE = 1e-1
BATCH_SIZE = 10
NUM_WORKERS = 0 # Parallel threads for dataloading

loss_function = nn.CrossEntropyLoss(ignore_index=0).to(device)
# optimizer = optim.SGD(list(net.Decoder.parameters()) + list(net.Encoder.parameters()), lr=LEARNING_RATE)

optimizer_encoder = optim.SGD(list(net.Encoder.parameters()), lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer_decoder = optim.Adam(list(net.Decoder.parameters()), lr=LEARNING_RATE)

print("Optimizer loaded")

# Creating the DataLoader for batching purposes
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, collate_fn = collate_fn)
print("Train Loader loaded")

start = time()
loss_list = []
import os

torch.backends.cudnn.benchmark = True

for epoch in range(NUMBER_OF_EPOCHS):
    print("Epoch {}".format(epoch+1))
    iteration = 0
    for batch_idx, sample in enumerate(train_loader):
        net.Encoder.zero_grad()
        net.Decoder.zero_grad()
        
        # optimizer.zero_grad()
        optimizer_encoder.zero_grad()
        optimizer_decoder.zero_grad()

        image_batch, captions_batch = sample['image'], sample['captions']

        #If GPU training required
        image_batch = image_batch.float()
        #captions_batch = captions_batch.float()

        if torch.cuda.is_available():
            image_batch, captions_batch = image_batch.cuda(), captions_batch.cuda()

        try:
            output_captions = net(image_batch, captions_batch) # Batch_size * max_caption_length * vocab_size
            print('Normal',captions_batch.shape, output_captions.shape)    
        except:
            print('Error---',captions_batch.shape, output_captions.shape)    
            
        if len(output_captions.shape) < 3:
            continue


        # print(captions_batch.shape, output_captions.shape)

        if batch_idx > 10 or epoch > 0:
            for i in range(captions_batch.shape[0]):
                if len(captions_batch.shape) > 2:
                    val1 = captions_batch[i,::]
                    val1 = np.argmax(val1, axis=1).numpy()
                else:
                    val1 = captions_batch[i,::]
                
                if torch.cuda.is_available():
                    val1 = val1.cpu().detach().numpy()
                    val2 = output_captions[i,::]
                    val2 = np.argmax((val2.cpu().detach().numpy()), axis=1)
                else:
                    val1 = val1.detach().numpy()
                    val2 = output_captions[i,::]
                    val2 = np.argmax((val2.detach().numpy()), axis=1)

                print(captions_preprocessing_obj.get_captions(val1))
                print(captions_preprocessing_obj.get_captions(val2),'\n')
        
        # Analyze the loss_function
        # Pass the matrices with same dimension


                # val2 = np.argmax((output_captions.detach().numpy())[i,:,:], axis=1)
                # print(captions_preprocessing_obj.get_captions(val1.detach().numpy()))
                # print(captions_preprocessing_obj.get_captions(val2),'\n')
            # print(list(map(lambda x : captions_preprocessing_obj.index_to_word[int(x)], val1)))
            # print(list(map(lambda x : captions_preprocessing_obj.index_to_word[int(x)], val2)))

        # for j in captions_batch[0][i].split():
        #     print(captions_preprocessing_obj.get_captions(caption_batch[i]))
        #     print(captions_preprocessing_obj.get_captions(output_batch[i]))
        #     print((caption_batch[i]))
        #     print((output_batch[i]))

        #print(np.argmax(captions_batch, axis = 2).shape, np.argmax(output_captions.detach().numpy(), axis = 2).shape, image_batch.shape)

        # loss = loss_function(output_captions.reshape(output_captions.shape[0], -1), captions_batch.reshape(captions_batch.shape[0], -1)) # 32 * max_caption_length * vocab_size
        # loss = loss_function(output_captions.reshape(output_captions.shape[0], -1), captions_batch.reshape(captions_batch.shape[0], -1)) # 32 * max_caption_length * vocab_size
        # loss = loss_function(np.argmax(torch.Tensor(np.argmax(output_captions.detach().numpy(), axis = 2))), torch.Tensor(np.argmax(captions_batch, axis = 2))) # 32 * max_caption_length * vocab_size

        # Look into the loss function
        # Output_captions - Batch_size * max_caption_length * vocab_size
        # Captions_batch  - Batch_size * max_caption_length

        # loss = loss_function(output_captions.reshape(-1, output_captions.shape[2]), captions_batch.reshape(-1))
        loss = loss_function(output_captions.reshape(-1, output_captions.shape[2]), captions_batch.reshape(-1))

        loss_list.append(loss.item())
        loss.backward()

        # optimizer.step()
        optimizer_encoder.step()
        optimizer_decoder.step()

        print("Epoch:{}, Iteration: {}, Loss: {}, TimeElapsed: {}Min".format(epoch, iteration+1, round(loss.item(), 2), round((time()-start)/60,2), ))
        iteration+=1

        if (iteration > 15):
            break

# Encoder Output - Batch_size * 

Train Dataset loaded
Optimizer loaded
Train Loader loaded
Epoch 1
torch.Size([10, 200])
Working (10, 200) torch.Size([10, 9])
Error (10, 200) torch.Size([10, 9])
Normal torch.Size([10, 9]) (10, 200)
torch.Size([10, 200])
Working (10, 200) torch.Size([10, 9])
Error (10, 200) torch.Size([10, 9])
Normal torch.Size([10, 9]) (10, 200)
torch.Size([10, 200])
Working (10, 200) torch.Size([10, 9])
Error (10, 200) torch.Size([10, 9])
Normal torch.Size([10, 9]) (10, 200)
torch.Size([10, 200])
Working (10, 200) torch.Size([10, 9])
Error (10, 200) torch.Size([10, 9])
Normal torch.Size([10, 9]) (10, 200)
torch.Size([10, 200])
Working (10, 200) torch.Size([10, 9])
Error (10, 200) torch.Size([10, 9])
Normal torch.Size([10, 9]) (10, 200)
torch.Size([10, 200])
Working (10, 200) torch.Size([10, 9])
Error (10, 200) torch.Size([10, 9])
Normal torch.Size([10, 9]) (10, 200)
torch.Size([10, 200])
Working (10, 200) torch.Size([10, 9])
Error (10, 200) torch.Size([10, 9])
Normal torch.Size([10, 9]) (10, 200)
tor

In [None]:
placeholder = np.zeros(15)
for i in captions_preprocessing_obj.captions_dict.values():
    placeholder[len(i.split(" "))] += 1

for i in placeholder:
    print(round(i,3))

## Prediction

In [None]:
TEST_IMAGE_DIR = '/content/drive/MyDrive/data/test_data/'

test_img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Applied sequentially

# Creating the Dataset
test_dataset = TestDatasetLoader(TEST_IMAGE_DIR, img_transform=test_img_transform)

test_loader  = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=NUM_WORKERS)

#output_caption = net.predict(device, test_loader)

In [None]:
def caption_image(image_feature, max_words=20):
        x = image_feature.unsqueeze(0)
        results = []
        states = None

        with torch.no_grad():
            for i in range(max_words):
                
                hiddens, states = net.Decoder.lstm(x, states)
                decoder_op = net.Decoder.linear(hiddens.squeeze(1))
                # predicted_word = decoder_op.argmax(1) # 1, 7356
                predicted_word = np.argmax(decoder_op, axis=1) # 1, 7356
                decoder_op = decoder_op[0].tolist()
                print(max(decoder_op))
                prob = max(decoder_op)
                x = net.Decoder.embed(predicted_word)
                x = x.unsqueeze(0)
                word = np.argmax(predicted_word[0])
                results.append(word)
                if predicted_word == captions_preprocessing_obj.vocab["[END]"]:
                    break
        print(results)
        caption = [captions_preprocessing_obj.index_to_word[int(i)] for i in  results]
        cap = ' '.join(caption)
        cap = cap.replace("[START]","").replace("[END]","")
        return cap
def to_device(data, device):
        if isinstance(data,(list,tuple)):
            return [to_device(x,device) for x in data]
        return data.to(device)

def max_prediction(encoded_features, decoder_model, max_length=80):
    #word_2_ix, ix_2_word = vocab_dict
    start_token = captions_preprocessing_obj.vocab['[START]']
    end_token = captions_preprocessing_obj.vocab['[END]']
    hidden = None # In the beginning the hidden state is None
    caption_word_id = []
    for i in range(max_length):
        encoded_features = encoded_features.unsqueeze(1)
        if(hidden == None):
            output, hidden = net.Decoder.get_pred(encoded_features)
        else:
            output, hidden = net.Decoder.get_pred(encoded_features, to_device(hidden, device))
    
        _ , predicted_id = output.max(1)
        caption_word_id.append(predicted_id)
        if (predicted_id == end_token):
            break
        encoded_features = decoder_model.embed(predicted_id)
    caption_word_id = torch.stack(caption_word_id, 1)
    return caption_word_id.cpu().numpy()[0]

In [None]:
pred_caps = {}
for batch_idx, sample in enumerate(test_loader):
        print("Image_idx", batch_idx)
        image = sample['image']
        image = image.float()
        img_features = net.Encoder(image)
        print(img_features.shape)
        pred_cap = caption_image(img_features, 11)
        pred_caps[batch_idx] = pred_cap
        print("Predicted",batch_idx, pred_cap)

In [None]:
def test_pred(embed_dim, hidden_dim,num_layer, vocab_len, vocab_dict, train_enc=False):
    encoder_model = net.Encoder(embed_dim, train_enc)
    decoder_model = net.Decoder(embed_dim, hidden_dim, num_layer)
        
    #encoder_model = encoder_model.cuda()
    encoder_model.eval() # Makes the model ready to be used in evaluation by taking care of batch norm and dropout
    #decoder_model = decoder_model.cuda()
    decoder_model.eval()
    
    test_dl = get_test_data()
    private_test_dl = get_private_test_data()
    bleu_score=[]
    hypo_complete = []
    ref = [[],[],[],[],[]]
    #for batchid, (images, img_id) in enumerate(private_test_dl):
    for batch_idx, sample in enumerate(test_loader):
        image = sample['image']
        images = images.cuda()
        encode_feat = encoder_model(images.float(), train_enc)
        #output_bs_pred =  beam_search_pred(encode_feat_rasha, decoder_model, vocab_dict, bw)
        output = max_prediction(encode_feat, decoder_model,  max_length=80):
        output_converted = [captions_preprocessing_obj.index_to_word[int(x)] for x in output]
        #hypo = convert_rasha_max_pred(output_bs_pred, ix_2_word)
        #private.write(f'{(img_id[0])}\t {" ".join(hypo)}\n') 
        print(output_converted)

    # sacre_bleu = sacrebleu.corpus_bleu(hypo_complete, ref).score
    # print(f"BLEU SCORE on public test data by beam search : {np.mean(np.array(bleu_score))}")
    # print(f"SACREBLEU SCORE by beam search : {sacre_bleu}")
    # return np.mean(np.array(bleu_score))

In [16]:
a =  np.arange(100).reshape(5,20)