Very minor differences in this model are 
1. The model dimensions are bigger. Batch size is 32.
2. The threshold is 4.
 def deep_output_layer(self, embedded_caption, h, context_vector):
        """
        :param embedded_caption: embedded caption, a tensor with shape (batch_size, embed_dim)
        :param h: hidden state, a tensor with shape (batch_size, decoder_dim)
        :param context_vector: context vector, a tensor with shape (batch_size, encoder_dim)
        :return: output
        """
        # Deep output is essentially multilayer perceptron for output
        dropout = nn.Dropout(0.2)
        scores = self.relu(dropout(self.L_h(h)))
        scores = (self.fc(h))
        return scores

In [75]:
import pandas as pd
data_location =  "../input/flickr-image-dataset/flickr30k_images/"
caption_file = '../input/captionstxt/captions.txt'


## Custom dataset loader

In [76]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [77]:
import os
import pandas as pd
import spacy # for tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms

# python -m spacy download en
spacy_eng = spacy.load('en_core_web_sm')

class Vocab_Builder:
    
    def __init__ (self,freq_threshold):

        # freq_threshold is to allow only words with a frequency higher 
        # than the threshold

        self.itos = {0 : "<PAD>", 1 : "<SOS>", 2 : "<EOS>", 3 : "<UNK>"}  #index to string mapping
        self.stoi = {"<PAD>" : 0, "<SOS>" : 1, "<EOS>" : 2, "<UNK>" : 3}  # string to index mapping
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_eng(text):
        #Removing spaces, lower, general vocab related work

        return [token.text.lower() for token in spacy_eng.tokenizer(text)]
    
    def build_vocabulary(self, sentence_list):
        frequencies = {} # dict to lookup for words
        idx = 4

        # FIXME better ways to do this are there
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1 
                if(frequencies[word] == self.freq_threshold):
                    #Include it
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    
    # Convert text to numericalized values
    def numericalize(self,text):
        tokenized_text = self.tokenizer_eng(text) # Get the tokenized text
        
        # Stoi contains words which passed the freq threshold. Otherwise, get the <UNK> token
        return [self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
        for token in tokenized_text ]
    
    def denumericalize(self, tensors):
        text = [self.itos[token] if token in self.itos else self.itos[3]]
        return text

class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, test, transform = None, freq_threshold = 5):
        self.root_dir = root_dir
        self.df = pd.read_csv(caption_file , delimiter='|')
        self.transform = transform
        # Get images, caption column from pandas
        self.split_factor = 153915 # 4000/ 5 = reserving 200 images for testing
        
        self.imgs = self.df["image_name"]
        self.imgs_test = self.imgs[self.split_factor:]
        self.imgs = self.imgs[0:self.split_factor]
        self.captions = self.df["caption_text"]
        self.captions_test = self.captions[self.split_factor:]
        self.captions = self.captions[0:self.split_factor]
        self.test = test
        #Init and Build vocab
        self.vocab = Vocab_Builder(freq_threshold) # freq threshold is experimental
        self.vocab.build_vocabulary(self.captions.tolist())

    def __len__(self):
        if (self.test == True):
            return len(self.imgs_test)
        
        return len(self.imgs)
    
    def __getitem__(self, index: int):
        
        if self.test == False:
            caption = self.captions[index]
            img_id = self.imgs[index]
        elif self.test == True:
            index += self.split_factor
            caption = self.captions_test[index]
            img_id = self.imgs_test[index]
            
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
        if self.transform is not None:
            img = self.transform(img)
        
        numericalized_caption = [self.vocab.stoi["<SOS>"]] #stoi is string to index, start of sentence
        numericalized_caption += self.vocab.numericalize(caption) # Convert each word to a number in our vocab
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        #return tensor
        
        return img, torch.tensor(numericalized_caption)
    
    @staticmethod
    def evaluation(self, index : int):
        caption = self.captions_test[index]
        img_id = self.imgs_test[index]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)
        caption = self.vocab.tokenizer_eng(caption)
        return img, caption
# Caption lengths will be different, in our batch all have to be same length


'''
Goes to the dataloader
'''
class Collate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        
        imgs = torch.cat(imgs, dim=0)
        
        targets = [item[1] for item in batch]
        
        lengths = torch.tensor([len(cap) for cap in targets]).long()
        
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad_idx)
        
        return imgs, targets, lengths

# caption file, Maybe change num_workers

def get_loader( root_folder,annotation_file,  transform, batch_size = 32,  num_workers = 8, shuffle = True, pin_memory = False, test = False):
    


    dataset =  FlickrDataset(root_folder,  annotation_file, test, transform = transform)
    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset = dataset,
        batch_size = batch_size,
        num_workers = num_workers,
        shuffle = shuffle,
        pin_memory = pin_memory,
        collate_fn =  Collate(pad_idx = pad_idx)
    )

    return loader, dataset

In [78]:
# vocabulary = Vocab_Builder(freq_threshold = 4)

# df.columns = ['image_name', 'comment_number', 'comment']
# captions = df['comment']
# vocabulary.build_vocabulary(captions.tolist())
# print(len(vocabulary))

In [79]:
# import pickle
# with open('vocab.pickle', 'wb') as f:
#     pickle.dump(vocabulary, f, protocol=pickle.HIGHEST_PROTOCOL)

In [80]:
import torch
import torch.nn as nn
import torchvision.models as models
# resnet50 = models.resnet50(pretrained = True)
import torch.nn.functional as F



In [81]:
class BahdanauAttention(nn.Module):
    
    '''
    Soft attention which is deterministic in nature. First introducted in 
    the paper Neural Machine Translation by Jointly Learning to Align and Translate (Bahdanau Et Al)
    '''
    
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        
        # Get the L attention dimension vector using this. Pass through softmax to get the 
        # score
        super(BahdanauAttention, self).__init__()
        
        self.attention = nn.Linear(attention_dim, 1)

        self.softmax = nn.Softmax(dim=1)
        
        self.relu = nn.ReLU()
        
        self.encoder_to_attention_dim = nn.Linear(encoder_dim, attention_dim)
        
        self.decoder_to_attention_dim = nn.Linear(decoder_dim, attention_dim)
        
        self.dropout = nn.Dropout(0.5)
        
        self.tanh = nn.Tanh()
        
    def forward(self, encoder_output, hidden_states):
        
        '''
        encoder_output : shape (batch_size, L, D)
        decoder_output : shape (batch_size, hidden_state dimension) 
        
        '''
      
        
        encoder_attention = self.encoder_to_attention_dim(encoder_output) # (batch_size, L, attention_dim)
        
        decoder_attention = self.decoder_to_attention_dim(hidden_states) # (batch_size, attention_dim)
        
        # Torch.cat() ?? 
        # >>> a = torch.cat((encoder,decoder.unsqueeze(1)),dim=1)
        # No, its actually adds the dim = 1 (Adds one more item in dim = 1)
        # We just want to add.
        
        
        #   (batch_size, L, attention_dim) + (batch_size, 1, attention_dim) 
        encoder_decoder = encoder_attention + decoder_attention.unsqueeze(1)  # (batch_size, L, attention_dim)
        
        encoder_decoder = self.tanh(encoder_decoder)
        
        attention_full = (self.attention(encoder_decoder)).squeeze(2) # (batch_size, L)
        
        alpha = self.softmax(attention_full) # Take the softmax across L(acc to paper)
        
        
        '''
        Equation 13 in the paper - classic Bahdanau attention
        '''
        
        z = (encoder_output * alpha.unsqueeze(2) ).sum(dim = 1) # Sum across L (pixels)
        
        return z, alpha
        
        


In [82]:


# Major changes include the ignoring of the last two layers. Author use a lower layer for more dense features.


class EncoderCNN(nn.Module):
   
    '''Takes in the image, encode it in shape (L,D) and return to decoder
        
        "The extractor produces L vectors, each of which is
        a D-dimensional representation corresponding to a part of
        the image"
        
    '''

    def __init__(self, encoded_size=14, train_CNN = False):
        
        super(EncoderCNN, self).__init__()
        
        # Fine-tune parameter
        self.train_CNN = train_CNN
        
        self.encoded_size =encoded_size
        
        # Load the resnet
#         self.resnet50 = models.resnet50(pretrained=True)
        self.resnet50 = models.resnet50(pretrained = False)
        
        # Remove adaptive pool and FC from the end
        layers_to_use = list(self.resnet50.children())[:-3]
        
        # Unpack and make it the conv_net
        self.resnet = nn.Sequential(*layers_to_use)
        
#         self.fc = nn.Linear(in_features,encoded_size)
        
        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_size, encoded_size))
        
        self.relu = nn.ReLU()
        
        self.dropout = nn.Dropout(0.5)
        
        if not train_CNN:
            for param in self.resnet.parameters():
                param.requires_grad = False
           
        
    def forward(self, images):
         
        # images.shape (batch_size, 3, image_size, image_size)    
            
        # Change the image_size dimensions. Check them yourself.
        batch_size = images.shape[0]
        
        with torch.no_grad():
            features = self.resnet(images) # Shape : (batch_size, encoder_dim, image_size/32, image_size/32)
        
        features = self.adaptive_pool(features) # Shape (batch_size, encoder_dim, encoded_size, encoded_size)
        
        features = features.permute(0, 2, 3, 1) # Shape : (batch_size, encoded_size, encoded_size, encoder_dim)
        
        # The above transformation is needed because we are going to do some computation in the 
        # decoder.
        encoder_dim = features.shape[-1]
        # When in doubt https://stackoverflow.com/questions/42479902/how-does-the-view-method-work-in-pytorch
        features = features.view(batch_size, -1, encoder_dim)  # (batch_size, L, D)
        
#         print("-" * 80 )
        
#         print("Features shape : " , features.shape)
        
#         print("-" * 80 )
        
        return features
    
    
# In decoder, we use an LSTM cell. So, remove num_layers
# https://stackoverflow.com/questions/57048120/pytorch-lstm-vs-lstmcell
# In seq to seq model, it's more like gettign the state and ending the for loop when 
# you get the <EOS>

class Decoder(nn.Module):
    def __init__(self,encoder_dim, decoder_dim, embed_size, vocab_size, attention_dim, dropout = 0.5):
        
        super(Decoder,self).__init__()
        
        # Setting everything for the perfect model!
        
        self.embed = nn.Embedding(vocab_size, embed_size) # Embedding layer courtesy Pytorch
        
        
        self.encoder_dim = encoder_dim
        
        self.decoder_dim = decoder_dim
        
        self.attention_dim = attention_dim
        
        self.embed_dim = embed_size
        
        self.vocab_size = vocab_size
        
        self.dropout = nn.Dropout(0.2)
        
        # Note, it's an LSTM Cell, features + embedding
        self.lstm = nn.LSTMCell(self.encoder_dim + self.embed_dim, self.decoder_dim, bias=True)
        
        self.attention = BahdanauAttention(encoder_dim, decoder_dim, attention_dim)
        
        self.f_beta = nn.Linear(self.decoder_dim, self.encoder_dim)
        
        self.sigmoid = nn.Sigmoid()
        
        self.relu = nn.ReLU()
        
        # See the paper 
        '''
        The initial memory state and hidden state of the LSTM
        are predicted by an average of the annotation vectors fed.
        through two separate MLPs (init,c and init,h):
        '''
        
        self.init_h = nn.Linear(encoder_dim, decoder_dim)
        self.init_c = nn.Linear(encoder_dim, decoder_dim)
        
        # deep output layers
        self.L_h = nn.Linear(decoder_dim, embed_size)
        # self.L_z = nn.Linear(encoder_dim, embed_size)
        # self.L_o = nn.Linear(embed_size, vocab_size)
        
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(decoder_dim, vocab_size)  # linear layer to find scores over vocabulary
        self.init_weights()
        self.init_weights()
        
    # Encoder output is the annotated vector a (L,D) in the paper
    
    def initialise_hidden_states(self, encoder_output):
        
        '''
        Initialise the hidden states before forward prop. As given in the paper.
        Authors take the mean of annotation vector across L dimension. Pass it through an MLP.
        '''
        
        # encoder_output : shape (batch_size, L, encoder_dim=D)
        
        mean = (encoder_output).mean(dim = 1) # Take mean over L
        
        # Pass through Fully connected
        
        c_0 = self.init_c(mean)
        c_0 = self.tanh(c_0)
        h_0 = self.init_h(mean)
        h_0 = self.tanh(h_0)
        
        return h_0, c_0, 

    def init_weights(self):
        
        # This helps initially. Fill the following weights before starting.
        
        self.embed.weight.data.uniform_(-0.1,0.1)
        self.fc.weight.data.uniform_(-0.1,0.1)
        self.fc.bias.data.fill_(0)
    
    
    # Thankful to sgrvinod for this part. b
    def forward(self, encoder_output, caption, caption_lengths):
        
        '''
        encoder_output : shape(batch_size, L, D)
        caption : (max_length, batch_size )
        
        Get the encoder_output i.e the features.
        '''
        
        batch_size = encoder_output.size(0)
        # num_pixels 
        L = encoder_output.size(1)
        
        max_caption_length = caption.shape[-1] # shape : (batch_size, max_caption) 
        
        caption_lengths, sort_ind = caption_lengths.sort(dim=0, descending=True)
        encoder_output = encoder_output[sort_ind]
        caption = caption[sort_ind]
        
#         print(sort_ind)
        
        lengths = [l - 1 for l in caption_lengths]
        
        embedding_of_all_captions = self.embed(caption)
        
        predictions = torch.zeros(batch_size, max_caption_length - 1, self.vocab_size).to(device)
        alphas = torch.zeros(batch_size, max_caption_length - 1, L).to(device)  
        
        # Concat and pass through lstm to get hidden states
        
        h, c = self.initialise_hidden_states(encoder_output)
        
        
        
        # Exclude <EOS>, t is the th timestep
        # We get all the embeddings for the t timestep
        # Then we get the encoded_output aka annotation vector
        # Use soft attention to get the context vector.
        # Concat and pass through the lstm cell to get hidden states --> predictions
        
#         print(max_caption_length)
        
        for t in range(max_caption_length - 1):
            
            batch_size_t = sum([l > t for l in lengths]) 
            
            # z from the returning function
            context_vector, alpha = self.attention(encoder_output[:batch_size_t], h[:batch_size_t])
            
            # Changes inspirsed from SgdrVinod(Suggested in paper also)
            gate = self.sigmoid(self.f_beta(h[:batch_size_t]))
            
            gated_context = gate * context_vector
#             context_vector : torch.Size([32, 1024]), embedded_caption_t : torch.Size([32, 256])

            h, c = self.lstm(torch.cat([ embedding_of_all_captions[:batch_size_t,t,:], gated_context], dim=1),(h[:batch_size_t], c[:batch_size_t]))
            
            predict_deep = self.deep_output_layer(embedding_of_all_captions[:batch_size_t,t,:], h, context_vector)
            
            predictions[:batch_size_t, t, :] = predict_deep 
            
            alphas[:batch_size_t, t, :] = alpha
            
        return predictions, alphas, caption, lengths
        
        
 
    def deep_output_layer(self, embedded_caption, h, context_vector):
        """
        :param embedded_caption: embedded caption, a tensor with shape (batch_size, embed_dim)
        :param h: hidden state, a tensor with shape (batch_size, decoder_dim)
        :param context_vector: context vector, a tensor with shape (batch_size, encoder_dim)
        :return: output
        """
        # Deep output is essentially multilayer perceptron for output
        scores = self.relu(self.dropout(self.L_h(h)))
        scores = (self.fc(h))
        return scores
    
    def predict_caption(self, encoder_output, captions):
        
        # "<SOS>" 1
        caption_list = [1]
        alphas = [] 
        h, c = self.initialise_hidden_states(encoder_output)
        
        
        # 2 is <EOS>
        while len(caption_list) < 40 :
            word = caption_list[-1]
            
            embedded_caption = self.embed(  torch.LongTensor([word]).to(device)  )  # (1, embed_dim)
            
            context_vector, alpha = self.attention(encoder_output, h);
            
            gate = self.sigmoid(self.f_beta(h))
            
            gated_context = gate * context_vector
            
            h, c = self.lstm(torch.cat([embedded_caption, gated_context], dim=1), (h, c))
            
            predictions = self.deep_output_layer(embedded_caption, h, context_vector)  # (1, vocab_size)
            
            # item converts to python scalar otherwise expect CUDA re-assert trigger
            
            next_word = (torch.argmax(predictions, dim=1, keepdim=True).squeeze()).item()
            
            caption_list.append(next_word)
            
            alphas.append(alpha)
            
            if(caption_list[-1] == 2):
                break
        return caption_list, alphas
      
    def beam_search(self, encoder_output, beam_size = 3):
        
        k = beam_size
        
        vocab_size = self.vocab_size
        
        encoder_size = encoder_output.size(-1)
        
        encoder_output = encoder_output.view(1, -1, encoder_size)
        
        num_pixels = encoder_output.size(1)
        
        encoder_output = encoder_output.expand(k, num_pixels, encoder_size)  # (k, num_pixels, encoder_dim)
        
        # Vocab.stoi(SOS)
        k_prev_words = torch.LongTensor([[1]] * k).to(device) 
        seqs = k_prev_words
        
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)
        
        complete_seqs = list()
        complete_seqs_scores = list()
        
        step = 1
        
        h, c = self.initialise_hidden_states(encoder_output)
        
        while True:
            embedded_caption = self.embed(k_prev_words).squeeze(1)
            
            context_vector, alpha = self.attention(encoder_output, h);
            
            gate = self.sigmoid(self.f_beta(h))
            
            gated_context = gate * context_vector
            
            h, c = self.lstm(torch.cat([embedded_caption, gated_context], dim=1), (h, c))
            
            scores = self.deep_output_layer(embedded_caption, h, context_vector)
            
            scores = F.log_softmax(scores, dim=1)
            
            scores = top_k_scores.expand_as(scores) + scores
            
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, dim=0)  # (s)
            else:
                top_k_scores, top_k_words = scores.view(-1).topk(k, dim=0)  # (s)
                
            prev_word_inds = torch.true_divide(top_k_words , vocab_size).long().cpu()  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)
            
             # Add new words to sequences
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <EOS>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != 2 ] #vocab.itos['<EOS>']
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
                
            seqs = seqs[incomplete_inds]
            
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            
            encoder_output = encoder_output[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            if step > 50:
                break
            step += 1
        
        if len(complete_seqs_scores) > 0:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
            return seq
        else:
            return [1,2]
        return complete_seqs


         

In [83]:
import torch
import torchvision.transforms as transforms
from PIL import Image


def save_checkpoint(state, filename="./LastModel.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    step = checkpoint["step"]
    return step

In [84]:
from matplotlib import pyplot as plt
import random


def show_image(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    plt.imshow(inp)
    
 
    if title is not None:
        plt.title(title)
    plt.savefig('foo.png', bbox_inches='tight')
    
    plt.pause(0.001)  # pause a bit so that plots are updated

# Model parameters

In [85]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
# from convert_text import get_loader
# from utils import *
# from model import CNNtoRNN
# from utils save_checkpoint, load_checkpoint, print_examples


# Training loop


In [87]:


mean = [0.485, 0.456, 0.406]

std = [0.229, 0.224, 0.225]

transform = transforms.Compose(
    [transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.RandomHorizontalFlip(),
    transforms.Normalize(mean, std)]
)
transform_val = transforms.Compose(
    [transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)]
)

train_loader, dataset = get_loader(
    root_folder = data_location+"/flickr30k_images",
    annotation_file = '../input/captionstxt/captions.txt',
    transform = transform, 
    num_workers = 4,
    test = False
)
test_loader, test_dataset = get_loader(
    root_folder = data_location+"/flickr30k_images",
    annotation_file = '../input/captionstxt/captions.txt',
    transform = transform_val, 
    num_workers = 4,
    test = True
)


# Test_dataset gonna come here soon
# Think about that later. We will do some training phases. It will take time but keep your calm.

torch.backends.cudnn.benchmark = True # Get some boost probaby

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = True

save_model = True

train_CNN = False
#Hyperparameters

embed_size = 350
encoder_dim = 1024
decoder_dim = 512
attention_dim = 512
vocab_size = len(dataset.vocab)
learning_rate = 4e-4 # Earlier 4e-4 till epoch 7 and then 2e-4
num_epochs = 15

print("Vocab_size", vocab_size)

# Tensorboard
writer = SummaryWriter("runs/flickr")

step = 0
# init model, loss
encoder = EncoderCNN() # Default arguments already given as encoder_size 14, train_CNN = False
encoder = encoder.to(device)

decoder = Decoder(encoder_dim, decoder_dim, embed_size, vocab_size, attention_dim)    
decoder = decoder.to(device)

print(device)

alpha_c = 1.0  # Not used variable in code, used just 1.0  
# regularization parameter for 'doubly stochastic attention', as in the paper

# https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html
# criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"]).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(decoder.parameters(), lr = learning_rate)

encoder.train()
decoder.train()

if load_model:
    # load the decoder weights
    step = load_checkpoint(torch.load("./Flickr30k_Decoder_11.pth.tar"), decoder, optimizer)
    
    # load the encoder weights
    m_state_dict = torch.load('./resnet5011.pt')
    encoder.load_state_dict(m_state_dict)
    encoder = encoder.to(device)
    
    print(encoder.training, decoder.training)
    
 # Do after epoch 10 .. 14
for epoch in range(11,13):
    if save_model:
        checkpoint = {
            "state_dict" : decoder.state_dict(),
            "optimizer" : optimizer.state_dict(),
            "step" : step
        }
        # This also got saved when epoch 6 started(latest)
        # torch.save(encoder.state_dict(), 'resnet50.pt')
        for param_group in optimizer.param_groups:
            print(param_group['lr'])
            param_group['lr'] = 4e-5
            print("After", param_group['lr'])
        
        
        if epoch > 11:
            filename = './Flickr30k_Decoder_' + str(epoch) +  '.pth.tar'
            print("Saving in ", filename, epoch)
            save_checkpoint(checkpoint, filename)
            torch.save(encoder.state_dict(), 'resnet50' + str(epoch) + '.pt')
    losses = []
    mvl = []
    for idx, (imgs, captions, lengths) in enumerate(train_loader):
#         optimizer.zero_grad() Init config

        imgs = imgs.to(device)
        captions = captions.to(device)
        lengths = lengths.to(device)
        # Pass through the encoder and get the annotation vector
        encoded_images = encoder(imgs)

        scores, alphas, sorted_cap, decode_lengths = decoder(encoded_images, captions, lengths)
        
        # We don't want <SOS>
        sorted_cap = sorted_cap[:,1:] # shape (batch_size, max_caption)
        
        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        
        scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
        targets = pack_padded_sequence(sorted_cap, decode_lengths, batch_first=True).data
        

        # Calculate loss
        loss = criterion(scores, targets)
        
        # batch_size = sorted_cap.shape[0]
        # caption_length = sorted_cap.shape[1]
        # loss = criterion(predictions.view(batch_size * caption_length, -1), sorted_cap.reshape(-1))
        # Doubly stochastic attention regularization but not using the beta so not proper
        loss += 1.0 * ((1. - alphas.sum(dim=1)) ** 2).mean()

        losses.append(loss.item())
        
        decoder.zero_grad()
        encoder.zero_grad()
        
        loss.backward()
        
        if( idx % 200 == 0):
            print("Step", idx, loss.item())

        writer.add_scalar("training loss", loss.item(), global_step = step)

        step += 1

        optimizer.step()

        if (idx)% 2000 == 0:
            print("Epoch: {} loss: {:.5f}".format(epoch,loss.item()))
            encoder.eval()
            decoder.eval()
            with torch.no_grad():
                bleu_score_checker() # BLEU should be done separately
                dataiter = iter(train_loader)
                imgs,captions, lengths = next(dataiter)
                imgs = imgs
                captions = captions.to(device)
                encoded_output = encoder( (imgs[0].unsqueeze(0).to(device)) )

                # Does not make a difference for the caption since we are not using it
                caption_greedy, alphas = decoder.predict_caption(encoded_output, captions)
                caps_greedy =[dataset.vocab.itos[idx] for idx in caption_greedy]
                caption = ' '.join(caps_greedy)
                print("Greedy search", caption)
                
                caption = decoder.beam_search(encoded_output, 3)
                caps = [dataset.vocab.itos[idx] for idx in caption]
                print("Beam search", ' '.join(caps) )

                show_image(imgs[0],title=' '.join(caps))
            decoder.train()
            encoder.train()
            
        # Valid loss
        if (idx ) % 1000 == 0 :
            valid_losses = []
            decoder.eval()
            encoder.eval()
            print("Valid section")
            with torch.no_grad():
                for index, (imgs, captions, lengths) in enumerate(test_loader):
                    imgs = imgs.to(device)
                    captions = captions.to(device)
                    lengths = lengths.to(device)
                    
                    encoded_images = encoder(imgs)
                    scores, alphas, sorted_cap, decode_lengths = decoder(encoded_images, captions, lengths)

                    # We don't want <SOS>
                    sorted_cap = sorted_cap[:,1:] # shape (batch_size, max_caption)

                    scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
                    targets = pack_padded_sequence(sorted_cap, decode_lengths, batch_first=True).data

                    valid_loss = criterion(scores, targets)
                    # batch_size = sorted_cap.shape[0]
                    # caption_length = sorted_cap.shape[1]
                    # valid_loss = criterion(predictions.view(batch_size * caption_length, -1), sorted_cap.reshape(-1))
                    
                    valid_loss += 1.0 * ((1. - alphas.sum(dim=1)) ** 2).mean()
                    
                    valid_losses.append(valid_loss.item())

                    # print("Step", index, valid_loss.item())
            decoder.train()
            encoder.train()
            print("-" * 80)
            mean_valid_loss = sum(valid_losses)/len(valid_losses)
            mvl.append(mean_valid_loss)
            print(mean_valid_loss)        
            print("-" * 80)                        
    
    mean_loss = sum(losses)/len(losses)
    print("-" * 80)
    print("Mean loss", mean_loss)
    print(mvl)
    print("-" * 80)




Vocab_size 7547
cuda
=> Loading checkpoint
True True
4e-05
After 4e-05
Step 0 2.7402756214141846
Epoch: 10 loss: 2.74028


KeyboardInterrupt: 

Epoch 10 --> 11 best till now.
BLEU-1 0.5930380578473908
BLEU-2 0.40598468860863113
BLEU-3 0.2727534747496662
BLEU-4 0.1858028279503759
Nltk metrics
BLEU-1 0.6111496417318056
BLEU-2 0.4183835766419379
BLEU-3 0.28463034547490357
BLEU-4 0.19147730644802838

Torch metrics epoch 10 --> epoch 11
BLEU-1 0.5920705441713625
BLEU-2 0.4084211654090082
BLEU-3 0.28045407870035294
BLEU-4 0.19565882267053974

In between epoch 9
Torch metrics
BLEU-1 0.5865305591713187
BLEU-2 0.4036900261383535
BLEU-3 0.2721799185648416
BLEU-4 0.186270821154543

## Saving resnet weights for deployment purpose

In [13]:
from torchtext.data.metrics import bleu_score

In [14]:

def show_image2(inp, index, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    plt.imshow(inp)
    
 
    if title is not None:
        plt.title(title)
    name  = 'showcase' + str(index) + '.png'
    plt.savefig(name, bbox_inches='tight')
    
    plt.pause(0.001)  # pause a bit so that plots are updated

In [48]:
import nltk
def bleu_score_checker():
    gc = []
    test = []
    # Will execute only 200 times. Inner 5 times
    offset = 153915
    
    for i in range(0,5000,5):
        temp_gc = []
        encoder.eval()
        decoder.eval()
        with torch.no_grad():
            img, caption = dataset.evaluation(dataset,i + offset)
            img = img.unsqueeze(0)
#             print(caption)
            
            encoded_output = encoder(img.to(device))
            
            caps = decoder.beam_search(encoded_output, 3)
            
            caps = [dataset.vocab.itos[idx] for idx in caps]
            
            generated_caption = ' '.join(caps)
#             show_image2(img.squeeze(0),i,title=generated_caption)
            generated_caption = generated_caption.split()[1:]
            generated_caption = generated_caption[:-2]
            test.append(generated_caption)
            temp_gc.append(caption)
        for j in range(1,5):
            img, caption = dataset.evaluation(dataset, i + j + offset)
            temp_gc.append(caption)
        gc.append(temp_gc)
        decoder.train()
        encoder.train()
    print("-" * 80)
    print("Torch metrics")
    print("BLEU-1", bleu_score(test,gc, max_n = 1, weights = [1.0] ) )
    
    print("BLEU-2", bleu_score(test, gc, max_n = 2, weights = [0.5,0.5]))
    
    print("BLEU-3", bleu_score(test, gc, max_n = 3, weights = [1/3,1/3,1/3]))
    
    print("BLEU-4", bleu_score(test, gc, max_n = 4, weights = [0.25,0.25, 0.25, 0.25]))
    
    
    print("-"*80)
    print("Nltk metrics")
    BLEU4 = nltk.translate.bleu_score.corpus_bleu(gc, test,weights=(0.25,0.25,0.25,0.25))
    BLEU1 = nltk.translate.bleu_score.corpus_bleu(gc, test,weights=(1.0,0,0,0))
    BLEU2 = nltk.translate.bleu_score.corpus_bleu(gc, test,weights=(0.5,0.5,0,0))
    BLEU3 = nltk.translate.bleu_score.corpus_bleu(gc, test,weights=(0.33,0.33,0.33,0))
    
    
    print(f"BLEU-1 {BLEU1}")
    print(f"BLEU-2 {BLEU2}")
    print(f"BLEU-3 {BLEU3}")
    print(f"BLEU-4 {BLEU4}")
    
#     print("GC" , gc)
#     print("Predictions", test)
        
    

In [50]:
bleu_score_checker()

--------------------------------------------------------------------------------
Torch metrics
BLEU-1 0.5904854430327624
BLEU-2 0.40363062428799307
BLEU-3 0.2730071269421867
BLEU-4 0.1869351372648633
--------------------------------------------------------------------------------
Nltk metrics
BLEU-1 0.6076569796548017
BLEU-2 0.4153683443070568
BLEU-3 0.28448294552872605
BLEU-4 0.19237128974019688


Epoch 10 - It's good over 5000/5 images

Torch metrics
BLEU-1 0.5921179879859123
BLEU-2 0.40628215260341893
BLEU-3 0.27486911607441983
BLEU-4 0.18863532216414042
Nltk metrics
BLEU-1 0.6074745138570442
BLEU-2 0.41681903972525775
BLEU-3 0.28552963821582755
BLEU-4 0.19352754860699