In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install nltk==3.4.5

In [None]:
import os
import numpy as np
import joblib
import cv2
from google.colab.patches import cv2_imshow

from torch.nn.utils.rnn import pack_padded_sequence

import torch
import torchvision
from torchvision.transforms import ToTensor, Normalize, Compose, Resize
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score
import nltk.translate.bleu_score
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

In [None]:
training_data_images_path= '/content/drive/MyDrive/Deep Learning/A4/Data/Train/Images'
training_data_images_captions= joblib.load('/content/drive/MyDrive/Deep Learning/A4/Data/Train/train_captions.pkl')
training_data_images_captions = {k: training_data_images_captions[k] for k in list(training_data_images_captions)[:3000]}

val_data_images_path= '/content/drive/MyDrive/Deep Learning/A4/Data/Val/Images'
val_data_images_captions= joblib.load('/content/drive/MyDrive/Deep Learning/A4/Data/Val/val_captions.pkl')
# val_data_images_captions = {k:val_data_images_captions[k] for k in list(val_data_images_captions)[:64]}


def generate_vocabulary(data):

    vocabulary=[]
    for _,caption_list in data.items():
        for caption in caption_list:

            if caption[0]=="\"":
                caption=caption[1:]
            if caption[-1]=="\"":
                caption=caption[:-1]

            vocab=nltk.word_tokenize(caption)
            # print(vocab)
            vocabulary+=vocab

    vocabulary=list(set(vocabulary))
    vocabulary2=['<start>','<end>','<pad>','<unk>']
    vocabulary=vocabulary2+vocabulary

    vocab_dict={}
    cnt=0
    for word in vocabulary:
        vocab_dict[word]=cnt
        cnt+=1
    # print(vocab_dict)         
    return vocab_dict

def pre_process_captions(data,vocab_dict):
    caption_in_no_list=[]
    max_length=0
    caption_max = 'hello'

    for img_ids,caption_list in data.items():
       caption_no_list=[] 
       for caption in caption_list:

           if caption[0]=="\"":
               caption=caption[1:]
           if caption[-1]=="\"":
               caption=caption[:-1]

           words=nltk.word_tokenize(caption)

           if (len(words)>max_length):
             max_length = len(words)
             caption_max = caption

           no_list=[vocab_dict['<start>']]
           for word in words:
               if word in vocab_dict.keys():
                 no=vocab_dict[word]
               else:
                 no=vocab_dict['<unk>']
      
               no_list.append(no)
           no_list.append(vocab_dict['<end>'])
           
           caption_no_list.append(no_list)
       caption_in_no_list.append(caption_no_list)

    max_length+=2   
    print("max length",max_length)
    print(caption_max)
    # for img_no in range(len(caption_in_no_list)):
    # count=0
    for five_captions in range(len(caption_in_no_list)):
      # count = count+1
      # print(count)
      # count1= 0
      for each_caption in range(len(caption_in_no_list[five_captions])):
        # count1 +=1
        # print("count1",count1)
        # print(caption_in_no_list[img_no][five_captions])
        num_pad= max_length - len(caption_in_no_list[five_captions][each_caption])
        # print("num_pad",num_pad)
        # num_count=0
        for i in range(num_pad):
          # num_count +=1
          # print("num_count",num_count)
          caption_in_no_list[five_captions][each_caption].append(vocab_dict['<pad>'])
          
    
    return torch.FloatTensor(caption_in_no_list),max_length

def caption_sentence_torch(data,max_length):
    caption_sentence_list=[]

    for img_ids,caption_list in data.items():
       caption_lists=[] 
       for caption in caption_list:
           if caption[0]=="\"":
               caption=caption[1:]
           if caption[-1]=="\"":
               caption=caption[:-1]

           caption='<start> '+caption+' <end>'
           len_of_this_current_caption=len(caption.split(' '))
           no_pad_reqd=max_length-len_of_this_current_caption

           pad_str=' <pad>'*no_pad_reqd
        #    print(no_pad_reqd)
           caption=caption+pad_str
           caption_lists.append(caption)

       caption_sentence_list.append(caption_lists) 

    return caption_sentence_list
    
vocab_dict= generate_vocabulary(training_data_images_captions)
# joblib.dump(vocab_dict,'/content/drive/MyDrive/Deep Learning/A4/models/vocab_dict')

train_caption_in_no_tensor,max_length= pre_process_captions(training_data_images_captions,vocab_dict)
# train_caption_sentences_list=caption_sentence_torch(training_data_images_captions,max_length)

val_caption_in_no_tensor,max_length_val= pre_process_captions(val_data_images_captions,vocab_dict)
# val_caption_sentences_list=caption_sentence_torch(val_data_images_captions,max_length_val)

In [None]:
print(train_caption_in_no_tensor.shape)

In [None]:
print(val_caption_in_no_tensor.shape)

In [None]:
# print(len(training_data_images_captions))

# Size of x_train= (6000, H, W)  but resize them first while uploading and prprocessing
# Size of y_train= (6000, 5 )

                   #########  DONT RUN UNLESS NEEDED ####  (takes time)

def pre_process_and_load(image_path,captions):
    images_list=os.listdir(image_path)

    x_train=[]
    y_train=[]
    cnt=0
    for image_id,caption_list in captions.items():
        # if (cnt>10):
        #     break
        # print(" image Id ==> ",image_id ,"\ncaption_list===> ",caption_list)

        print(cnt)
        if(image_id in images_list):
            # print("hello image is present")
            img_path_to_read=image_path+ '/' +image_id
            img= cv2.imread(img_path_to_read)
            img=cv2.resize(img,(224,224))
            # cv2_imshow(img)

            img=img/255.         
            img=img.astype('float32')
            img=torch.tensor(img).permute((2,0,1))
            x_train.append(img)
            y_train.append(caption_list)

        cnt+=1
    return x_train,y_train


In [None]:

    

x_train, y_train=  pre_process_and_load(training_data_images_path,training_data_images_captions)  
# x_train is list of tensor images
#y_train is list of list of list of str
x_val  ,y_val=   pre_process_and_load(val_data_images_path,val_data_images_captions)  



x_train_tensor= torch.stack(x_train)
y_train=y_train
# x_train_tensor is tensor of tensor images
x_val_tensor= torch.stack(x_val)
y_val=y_val

###################################################
# train_ds = list(zip(x_train_tensor,y_train))
# val_ds = list(zip(x_val_tensor,y_val))
###################################################

train_ds = list(zip(x_train_tensor,train_caption_in_no_tensor))
val_ds = list(zip(x_val_tensor,val_caption_in_no_tensor))




In [None]:
#################### shifting my training and val data to GPU  ###########################
#check whether cuda is available
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)
    
class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)
    def __len__(self):
        return len(self.dl)

In [None]:
from torch.utils.data import DataLoader
batch_size = 32
train_dataset = DataLoader(train_ds, batch_size,shuffle=True)
val_dataset = DataLoader(val_ds,batch_size)

train_dataset = DeviceDataLoader(train_dataset,torch.device('cuda'))  
val_dataset = DeviceDataLoader(val_dataset,torch.device('cuda'))

In [None]:
def save_model(model,file_name):
  pass
#   torch.save(model.state_dict(),'/content/drive/My Drive/Deep Learning/A4/models/'+file_name+'_cp.pth')
    
def load_model(file_name):
  return torch.load('/content/drive/My Drive/Deep Learning/A4/models/'+file_name+'_cp.pth')

In [None]:
from torchvision.models import vgg19


class Encoder(nn.Module):
    def __init__(self, network='vgg19'):
        super(Encoder, self).__init__()
        self.network = network
        self.net = vgg19(pretrained=True)
        self.net = nn.Sequential(*list(self.net.features.children())[:-1])
        self.dim = 512

    def forward(self, x):
        # print("x==>",x)
        x= self.net(x)
        # print(x)
        # print(x.shape)

        x = x.permute(0, 2, 3, 1)
        # print(x.shape)
        x = x.view(x.size(0), -1, x.size(-1))
        # print("x shape",x.shape)
        return x


# test_img=x_train_tensor[0].reshape(1,3,227,227)
# model1= Encoder()
# img_features=model1(test_img)
# print(img_features.shape)
# # print(model)


In [None]:
class Attention(nn.Module):
    def __init__(self, encoder_dim):
        super(Attention, self).__init__()
        self.U = nn.Linear(512, 512)   #DECODER 
        self.W = nn.Linear(encoder_dim, 512)  # ENCODER 
        self.tanh = nn.Tanh()

        self.v = nn.Linear(512, 1)
        self.softmax = nn.Softmax(1)

    def forward(self, img_features, hidden_state):  # image features is the 196 hidden rep.   # hidden state is the last pixels hidden rep. OR So
        # print("img_feature",img_features.shape)
        # print("hidden_state",hidden_state.shape)
        # U_h = self.U(hidden_state).unsqueeze(1)  # hidden state of decoder basically 'S'
        # print(U_h.shape)
        
        U_h = self.U(hidden_state).unsqueeze(1)
        # print("u",U_h.shape)

        
        W_s = self.W(img_features)    #Encoder
        # print(W_s.shape)

        att =self.tanh(W_s + U_h)
        e = self.v(att).squeeze(2)

        # print("e ",e.shape)

        alpha = self.softmax(e)
        # print("alpha sum:",torch.sum(alpha))
        # print("alpha shape",alpha.shape)
        # print("alpha shape unsqueeze(2) ",alpha.unsqueeze(2).shape)
        # print(img_features.shape)
        context = (img_features * alpha.unsqueeze(2)).sum(1)
        # print(context.shape)
        return context, alpha


# model2=Attention(512)
# ans0,ans1= model2(img_features,img_features[0:1,195,:])
# print("context :",ans0.shape)
# print("alpha :",ans1.shape)

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocabulary_size, encoder_dim):#, tf=False):

        super(Decoder, self).__init__()
        # self.use_tf = tf

        self.vocabulary_size = vocabulary_size
        self.encoder_dim = encoder_dim

        self.init_h = nn.Linear(encoder_dim, 512)
        self.init_c = nn.Linear(encoder_dim, 512)
        self.tanh = nn.Tanh()

        self.f_beta = nn.Linear(512, encoder_dim)
        self.sigmoid = nn.Sigmoid()

        self.deep_output = nn.Linear(512, vocabulary_size)
        self.dropout = nn.Dropout(p=0.2)

        self.attention = Attention(encoder_dim)
        self.embedding = nn.Embedding(vocabulary_size, 512)
        self.lstm = nn.LSTMCell(512 + encoder_dim, 512)
        self.softmax = nn.Softmax()

    def forward(self, img_features, captions,max_len, flag=True):

        # print("caption shape", captions.shape)
        # print("image features shape===>",img_features.shape)
        batch_size = img_features.size(0)

        h, c = self.get_init_lstm_state(img_features)
        max_timespan =max_len # max([len(caption) for caption in captions]) - 1
        # print("max_timespan===>",max_timespan)

        prev_words = torch.zeros(batch_size, 1).long().cuda()
        # print("prev_words ka shape",prev_words.shape)
        embedding = self.embedding(prev_words)
        # print("Embedding ==>",embedding.shape)


        # print("batch size ==>",batch_size)
        preds = torch.zeros(batch_size, max_timespan, self.vocabulary_size).cuda()
        alphas = torch.zeros(batch_size, max_timespan, img_features.size(1)).cuda()

        

        # test=captions[:,1,:1]
        # print("test type",test.type())
        # print("prev words type",prev_words.type())
        # print("test type",test.shape)
        # print("prev words type",prev_words.shape)

        embedding = self.embedding(captions[:,0,0:1].long()) if flag==True else self.embedding(prev_words)

        # embedding = self.embedding(captions[:,1,:1].long()) if flag==True else self.embedding(prev_words)
        # print("embedding-->",embedding)
        # print("embedding shape-->",embedding.shape)
        
        for t in range(max_timespan):
            # print("t==>",t)
            context, alpha = self.attention(img_features, h)
            gate = self.sigmoid(self.f_beta(h))
            gated_context = gate * context

            # gated_context=context #i added

            # if(self.training and t!=0):
            #     use  TF

            if (t!=0):# and self.training == True):
                # print("hi1")                    
                # print(preds[:,t-1])   #179 values ka list (softmax)
                # print("hi   ",preds[:,t-1].shape) # 32,179
                maximum_index_in_pred=torch.argmax(preds[:,t-1,:], dim=1)   #(32 size ka ek list)
                # maximum_index_in_pred=maximum_index_in_pred.tolist()
                
                # print("hi2")
                # maximum_index_in_pred=np.array(maximum_index_in_pred)
                maximum_index_in_pred=maximum_index_in_pred.reshape(maximum_index_in_pred.shape[0],1)
                # print("maximum_index_in_pred  =>",maximum_index_in_pred)

                # print("hi3")
                # embedding= self.embedding(maximum_index_in_pred)


                embedding = self.embedding(captions[:,0,t-1:t].long()) if flag==True else self.embedding(maximum_index_in_pred)
                # print("embedding ka shape", embedding.shape)

                # print("hi4")

            # print("embedding shape ",embedding.shape)
            embedding=embedding.reshape(embedding.shape[0],embedding.shape[2])
            # print("e ",embedding.shape)


            lstm_input = torch.cat((embedding, gated_context), dim=1)


            h, c = self.lstm(lstm_input, (h, c))
            output1 = self.deep_output(self.dropout(h))
            # output = self.softmax(output1)
            # print("output train,",output.shape)   #  batch size , vocab size  =(32 , 674 )

            preds[:, t] = output1     # (32, 26, 179)
            alphas[:, t] = alpha

            # if flag==False:
            #   embedding = self.embedding(output.max(1)[1].reshape(batch_size, 1))


            # if not self.training :
            #     print(output.max(1)[1])
            #     embedding = self.embedding(output.max(1)[1].reshape(batch_size, 1))

        # print("preds decoder",preds.shape)
        # print("alphas decoder",alphas.shape)
        return preds, alphas

    def get_init_lstm_state(self, img_features):
        # print("img_features shape decoder",img_features.shape)
        avg_features = img_features.mean(dim=1) #img_features: 1,196,512
        # print("avg_features.shape",avg_features.shape) #avg_features.shape: 1,512

        c = self.init_c(avg_features)
        c = self.tanh(c)

        h = self.init_h(avg_features)
        # print("h.shape",h.shape)
        h = self.tanh(h)

        return h, c

# model3 = Decoder(len(vocab_dict),512)
# # print(y_train[0])
# preds, alphas = model3(img_features,y_train[0])

In [None]:
# from torch.nn.utils.rnn import pack_padded_sequence
"""
def calc_loss(pred_for_29_timestamps,targets_for_29_timestamps,max_len):  #(10,29)   (10,29)

    cross_entropy_loss=nn.CrossEntropyLoss().cuda()
    # print("pred_for_29_timestamps shape", pred_for_29_timestamps.shape)
    # print("targets_for_29_timestamps shape", targets_for_29_timestamps.shape)
    inner_loss=0
    for i in range(max_len):
        inner_loss+=cross_entropy_loss(pred_for_29_timestamps[:,i,:].float(),targets_for_29_timestamps[:,i].long())
        
        # break
    # print(loss)
    return inner_loss/max_len    
"""

def validation_step( batch,encoder,decoder,val_max_len,vocab_dict):

    cross_entropy_loss=nn.CrossEntropyLoss().cuda()
    val_loss=0

    imgs, captions = batch 
    imgs = imgs.cuda()
    captions = captions.cuda()
    img_features = encoder(imgs)
    preds, alphas = decoder(img_features, captions ,val_max_len, flag=False)
    

    targets = captions[:,:]
    preds_sentences=   generate_sentence_from_preds(preds,vocab_dict,val_max_len)     
    captions_sentences_list = generate_sentence_from_targets(targets ,vocab_dict, val_max_len)
    bleu_score_values = bleu_score( captions_sentences_list ,preds_sentences)
    bleu_score_val1 = bleu_score_values[0]
    bleu_score_val2 = bleu_score_values[1]
    bleu_score_val3 = bleu_score_values[2]
    bleu_score_val4 = bleu_score_values[3] 
    meteor_score_val = meteor_score_calc( captions_sentences_list ,preds_sentences)
    

    """
    targets=targets.float()
    targets=targets.cuda()
    preds=torch.stack([preds,preds,preds,preds,preds],dim=1)
    preds=preds.float()
    """
    targets = captions[:,0:1,:]
    targets=targets.reshape(targets.shape[0],targets.shape[2])
    targets = pack_padded_sequence(targets, [len(tar) - 1 for tar in targets], batch_first=True)[0]
    packed_preds = pack_padded_sequence(preds, [len(pred) - 1 for pred in preds], batch_first=True)[0]


    val_loss=cross_entropy_loss(packed_preds.float(), targets.long())




    # loss1=calc_loss(preds[:,0,:,:], targets[:,0,:],val_max_len)
    # loss2=calc_loss(preds[:,1,:,:], targets[:,1,:],val_max_len)
    # loss3=calc_loss(preds[:,2,:,:], targets[:,2,:],val_max_len)
    # loss4=calc_loss(preds[:,3,:,:], targets[:,3,:],val_max_len)
    # loss5=calc_loss(preds[:,4,:,:], targets[:,4,:],val_max_len)
    # val_loss = loss1+loss2+loss3+loss4+loss5
    # val_loss/=5

    # acc = accuracy(out, labels)           # Calculate accuracy
    return ({'val_loss': val_loss.detach(), 'bleu_score_val1': bleu_score_val1,'bleu_score_val2': bleu_score_val2,'bleu_score_val3': bleu_score_val3,\
            'bleu_score_val4': bleu_score_val4,'meteor_score_val':meteor_score_val},preds)

def validation_epoch_end( outputs):
    batch_losses = [x['val_loss'] for x in outputs]
    epoch_loss = torch.stack(batch_losses).mean()   # Combine losses

    batch_bleu_score1 = [x['bleu_score_val1'] for x in outputs]
    epoch_bleu_score1 = sum(batch_bleu_score1) / len(batch_bleu_score1)

    batch_bleu_score2 = [x['bleu_score_val2'] for x in outputs]
    epoch_bleu_score2 = sum(batch_bleu_score2) / len(batch_bleu_score2)

    batch_bleu_score3 = [x['bleu_score_val3'] for x in outputs]
    epoch_bleu_score3 = sum(batch_bleu_score3) / len(batch_bleu_score3)

    batch_bleu_score4 = [x['bleu_score_val4'] for x in outputs]
    epoch_bleu_score4 = sum(batch_bleu_score4) / len(batch_bleu_score4)

    batch_meteor_score = [x['meteor_score_val'] for x in outputs]
    epoch_meteor_score = sum(batch_meteor_score) / len(batch_meteor_score)

    # batch_accs = [x['val_acc'] for x in outputs]
    # epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
    return {'val_loss': epoch_loss.item(), 'bleu_score_val1': epoch_bleu_score1,'bleu_score_val2': epoch_bleu_score2,'bleu_score_val3': epoch_bleu_score3,\
            'bleu_score_val4': epoch_bleu_score4,'meteor_score_val': epoch_meteor_score}

@torch.no_grad()     ## this signifies  dont calc computation graph on this func
def evaluate(val_loader,encoder,decoder,val_max_len,vocab_dict, flag="val"):
    encoder.eval()
    decoder.eval()

    dict_pred = [validation_step(batch,encoder,decoder,val_max_len,vocab_dict) for batch in val_loader]

    outputs=[]    
    predicted_sentences=[]
    for i in range(len(dict_pred)):
        outputs.append(dict_pred[i][0])
        predicted_sentences.append(dict_pred[i][1])

    # print(output)
    # print(pred)
    if (flag=="val"):
        return validation_epoch_end(outputs)
    elif(flag=="test"):
        return validation_epoch_end(outputs),predicted_sentences


def epoch_end(epoch, result):
  print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f},val_blue_score1: {:.4f},val_blue_score2: {:.4f},val_blue_score3: {:.4f},val_blue_score4: {:.4f},\
  val_meteor_score: {:.4f}".format(epoch, result['train_loss'], \
        result['val_loss'],result['bleu_score_val1'],result['bleu_score_val2'],result['bleu_score_val3'],result['bleu_score_val4'],result['meteor_score_val']))


In [None]:
def generate_sentence_from_preds(preds,vocab_dict,max_len):
    # print("preds.shape",preds.shape)   
    sentence_list=[]
    for i in range(preds.shape[0]):
        sentence=''
        for j in range(max_len):
            word_id=torch.argmax(preds[i][j]).item()
            # if j%500==0:
            #   print(word_id)
            org_word = list(vocab_dict.keys())[list(vocab_dict.values()).index(word_id)]
            sentence+= ' '+ org_word
        sentence_list.append(sentence) 
    return sentence_list  

def generate_sentence_from_targets(targets,vocab_dict,max_len):
    sentences_list=[]
    for i in range(targets.shape[0]):
        sentence_list=[]
        for caption in range(5):
          sentence=''
          for j in range(max_len):
            # word_id=torch.argmax(targets[i][caption][j]).item()
            word_id= targets[i][caption][j].item()
            # print("word_id",word_id)
            org_word = list(vocab_dict.keys())[list(vocab_dict.values()).index(word_id)]
            # print("org_word",org_word)
            sentence+= ' '+ org_word

          sentence_list.append(sentence)
        sentences_list.append(sentence_list)
    return sentences_list


#requires editing
def bleu_score( captions_sentences_list ,preds_sentences):
    # print(captions_sentences_list)
    bleu_scores_list=[]
    generated_score1=0
    generated_score2=0
    generated_score3=0
    generated_score4=0
    for i in range(len(captions_sentences_list)):
        references=[]
        for j in range(5):
            word_tokens=captions_sentences_list[i][j].split(' ')

            new_word_tokens_without_start_and_pad=[]    
            for word in word_tokens:
                if((word!='<start>') and (word!='<pad>')):
                    new_word_tokens_without_start_and_pad.append(word)

            word_tokens=new_word_tokens_without_start_and_pad
            references.append(word_tokens)

        candidate_sentence=    preds_sentences[i]
        candidates=candidate_sentence.split(' ')

        new_candidates_tokens_without_start_and_pad=[]
        for word in candidates:
            if((word!='<start>') and (word!='<pad>')):
                new_candidates_tokens_without_start_and_pad.append(word)
        candidates=new_candidates_tokens_without_start_and_pad

        # print("ref (out of 32 images) for every images 1st ref caption==>):",references[0])
        # print("candidates (out of 32 predictions)  :",candidates)

        generated_score1 += sentence_bleu(references, candidates, weights=(1, 0, 0, 0),smoothing_function=SmoothingFunction().method4)
        generated_score2 += sentence_bleu(references, candidates, weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method4)
        generated_score3 += sentence_bleu(references, candidates, weights=(0, 0, 1, 0),smoothing_function=SmoothingFunction().method4)
        generated_score4 += sentence_bleu(references, candidates, weights=(0, 0, 0, 1),smoothing_function=SmoothingFunction().method4)
    generated_score1 /=  len(captions_sentences_list)
    generated_score2 /=  len(captions_sentences_list)
    generated_score3 /=  len(captions_sentences_list)
    generated_score4 /=  len(captions_sentences_list)
    bleu_scores_list.append(generated_score1)
    bleu_scores_list.append(generated_score2)
    bleu_scores_list.append(generated_score3)
    bleu_scores_list.append(generated_score4)
    # return generated_score
    return bleu_scores_list


def pre_process_sentences(sentence):
    # print("sentence:",sentence)
    modified_sentence=sentence.replace("<start> ","")
    modified_sentence=modified_sentence.replace(" <start>","")
    # modified_sentence=sentence.replace(" <start> "," ")

    modified_sentence=modified_sentence.replace("<pad> ","")
    modified_sentence=modified_sentence.replace(" <pad>","")
    # modified_sentence=sentence.replace(" <pad> "," ")


    modified_sentence=modified_sentence.replace("<end> ","")
    modified_sentence=modified_sentence.replace(" <end>","")
    # modified_sentence=sentence.replace(" <end> "," ")

    # print("modified_sentence:",modified_sentence)
    return modified_sentence


def meteor_score_calc(captions_sentences_list ,preds_sentences):
    generated_score=0
    for i in range(len(captions_sentences_list)):
        references=[]
        for j in range(5):
            # print("word_tokens :",captions_sentences_list[i][j])

            word_tokens=captions_sentences_list[i][j]
            word_tokens=pre_process_sentences(word_tokens)
            references.append(word_tokens)

        candidate_sentence= preds_sentences[i]
        # print("candidate_sentence",candidate_sentence)
        candidates=candidate_sentence
        candidates=pre_process_sentences(candidates)
        generated_score+= meteor_score(references, candidates)

    generated_score/=  len(captions_sentences_list)
    return generated_score

In [None]:
from torch.autograd import Variable

In [None]:
def train(epochs, encoder, decoder, data_loader,val_data_loader, word_dict,alpha_c,max_len,val_max_len,l_rate,filename,history_filename):

    cross_entropy_loss=nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.Adam(decoder.parameters(),lr=l_rate)

    loss=0
    history=[]
    result={}
    
    min_val_loss = 1e8
    best_epoch=0
    for epoch in range(epochs):
        encoder.eval()
        decoder.train()

        train_losses=[]

        blue_score_for_this_epoch1=0
        blue_score_for_this_epoch2=0
        blue_score_for_this_epoch3=0
        blue_score_for_this_epoch4=0
        meteor_score_for_this_epoch=0
        # print(len(data_loader))

        cnt = 0
        for batch_idx, (imgs, captions) in enumerate(data_loader):
            # if (cnt>=1):
            #     break
            # cnt+=1    
            
            
            imgs = Variable(imgs).cuda()
            # print("imgs  ===>",imgs.shape)            
            captions = Variable(captions).cuda()
            # captions_sentences_list=captions_sentences_list
            img_features = encoder(imgs)
            # print("encoder ka result image feature shape",img_features.shape)
            optimizer.zero_grad()
            preds, alphas = decoder(img_features, captions ,max_len , flag=True)
            
            targets = captions[:,0:1,:]  
            # print("captions shape",captions.shape)
            # print("targets shape",targets.shape)
            # print("preds shape:",preds.shape)

            targets=targets.reshape(targets.shape[0],targets.shape[2])
            # print("targets 2 shape",targets.shape)
            targets = pack_padded_sequence(targets, [len(tar) - 1 for tar in targets], batch_first=True)[0]
            # print("targets 3 shape",targets.shape)
            # print(" preds 1 shape :", preds.shape)
            packed_preds = pack_padded_sequence(preds, [len(pred) - 1 for pred in preds], batch_first=True)[0]
            # print(" packed_preds 1 shape :", packed_preds.shape)
   

            
            att_regularization = alpha_c * ((1 - alphas.sum(1))**2).mean()
            loss_train=cross_entropy_loss(packed_preds.float(), targets.long())
            loss_train+=att_regularization
            # print(" hi", LOSS_final.item())

            # preds_sentences=   generate_sentence_from_preds(preds,vocab_dict,max_len)    
            # captions_sentences_list = generate_sentence_from_targets(targets ,vocab_dict, max_len)


            # targets=targets.float()
            # targets=targets.cuda()
            # preds=torch.stack([preds,preds,preds,preds,preds],dim=1)
            # preds=preds.float()

            
            # loss1=calc_loss(preds[:,0,:,:], targets[:,0,:],max_len)
            # loss2=calc_loss(preds[:,1,:,:], targets[:,1,:],max_len)
            # loss3=calc_loss(preds[:,2,:,:], targets[:,2,:],max_len)
            # loss4=calc_loss(preds[:,3,:,:], targets[:,3,:],max_len)
            # loss5=calc_loss(preds[:,4,:,:], targets[:,4,:],max_len)

            # loss = (loss1+loss2+loss3+loss4+loss5)/5
            # loss += att_regularization
            train_losses.append(loss_train)


            # print("epoch ==>",e," and loss at i th batch===>" ,loss)
            loss_train.backward()
            optimizer.step()


        no_of_batches=(len(data_loader)/batch_size)
        # blue_score_for_this_epoch1=blue_score_for_this_epoch1/ no_of_batches
        # blue_score_for_this_epoch2=blue_score_for_this_epoch1/ no_of_batches
        # blue_score_for_this_epoch3=blue_score_for_this_epoch1/ no_of_batches
        # blue_score_for_this_epoch4=blue_score_for_this_epoch1/ no_of_batches
        
        # meteor_score_for_this_epoch=meteor_score_for_this_epoch/ no_of_batches

        # print("blue_score_for_this_epoch==> ",blue_score_for_this_epoch)
        # print("meteor_score_for_this_epoch==> ",meteor_score_for_this_epoch)

        result = evaluate( val_dataset,encoder,decoder,val_max_len,vocab_dict)
        
        if result['val_loss']< min_val_loss:
          min_val_loss = result['val_loss']
        #   save_model(encoder,filename+'_encoder')
        #   save_model(decoder,filename+'_decoder')
          best_epoch=epoch

        result['train_loss'] = torch.stack(train_losses).mean().item()
        # print("loss at epoch ==>",epoch ," == ",result['train_loss'])

        # result['bleu_train_score1']=blue_score_for_this_epoch1
        # result['bleu_train_score2']=blue_score_for_this_epoch2
        # result['bleu_train_score3']=blue_score_for_this_epoch3
        # result['bleu_train_score4']=blue_score_for_this_epoch4
        # result['meteor_train_score']=meteor_score_for_this_epoch
        epoch_end(epoch , result)  #for printing everything
        history.append(result)
        # joblib.dump(history,'/content/drive/MyDrive/Deep Learning/A4/models/'+history_filename)

    return history,encoder,decoder,best_epoch 

In [None]:
from torch.autograd import Variable
encoder = Encoder().cuda()
print("length of vocab",len(vocab_dict))
# attention = Attention(512).cuda()
decoder = Decoder(len(vocab_dict),512).cuda()
history,encoder,decoder,best_epoch = train(20,encoder,decoder,train_dataset,val_dataset,vocab_dict,0.3,max_length,max_length_val,l_rate=0.0001,filename='run1',history_filename='run1')
# history1,encoder,decoder = train(30,encoder,decoder,train_dataset,val_dataset,vocab_dict,0.1,max_length,max_length_val,l_rate=0.0001,filename='run2')
# history2,encoder,decoder = train(30,encoder,decoder,train_dataset,val_dataset,vocab_dict,0.1,max_length,max_length_val,l_rate=0.0001,filename='run3')


In [None]:
# torch.save(encoder.state_dict(),'/content/drive/My Drive/Deep Learning/A4/models/'+'20th_epoch_encoder'+'_cp.pth')
# torch.save(decoder.state_dict(),'/content/drive/My Drive/Deep Learning/A4/models/'+'20th_epoch_decoder'+'_cp.pth')

In [None]:
'''
file_path_enc = '/content/drive/MyDrive/Deep Learning/A4/models/run1_encoder_cp.pth'
file_path_dec = '/content/drive/MyDrive/Deep Learning/A4/models/run1_decoder_cp.pth'

vocab_dict = joblib.load('/content/drive/MyDrive/Deep Learning/A4/models/vocab_dict')

encoder = Encoder().cuda()
checkpoint_enc = torch.load(file_path_enc)
encoder.load_state_dict(checkpoint_enc)

decoder = Decoder(len(vocab_dict),512).cuda()
checkpoint_dec = torch.load(file_path_dec)
decoder.load_state_dict(checkpoint_dec)

history1,encoder,decoder = train(5,encoder,decoder,train_dataset,val_dataset,vocab_dict,0.3,max_length,max_length_val,l_rate=0.001,filename='run2',history_filename='run2')
'''

In [None]:
"""
file_path_enc = '/content/drive/MyDrive/Deep Learning/A4/models/run1_encoder_cp.pth'
file_path_dec = '/content/drive/MyDrive/Deep Learning/A4/models/run1_decoder_cp.pth'

vocab_dict = joblib.load('/content/drive/MyDrive/Deep Learning/A4/models/vocab_dict')

enc = Encoder().cuda()
checkpoint_enc = torch.load(file_path_enc)
enc.load_state_dict(checkpoint_enc)

dec = Decoder(len(vocab_dict),512).cuda()
checkpoint_dec = torch.load(file_path_dec)
dec.load_state_dict(checkpoint_dec)

history2,encoder,decoder = train(100,enc,dec,train_dataset,val_dataset,vocab_dict,0.0001,max_length,max_length_val,l_rate=0.0001,filename='run2')
"""

In [None]:
def helper_plot(history):
  val_loss=[]
  # val_bleu_score=[]
  # val_meteor_score=[]
  train_loss=[]
  val_bleu_score_1=[]
  val_bleu_score_2=[]
  val_bleu_score_3=[]
  val_bleu_score_4=[]
  val_meteor_score=[]
  for i in range(len(history)):
    val_loss.append(history[i]['val_loss'])
    # val_bleu_score.append(history[i]['bleu_score_val'])
    # val_meteor_score.append(history[i]['meteor_score_val'])
    train_loss.append(history[i]['train_loss'])
    # train_bleu_score.append(history[i]['bleu_train_score'])
    # train_meteor_score.append(history[i]['meteor_train_score'])
    val_bleu_score_1.append(history[i]['bleu_score_val1'])
    val_bleu_score_2.append(history[i]['bleu_score_val2'])
    val_bleu_score_3.append(history[i]['bleu_score_val3'])
    val_bleu_score_4.append(history[i]['bleu_score_val4'])

    val_meteor_score.append(history[i]['meteor_score_val'])



  # return val_loss, train_loss, train_bleu_score, train_meteor_score,val_bleu_score, val_meteor_score
  return val_loss, train_loss,val_bleu_score_1,val_bleu_score_2,val_bleu_score_3,val_bleu_score_4,val_meteor_score

def loss_plot(file,train_loss,val_loss):
    x_axis = np.arange(1,21)
    plt.plot(x_axis,val_loss, color="red", label="validation loss")
    plt.plot(x_axis,train_loss, color="blue",  label="train loss")
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.title('loss plot')
    # plt.savefig('/content/drive/MyDrive/Deep Learning/A4/plots/'+file)
    plt.legend()


def bleu_plot(file,val_bleu_score_1,val_bleu_score_2,val_bleu_score_3,val_bleu_score_4):
    x_axis = np.arange(1,21)
    plt.plot(x_axis,val_bleu_score_1, color="red", label="val_bleu_score_1")
    plt.plot(x_axis,val_bleu_score_2, color="blue", label="val_bleu_score_2")
    plt.plot(x_axis,val_bleu_score_3, color="green", label="val_bleu_score_3")
    plt.plot(x_axis,val_bleu_score_4, color="black", label="val_bleu_score_4")
    plt.xlabel("epoch")
    plt.ylabel("blue scores")
    plt.title('bleu plot')
    # plt.savefig('/content/drive/MyDrive/Deep Learning/A4/plots/'+file)
    plt.legend()


def meteor_plot(file,val_meteor_score):
    x_axis = np.arange(1,21)
    plt.plot(x_axis,val_meteor_score, color="red", label="val_meteor_score")
    plt.xlabel("epoch")
    plt.ylabel("val_meteor_score")
    plt.title('meteor plot')
    # plt.savefig('/content/drive/MyDrive/Deep Learning/A4/plots/'+file)
    plt.legend()    

history = joblib.load('/content/drive/MyDrive/Deep Learning/A4/models/run1')
val_loss, train_loss,val_bleu_score_1,val_bleu_score_2,val_bleu_score_3,val_bleu_score_4,val_meteor_score= helper_plot(history)
loss_plot('plot.jpg',train_loss,val_loss)  

In [None]:
bleu_plot('bleu.jpg',val_bleu_score_1,val_bleu_score_2,val_bleu_score_3,val_bleu_score_4)

In [None]:
meteor_plot('meteor.jpg',val_meteor_score) 

Show attention weights for each word in 5 images.

In [None]:
import math

import skimage
import skimage.transform

file_path_enc = '/content/drive/MyDrive/Deep Learning/A4/models/20th_epoch_encoder_cp.pth'
file_path_dec = '/content/drive/MyDrive/Deep Learning/A4/models/20th_epoch_decoder_cp.pth'
vocab_dict = joblib.load('/content/drive/MyDrive/Deep Learning/A4/models/vocab_dict')

enc = Encoder().cuda()
checkpoint_enc = torch.load(file_path_enc)
enc.load_state_dict(checkpoint_enc)
dec = Decoder(len(vocab_dict),512).cuda()
checkpoint_dec = torch.load(file_path_dec)
dec.load_state_dict(checkpoint_dec)

In [None]:
# enc = encoder
# dec = decoder


enc.eval()
dec.eval()

cnt=0
for batch_idx, (imgs, captions) in enumerate(train_dataset):
  if cnt >0:
    break
  cnt+=1
  imgs = imgs.cuda()
  captions = captions.cuda()
  # print(captions.shape)
  img_features = enc(imgs)
  preds, alphas = dec(img_features, captions ,max_length)
  # print(preds.shape)
  # print(max_length)
  # print(captions.shape)
  # print(img_features.shape)
  # print(alphas.shape)
  alphas = alphas.reshape(alphas.shape[0],alphas.shape[1],14,14)
  # print(alphas.shape)

  for images in range(5):

    corresponding_img=imgs[images]  
    preds_sentence=  generate_sentence_from_preds(preds[images:images+1],vocab_dict,max_length) 
    print("predicted caption",preds_sentence)    
    captions_sentences_list = generate_sentence_from_targets(captions[images:images+1],vocab_dict, max_length)
    print("original captions",captions_sentences_list)
  
    alpha_list = []
    for word in range(captions[images].shape[1]):
      alpha = alphas[images][word].detach().cpu().numpy()
      alpha = alpha*255
      alpha = alpha.astype('uint8')


      alpha_list.append(alpha)    
    #   cv2_imshow(alpha)
    
    w=10
    h=10
    fig=plt.figure(figsize=(30, 30))
    columns = math.ceil(math.sqrt(captions[images].shape[1]))
    rows = math.ceil(math.sqrt(captions[images].shape[1]))
    for i in range(1, captions[images].shape[1]+1):

        corresponding_img=imgs[images]  
        img = alpha_list[i-1]
        

        # print(i)
        preds_sentence_words=preds_sentence[0].split(' ')
        # print('preds_sentence_words:',preds_sentence_words)
        # print(len(preds_sentence_words))
        # print(preds_sentence_words[i])

        if (preds_sentence_words[i]=='<end>'):
            
            corresponding_img=corresponding_img.permute(1,2,0)
            corresponding_img=corresponding_img.detach().cpu().numpy()
            corresponding_img=corresponding_img*255
            corresponding_img=corresponding_img.astype('uint8')
            corresponding_img=cv2.resize(corresponding_img,(224,224))
            cv2_imshow(corresponding_img)
            break
        fig.add_subplot(rows, columns, i)
        plt.gca().set_title(preds_sentence_words[i])
        alpha_img = skimage.transform.pyramid_expand(img, upscale=16, sigma=2)


        corresponding_img=corresponding_img.permute(1,2,0)
        corresponding_img=corresponding_img.detach().cpu().numpy()
        corresponding_img=corresponding_img*255
        corresponding_img=corresponding_img.astype('uint8')
        corresponding_img=cv2.resize(corresponding_img,(224,224))
        # corresponding_img=corresponding_img.astype('float')
        # cv2_imshow(corresponding_img)
        # print(corresponding_img.shape)
        # # alpha_img*=255
        # # alpha_img=alpha_img.astype('uint8')
        # # cv2_imshow(alpha_img)
        alpha_img= (alpha_img-alpha_img.min() ) /( alpha_img.max()-alpha_img.min()  )
        alpha_img=alpha_img*255
        alpha_img=alpha_img.astype('uint8')

        # print(alpha_img)
        # cv2_imshow(alpha_img)

        alpha_img=np.dstack((alpha_img,alpha_img,alpha_img))
        Attention_images = cv2.addWeighted(corresponding_img, 0.4, alpha_img, 0.8, 0)

        plt.imshow(Attention_images)
        plt.axis('off')
    # fig.savefig('/content/drive/MyDrive/Deep Learning/A4/visualise_attention_weights/attention_plot'+str(images)+'.png')
    plt.show()      

Evaluation on Test Set 

In [None]:
from torch.utils.data import DataLoader
def test():
  file_path_enc = '/content/drive/MyDrive/Deep Learning/A4/models/run1_encoder_cp.pth'
  file_path_dec = '/content/drive/MyDrive/Deep Learning/A4/models/run1_decoder_cp.pth'

  vocab_dict = joblib.load('/content/drive/MyDrive/Deep Learning/A4/models/vocab_dict')

  enc = Encoder().cuda()
  checkpoint_enc = torch.load(file_path_enc)
  enc.load_state_dict(checkpoint_enc)
  dec = Decoder(len(vocab_dict),512).cuda()
  checkpoint_dec = torch.load(file_path_dec)
  dec.load_state_dict(checkpoint_dec)

  test_data_images_path= '/content/drive/MyDrive/Deep Learning/A4/Data/Test/Images'
  test_data_images_captions= joblib.load('/content/drive/MyDrive/Deep Learning/A4/Data/Test/test_captions.pkl')
  # test_data_images_captions = {k: test_data_images_captions[k] for k in list(test_data_images_captions)[:100]}



  test_caption_in_no_tensor,test_max_len= pre_process_captions(test_data_images_captions,vocab_dict)
  x_test  ,y_test=   pre_process_and_load(test_data_images_path,test_data_images_captions)  
  x_test_tensor= torch.stack(x_test)
  y_test=y_test
  test_ds = list(zip(x_test_tensor,test_caption_in_no_tensor))
  batch_size = 32
  test_dataset = DataLoader(test_ds,batch_size)
  test_dataset = DeviceDataLoader(test_dataset,torch.device('cuda'))
  
  result, predicted_sentences = evaluate(test_dataset,enc,dec,test_max_len,vocab_dict, flag="test")

  print("val_loss: {:.4f}, val_blue_score1: {:.4f},val_blue_score2: {:.4f},val_blue_score3: {:.4f},val_blue_score4: {:.4f},\
   val_meteor_score: {:.4f}".format(\
      result['val_loss'],result['bleu_score_val1'],result['bleu_score_val2'],result['bleu_score_val3'],result['bleu_score_val4'],result['meteor_score_val']))
  
  # print(len(predicted_sentences))
  # print(len(predicted_sentences[0]))
  # print(len(predicted_sentences[0][0]))
  # print(predicted_sentences[0][0])
#   print(np.array(predicted_sentences).shape)

  sentence_list=  generate_sentence_from_preds(predicted_sentences[0],vocab_dict,test_max_len) 
  print(sentence_list)
  return test_dataset,test_max_len

  
test_dataset,test_max_len=test()

In [None]:


enc.eval()
dec.eval()

cnt=0
for batch_idx, (imgs, captions) in enumerate(test_dataset):
  if cnt >0:
    break
  cnt+=1
  imgs = imgs.cuda()
  captions = captions.cuda()
  # print(captions.shape)
  img_features = enc(imgs)
  print(img_features.shape)
  print(captions.shape)
  print(max_length)
  preds, alphas = dec(img_features, captions ,test_max_len)
  # print(preds.shape)
  # print(max_length)
  # print(captions.shape)
  # print(img_features.shape)
  # print(alphas.shape)
  alphas = alphas.reshape(alphas.shape[0],alphas.shape[1],14,14)
  # print(alphas.shape)

  for images in range(20):

    corresponding_img=imgs[images]  
    preds_sentence=  generate_sentence_from_preds(preds[images:images+1],vocab_dict,test_max_len) 
    print("predicted caption",preds_sentence)    
    captions_sentences_list = generate_sentence_from_targets(captions[images:images+1],vocab_dict, test_max_len)
    print("original captions",captions_sentences_list)
  
    alpha_list = []
    for word in range(captions[images].shape[1]):
      alpha = alphas[images][word].detach().cpu().numpy()
      alpha = alpha*255
      alpha = alpha.astype('uint8')


      alpha_list.append(alpha)    
    #   cv2_imshow(alpha)
    
    w=10
    h=10
    fig=plt.figure(figsize=(30, 30))
    columns = math.ceil(math.sqrt(captions[images].shape[1]))
    rows = math.ceil(math.sqrt(captions[images].shape[1]))
    for i in range(1, captions[images].shape[1]+1):

        corresponding_img=imgs[images]  
        img = alpha_list[i-1]
        

        # print(i)
        preds_sentence_words=preds_sentence[0].split(' ')
        # print('preds_sentence_words:',preds_sentence_words)
        # print(len(preds_sentence_words))
        # print(preds_sentence_words[i])

        if (preds_sentence_words[i]=='<end>'):
            
            corresponding_img=corresponding_img.permute(1,2,0)
            corresponding_img=corresponding_img.detach().cpu().numpy()
            corresponding_img=corresponding_img*255
            corresponding_img=corresponding_img.astype('uint8')
            corresponding_img=cv2.resize(corresponding_img,(224,224))
            cv2_imshow(corresponding_img)
            break
        fig.add_subplot(rows, columns, i)
        plt.gca().set_title(preds_sentence_words[i])
        alpha_img = skimage.transform.pyramid_expand(img, upscale=16, sigma=2)


        corresponding_img=corresponding_img.permute(1,2,0)
        corresponding_img=corresponding_img.detach().cpu().numpy()
        corresponding_img=corresponding_img*255
        corresponding_img=corresponding_img.astype('uint8')
        corresponding_img=cv2.resize(corresponding_img,(224,224))
        # corresponding_img=corresponding_img.astype('float')
        # cv2_imshow(corresponding_img)
        # print(corresponding_img.shape)
        # # alpha_img*=255
        # # alpha_img=alpha_img.astype('uint8')
        # # cv2_imshow(alpha_img)
        alpha_img= (alpha_img-alpha_img.min() ) /( alpha_img.max()-alpha_img.min()  )
        alpha_img=alpha_img*255
        alpha_img=alpha_img.astype('uint8')

        # print(alpha_img)
        # cv2_imshow(alpha_img)

        alpha_img=np.dstack((alpha_img,alpha_img,alpha_img))
        Attention_images = cv2.addWeighted(corresponding_img, 0.4, alpha_img, 0.8, 0)

        plt.imshow(Attention_images)
        plt.axis('off')
    # fig.savefig('/content/drive/MyDrive/Deep Learning/A4/visualise_attention_weights/attention_plot'+str(images)+'.png')
    plt.show()  