In [1]:
import os

In [2]:
import cv2
import random
import numpy as np
import random

In [3]:
SEED=1234
np.random.seed(SEED)

In [4]:
import torch
from torch.utils import data
import torch.nn as nn

In [5]:
from dataloader import Dataset
from vocabulary import Vocabulary

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
dataset_folder="/floyd/input/bangla_image_caption/"

In [7]:
!pip install bnltk

[31mscikit-umfpack 0.3.2 has requirement numpy>=1.15.3, but you'll have numpy 1.15.1 which is incompatible.[0m
[31mmenpo 0.9.2 has requirement matplotlib>=3.0, but you'll have matplotlib 2.2.3 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [8]:
import json

In [9]:
dataset_folder="/floyd/input/bangla_image_caption/"

In [10]:
caption_json_path=dataset_folder+"captions.json"

In [11]:
filenames_with_captions=json.load(open(caption_json_path))

In [12]:
from bnltk.tokenize import Tokenizers
t = Tokenizers()
tokenizer = t.bn_word_tokenizer

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train,test=train_test_split(filenames_with_captions,test_size=0.1)
train,valid=train_test_split(filenames_with_captions,test_size=0.1)

In [15]:
image_names=[]
all_captions=[]
train_with_captions=[]
val_with_captions=[]
test_with_captions=[]
image_folder= dataset_folder+"images/"
for filename_caption in train:
    image_name = filename_caption["filename"]
    captions=filename_caption["caption"]
    for caption in captions:
        train_with_captions.append((image_name,caption))
        all_captions.append(caption)
for filename_caption in valid:
    image_name = filename_caption["filename"]
    captions=filename_caption["caption"]
    for caption in captions:
        val_with_captions.append((image_name,caption))
        all_captions.append(caption)
for filename_caption in test:
    image_name = filename_caption["filename"]
    captions=filename_caption["caption"]
    for caption in captions:
        test_with_captions.append((image_name,caption))
        all_captions.append(caption)

In [16]:
vocab=Vocabulary(vocab_threshold=6,captions=all_captions,tokenizer=t.bn_word_tokenizer)

[0/20140] Tokenizing captions...
[1000/20140] Tokenizing captions...
[2000/20140] Tokenizing captions...
[3000/20140] Tokenizing captions...
[4000/20140] Tokenizing captions...
[5000/20140] Tokenizing captions...
[6000/20140] Tokenizing captions...
[7000/20140] Tokenizing captions...
[8000/20140] Tokenizing captions...
[9000/20140] Tokenizing captions...
[10000/20140] Tokenizing captions...
[11000/20140] Tokenizing captions...
[12000/20140] Tokenizing captions...
[13000/20140] Tokenizing captions...
[14000/20140] Tokenizing captions...
[15000/20140] Tokenizing captions...
[16000/20140] Tokenizing captions...
[17000/20140] Tokenizing captions...
[18000/20140] Tokenizing captions...
[19000/20140] Tokenizing captions...
[20000/20140] Tokenizing captions...


In [17]:
from torchvision import transforms

# Define a transform to pre-process the training images.
transform_train = transforms.Compose([ 
    transforms.Resize(224),                           # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
])
transform_test = transforms.Compose([ 
    transforms.Resize(224),                          # smaller edge of image resized to 256
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
])

In [18]:
dataset= Dataset(image_folder,train_with_captions,transform_train,vocab,tokenizer=t.bn_word_tokenizer)
valid_dataset= Dataset(image_folder,val_with_captions,transform_test,vocab,tokenizer=t.bn_word_tokenizer)
test_dataset= Dataset(image_folder,test_with_captions,transform_test,vocab,tokenizer=t.bn_word_tokenizer)

In [19]:
from model import EncoderCNN,EncoderVGG,EncoderVGGAtt
from model import DecoderRNN,DecoderRNNAttention,Attention

In [20]:
batch_size = 128          # batch size
vocab_threshold = 6        # minimum word count threshold
vocab_from_file = True    # if True, load existing vocab file
embed_size = 512           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 20             # number of training epochs (1 for testing)
save_every = 1             # determines frequency of saving model weights
print_every = 200          # determines window for printing average loss
log_file = 'training_log_attention.txt'       # name of file with saved training loss and perplexity
val_log_file = 'validation_log_attention.txt'
vocab_size=len(vocab)

In [21]:
vocab_size

1448

In [None]:
encoder = EncoderVGGAtt()
decoder = DecoderRNNAttention(embed_size, hidden_size, vocab_size,batch_size,dropout=0.2)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

In [None]:
params = list(decoder.parameters())# + list(encoder.embed.parameters()) 
optimizer = torch.optim.Adam(params, lr=0.001,weight_decay=0.001)

In [None]:
validation_loss_min = np.inf

In [None]:
train_losses=[]
validation_losses=[]

In [None]:
f = open(log_file, 'w')
validation_f=open(val_log_file,'w')
i_step=0
print_every=100
# torch.autograd.set_detect_anomaly(True)
for epoch in range(1, num_epochs+1):
    try:
        for images,captions in dataset.load_data(batch_size):
            images=torch.cat(images)


            # Move batch of images and captions to GPU if CUDA is available.
            images_gpu = images.to(device)
            captions_gpu = [caption.to(device) for caption in captions]
            target_captions = [caption[1:] for caption in captions_gpu]
            captions_padded=nn.utils.rnn.pad_sequence(target_captions,batch_first=True)
            # Zero the gradients.
            decoder.zero_grad()
            encoder.eval()

            # Pass the inputs through the CNN-RNN model.
            features = encoder(images_gpu)
            outputs,alphas = decoder(features, captions_gpu)

            # Calculate the batch loss.
    #         print("outputs.shape: ", outputs.shape)
            loss = criterion(outputs.contiguous().view(-1, vocab_size), captions_padded.view(-1))
            # Backward pass.
            loss.backward()

            # Update the parameters in the optimizer.
            optimizer.step()

            # Get training statistics.
            stats = 'Epoch [%d/%d], Step [%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, loss.item(), np.exp(loss.item()))

            # Print training statistics (on same line).
            print('\r' + stats, end="")


            # Print training statistics to file.
            f.write(stats + '\n')
            f.flush()

            # Print training statistics (on different line).
            if i_step % print_every == 0:
                print('\r' + stats)
            i_step=i_step+1
        train_losses.append(loss.item())
    except RuntimeError as e:
        print(e)
        break
    validation_loss=0
    validation_iter=1
    for images,captions in valid_dataset.load_data(batch_size):
        images=torch.cat(images)
        

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = [caption.to(device) for caption in captions]
        target_captions=[caption[1:] for caption in captions]
        captions_padded=nn.utils.rnn.pad_sequence(target_captions,batch_first=True)
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs,alphas = decoder(features, captions)

        
        # Calculate the batch loss.
#         print("outputs.shape: ", outputs.shape)
        loss = criterion(outputs.contiguous().view(-1, vocab_size), captions_padded.view(-1))
        validation_loss=validation_loss+loss.item()
        validation_iter=validation_iter+1
        # Get training statistics.
    validation_loss = validation_loss/validation_iter
    validation_losses.append(validation_loss)
    stats = 'Epoch [%d/%d], Step [%d], Validation Loss: %.4f, Perplexity: %5.4f\n' % (epoch, num_epochs, validation_iter, validation_loss, np.exp(validation_loss))
    if validation_loss<validation_loss_min:
        print("\nmodel improved!")
        torch.save(decoder.state_dict(), os.path.join('bengali_models_attention', 'decoder.pkl'))
        torch.save(encoder.state_dict(), os.path.join('bengali_models_attention', 'encoder.pkl'))
        validation_loss_min=validation_loss
    else:
        print("\nnot improved yet!")
    # Print training statistics (on same line).
    print('\n' + stats, end="")


    # Print training statistics to file.
    validation_f.write(stats + '\n')
    validation_f.flush()

    # Print training statistics (on different line).



# Close the training log file.
f.close()
validation_f.close()

In [None]:
encoder.load_state_dict(torch.load("bengali_models_attention/encoder.pkl",map_location="cpu"))
decoder.load_state_dict(torch.load("bengali_models_attention/decoder.pkl",map_location="cpu"))

In [None]:
import matplotlib.pyplot as plt

In [None]:
validation_image=validation_image.to(device)

In [None]:
validation_image=validation_image.view(1,3,224,224)

In [None]:
actual_outputs=validation_caption.cpu().numpy().tolist()

In [None]:
encoder.eval()

In [None]:

from torch import nn


In [None]:
len(valid_dataset)

In [None]:
import matplotlib.font_manager as fm
prop = fm.FontProperties(fname='Kalpurush.ttf')

In [None]:
encoder.eval()
decoder.eval()
count=0
for validation_image,validation_caption in valid_dataset:
    features=encoder(validation_image.unsqueeze(0))
    start =0
    start_tensor=torch.tensor(start)
    start_tensor.to("cpu")
    word_embedding= decoder.word_embeddings(start_tensor)
    word_embedding.size()
    batch_size=features.shape[0]
    word_alphas=[]
    bangla_words=[]

    decoder.hidden,decoder.cell=decoder.init_hidden(batch_size)

    while True:
        attention_feature,alpha = decoder.attention(features,decoder.hidden)

        word_alphas.append(alpha)

        gate= decoder.sigmoid(decoder.f_beta(decoder.hidden))

        attention_feature=attention_feature*gate

        embedding= torch.cat((attention_feature,word_embedding.unsqueeze(0)),dim=1)

        hidden=decoder.hidden
        cell=decoder.cell
        hidden,cell=decoder.decode_step(embedding,(hidden,cell))
        decoder.hidden,decoder.cell=hidden,cell

        pred_word= decoder.linear(decoder.hidden)

        pred_word = nn.functional.softmax(pred_word)

        word_indice=torch.max(pred_word,dim=1)[1]
        bangla_words.append(vocab.idx2word[word_indice.item()])
        word_embedding = decoder.word_embeddings(word_indice[0])
        if word_indice==1:
            break
    plt.figure()
    plt.imshow(validation_image.cpu().numpy().transpose((1,2,0)))
    plt.title(" ".join(bangla_words), fontproperties=prop)
    count=count+1
    if count>99:
        break

In [None]:

""" Define the feattention_feature behavior of the model """
input_captions=[[self.word_embeddings(word) for word in single_caption[:-1]] for single_caption in captions]
pred_captions=[]
caption_alphas=[]

#         input_captions=nn.utils.rnn.pad_sequence(input_captions,batch_first=True)

#         input_captions_lengths=[len(input_caption)+1 for input_caption in input_captions]
#         embeddings = self.word_embeddings(input_captions) # embeddings new shape : (batch_size, captions length - 1, embed_size)

batch_size = features.shape[0] # features is of shape (batch_size, embed_size)
for i,input_caption in enumerate(input_captions):
    self.hidden,self.cell=self.init_hidden(1)
    feature=features[i]
    #print("feature size")
    #print(feature.size())

    #print(self.hidden.size())
    #print(self.cell.size())
    pred_words=[]
    word_alphas=[]
    for input_word in input_caption:
        attention_feature,alpha= self.attention(feature.view(1,feature.size(0),feature.size(1)),self.hidden)
        word_alphas.append(alpha)
        gate=self.sigmoid(self.f_beta(self.hidden))
        #print("attention feature")
        #print(attention_feature.size())
        #print("gate ")
        #print(gate.size())
        attention_feature=attention_feature*gate
        #print(attention_feature.size())
        #print(input_word.size())
        embedding= torch.cat((attention_feature,input_word.unsqueeze(0)),dim=1)
        hidden=self.hidden
        cell=self.cell
        hidden,cell=self.decode_step(embedding,(hidden,cell))
        #print("hidden and cell")
        #print(hidden.size())
        #print(cell.size())
        self.hidden,self.cell =hidden,cell
        pred_word=self.linear(self.dropout(self.hidden))
        pred_words.append(pred_word)
    pred_captions.append(torch.cat(pred_words))
    caption_alphas.append(word_alphas)
pred_captions_padded=nn.utils.rnn.pad_sequence(pred_captions,batch_first=True)



In [None]:
def sample(features):
    " accepts pre-processed image tensor (features) and returns predicted sentence (list of tensor ids of length max_len) "


    output = []
    batch_size = features.shape[0] # batch_size is 1 at inference, features shape : (1, 1, embed_size)
    hidden = decoder.init_hidden(batch_size) # Get initial hidden state of the LSTM

    while True:
        lstm_out, hidden = decoder.lstm(features, hidden) # lstm_out shape : (1, 1, hidden_size)
        outputs = decoder.linear(lstm_out)  # outputs shape : (1, 1, vocab_size)
        outputs= nn.functional.softmax(outputs,dim=2)
        outputs = outputs.squeeze(1) # outputs shape : (1, vocab_size)
        _, max_indice = torch.max(outputs, dim=1) # predict the most likely next word, max_indice shape : (1)

        output.append(max_indice.cpu().numpy()[0].item()) # storing the word predicted

        if (max_indice == 0):
            # We predicted the <end> word, so there is no further prediction to do
            break

        ## Prepare to embed the last predicted word to be the new input of the lstm
        features = decoder.word_embeddings(max_indice) # features shape : (1, embed_size)
        features = features.unsqueeze(1) # features shape : (1, 1, embed_size)

    return output

In [None]:
outputs=sample(features.view(1,1,4096))

In [None]:
for output in outputs:
    print(vocab.idx2word[output])

In [None]:
for output in actual_outputs:
    print(vocab.idx2word[output])