In [2]:
import os
import numpy as np
import json
import torch
import torch.nn as nn
from torch.nn import Parameter
from scipy.misc import imread, imresize
from torch.utils.data import Dataset
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
#         resnet = torchvision.models.resnet101(pretrained=True)
        vgg = torchvision.models.vgg16(pretrained=True)
        all_modules = list(vgg.children())
        modules = all_modules[:-2]
        self.vgg = nn.Sequential(*modules)
        self.avgpool = nn.AvgPool2d(8)
        self.fine_tune()
    
    def forward(self, images):
        """
        Forward function
        input: - images : with shape (batch_size, 3, 224, 224) channel first
        """
        # get the images features
        encoded_image = self.vgg(images) # (batch_size, 512, 8, 8)
        
        batch_size = encoded_image.shape[0]
        features = encoded_image.shape[1]
        num_pixels = encoded_image.shape[2] * encoded_image.shape[3]
        # get the global feature by using average pooling and rsshape it to batch_size, 512 (14)
        global_features = self.avgpool(encoded_image).view(batch_size, -1) # (batch_size, 512)
        # get the encoded image by resize the image feature
        enc_image = encoded_image.permute(0, 2, 3, 1) # (batch_size, 7, ,7, 512)
        enc_image = enc_image.view(batch_size, num_pixels, features) # (batch_size, num_pixels, 512)
        return enc_image, global_features
    
    def fine_tune(self, status=False):
        if not status:
            for param in self.vgg.parameters():
                param.requires_grad = False
        else:
            for module in list(self.vgg.children())[5:]: # last layer only, len total layer is 8
                for param in module.parameters():
                    param.requires_grad = True

In [49]:
class AdaptiveLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AdaptiveLSTMCell, self).__init__()
        # create LSTM cell
        self.lstm_cell = nn.LSTMCell(input_size, hidden_size)
        # create input gate
        self.x_gate = nn.Linear(input_size, hidden_size)
        # crate hidden gate
        self.h_gate = nn.Linear(hidden_size, hidden_size)
        
    def forward(self, x, states):
        """
        Forward function for LSTM
        input: - x : word token combined with encoded_image
               - states : the old hidden cell and memory cell
        """
        h_old, m_old = states
        # do LSTM, and get new hidden and output
        ht, mt = self.lstm_cell(x, (h_old, m_old))
        # do sigmoid to the input and hidden to get visual sentinel St (9)
        gt = F.sigmoid(self.x_gate(x) + self.h_gate(h_old))
        # and then do tanh to get visual sentinel (10)
        st = gt * F.tanh(mt)
        return ht, mt, st

In [50]:
class AdaptiveAttention(nn.Module):
    def __init__(self, hidden_size, attention_dimension):
        super(AdaptiveAttention, self).__init__()
        self.sentinel_affine= nn.Linear(hidden_size, hidden_size)
        self.sentinel_attention = nn.Linear(hidden_size, attention_dimension)
        self.hidden_affine = nn.Linear(hidden_size, hidden_size)
        self.hidden_attention = nn.Linear(hidden_size, attention_dimension)
        self.visual_attention = nn.Linear(hidden_size, attention_dimension)
        self.alphas = nn.Linear(attention_dimension, 1)
        self.context_hidden = nn.Linear(hidden_size, hidden_size)
   
    def forward(self, spatial_image, decoder_output, st):
        """
        Forward function for Adaptive Attention
        input: - spatial_image : the spatial image with shape (batch_size, num_pixels, hidden_size)
               - decoder_output : the decoder hidden state with shape (batch_size, hidden_size)
               - st : Visual sentinel returned by sentinel class with shape (batch_size, hidden_size)
        """
        # extract num_pixels
        num_pixels = spatial_image.shape[1]
        # get the visual attention using spatial_image as input
        visual_attn = self.visual_attention(spatial_image) # (batch_size, num_pixels, att_dim)
        # get sentinel affine using st as input with ReLU activation
        sentinel_affine = F.relu(self.sentinel_affine(st)) # (batch_size, hidden_size)
        # get sentinel attention using sentinel_affine as input
        sentinel_attn = self.sentinel_attention(sentinel_affine) # (batch_size, att_dim)
        
        hidden_affine = F.tanh(self.hidden_affine(decoder_output)) # (batch_sizem hidden_size)
        hidden_attn = self.hidden_attention(hidden_affine) # (batch_size, attention_dimension)
        
        hidden_resized = hidden_attn.unsqueeze(1).expand(hidden_attn.size(0), num_pixels + 1, hidden_attn.size(1))
        
        concat_features = torch.cat([spatial_image, sentinel_affine.unsqueeze(1)], dim=1) # (batch_size, num_pixels+1, hidden_size)
        attended_features = torch.cat([visual_attn, sentinel_attn.unsqueeze(1)], dim=1) # (batch_size, num_pixels, attn_dim)
        
        # do tanh to attended and hidden (6)
        attention = F.tanh(attended_features + hidden_resized) # (batch_size, num_pixles+1, attn_dim)
        # do a forward linear layer
        alpha = self.alphas(attention).squeeze(2) # (batch_size, num_pixels+1)
        # and do softmax
        att_weights = F.softmax(alpha, dim=1) # (batch_size, num_pixels+1)
        
        context = (concat_features * att_weights.unsqueeze(2)).sum(dim=1) # (batch_size, hidden_size)
        # get the new beta value by getting the last value of att_weights
        beta_value = att_weights[:, -1].unsqueeze(1) #(batch_size, 1)
        
        out_l = F.tanh(self.context_hidden(context + hidden_affine))
        
        return out_l, att_weights, beta_value
        
        
        

In [51]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, vocab_size, attention_dimension, embed_size, encoded_dimension):
        super(Decoder, self).__init__()
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.encoded_to_hidden = nn.Linear(encoded_dimension, hidden_size)
        self.global_features = nn.Linear(encoded_dimension, embed_size)
        # input of the LSTMCell should be of shape (batch_size, input_size)
        # because the input and global features are concenated, then input_features should be embed_size*2
        self.LSTM = AdaptiveLSTMCell(embed_size*2, hidden_size)
        self.adaptive_attention = AdaptiveAttention(hidden_size, attention_dimension)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.vocab_size = vocab_size
        self.dropout = nn.Dropout(0.5)
        self.init_weights()
        
    def init_weights(self):
        self.fc.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)
        self.embedding.weight.data.uniform_(-0.1, 0.1)
    
    def init_hidden_state(self, encoded_image):
        h = torch.zeros(encoded_image.shape[0], 512).to(device)
        c = torch.zeros(encoded_image.shape[0], 512).to(device)
        return h, c

    def forward(self, encoded_image, global_features, encoded_captions, caption_lengths):
        """
        Forward function for decoder
        input: - encoded_image : the encoded images from encoder with shape (batch_size, num_pixels, 2048)
               - global_features : the global features from encoder with shape (batch_size, 2048)
               - encoded_captions : encoded captions with shape (batch_size, max_caption_length)
               - caption_lengths : encoded caption length with dimension (batch_size, 1)
        """
        # extract the batch size and num_pixels
        batch_size = encoded_image.shape[0]
        num_pixels = encoded_image.shape[1]
        # get the spatial image
        spatial_image = F.relu(self.encoded_to_hidden(encoded_image)) # (batch_size, num_pixels, hidden_size)
        global_image = F.relu(self.global_features(global_features)) # (batch_size, embed_size)
        # sort input data by decreasing length
        # caption_length will contains the sorted length, and sort_idx will contains the sorted elements indices
        caption_lengths, sort_idx = caption_lengths.squeeze(1).sort(dim=0, descending=True)
        # sort spatial_image, global_image, encoded_captions and encoded_image batches by caption length
        spatial_image = spatial_image[sort_idx]
        global_image = global_image[sort_idx]
        encoded_captions = encoded_captions[sort_idx]
        encoded_image = encoded_image[sort_idx]
        
        # Embedding, each batch contains a caption. All batch have the same number of rows (words), since we previously
        # padded the ones shorter than max_caption_lengths, as well as the same number of columns (embed_dimension)
        embeddings = self.embedding(encoded_captions) # (batch_size, max_caption_length, embed_dimesion)
        
        # initialize LSTM
        h, c = self.init_hidden_state(encoded_image) # (batch_size, hidden_size)
        
        # we won't decode at the <end> position, since we've finished generating as soon as we generate <end>
        decode_lengths =(caption_lengths - 1).tolist()
        
        # create tensors to store word prediction score, alphas and betas
        predictions = torch.zeros(batch_size, max(decode_lengths), self.vocab_size).to(device)
        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels+1).to(device)
        betas = torch.zeros(batch_size, max(decode_lengths), 1).to(device)
        
        # concenate the embeddings and global image feature for LSTM input
        global_image = global_image.unsqueeze(1).expand_as(embeddings)
        inputs = torch.cat((embeddings, global_image), dim=2) # (batch_size, max_caption_length, embed_dimension * 2)
        
        # start decoding
        for timestep in range(max(decode_lengths)):
            # create a packet sequence to process the only effective batch size N_t at that timestep
            batch_size_t = sum([l > timestep for l in decode_lengths])
            current_input = inputs[:batch_size_t, timestep, :] # (batch_size_t, embed_dimension * 2)
            # do LSTM
            h, c, st = self.LSTM(current_input, (h[:batch_size_t], c[:batch_size_t])) # (batch_size, hidden_size)
            # run the adaptive attention
            out_l, alpha_t, beta_t = self.adaptive_attention(spatial_image[:batch_size_t], h, st)
            # compute the probability over the vocabulary with fullt connected layer
            pred = self.fc(self.dropout(out_l))
            # store the prediction, alphas and betas value
            predictions[:batch_size_t, timestep, :] = pred
            alphas[:batch_size_t, timestep, :] = alpha_t
            betas[:batch_size_t, timestep, :] = beta_t
        return predictions, alphas, betas, encoded_captions, decode_lengths, sort_idx

In [52]:
%load_ext autoreload
%autoreload
from torch.nn.utils.rnn import pack_padded_sequence
from cococaptioncider.pycocotools.coco import COCO
from cococaptioncider.pycocoevalcap.eval import COCOEvalCap
from util import *
from dataset import *
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
# Model Parameter
embed_dim = 512 # dimension of word embeddings
attention_dim = 512 # attention hidden size
hidden_size = 512 # dimension of decoder LSTM
cudnn.benchmark = True # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# Training parameter
start_epoch = 0
epochs = 20
epochs_since_improvement = 0
batch_size = 100
workers = 1
encoder_lr = 1e-4
decoder_lr = 5e-4
grad_clip = 0.1
best_cider = 0
print_freq = 100
fine_tune_encoder = False
checkpoint = None
annFile = 'cococaptioncider/annotations/new_indo_caption_val.json'

In [54]:
def train(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch, vocab_size):
    decoder.train()
    encoder.train()
    losses = AverageMeter()
    top5accs = AverageMeter()
    
    for i, (imgs, caps, caplens) in enumerate(train_loader):
        # move to GPU
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)
        
        # Feed Forward
        encoded_image, global_features = encoder(imgs)
        predictions, alphas, betas, encoded_captions, decode_lengths, sort_idx = decoder(encoded_image, global_features, caps, caplens)
        
        # Since we decoded starting caption with <start> token, the targets are all words after <start> up to <end>
        targets = encoded_captions[:, 1:]
        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        scores, _ = pack_padded_sequence(predictions, decode_lengths, batch_first=True)
        targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)
        
        # Calculate loss
        loss = criterion(scores, targets)
        
        # Back prop
        decoder_optimizer.zero_grad()
        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad()
        
        loss.backward()
        
        # Update weights
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()
        
        # Keep track if metrics
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(top5, sum(decode_lengths))
        
        # Print status every print_freq
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(epoch, i, len(train_loader),
                                                                            loss=losses,
                                                                            top5=top5accs))
        
        

In [55]:
def validate(val_loader, encoder, decoder, beam_size, epoch, vocab_size):
    """
    Funtion to validate over the complete dataset
    """
    encoder.eval()
    decoder.eval()
    results = []

    for i, (img, image_id) in enumerate(tqdm(val_loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size
        infinite_pred = False
        
        # Encode
        image = img.to(device)       # (1, 3, 224, 224)
        enc_image, global_features = encoder(image) # enc_image of shape (1,num_pixels,features)
        # Flatten encoding
        num_pixels = enc_image.size(1)
        encoder_dim = enc_image.size(2)
        # We'll treat the problem as having a batch size of k
        enc_image = enc_image.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)
        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)  # (k, 1)
        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)
        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)
        # Lists to store completed sequences, their alphas and scores
        complete_seqs = list()
        complete_seqs_scores = list()
        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(enc_image)
        spatial_image = F.relu(decoder.encoded_to_hidden(enc_image))  # (k,num_pixels,hidden_size)
        global_image = F.relu(decoder.global_features(global_features))      # (1,embed_dim)
        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:
            embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (k,embed_dim)
            inputs = torch.cat((embeddings, global_image.expand_as(embeddings)), dim = 1)    
            h, c, st = decoder.LSTM(inputs , (h, c))  # (batch_size_t, hidden_size)
            # Run the adaptive attention model
            out_l, _, _ = decoder.adaptive_attention(spatial_image, h, st)
            # Compute the probability over the vocabulary
            scores = decoder.fc(out_l)      # (batch_size, vocab_size)
            scores = F.log_softmax(scores, dim=1)   # (s, vocab_size)
            # (k,1) will be (k,vocab_size), then (k,vocab_size) + (s,vocab_size) --> (s, vocab_size)
            scores = top_k_scores.expand_as(scores) + scores  
            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                #Remember: torch.topk returns the top k scores in the first argument, and their respective indices in the second argument
                top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words / vocab_size  # (s) 
            next_word_inds = top_k_words % vocab_size  # (s) 
            # Add new words to sequences, alphas
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            if k == 0:
                break

            # Proceed with incomplete sequences
            seqs = seqs[incomplete_inds]              
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            spatial_image = spatial_image[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                infinite_pred = True
                break

            step += 1
            
        if infinite_pred is not True:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
        else:
            seq = seqs[0][:20]
            seq = [seq[i].item() for i in range(len(seq))]
                
        # Construct Sentence
        sen_idx = [w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]
        sentence = ' '.join([rev_word_map[sen_idx[i]] for i in range(len(sen_idx))])
        item_dict = {"image_id": image_id.item(), "caption": sentence}
        results.append(item_dict)
    
    print("Calculating Evalaution Metric Scores......\n")
    
    resFile = 'cococaptioncider/results/captions_val2014_results_' + str(epoch) + '.json' 
    evalFile = 'cococaptioncider/results/captions_val2014_eval_' + str(epoch) + '.json' 
    # Calculate Evaluation Scores
    with open(resFile, 'w') as wr:
        json.dump(results,wr)
        
    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)
    # create cocoEval object by taking coco and cocoRes
    cocoEval = COCOEvalCap(coco, cocoRes)
    # evaluate on a subset of images
    # please remove this line when evaluating the full validation set
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    # evaluate results
    cocoEval.evaluate()    
    # Save Scores for all images in resFile
    with open(evalFile, 'w') as w:
        json.dump(cocoEval.eval, w)

    return cocoEval.eval['CIDEr'], cocoEval.eval['Bleu_4']
            

In [56]:
with open('caption data/WORDMAP.json', 'r') as j:
    word_map = json.load(j)
rev_word_map = {v: k for k, v in word_map.items()}  # idx2word

In [57]:
if checkpoint is None:
    decoder = Decoder(hidden_size, 
                      vocab_size=len(word_map), 
                      attention_dimension = attention_dim, 
                      embed_size = embed_dim,
                      encoded_dimension = 512) 
    
    encoder = Encoder()
    decoder_optimizer = torch.optim.Adam(params=decoder.parameters(),lr=decoder_lr, betas = (0.8,0.999))
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                         lr=encoder_lr, betas = (0.9,0.999)) if fine_tune_encoder else None
else:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint['epoch'] + 1
    epochs_since_improvement = checkpoint['epochs_since_improvement']
    best_cider = checkpoint['cider']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    if fine_tune_encoder is True and encoder_optimizer is None:
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),lr=encoder_lr)
        print("Finetuning the CNN")

In [58]:
# Move to GPU, if available
decoder = decoder.to(device)
encoder = encoder.to(device)

In [59]:
# Loss function
criterion = nn.CrossEntropyLoss().to(device)

In [60]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

In [61]:
#dataset_path = "C:/Users/share/Documents/Machine Learning/Datasets/karpathy/output/TRAIN_IMAGES_coco_5_cap_per_img_100_min_word_freq.hdf5"
#caption = "C:/Users/share/Documents/Machine Learning/Datasets/karpathy/output/TRAIN_CAPTIONS_coco_5_cap_per_img_100_min_word_freq.json"
#caplens = "C:/Users/share/Documents/Machine Learning/Datasets/karpathy/output/TRAIN_CAPLENS_coco_5_cap_per_img_100_min_word_freq.json"

train_loader = torch.utils.data.DataLoader(COCOTrainDataset(transform=transforms.Compose([normalize])),
                                           batch_size = batch_size, 
                                           shuffle=True, 
                                           pin_memory=True)

#validation_path = "C:/Users/share/Documents/Machine Learning/Datasets/karpathy/output/TRAIN_IMAGES_coco_5_cap_per_img_100_min_word_freq.hdf5"

val_loader = torch.utils.data.DataLoader(COCOValidationDataset(transform=transforms.Compose([normalize])),
                                         batch_size = 1,
                                         shuffle=True, 
                                         pin_memory=True)

In [69]:
for epoch in range(start_epoch, 10):
    if epochs_since_improvement == 8:
        print('No Improvement for the last 8 epochs. Training terminated')
        break
    
    # Decay
    if epoch % 3 == 0 and epoch != 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
    
    # One epoch training
    train(train_loader=train_loader,
         encoder=encoder,
         decoder=decoder,
         criterion=criterion,
         encoder_optimizer=encoder_optimizer,
         decoder_optimizer=decoder_optimizer,
         epoch=epoch,
         vocab_size=len(word_map))
    
    
    recent_cider, recent_blue4 = validate(val_loader,
                                         encoder,
                                         decoder,
                                         3,
                                         epoch,
                                         len(word_map))
    print("Epoch {}:\tCIDEr Score: {}".format(epoch, recent_cider))
    
    # Check if there was an improvement
    #recent_cider = best_cider + 0.01
    is_best = recent_cider > best_cider
    best_cider = max(recent_cider, best_cider)
    if not is_best:
        epochs_since_improvement += 1
        print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
    else:
        epochs_since_improvement = 0
        
    print('Predict: ')
    predict_output("test1.jpg", rev_word_map)
    

    save_checkpoint(epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_cider, is_best)
    

Epoch: [1][0/5729]	Loss 2.9255 (2.9255)	Top-5 Accuracy 65.765 (65.765)	
Epoch: [1][100/5729]	Loss 2.9783 (3.0864)	Top-5 Accuracy 66.283 (64.068)	
Epoch: [1][200/5729]	Loss 3.1902 (3.0779)	Top-5 Accuracy 62.959 (64.323)	
Epoch: [1][300/5729]	Loss 3.1377 (3.0798)	Top-5 Accuracy 63.796 (64.319)	
Epoch: [1][400/5729]	Loss 3.1814 (3.0823)	Top-5 Accuracy 61.426 (64.327)	
Epoch: [1][500/5729]	Loss 3.1644 (3.0826)	Top-5 Accuracy 61.516 (64.307)	
Epoch: [1][600/5729]	Loss 3.1464 (3.0816)	Top-5 Accuracy 64.734 (64.280)	
Epoch: [1][700/5729]	Loss 3.0619 (3.0783)	Top-5 Accuracy 64.458 (64.346)	
Epoch: [1][800/5729]	Loss 3.1007 (3.0761)	Top-5 Accuracy 64.875 (64.384)	
Epoch: [1][900/5729]	Loss 3.1417 (3.0742)	Top-5 Accuracy 62.464 (64.389)	
Epoch: [1][1000/5729]	Loss 3.0270 (3.0741)	Top-5 Accuracy 67.554 (64.397)	
Epoch: [1][1100/5729]	Loss 2.9033 (3.0742)	Top-5 Accuracy 66.114 (64.401)	
Epoch: [1][1200/5729]	Loss 2.9569 (3.0724)	Top-5 Accuracy 65.610 (64.432)	
Epoch: [1][1300/5729]	Loss 3.1001 (3.

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [47:02<00:00, 14.58it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.543992
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 300885, 'reflen': 306886, 'guess': [300885, 260381, 219877, 179373], 'correct': [197924, 96421, 40966, 17056]}
ratio: 0.9804455074522755
Bleu_1: 0.645
Bleu_2: 0.484
Bleu_3: 0.350
Bleu_4: 0.251
computing CIDEr score...
CIDEr: 0.889
Epoch 1:	CIDEr Score: 0.8887264183756494
Predict: 
sebuah sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus
Epoch: [2][0/5729]	Loss 2.8062 (2.8062)	Top-5 Accuracy 68.781 (68.781)	
Epoch: [2][100/5729]	Loss 2.6551 (2.8564)	Top-5 Accuracy 69.776 (67.404)	
Epoch: [2][200/5729]	Loss 2.8643 (2.8703)	Top-5 Accuracy 66.433 (67.203)	
Epoch: [2][300/5729]	Loss 2.8204 (2.8773)	Top-5 Accuracy 69.087 (67.068)	
Epoch: [2][400/5729]	Loss 2.8934 (2.8824)	Top-5 Acc

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [47:30<00:00, 15.27it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.542001
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 299223, 'reflen': 305641, 'guess': [299223, 258719, 218215, 177711], 'correct': [196999, 96408, 41605, 17324]}
ratio: 0.9790015083054925
Bleu_1: 0.644
Bleu_2: 0.485
Bleu_3: 0.353
Bleu_4: 0.254
computing CIDEr score...
CIDEr: 0.902
Epoch 2:	CIDEr Score: 0.9021348722085023
Predict: 
sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus

DECAYING learning rate.
The new learning rate is 0.000400

Epoch: [3][0/5729]	Loss 2.8223 (2.8223)	Top-5 Accuracy 67.638 (67.638)	
Epoch: [3][100/5729]	Loss 2.8444 (2.7731)	Top-5 Accuracy 67.026 (68.586)	
Epoch: [3][200/5729]	Loss 2.7380 (2.7610)	Top-5 Accuracy 68.519 (68.681)	
Epoch: [3][300/5729]	Loss 2.5743 (2.7590)	Top-5 Accuracy 71.095 (68.

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [45:41<00:00, 14.77it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.521995
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 298729, 'reflen': 305165, 'guess': [298729, 258225, 217721, 177217], 'correct': [198104, 96208, 41688, 17700]}
ratio: 0.9789097701243558
Bleu_1: 0.649
Bleu_2: 0.486
Bleu_3: 0.354
Bleu_4: 0.257
computing CIDEr score...
CIDEr: 0.915
Epoch 3:	CIDEr Score: 0.9148574687741924
Predict: 
sebuah sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus
Epoch: [4][0/5729]	Loss 2.7087 (2.7087)	Top-5 Accuracy 68.586 (68.586)	
Epoch: [4][100/5729]	Loss 2.6717 (2.6759)	Top-5 Accuracy 69.223 (69.858)	
Epoch: [4][200/5729]	Loss 2.6700 (2.6731)	Top-5 Accuracy 71.257 (69.934)	
Epoch: [4][300/5729]	Loss 2.6694 (2.6752)	Top-5 Accuracy 70.000 (69.936)	
Epoch: [4][400/5729]	Loss 2.7085 (2.6731)	Top-5 Acc

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [47:24<00:00, 14.24it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.541900
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 311760, 'reflen': 313617, 'guess': [311760, 271256, 230752, 190248], 'correct': [203104, 97768, 42102, 17881]}
ratio: 0.9940787648628711
Bleu_1: 0.648
Bleu_2: 0.482
Bleu_3: 0.348
Bleu_4: 0.250
computing CIDEr score...
CIDEr: 0.900
Epoch 4:	CIDEr Score: 0.899638820494321

Epochs since last improvement: 1

Predict: 
sebuah sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus
Epoch: [5][0/5729]	Loss 2.5942 (2.5942)	Top-5 Accuracy 72.915 (72.915)	
Epoch: [5][100/5729]	Loss 2.7098 (2.6081)	Top-5 Accuracy 70.060 (70.738)	
Epoch: [5][200/5729]	Loss 2.4974 (2.6050)	Top-5 Accuracy 71.238 (70.854)	
Epoch: [5][300/5729]	Loss 2.5800 (2.6081)	Top-5 Accuracy 71.812 (70.766)	
Epoch: [5][400/57

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [47:04<00:00, 14.34it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.541987
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 314718, 'reflen': 314843, 'guess': [314718, 274214, 233710, 193206], 'correct': [203099, 96448, 40762, 17431]}
ratio: 0.999602976721728
Bleu_1: 0.645
Bleu_2: 0.476
Bleu_3: 0.341
Bleu_4: 0.244
computing CIDEr score...
CIDEr: 0.893
Epoch 5:	CIDEr Score: 0.8929402969977082

Epochs since last improvement: 2

Predict: 
sebuah sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus

DECAYING learning rate.
The new learning rate is 0.000320

Epoch: [6][0/5729]	Loss 2.5678 (2.5678)	Top-5 Accuracy 70.965 (70.965)	
Epoch: [6][100/5729]	Loss 2.6210 (2.5394)	Top-5 Accuracy 71.279 (71.869)	
Epoch: [6][200/5729]	Loss 2.5257 (2.5310)	Top-5 Accuracy 73.154 (71.904)	
Epoch: [6][300/5729]	Loss 2.593

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [46:33<00:00, 14.14it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.561079
creating index...
index created!
Loading and preparing results...     
DONE (t=0.10s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 309788, 'reflen': 312365, 'guess': [309788, 269284, 228780, 188276], 'correct': [203576, 97291, 41532, 17457]}
ratio: 0.9917500360155556
Bleu_1: 0.652
Bleu_2: 0.483
Bleu_3: 0.348
Bleu_4: 0.249
computing CIDEr score...
CIDEr: 0.903
Epoch 6:	CIDEr Score: 0.9033666380065309

Epochs since last improvement: 3

Predict: 
sebuah sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus
Epoch: [7][0/5729]	Loss 2.4159 (2.4159)	Top-5 Accuracy 74.713 (74.713)	
Epoch: [7][100/5729]	Loss 2.3843 (2.4603)	Top-5 Accuracy 72.771 (72.897)	
Epoch: [7][200/5729]	Loss 2.4300 (2.4648)	Top-5 Accuracy 73.456 (72.816)	
Epoch: [7][300/5729]	Loss 2.4127 (2.4665)	Top-5 Accuracy 72.029 (72.798)	
Epoch: [7][400/5

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [47:02<00:00, 14.85it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.570997
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 313717, 'reflen': 314815, 'guess': [313717, 273213, 232709, 192206], 'correct': [202589, 95093, 40409, 17032]}
ratio: 0.9965122373457396
Bleu_1: 0.644
Bleu_2: 0.472
Bleu_3: 0.338
Bleu_4: 0.242
computing CIDEr score...
CIDEr: 0.885
Epoch 7:	CIDEr Score: 0.8850022203548268

Epochs since last improvement: 4

Predict: 
sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus
Epoch: [8][0/5729]	Loss 2.5875 (2.5875)	Top-5 Accuracy 71.001 (71.001)	
Epoch: [8][100/5729]	Loss 2.6307 (2.4320)	Top-5 Accuracy 70.549 (73.237)	
Epoch: [8][200/5729]	Loss 2.3776 (2.4257)	Top-5 Accuracy 74.472 (73.361)	
Epoch: [8][300/5729]	Loss 2.4352 (2.4282)	Top-5 Accuracy 74.032 (73.359)	
Epoch: [8][400/5729

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [46:39<00:00, 16.03it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.526997
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 311526, 'reflen': 313734, 'guess': [311526, 271022, 230518, 190014], 'correct': [202938, 96516, 41180, 17596]}
ratio: 0.9929621909005687
Bleu_1: 0.647
Bleu_2: 0.478
Bleu_3: 0.344
Bleu_4: 0.247
computing CIDEr score...
CIDEr: 0.892
Epoch 8:	CIDEr Score: 0.8920561695740918

Epochs since last improvement: 5

Predict: 
sebuah sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus

DECAYING learning rate.
The new learning rate is 0.000256

Epoch: [9][0/5729]	Loss 2.2291 (2.2291)	Top-5 Accuracy 77.196 (77.196)	
Epoch: [9][100/5729]	Loss 2.2580 (2.3798)	Top-5 Accuracy 76.572 (74.083)	
Epoch: [9][200/5729]	Loss 2.5705 (2.3818)	Top-5 Accuracy 71.570 (74.039)	
Epoch: [9][300/5729]	Loss 2.26

EVALUATING AT BEAM SIZE 3: 100%|█████████████████████████████████████████████████| 40504/40504 [48:36<00:00, 13.89it/s]


Calculating Evalaution Metric Scores......

loading annotations into memory...
0:00:00.607013
creating index...
index created!
Loading and preparing results...     
DONE (t=0.11s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 312143, 'reflen': 313928, 'guess': [312143, 271639, 231135, 190632], 'correct': [202559, 96604, 41486, 17756]}
ratio: 0.9943139828240839
Bleu_1: 0.645
Bleu_2: 0.478
Bleu_3: 0.344
Bleu_4: 0.248
computing CIDEr score...
CIDEr: 0.896
Epoch 9:	CIDEr Score: 0.8962054076344467

Epochs since last improvement: 6

Predict: 
sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus


In [39]:
img = imread('test_imgs/test1.jpg')
img = imresize(img, (256, 256)) # (224, 224, 3)
img = img.transpose(2, 0, 1) # channel first (3, 224, 224)
img = img / 255. # normalize the input to 0 - 1
img = torch.FloatTensor(img).to(device) # convert to tensor
# normalize the input image
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
transform = transforms.Compose([normalize])
image = transform(img.to('cpu')).to(device)  # (3, 224, 224)

# add single batch
image = image.unsqueeze(0) # (1, 3, 224, 224)
# encoded_image, global_features = encoder(image)

In [42]:
x = encoder.vgg(image)

In [43]:
x.shape

torch.Size([1, 512, 8, 8])

In [63]:
def predict_output(image, rev_word_map):
    """
    predict output with beam size of 1 (predict the word and feet it to the next LSTM)
    print out the generated sentence
    """
    max_len = 20
    sampled = []
    img = imread(image)
    img = imresize(img, (256, 256)) # (224, 224, 3)
    img = img.transpose(2, 0, 1) # channel first (3, 224, 224)
    img = img / 255. # normalize the input to 0 - 1
    img = torch.FloatTensor(img).to(device) # convert to tensor
    # normalize the input image
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    transform = transforms.Compose([normalize])
    image = transform(img.to('cpu')).to(device)  # (3, 224, 224)
    
    # add single batch
    image = image.unsqueeze(0) # (1, 3, 224, 224)
    encoded_image, global_features = encoder(image)
    num_pixels = encoded_image.shape[1]
    spatial_image = F.relu(decoder.encoded_to_hidden(encoded_image)) # (batch_size, num_pixels, hidden_size)
    global_image = F.relu(decoder.global_features(global_features)) # (batch_size, embed_size)
    alphas = torch.zeros(max_len, num_pixels+1)
    betas = torch.zeros(max_len, 1)
    # create prediction with initial <start> token
    predictions = torch.LongTensor([[word_map['<start>']]]).to(device) # (1, 1)
    h, c = decoder.init_hidden_state(encoded_image)
    
    for timestep in range(max_len):
        embeddings = decoder.embedding(predictions).squeeze(1) # (1, 1, embed_dim) --> (1, embed_dim)
        inputs = torch.cat((embeddings, global_image), dim=1) # (1, embed_dim*2)
        h, c, st = decoder.LSTM(inputs, (h, c))
        out, alpha, beta = decoder.adaptive_attention(spatial_image, h, st)
        pt = decoder.fc(out)
        _, pred = pt.max(1)
        sampled.append(pred.item())
        alphas[timestep] = alpha
        betas[timestep] = beta.item()
    
    generated_words = [rev_word_map[sampled[i]] for i in range(len(sampled))]
    filtered_words = ' '.join([word for word in generated_words if word != '<end>'])
    print(filtered_words)
    
    

In [64]:
save_checkpoint(epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_cider, is_best)


In [65]:
predict_output("test1.jpg", rev_word_map)

sebuah sebuah sebuah sebuah bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus bus


In [68]:
start_epoch = 1

In [1]:
encoder.vgg

NameError: name 'encoder' is not defined