In [1]:
import json
import torch
import torch.nn as nn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pack_padded_sequence
from cococaption.pycocotools.coco import COCO
from cococaption.pycocoevalcap.eval import COCOEvalCap
import torch.backends.cudnn as cudnn
from models.captioning_models import *
from util import *

from data_utils import get_karpathy_split
from data_loader_captions import get_caption_loader
from build_vocab import Vocabulary
from tqdm.autonotebook import tqdm
from string import punctuation
import nltk
import pickle

from data_loader_captions import filename_from_id
from PIL import Image

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#Model Parameters
emb_dim = 512                  # dimension of word embeddings
attention_dim = 512            # attention hidden size
hidden_size = 512              # dimension of decoder RNN
cudnn.benchmark = True         # set to true only if inputs to model are fixed size; otherwise lot of computational overhead
# Training parameters
start_epoch = 0
epochs = 40                             # number of epochs to train before finetuning the encoder. Set to 18 when finetuning ecoder
epochs_since_improvement = 0            # keeps track of number of epochs since there's been an improvement in validation BLEU
batch_size = 80                         # set to 32 when finetuning the encoder
workers = 1                             # number of workers for data-loading
encoder_lr = 1e-4                       # learning rate for encoder. if fine-tuning, change to 1e-5 for CNN parameters only
decoder_lr = 5e-4                       # learning rate for decoder
grad_clip = 0.1                         # clip gradients at an absolute value of
best_cider = 0.                         # Current BLEU-4 score 
print_freq = 1# 100                        # print training/validation stats every __ batches
fine_tune_encoder = False                # set to true after 20 epochs 
checkpoint = None    # path to checkpoint, None at the begining
annFile = '/home/vu48pok/Dokumente/Projekte/reg/knowing-when-to-look-adaptive-attention/cococaption/annotations/captions_val2014.json'  # Location of validation annotations

splits_path = '/home/vu48pok/.data/compling/data/corpora/external/MSCOCO/COCO/splits/karpathy/caption_datasets/'
caps_path = '/home/vu48pok/.data/compling/data/corpora/external/MSCOCO/COCO/'
image_dir = '/home/vu48pok/.data/compling/data/corpora/external/MSCOCO/COCO/'
crop_size=224

In [3]:
with open('/home/vu48pok/Dokumente/Projekte/reg/knowing-when-to-look-adaptive-attention/data/coco_vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [4]:
def train(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch, vocab_size):

    decoder.train()                 # train mode (dropout and batchnorm is used)
    encoder.train()
    losses = AverageMeter()         # loss (per decoded word)
    top5accs = AverageMeter()       # top5 accuracy

    # Batches
    for i, (imgs, caps, caplens) in enumerate(train_loader):

        # Move to GPU, if available
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)
        # Forward prop.
        enc_image,  global_features = encoder(imgs)
        predictions, alphas, betas, encoded_captions, decode_lengths, sort_ind = decoder(enc_image, global_features, 
                                                                                         caps, caplens)
        # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
        targets = encoded_captions[:, 1:]
        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        
        scores, _, _, _ = pack_padded_sequence(predictions, decode_lengths, batch_first=True)
        targets, _, _, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)
        # Calculate loss
        loss = criterion(scores, targets)
        # Back prop.
        decoder_optimizer.zero_grad()
        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad() 
            
        loss.backward()

        # Update weights
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()
        # Keep track of metrics
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))    
        top5accs.update(top5, sum(decode_lengths))
        # Print status every print_freq iterations --> (print_freq * batch_size) images
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(epoch, i, len(train_loader),
                                                                            loss=losses,
                                                                            top5=top5accs))

In [4]:
checkpoint = torch.load('/home/vu48pok/.data/compling/data/misc/models/AdaptiveAttention/BEST_checkpoint_12.pth.tar', map_location=torch.device('cpu'))

start_epoch = checkpoint['epoch'] + 1
epochs_since_improvement = checkpoint['epochs_since_improvement']
best_cider = checkpoint['cider']
decoder = checkpoint['decoder']
decoder_optimizer = checkpoint['decoder_optimizer']
encoder = checkpoint['encoder']
encoder_optimizer = checkpoint['encoder_optimizer']
if fine_tune_encoder is True and encoder_optimizer is None:
    encoder.fine_tune(fine_tune_encoder)
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),lr=encoder_lr)
    print("Finetuning the CNN")



In [9]:
caps_df = get_karpathy_split(splits_path=splits_path, caps_path=caps_path)


transform = transforms.Compose([
    transforms.Resize((crop_size, crop_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))
])

In [10]:
validate(caps_df, encoder, decoder, 0, vocab)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




AttributeError: 'AvgPool2d' object has no attribute 'divisor_override'

In [6]:
def validate(caps_df, encoder, decoder, epoch, vocab, max_len=20):
    """
    Funtion to validate over the complete dataset
    """

    encoder.eval()
    decoder.eval()
    results = []
    
    val_ids = np.unique(caps_df.loc[caps_df.split == 'val'].image_id)
    for i, image_id in tqdm(enumerate(val_ids[:100])):

        entry = caps_df.loc[caps_df.image_id == image_id].iloc[0]
        # get image filename
        img_path = os.path.join(
            image_dir, '{coco_split}2014/',
            filename_from_id(image_id, prefix='COCO_{coco_split}2014_')
            )
        img_path = img_path.format(coco_split=entry.coco_split)
        # read image file and transform
        image = Image.open(img_path).convert('RGB')

        img = transform(image).unsqueeze(0)

        enc_image,  global_features = encoder(img)
        num_pix = enc_image.shape[1]

        spatial_image = F.relu(decoder.encoded_to_hidden(enc_image))  # (batch_size,num_pixels,hidden_size)
        global_image = F.relu(decoder.global_features(global_features))      # (batch_size,embed_size)

        alphas_stored = torch.zeros(max_len, num_pix+1)
        betas_stored = torch.zeros(max_len,1)
        pred = torch.LongTensor([[vocab('<start>')]]).to(device)   # (1, 1)  
        betas_stored = torch.zeros(max_len,1)

        h,c = decoder.init_hidden_state(enc_image)                    #  (1,hidden_size)

        for timestep in range(max_len):
            embeddings = decoder.embedding(pred).squeeze(1)       # (1,1,embed_dim) --> (1,embed_dim)    
            inputs = torch.cat((embeddings,global_image), dim = 1)    # (1, embed_dim * 2)
            h, c, st = decoder.LSTM(inputs, (h, c))  # (1, hidden_size)
            # Run the adaptive attention model
            out, alpha, beta = decoder.adaptive_attention(spatial_image, h, st)
            # Compute the probability
            pt = decoder.fc(out)  
            _,pred = pt.max(1)
            sampled.append(pred.item())
            alphas_stored[timestep] = alpha
            betas_stored[timestep] = beta.item()

        generated_words = [vocab.idx2word[sampled[i]] for i in range(len(sampled))]
        sentence = ' '.join([word for word in generated_words if word != '<end>'])
        
        item_dict = {"image_id": image_id.item(), "caption": sentence}
        results.append(item_dict)
        
    print("Calculating Evalaution Metric Scores......\n")
    
    resFile = 'results/captions_val2014_results_' + str(epoch) + '.json' 
    evalFile = 'results/captions_val2014_eval_' + str(epoch) + '.json' 
    # Calculate Evaluation Scores
    with open(resFile, 'w') as wr:
        json.dump(results,wr)
        
    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)
    # create cocoEval object by taking coco and cocoRes
    cocoEval = COCOEvalCap(coco, cocoRes)
    # evaluate on a subset of images
    # please remove this line when evaluating the full validation set
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    # evaluate results
    cocoEval.evaluate()    
    # Save Scores for all images in resFile
    with open(evalFile, 'w') as w:
        json.dump(cocoEval.eval, w)

    return cocoEval.eval['CIDEr'], cocoEval.eval['Bleu_4']       

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

with open('caption data/WORDMAP_coco.json', 'r') as j:
    word_map = json.load(j)

rev_word_map = {v: k for k, v in word_map.items()}  # idx2word

if checkpoint is None:
    decoder = DecoderWithAttention(hidden_size = hidden_size,
                                   vocab_size = len(word_map), 
                                   att_dim = attention_dim, 
                                   embed_size = emb_dim,
                                   encoded_dim = 2048) 

    encoder = Encoder(hidden_size = hidden_size, embed_size = emb_dim)
    decoder_optimizer = torch.optim.Adam(params=decoder.parameters(),lr=decoder_lr, betas = (0.8,0.999))
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                         lr=encoder_lr, betas = (0.8,0.999)) if fine_tune_encoder else None

else:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint['epoch'] + 1
    epochs_since_improvement = checkpoint['epochs_since_improvement']
    best_cider = checkpoint['cider']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    if fine_tune_encoder is True and encoder_optimizer is None:
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),lr=encoder_lr)
        print("Finetuning the CNN")

# Move to GPU, if available
decoder = decoder.to(device)
encoder = encoder.to(device)

# Loss function
criterion = nn.CrossEntropyLoss().to(device)


transform = transforms.Compose([
    transforms.Resize((crop_size, crop_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))
])

caps_df = get_karpathy_split(splits_path=splits_path, caps_path=caps_path)

train_loader = get_caption_loader(
    decoding_level='word', 
    split='train',
    data_df=caps_df.loc[caps_df.split == 'train'].iloc[:10000], 
    image_dir=image_dir, 
    vocab=vocab,
    transform=transform, 
    batch_size=batch_size, 
    shuffle=False,
    num_workers=2, 
    drop_last=False   
)

In [None]:
# Epochs

for epoch in range(start_epoch, epochs):
    
    if epoch > 2:
        break
    

    # Terminate training if there is no improvmenet for 8 epochs
    if epochs_since_improvement == 8:
        print("No Improvement for the last 6 epochs. Training Terminated")
        break

    # Decay the learning rate by 0.8 every 3 epochs
    if epoch % 3 == 0 and epoch !=0:
        adjust_learning_rate(decoder_optimizer, 0.8)

    # One epoch's training
    train(train_loader=train_loader,
          encoder=encoder,
          decoder=decoder,
          criterion=criterion,
          encoder_optimizer=encoder_optimizer,
          decoder_optimizer=decoder_optimizer,
          epoch=epoch,
          vocab_size = len(vocab))

In [34]:
val_ids = np.unique(caps_df.loc[caps_df.split == 'val'].image_id)

image_id = val_ids[2]

entry = caps_df.loc[caps_df.image_id == image_id].iloc[0]
# get image filename
img_path = os.path.join(
    image_dir, '{coco_split}2014/',
    filename_from_id(image_id, prefix='COCO_{coco_split}2014_')
    )
img_path = img_path.format(coco_split=entry.coco_split)
# read image file and transform
image = Image.open(img_path).convert('RGB')

img = transform(image).unsqueeze(0)

enc_image, global_features = encoder(img)
num_pix = enc_image.shape[1]

spatial_image = F.relu(decoder.encoded_to_hidden(enc_image))  # (batch_size,num_pixels,hidden_size)
global_image = F.relu(decoder.global_features(global_features))      # (batch_size,embed_size)

alphas_stored = torch.zeros(max_len, num_pix+1)
betas_stored = torch.zeros(max_len,1)
pred = torch.LongTensor([[vocab('<start>')]]).to(device)   # (1, 1)  
betas_stored = torch.zeros(max_len,1)

h,c = decoder.init_hidden_state(enc_image)                    #  (1,hidden_size)

for timestep in range(max_len):
    embeddings = decoder.embedding(pred).squeeze(1)       # (1,1,embed_dim) --> (1,embed_dim)    
    inputs = torch.cat((embeddings,global_image), dim = 1)    # (1, embed_dim * 2)
    h, c, st = decoder.LSTM(inputs, (h, c))  # (1, hidden_size)
    # Run the adaptive attention model
    out, alpha, beta = decoder.adaptive_attention(spatial_image, h, st)
    # Compute the probability
    pt = decoder.fc(out)  
    _,pred = pt.max(1)
    sampled.append(pred.item())
    alphas_stored[timestep] = alpha
    betas_stored[timestep] = beta.item()

generated_words = [vocab.idx2word[sampled[i]] for i in range(len(sampled))]
filtered_words = ' '.join([word for word in generated_words if word != '<end>'])

print(filtered_words)

a kitchen with a toilet and a toilet a man is in a kitchen with a toilet and a toilet a man is in a kitchen with a toilet and a toilet a man is in a kitchen with a toilet and a toilet a man is in a kitchen with a toilet and a toilet
