In [1]:
#%pip install ipywidgets nltk #cython

#import nltk
#nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

if not os.path.join(os.getcwd(), 'pycocotools') in sys.path:
    sys.path.append(os.path.join(os.getcwd(), 'pycocotools'))

import string
import pandas as pd
import numpy as np
import torch
import json
import gc

from collections import defaultdict

#from torchvision import transforms
from torch.utils.data import DataLoader
#from torch.nn.utils.rnn import pack_padded_sequence

from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from torch.nn import NLLLoss

from tqdm.auto import tqdm

from loader.dataset_coco import COCODataset
from loader.images import ImageS3
from loader.model import ModelS3
from loader.vocab import construct_vocab

from commons.utils import embedding_matrix, tensor_to_word_fn
from models.transformer import MemoryAugmentedEncoder, MeshedDecoder, Transformer, ScaledDotProductAttentionMemory

from eval.metrics import bleu, cider, rouge, spice, meteor, bleu_score_fn

from IPython.core.display import HTML

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(1111)
np.random.seed(1111)

In [2]:
train = COCODataset(dtype='train', ret_type='tensor', copy_img_to_mem=False, device=device, partial=100) #
val = COCODataset(dtype='val', ret_type='tensor', copy_img_to_mem=False, device=device, partial=50) #

loading annotations into memory...
Done (t=1.17s)
creating index...
index created!
tokenizing caption... done!!
loading annotations into memory...
Done (t=0.17s)
creating index...
index created!
tokenizing caption... done!!


In [3]:
vocabulary = construct_vocab(train.df, val.df)
train.vocabulary = vocabulary
val.vocabulary = vocabulary

In [4]:
val_eval = COCODataset(dtype='val', ret_type='corpus', copy_img_to_mem=False, vocabulary=vocabulary, device=device, partial=50) #

loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
tokenizing caption... done!!


In [5]:
test = COCODataset(dtype='test', ret_type='corpus', copy_img_to_mem=False, vocabulary=vocabulary, device=device, partial=10) #

loading annotations into memory...
Done (t=0.13s)
creating index...
index created!
no caption found... done!!


In [6]:
BUCKET = 'assistive-vision'
MODEL = 'm2_transformer_multiheadatt'
GLOVE_DIR = 'annotations/glove'

MEMORY_SLOTS = 40
NUM_HEAD = 8
NUM_WORKERS = 0

BATCH_SIZE = 16 #10
LOG_INTERVAL = 25 * (256 // BATCH_SIZE)
LR = 1
LR_WARMUP = 10000
NUM_EPOCHS = 5

LOCAL_PATH = 'bin/'
KEY_PATH = 'bin/'
CHEKCPOINTS_PATH = 'checkpoints/'
CAPTIONS_PATH = 'captions/'
VERSION = '1.0'

MODEL_NAME = f'{MODEL}_b{BATCH_SIZE}_mha{NUM_HEAD}'


In [7]:
def lambda_lr(s):
    warm_up = LR_WARMUP
    s += 1
    return (model.d_model ** -.5) * min(s ** -.5, s * warm_up ** -1.5)

def training(model, dataloader, optim, desc=''):
    
    # Training with cross-entropy
    means = dict()
    
    for phase in ['train', 'val']:

        running_loss = .0
    
        if phase == 'train':
            model.train()
            scheduler.step()
        else:
            model.eval()

        t = tqdm(iter(dataloaders[phase]), desc=f'{desc} ::: {phase}')
        
        with torch.set_grad_enabled(phase == 'train'):
            for batch_idx, batch in enumerate(t):
                images, captions, lengths, fname, image_id = batch

                out = model(images, captions)

                if phase == 'train':
                    optim.zero_grad()

                captions_gt = captions[:, 1:].contiguous()
                out = out[:, :-1].contiguous()
                loss = loss_fn(out.view(-1, len(vocabulary)), captions_gt.view(-1))

                if phase == 'train':
                    loss.backward()
                    optim.step()

                this_loss = loss.item()
                running_loss += this_loss

                t.set_postfix({
                        'loss': running_loss / (batch_idx + 1)
                    }, refresh=True)

                if (batch_idx + 1) % LOG_INTERVAL == 0 :
                    print(f'{desc}_{phase} {batch_idx + 1}/{len(dataloader)} '
                          f'{phase}_loss: {loss / (batch_idx + 1):.4f} ')

                if phase == 'train':
                    scheduler.step()

                # release gpu memory
                del images
                del captions
                gc.collect()
                if device.type == 'cuda':
                    torch.cuda.empty_cache()

            means[phase] = running_loss / len(dataloaders[phase])

    return means['train'], means['val']

def evaluate(model, dataloader, bleu_score_fn, tensor_to_word_fn, desc=''):
    
    model.eval()
    
    pred_byfname = dict()
    caps_byfname = defaultdict(list)
    scores = dict()
    
    running_bleu = [0.0] * 5
    
    t = tqdm(iter(data_loader), desc=f'{desc}')
    for batch_idx, batch in enumerate(t):
        images, captions, lengths, fname, image_id = batch
        
        out, _ = model.beam_search(images, 20, vocabulary.word2idx['<end>'], 5, out_size=1)
        outputs = tensor_to_word_fn(out.cpu().numpy())
        
        for i in range(1, 5):
            running_bleu[i] += bleu_score_fn(reference_corpus=captions, candidate_corpus=outputs, n=i)
        t.set_postfix({
            'bleu1': running_bleu[1] / (batch_idx + 1),
            'bleu4': running_bleu[4] / (batch_idx + 1)
        }, refresh=True)
        
        for f, o, c in zip(fname, outputs, captions):
            if not f in pred_byfname:
                pred_byfname[f] = [detokenize(o)]
            caps_byfname[f].append(detokenize(c))
        
        # release gpu memory
        del images
        del captions
        gc.collect()
        if device.type == 'cuda':
            torch.cuda.empty_cache()
    
    # mean running_bleu score
    for i in range(1, 5):
        running_bleu[i] /= len(data_loader)
    scores['bleu'] = running_bleu

    # calculate overall score
    scores['coco_bleu'] = bleu(caps_byfname, pred_byfname, verbose=0)
    scores['cider'] = cider(caps_byfname, pred_byfname)
    scores['rouge'] = rouge(caps_byfname, pred_byfname)
    #scores['spice'] = spice(caps_byfname, pred_byfname)
    #scores['meteor'] = meteor(caps_byfname, pred_byfname)
    
    return scores

def generate_captions(dataloader, model, desc=''):
    rlist = []
    
    t = tqdm(iter(dataloader), desc=f'{desc}')
    for batch_idx, batch in enumerate(t):
        images, fname, image_id = batch
        
        out, _ = model.beam_search(images, 20, vocabulary.word2idx['<end>'], 5, out_size=1)
        outputs = tensor_to_word_fn(out.cpu().numpy())

        for out, img in zip(outputs, image_id):
            result = dict(
                image_id = int(img),
                caption = detokenize(out)
            )
            rlist.append(result)
        
        # release gpu memory
        del images
        gc.collect()
        if device.type == 'cuda':
            torch.cuda.empty_cache()
    
    return rlist

def detokenize(tokens):
    return ''.join([' ' + i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [8]:
# Model and dataloaders
encoder = MemoryAugmentedEncoder(3, 0, attention_module=ScaledDotProductAttentionMemory,
                                 attention_module_kwargs={'m': MEMORY_SLOTS})
decoder = MeshedDecoder(len(vocabulary), 54, 3, vocabulary.word2idx['<pad>'])
model = Transformer(vocabulary.word2idx['<start>'], encoder, decoder).to(device)

optim = Adam(model.parameters(), lr=LR, betas=(0.9, 0.98))
scheduler = LambdaLR(optim, lambda_lr)
loss_fn = NLLLoss(ignore_index=vocabulary.word2idx['<pad>'])

corpus_bleu_score_fn = bleu_score_fn(4, 'corpus')
tensor2word_fn = tensor_to_word_fn(idx2word=vocabulary.idx2word)

model_bin = ModelS3()

fname = f'{MODEL_NAME}_ep{NUM_EPOCHS}_bin_v{VERSION}.pkl'
model_bin.save_pkl(model, os.path.join(LOCAL_PATH, fname), os.path.join(KEY_PATH, fname))

In [9]:
dataloaders = dict(
    train = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, sampler=None, pin_memory=False),
    val = DataLoader(val, batch_size=BATCH_SIZE, shuffle=True, sampler=None, pin_memory=False)
)

eval_collate_fn = lambda batch: (torch.stack([x[0] for x in batch]), [x[1] for x in batch], [x[2] for x in batch], 
                                 [x[3] for x in batch], [x[4] for x in batch])
test_collate_fn = lambda batch: (torch.stack([x[0] for x in batch]), [x[1] for x in batch], [x[2] for x in batch])

val_eval_loader = DataLoader(val_eval, batch_size=BATCH_SIZE, shuffle=False, sampler=None, pin_memory=False, collate_fn=eval_collate_fn)
test_loader = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False, sampler=None, pin_memory=False, collate_fn=test_collate_fn)


In [12]:
train_loss_min = 100
val_loss_min = 100
val_cider_max = 0.0
patience = 0

for epoch in range(NUM_EPOCHS):
    train_loss, val_loss = training(model, dataloaders, optim, desc=f'Epoch {epoch+1} of {NUM_EPOCHS}')
    
    with torch.no_grad():
        scores =  evaluate(model, val_eval_loader, bleu_score_fn=corpus_bleu_score_fn, 
                           tensor_to_word_fn=tensor2word_fn, desc='Eval Score')
        
        print(f'Epoch {epoch + 1}/{NUM_EPOCHS}')
        print('=' * 95)
        print(''.join([f'val_bleu{i}: {scores["bleu"][i]:.4f} ' for i in range(1, 5)]))
        print(''.join([f'val_coco_bleu{i + 1}{":":>5} {scores["coco_bleu"][0][i]:.4f} ' for i in range(0, 4)]))
        print(f'val_cider{":":>5} {scores["cider"][0]:.4f}')
        print(f'val_rouge{":":>5} {scores["rouge"][0]:.4f}')
        #print(f'val_spcie{":":>5} {scores["spice"][0]:.4f}')
        #print(f'val_meteor{":":>5} {scores["meteor"][0]:.4f}')
        print('-' * 95)
        
        state = dict(
            epoch = epoch + 1,
            state_dict = model.state_dict(),
            train_loss_latest = train_loss,
            val_loss_latest = val_loss,
            train_loss_min = min(train_loss, train_loss_min),
            val_loss_min = min(val_loss, val_loss_min),
            val_bleu1 = scores['bleu'][1],
            val_bleu4 = scores['bleu'][4],
            val_coco_bleu1 = scores['coco_bleu'][0][0],
            val_coco_bleu4 = scores['coco_bleu'][0][3],
            val_cider = scores['cider'][0],
            val_cider_max = max(scores['cider'][0], val_cider_max),
            val_rouge = scores['rouge'][0]
        )
        
        if scores['cider'][0] > val_cider_max:
            val_cider_max = scores['cider'][0]
            fname = f'{MODEL_NAME}_best_v{VERSION}.pth'
            # keep the best model
            model_bin.save(state, os.path.join(LOCAL_PATH, fname), os.path.join(KEY_PATH, fname))
            patience = 0
        else:
            patience += 1
        
        if patience == 5:
            patience = 0
            optim = Adam(model.parameters(), lr=5e-6)
        
        # save as checkpoint
        fname = f'{MODEL_NAME}_ep{epoch + 1}_chkpoint_v{VERSION}.pth'
        model_bin.save(state, os.path.join(CHEKCPOINTS_PATH, fname), os.path.join(KEY_PATH, fname))
            
fname = f'{MODEL_NAME}_ep{NUM_EPOCHS}_latest_v{VERSION}.pth'
model_bin.save(state, os.path.join(LOCAL_PATH, fname), os.path.join(KEY_PATH, fname))
    

[2021-07-20 21:57:47.087 pytorch-1-6-cpu-py36--ml-t3-medium-370ee60fbc7a856e8f67ac271515:799 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None




[2021-07-20 21:57:47.275 pytorch-1-6-cpu-py36--ml-t3-medium-370ee60fbc7a856e8f67ac271515:799 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


HBox(children=(HTML(value='Epoch 1 of 5 ::: train'), FloatProgress(value=0.0, max=7.0), HTML(value='')))




RuntimeError: stack expects each tensor to be equal size, but got [3, 424, 640] at entry 0 and [3, 360, 640] at entry 1