In [1]:
#%pip install ipywidgets nltk tqdm #cython
#%pip install git+https://github.com/facebookresearch/fvcore.git
#python -m pip install detectron2 -f \
#  https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.7/index.html
#python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

# https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html 
# https://stackoverflow.com/questions/62081155/pytorch-indexerror-index-out-of-range-in-self-how-to-solve
# https://www.programmersought.com/article/97387644893/

#import nltk
#nltk.download('punkt')

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

#if not os.path.join(os.getcwd(), 'pycocotools') in sys.path:
#    sys.path.append(os.path.join(os.getcwd(), 'pycocotools'))

import string
import pandas as pd
import numpy as np
import torch
import json
import gc

from collections import defaultdict

#from torchvision import transforms
from torch.utils.data import DataLoader
#from torch.nn.utils.rnn import pack_padded_sequence

from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from torch.nn import NLLLoss

from tqdm.auto import tqdm

from loader.dataset_coco import COCODataset
from loader.images import ImageS3
from loader.model import ModelS3
from loader.vocab import construct_vocab

from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg

from commons.utils import embedding_matrix, tensor_to_word_fn
from models.transformer import MemoryAugmentedEncoder, MeshedDecoder, Transformer, ScaledDotProductAttentionMemory

from eval.metrics import bleu, cider, rouge, spice, meteor, bleu_score_fn

from IPython.core.display import HTML

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(1111)
np.random.seed(1111)

In [2]:
import detectron2

print(torch.__version__)
print(detectron2.__version__)
print(torch.version.cuda)

1.7.1
0.1
10.1


In [2]:
cfg = get_cfg()
#cfg.set_new_allowed(True)
cfg.merge_from_file('bua/configs/VG-Detection/faster_rcnn_R_101_C4_attr_caffemaxpool.yaml')
cfg.MODEL.RPN.POST_NMS_TOPK_TEST = 300
cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.6
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2
# VG Weight

# curl https://nlp.cs.unc.edu/models/faster_rcnn_from_caffe_attr_original.pkl --output bua/weights/faster_rcnn_from_caffe_attr_original.pkl
cfg.MODEL.WEIGHTS = 'bua/weights/faster_rcnn_from_caffe_attr_original.pkl'  
# 'https://nlp.cs.unc.edu/models/faster_rcnn_from_caffe_attr_original.pkl' 
# 'http://nlp.cs.unc.edu/models/faster_rcnn_from_caffe_attr.pkl'
#if not torch.cuda.is_available():
cfg.MODEL.DEVICE = device.type

#cfg
predictor = DefaultPredictor(cfg)

Config 'bua/configs/VG-Detection/faster_rcnn_R_101_C4_attr_caffemaxpool.yaml' has no VERSION. Assuming it to be compatible with latest v2.


Modifications for VG in ResNet Backbone (modeling/backbone/resnet.py):
	Using pad 0 in stem max_pool instead of pad 1.

Modifications for VG in RPN (modeling/proposal_generator/rpn.py):
	Use hidden dim 512 instead fo the same dim as Res4 (1024).

Modifications for VG in RoI heads (modeling/roi_heads/roi_heads.py):
	1. Change the stride of conv1 and shortcut in Res5.Block1 from 2 to 1.
	2. Modifying all conv2 with (padding: 1 --> 2) and (dilation: 1 --> 2).
	For more details, please check 'https://github.com/peteanderson80/bottom-up-attention/blob/master/models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt'.

Modifications for VG in RoI heads (modeling/roi_heads/fast_rcnn.py))
	Embedding: 1601 --> 256	Linear: 2304 --> 512	Linear: 512 --> 401



In [3]:
train = COCODataset(dtype='train', ret_type='tensor', predictor=predictor, copy_img_to_mem=False, device=device, partial=100) #
val = COCODataset(dtype='val', ret_type='tensor', predictor=predictor, copy_img_to_mem=False, device=device, partial=50) #

loading annotations into memory...
Done (t=1.39s)
creating index...
index created!
tokenizing caption... done!!
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
tokenizing caption... done!!


In [4]:
vocabulary = construct_vocab(train.df, val.df)
train.vocabulary = vocabulary
val.vocabulary = vocabulary

In [5]:
val_eval = COCODataset(dtype='val', ret_type='corpus', predictor=predictor, copy_img_to_mem=False, 
                       vocabulary=vocabulary, device=device, partial=50) #

loading annotations into memory...
Done (t=0.08s)
creating index...
index created!
tokenizing caption... done!!


In [6]:
test = COCODataset(dtype='test', ret_type='corpus', predictor=predictor, copy_img_to_mem=False, 
                   vocabulary=vocabulary, device=device, partial=10) #

loading annotations into memory...
Done (t=0.13s)
creating index...
index created!
no caption found... done!!


In [7]:
BUCKET = 'assistive-vision'
MODEL = 'm2_transformer_multiheadatt'
GLOVE_DIR = 'annotations/glove'

MEMORY_SLOTS = 40
NUM_HEAD = 8
NUM_WORKERS = 0

BATCH_SIZE = 16 #10
LOG_INTERVAL = 25 * (256 // BATCH_SIZE)
LR = 1
LR_WARMUP = 10000
NUM_EPOCHS = 2

LOCAL_PATH = 'bin/'
KEY_PATH = 'bin/'
CHEKCPOINTS_PATH = 'checkpoints/'
CAPTIONS_PATH = 'captions/'
VERSION = '1.0'

MODEL_NAME = f'{MODEL}_b{BATCH_SIZE}_mha{NUM_HEAD}'


In [8]:
def lambda_lr(s):
    warm_up = LR_WARMUP
    s += 1
    return (model.d_model ** -.5) * min(s ** -.5, s * warm_up ** -1.5)

def training(model, dataloader, optim, desc=''):
    
    # Training with cross-entropy
    means = dict()
    
    for phase in ['train', 'val']:

        running_loss = .0
    
        if phase == 'train':
            model.train()
            scheduler.step()
        else:
            model.eval()

        t = tqdm(iter(dataloaders[phase]), desc=f'{desc} ::: {phase}')
        
        with torch.set_grad_enabled(phase == 'train'):
            for batch_idx, batch in enumerate(t):
                images, captions, lengths, fname, image_id = batch

                out = model(images, captions)

                if phase == 'train':
                    optim.zero_grad()

                captions_gt = captions[:, 1:].contiguous()
                out = out[:, :-1].contiguous()
                loss = loss_fn(out.view(-1, len(vocabulary)), captions_gt.view(-1))

                if phase == 'train':
                    loss.backward()
                    optim.step()
                    scheduler.step()

                this_loss = loss.item()
                running_loss += this_loss

                t.set_postfix({
                        'loss': running_loss / (batch_idx + 1)
                    }, refresh=True)

                if (batch_idx + 1) % LOG_INTERVAL == 0 :
                    print(f'{desc}_{phase} {batch_idx + 1}/{len(dataloader)} '
                          f'{phase}_loss: {loss / (batch_idx + 1):.4f} ')

                if phase == 'train':
                    scheduler.step()

                # release gpu memory
                del images
                del captions
                gc.collect()
                if device.type == 'cuda':
                    torch.cuda.empty_cache()

            means[phase] = running_loss / len(dataloaders[phase])

    return means['train'], means['val']

def evaluate(model, dataloader, bleu_score_fn, tensor_to_word_fn, desc=''):
    
    model.eval()
    
    pred_byfname = dict()
    caps_byfname = defaultdict(list)
    scores = dict()
    
    running_bleu = [0.0] * 5
    
    t = tqdm(iter(data_loader), desc=f'{desc}')
    for batch_idx, batch in enumerate(t):
        images, captions, lengths, fname, image_id = batch
        
        out, _ = model.beam_search(images, 20, vocabulary.word2idx['<end>'], 5, out_size=1)
        outputs = tensor_to_word_fn(out.cpu().numpy())
        
        for i in range(1, 5):
            running_bleu[i] += bleu_score_fn(reference_corpus=captions, candidate_corpus=outputs, n=i)
        t.set_postfix({
            'bleu1': running_bleu[1] / (batch_idx + 1),
            'bleu4': running_bleu[4] / (batch_idx + 1)
        }, refresh=True)
        
        for f, o, c in zip(fname, outputs, captions):
            if not f in pred_byfname:
                pred_byfname[f] = [detokenize(o)]
            caps_byfname[f].append(detokenize(c))
        
        # release gpu memory
        del images
        del captions
        gc.collect()
        if device.type == 'cuda':
            torch.cuda.empty_cache()
    
    # mean running_bleu score
    for i in range(1, 5):
        running_bleu[i] /= len(data_loader)
    scores['bleu'] = running_bleu

    # calculate overall score
    scores['coco_bleu'] = bleu(caps_byfname, pred_byfname, verbose=0)
    scores['cider'] = cider(caps_byfname, pred_byfname)
    scores['rouge'] = rouge(caps_byfname, pred_byfname)
    #scores['spice'] = spice(caps_byfname, pred_byfname)
    #scores['meteor'] = meteor(caps_byfname, pred_byfname)
    
    return scores

def generate_captions(dataloader, model, desc=''):
    rlist = []
    
    t = tqdm(iter(dataloader), desc=f'{desc}')
    for batch_idx, batch in enumerate(t):
        images, fname, image_id = batch
        
        out, _ = model.beam_search(images, 20, vocabulary.word2idx['<end>'], 5, out_size=1)
        outputs = tensor_to_word_fn(out.cpu().numpy())

        for out, img in zip(outputs, image_id):
            result = dict(
                image_id = int(img),
                caption = detokenize(out)
            )
            rlist.append(result)
        
        # release gpu memory
        del images
        gc.collect()
        if device.type == 'cuda':
            torch.cuda.empty_cache()
    
    return rlist

def detokenize(tokens):
    return ''.join([' ' + i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [9]:
# Model and dataloaders
# reminder to save the vocabulary, encoder, and decoder
encoder = MemoryAugmentedEncoder(3, 0, attention_module=ScaledDotProductAttentionMemory,
                                 attention_module_kwargs={'m': MEMORY_SLOTS})
decoder = MeshedDecoder(len(vocabulary), 54, 3, vocabulary.word2idx['<pad>']) # vocabulary.max_len
model = Transformer(vocabulary.word2idx['<start>'], encoder, decoder).to(device)

optim = Adam(model.parameters(), lr=LR, betas=(0.9, 0.98))
scheduler = LambdaLR(optim, lambda_lr)
loss_fn = NLLLoss(ignore_index=vocabulary.word2idx['<pad>'])

corpus_bleu_score_fn = bleu_score_fn(4, 'corpus')
tensor2word_fn = tensor_to_word_fn(idx2word=vocabulary.idx2word)

model_bin = ModelS3()

fname = f'{MODEL_NAME}_bin_v{VERSION}.pkl'
model_bin.save_pkl(model, os.path.join(LOCAL_PATH, fname), os.path.join(KEY_PATH, fname))

sinusoid_encoding_table tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00],
        ...,
        [ 9.8663e-01, -1.6299e-01, -1.0289e-01,  ...,  9.9998e-01,
          5.3905e-03,  9.9999e-01],
        [ 3.9593e-01, -9.1828e-01,  7.5888e-01,  ...,  9.9998e-01,
          5.4941e-03,  9.9998e-01],
        [-5.5879e-01, -8.2931e-01,  9.6755e-01,  ...,  9.9998e-01,
          5.5978e-03,  9.9998e-01]])
sinusoid_encoding_table.shape torch.Size([55, 512])


In [10]:
dataloaders = dict(
    train = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, sampler=None, pin_memory=False),
    val = DataLoader(val, batch_size=BATCH_SIZE, shuffle=True, sampler=None, pin_memory=False)
)

eval_collate_fn = lambda batch: (torch.stack([x[0] for x in batch]), [x[1] for x in batch], [x[2] for x in batch], 
                                 [x[3] for x in batch], [x[4] for x in batch])
test_collate_fn = lambda batch: (torch.stack([x[0] for x in batch]), [x[1] for x in batch], [x[2] for x in batch])

val_eval_loader = DataLoader(val_eval, batch_size=BATCH_SIZE, shuffle=False, sampler=None, pin_memory=False, collate_fn=eval_collate_fn)
test_loader = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False, sampler=None, pin_memory=False, collate_fn=test_collate_fn)


In [11]:
train_loss_min = 100
val_loss_min = 100
val_cider_max = 0.0
patience = 0

for epoch in range(NUM_EPOCHS):
    train_loss, val_loss = training(model, dataloaders, optim, desc=f'Epoch {epoch+1} of {NUM_EPOCHS}')
    
    with torch.no_grad():
        scores =  evaluate(model, val_eval_loader, bleu_score_fn=corpus_bleu_score_fn, 
                           tensor_to_word_fn=tensor2word_fn, desc='Eval Score')
        
        print(f'Epoch {epoch + 1}/{NUM_EPOCHS}')
        print('=' * 95)
        print(''.join([f'val_bleu{i}: {scores["bleu"][i]:.4f} ' for i in range(1, 5)]))
        print(''.join([f'val_coco_bleu{i + 1}{":":>5} {scores["coco_bleu"][0][i]:.4f} ' for i in range(0, 4)]))
        print(f'val_cider{":":>5} {scores["cider"][0]:.4f}')
        print(f'val_rouge{":":>5} {scores["rouge"][0]:.4f}')
        #print(f'val_spcie{":":>5} {scores["spice"][0]:.4f}')
        #print(f'val_meteor{":":>5} {scores["meteor"][0]:.4f}')
        print('-' * 95)
        
        state = dict(
            epoch = epoch + 1,
            state_dict = model.state_dict(),
            train_loss_latest = train_loss,
            val_loss_latest = val_loss,
            train_loss_min = min(train_loss, train_loss_min),
            val_loss_min = min(val_loss, val_loss_min),
            val_bleu1 = scores['bleu'][1],
            val_bleu4 = scores['bleu'][4],
            val_coco_bleu1 = scores['coco_bleu'][0][0],
            val_coco_bleu4 = scores['coco_bleu'][0][3],
            val_cider = scores['cider'][0],
            val_cider_max = max(scores['cider'][0], val_cider_max),
            val_rouge = scores['rouge'][0]
        )
        
        if scores['cider'][0] > val_cider_max:
            val_cider_max = scores['cider'][0]
            fname = f'{MODEL_NAME}_best_v{VERSION}.pth'
            # keep the best model
            model_bin.save(state, os.path.join(LOCAL_PATH, fname), os.path.join(KEY_PATH, fname))
            patience = 0
        else:
            patience += 1
        
        if patience == 5:
            patience = 0
            optim = Adam(model.parameters(), lr=5e-6)
        
        # save as checkpoint
        fname = f'{MODEL_NAME}_ep{epoch + 1}_chkpoint_v{VERSION}.pth'
        model_bin.save(state, os.path.join(CHEKCPOINTS_PATH, fname), os.path.join(KEY_PATH, fname))
            
fname = f'{MODEL_NAME}_ep{NUM_EPOCHS}_latest_v{VERSION}.pth'
model_bin.save(state, os.path.join(LOCAL_PATH, fname), os.path.join(KEY_PATH, fname))
    



Epoch 1 of 2 ::: train:   0%|          | 0/7 [00:00<?, ?it/s]

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370116979/work/torch/csrc/utils/python_arg_parser.cpp:882.)
  filter_inds = filter_mask.nonzero()


mask_queries tensor([[[1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [1.],
    

KeyboardInterrupt: 