In [1]:
import torch 
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

import matplotlib.pyplot as plt

from model.model import SentenceEncoder, SentenceDecoder, ImageEncoder, cnnTransforms, Gesd
from dataset import VisDialDataset
from utils.token import Lang

from VQAFeature.model import VQADualModel
from VQAFeature.utils import setDualData

jsonFile = "/home/ball/dataset/mscoco/visdialog/visdial_1.0_val.json"
cocoDir = "/home/ball/dataset/mscoco/"
sentFeature = "visdial_train.h5"
langFile = "dataset/lang.pkl"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device("cpu")
print(DEVICE)

cuda


In [2]:
lang = Lang.load(langFile)
dataset = VisDialDataset(dialFile = jsonFile,
                         cocoDir = cocoDir, 
                         sentTransform = torch.LongTensor,
                         imgTransform = cnnTransforms,
                         convertSentence = lang.sentenceToVector
                        )

Load lang model: dataset/lang.pkl. Word size: 43974


Preparing image paths with image_ids: 133351it [00:00, 376983.53it/s]


In [15]:
loader = torch.utils.data.DataLoader(dataset, 
                                     batch_size=2, 
                                     shuffle=True, 
                                     num_workers=0, 
                                     collate_fn=VisDialDataset.collate_fn)
it = iter(loader)

In [36]:
data = it.next()
images_t, questions_t, answers_t, label_t = setDualData(data, lang, DEVICE)

In [39]:
image_setting = {
    "output_size": 1024,
    "pretrained": False
}
sentence_setting = {
    "word_size": len(lang),
    "output_size": 512
}

model = VQADualModel(image_setting, sentence_setting).to(DEVICE)

In [48]:
model = torch.load("VQAFeature/models/dualfix/VQAmodel.2.pth").to(DEVICE).eval()

In [4]:
class Gesd(torch.nn.Module):
    def __init__(self, gamma=1, c=1, dim=1):
        super(Gesd, self).__init__()
        self.gamma = gamma
        self.c = c
        self.dim = dim

    def forward(self, f1, f2):
        l2_norm = ((f1-f2) ** 2).sum(dim=self.dim)
        euclidean = 1 / (1 + l2_norm)
        sigmoid  = 1 / (1 + torch.exp(-1 * self.gamma * ((f1*f2).sum(dim=self.dim) + self.c)))
        output = euclidean * sigmoid

        return output

In [54]:
from tqdm import tqdm
scores = []
for i in tqdm(range(100)):
    data = VisDialDataset.collate_fn(dataset[i:i+1])
    images_t, questions_t, answers_t, label_t = setDualData(data, lang, DEVICE, negsimple=0)
    iq_outputs = model.imageQuestion(images_t, questions_t)
    a_outputs = model.answer(answers_t)
    scores.append(eval(iq_outputs, a_outputs))
# torch.stack(scores).mean()

100%|██████████| 100/100 [00:03<00:00, 28.73it/s]


In [55]:
scores

[(tensor(0.4013), tensor(0.3724, device='cuda:0')),
 (tensor(0.4617), tensor(0.4036, device='cuda:0')),
 (tensor(0.4004), tensor(0.3810, device='cuda:0')),
 (tensor(0.6167), tensor(0.2920, device='cuda:0')),
 (tensor(0.5293), tensor(0.3317, device='cuda:0')),
 (tensor(0.4560), tensor(0.3103, device='cuda:0')),
 (tensor(0.3936), tensor(0.3613, device='cuda:0')),
 (tensor(0.5093), tensor(0.3719, device='cuda:0')),
 (tensor(0.6117), tensor(0.2983, device='cuda:0')),
 (tensor(0.3845), tensor(0.3995, device='cuda:0')),
 (tensor(0.4926), tensor(0.2993, device='cuda:0')),
 (tensor(0.5408), tensor(0.3293, device='cuda:0')),
 (tensor(0.3926), tensor(0.3467, device='cuda:0')),
 (tensor(0.5301), tensor(0.3283, device='cuda:0')),
 (tensor(0.4676), tensor(0.3884, device='cuda:0')),
 (tensor(0.5760), tensor(0.2738, device='cuda:0')),
 (tensor(0.5621), tensor(0.3154, device='cuda:0')),
 (tensor(0.7083), tensor(0.3258, device='cuda:0')),
 (tensor(0.5587), tensor(0.3359, device='cuda:0')),
 (tensor(0.5

In [53]:
def eval(inputs, targets):
    #inputs: batch * feature
    #targets: batch * feature(each answer's feature)
    criterion = Gesd(dim=2)
    batch = inputs.size(0)
    inputs = inputs.unsqueeze(1).repeat(1, batch, 1)
    targets = targets.unsqueeze(0).repeat(batch, 1, 1)
    
    score = criterion(inputs, targets)
    vals, orders = score.sort(descending=True, dim=1)
    table = torch.arange(batch).unsqueeze(1).repeat(1, batch)
    idx_orders = (table == orders.cpu()).nonzero()[:, 1].float()
    mrr = (1/(idx_orders+1)).mean()
#     mrr = idx_orders.mean()
    return mrr.detach(), score.mean().detach()

In [132]:
idx = eval(outputs, answers_t)