In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import torch
from torch.autograd import Variable

from dataset import Dictionary, HMQAFeatureDataset
from model import SoftCount
from config import *
from datetime import datetime, timedelta

import h5py
import numpy as np
import _pickle as pkl
import json
import torch.nn.functional as F

In [4]:
dictionary = Dictionary.load_from_file('data/dictionary.pkl')

loading dictionary from data/dictionary.pkl


In [5]:
%%time
print('loading features from train hdf5 file')
train_h5_loc = './data/train36.hdf5'
with h5py.File(train_h5_loc, 'r') as hf:
    train_image_features = np.array(hf.get('image_features'))
    train_spatials_features = np.array(hf.get('spatial_features'))
# np.save( open("/tmp/vqa/train_image_features", "wb"), train_image_features)
# np.save( open("/tmp/vqa/train_spatials_features", "wb"), train_spatials_features)

In [6]:
# %%time
# train_image_features = np.load(open("/tmp/vqa/train_image_features", "rb"))
# train_spatials_features = np.load(open("/tmp/vqa/train_spatials_features", "rb"))

CPU times: user 92 ms, sys: 42.6 s, total: 42.7 s
Wall time: 7min 15s


In [7]:
from dataset import HMQAFeatureDataset

hmqa_train_dset = HMQAFeatureDataset(
    img_id2hqma_idx = pkl.load(open("./data/train36_imgid2idx.pkl", "rb")),
    image_features = train_image_features, 
    spatial_features = train_spatials_features, 
    qid2count = json.load(open("./data/how_many_qa/qid2count.json", "rb")), 
    qid2count2score = json.load(open("./data/how_many_qa/qid2count2score.json", "rb")), 
    name="train", 
    dictionary=dictionary
)
del HMQAFeatureDataset

In [8]:
len(hmqa_train_dset)

83642

In [9]:
len(set([x["image_id"] for x in hmqa_train_dset.entries]))

45546

In [10]:
%%time
print('loading features from val hdf5 file')
val_h5_loc = './data/val36.hdf5'
with h5py.File(val_h5_loc, 'r') as hf:
    val_image_features = np.array(hf.get('image_features'))
    val_spatials_features = np.array(hf.get('spatial_features'))
# np.save( open("/tmp/vqa/val_image_features", "wb"), val_image_features)
# np.save( open("/tmp/vqa/val_spatials_features", "wb"), val_spatials_features)

In [11]:
# %%time
# val_image_features = np.load(open("/tmp/vqa/val_image_features", "rb"))
# val_spatials_features = np.load(open("/tmp/vqa/val_spatials_features", "rb"))

CPU times: user 24 ms, sys: 24.1 s, total: 24.1 s
Wall time: 1min 44s


In [12]:
# len(train_image_features)

from dataset import HMQAFeatureDataset

hmqa_dev_dset = HMQAFeatureDataset(
    img_id2hqma_idx = pkl.load(open("./data/val36_imgid2idx.pkl", "rb")),
    image_features = val_image_features, 
    spatial_features = val_spatials_features, 
    qid2count = json.load(open("./data/how_many_qa/qid2count.json", "rb")), 
    qid2count2score = json.load(open("./data/how_many_qa/qid2count2score.json", "rb")), 
    name="dev", 
    dictionary=dictionary
)

hmqa_test_dset = HMQAFeatureDataset(
    img_id2hqma_idx = pkl.load(open("./data/val36_imgid2idx.pkl", "rb")),
    image_features = val_image_features, 
    spatial_features = val_spatials_features, 
    qid2count = json.load(open("./data/how_many_qa/qid2count.json", "rb")), 
    qid2count2score = json.load(open("./data/how_many_qa/qid2count2score.json", "rb")), 
    name="test", 
    dictionary=dictionary
)
del HMQAFeatureDataset

In [13]:
len(hmqa_dev_dset), len(hmqa_test_dset)

(17714, 5000)

In [14]:
from torch.utils.data import DataLoader

hmqa_train_loader = DataLoader(hmqa_train_dset, 64, shuffle=True, num_workers=0)
hmqa_dev_loader = DataLoader(hmqa_dev_dset, 64, shuffle=True, num_workers=0)
hmqa_test_loader = DataLoader(hmqa_test_dset, 64, shuffle=True, num_workers=0)

In [15]:
def evaluate(model, hmqa_loader):
    
    all_acc = []
    all_se = []
    for i, (v_emb, b, q, c, c2s) in enumerate(hmqa_loader):
        v_emb = Variable(v_emb)
        b = Variable(b)
        q = Variable(q)
        c = Variable(c).view(-1).float()
        
        if USE_CUDA:
            v_emb = v_emb.cuda()
            b = b.cuda()
            q = q.cuda()
            c = c.cuda()
        
        kappa_0, rho = model.compute_vars(v_emb, b, q)
        count, greedy_count, logPA, entP, A, rho, P = model.take_mc_samples(kappa_0, rho, 1)
        
#         pred = model(v_emb, q)
        
#         ret = torch.stack((count.float(), greedy_count.float(), c.float()))
#         break
    
        nearest_pred = (greedy_count + 0.5).long().clamp(0, 20)
        for one_c, one_c2s, one_pred in zip(c, c2s, nearest_pred):
            one_c = one_c.cpu().data
            one_pred = one_pred.cpu().data
            
#             print("one_c = ", one_c, " and nearest pred = ", one_pred)
            
            all_se.append((one_c - one_pred.float()) ** 2)
            all_acc.append(one_c2s[one_pred])
    
    acc = torch.stack(all_acc).mean()
    rmse = torch.stack(all_se).mean() ** 0.5
    
    return acc, rmse  #, ret

In [17]:
def isnan(x):
    check = (x != x)
    check = check.float().sum().data[0]
    return check > 0

In [30]:
from model import IRLC
model = IRLC(ques_dim=1024, score_dim=2048, dropout=0.4)
del IRLC

initialising with glove embeddings
done.


In [32]:
if USE_CUDA:
    model.cuda()
model

IRLC(
  (ques_parser): QuestionParser(
    (embd): Embedding(20159, 300, padding_idx=20158)
    (rnn): GRU(300, 1024)
    (dropout): Dropout(p=0.4)
  )
  (f_s): ScoringFunction(
    (v_drop): Dropout(p=0.4)
    (q_drop): Dropout(p=0.4)
    (v_proj): FCNet(
      (main): Sequential(
        (0): Linear(in_features=2048, out_features=2048, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
    )
    (q_proj): FCNet(
      (main): Sequential(
        (0): Linear(in_features=1024, out_features=2048, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
    )
    (dropout): Dropout(p=0.4)
  )
  (W): Linear(in_features=2048, out_features=1, bias=True)
  (f_rho): RhoScorer(
    (W): Linear(in_features=1024, out_features=1, bias=True)
    (f_rho): FCNet(
      (main): Sequential(
        (0): Linear(in_features=17, out_features=100, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
    )
    (dense): Linear(in_features=100, out_features=1, bias=True)
  )
  (ex

In [33]:
test_acc, test_rmse = evaluate(model, hmqa_test_loader)
test_acc, test_rmse

(tensor(0.1995), tensor(3.5940))

In [34]:
opt = torch.optim.Adam(model.parameters(), lr=5e-4)
sched = torch.optim.lr_scheduler.ExponentialLR(opt, gamma=.99)

In [35]:
test_accs = []
test_rmses = []

dev_accs = []
dev_rmses = []

In [36]:
exp_name = "hope-2048-0.4"

In [37]:
for epoch in range(0, 100):
    # adjust learning rate
    sched.step()
    print("learning rate is {}".format(opt.param_groups[0]["lr"]))
    
    # save model every 10 epochs
    if epoch % 10 == 0:
        print("saving model..")
        torch.save(model.state_dict(), "./saved_models/{}-epoch-{}-acc-{}".format(exp_name, epoch, test_acc))
        print("Done.")
    
    for i, (v_emb, b, q, c, _) in enumerate(hmqa_train_loader):
        v_emb = Variable(v_emb)
        b = Variable(b)
        q = Variable(q)
        c = Variable(c).view(-1).float()

        if USE_CUDA:
            v_emb = v_emb.cuda()
            q = q.cuda()
            c = c.cuda()
            b = b.cuda()

        B, k, _ = v_emb.size()

        kappa_0, rho = model.compute_vars(v_emb, b, q)
        if isnan(kappa_0) or isnan(rho) or isnan(model.eps):
            raise Exception("there are nans here")

        num_samples = 32
        count, greedy_count, logPA, entP, A, rho, P = model.take_mc_samples(kappa_0, rho, num_samples)
        c_gt = torch.cat([c] * num_samples)

        loss = model.get_loss(c_gt, count, greedy_count, logPA, entP, A, rho)

        if i % 100 == 0:
            print("epoch = {}, i = {}, loss = {},".format(epoch, i, loss.data[0],))

        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.25)
        opt.step()


    print("evaluating model on dev and test...")

    model.eval()
    dev_acc, dev_rmse = evaluate(model, hmqa_dev_loader)
    print("dev_acc: {}, dev_rmse: {}".format(dev_acc, dev_rmse))
    test_acc, test_rmse = evaluate(model, hmqa_test_loader)
    print("test_acc: {}, test_rmse: {}".format(test_acc, test_rmse))
    model.train()
    
    test_accs.append(test_acc)
    test_rmses.append(test_rmse)
    dev_accs.append(dev_acc)
    dev_rmses.append(dev_rmse)

learning rate is 0.0005
saving model..
Done.
epoch = 0, i = 0, loss = -30.549734115600586,


  This is separate from the ipykernel package so we can avoid doing imports until


epoch = 0, i = 100, loss = -0.04085271432995796,
epoch = 0, i = 200, loss = -0.038132987916469574,
epoch = 0, i = 300, loss = -0.04279591888189316,
epoch = 0, i = 400, loss = -0.010144894942641258,
epoch = 0, i = 500, loss = -0.005301458761096001,
epoch = 0, i = 600, loss = -0.004563407972455025,
epoch = 0, i = 700, loss = 0.004216013476252556,
epoch = 0, i = 800, loss = 0.016204925253987312,
epoch = 0, i = 900, loss = -0.05258560925722122,
epoch = 0, i = 1000, loss = -0.051397718489170074,
epoch = 0, i = 1100, loss = -0.15002067387104034,
epoch = 0, i = 1200, loss = -0.07665669918060303,
epoch = 0, i = 1300, loss = -0.0640316978096962,
evaluating model on dev and test...
dev_acc: 0.3863780200481415, dev_rmse: 3.383089780807495
test_acc: 0.39278000593185425, test_rmse: 3.1613287925720215
learning rate is 0.000495
epoch = 1, i = 0, loss = 0.0041228048503398895,
epoch = 1, i = 100, loss = -0.027404343709349632,
epoch = 1, i = 200, loss = -0.12366552650928497,
epoch = 1, i = 300, loss = 0

epoch = 9, i = 700, loss = -0.008578334003686905,
epoch = 9, i = 800, loss = -0.05775934085249901,
epoch = 9, i = 900, loss = -0.0366673581302166,
epoch = 9, i = 1000, loss = -0.04420008137822151,
epoch = 9, i = 1100, loss = -0.017653504386544228,
epoch = 9, i = 1200, loss = 0.027088219299912453,
epoch = 9, i = 1300, loss = -0.0018306095153093338,
evaluating model on dev and test...
dev_acc: 0.4874393045902252, dev_rmse: 2.7815847396850586
test_acc: 0.5030800104141235, test_rmse: 2.505992889404297
learning rate is 0.0004521910375044022
saving model..
Done.
epoch = 10, i = 0, loss = -0.02495328150689602,
epoch = 10, i = 100, loss = -0.039953842759132385,
epoch = 10, i = 200, loss = -0.022315451875329018,
epoch = 10, i = 300, loss = -0.016358336433768272,
epoch = 10, i = 400, loss = 0.013644600287079811,
epoch = 10, i = 500, loss = -0.023220647126436234,
epoch = 10, i = 600, loss = -0.011892015114426613,
epoch = 10, i = 700, loss = 0.025094714015722275,
epoch = 10, i = 800, loss = -0.008

epoch = 18, i = 900, loss = -0.002788200043141842,
epoch = 18, i = 1000, loss = -0.047101009637117386,
epoch = 18, i = 1100, loss = -0.030959278345108032,
epoch = 18, i = 1200, loss = -0.032042741775512695,
epoch = 18, i = 1300, loss = -0.04250055178999901,
evaluating model on dev and test...
dev_acc: 0.5091848373413086, dev_rmse: 2.688485622406006
test_acc: 0.528219997882843, test_rmse: 2.4686434268951416
learning rate is 0.00041308431191779333
epoch = 19, i = 0, loss = -0.022804858162999153,
epoch = 19, i = 100, loss = -0.024233344942331314,
epoch = 19, i = 200, loss = -0.015853775665163994,
epoch = 19, i = 300, loss = -0.02966458350419998,
epoch = 19, i = 400, loss = -0.025421209633350372,
epoch = 19, i = 500, loss = -0.034431032836437225,
epoch = 19, i = 600, loss = -0.05846156179904938,
epoch = 19, i = 700, loss = -0.05565168708562851,
epoch = 19, i = 800, loss = 0.0030388599261641502,
epoch = 19, i = 900, loss = 0.03302712365984917,
epoch = 19, i = 1000, loss = 0.0361199043691158

epoch = 27, i = 1100, loss = -0.03125246614217758,
epoch = 27, i = 1200, loss = -0.059032514691352844,
epoch = 27, i = 1300, loss = -0.007645882666110992,
evaluating model on dev and test...
dev_acc: 0.515355110168457, dev_rmse: 2.667637825012207
test_acc: 0.5336199998855591, test_rmse: 2.458373546600342
learning rate is 0.0003773596436018163
epoch = 28, i = 0, loss = 0.026255887001752853,
epoch = 28, i = 100, loss = 0.00615723617374897,
epoch = 28, i = 200, loss = -0.012430164031684399,
epoch = 28, i = 300, loss = -0.006967688910663128,
epoch = 28, i = 400, loss = -0.01525468472391367,
epoch = 28, i = 500, loss = -0.41006457805633545,
epoch = 28, i = 600, loss = -0.02421398274600506,
epoch = 28, i = 700, loss = -0.06134917959570885,
epoch = 28, i = 800, loss = 0.024557527154684067,
epoch = 28, i = 900, loss = -0.010952990502119064,
epoch = 28, i = 1000, loss = -0.03454614803195,
epoch = 28, i = 1100, loss = -0.028432462364435196,
epoch = 28, i = 1200, loss = -0.009449493139982224,
epo

epoch = 36, i = 1300, loss = 0.023219497874379158,
evaluating model on dev and test...
dev_acc: 0.5202664732933044, dev_rmse: 2.6313629150390625
test_acc: 0.5428400039672852, test_rmse: 2.3883886337280273
learning rate is 0.0003447245429345389
epoch = 37, i = 0, loss = -0.010875942185521126,
epoch = 37, i = 100, loss = -0.014991720207035542,
epoch = 37, i = 200, loss = -0.04361578822135925,
epoch = 37, i = 300, loss = -0.006254150532186031,
epoch = 37, i = 400, loss = -0.02520851045846939,
epoch = 37, i = 500, loss = -0.020277826115489006,
epoch = 37, i = 600, loss = -0.014478952623903751,
epoch = 37, i = 700, loss = 0.01225356012582779,
epoch = 37, i = 800, loss = -0.018824825063347816,
epoch = 37, i = 900, loss = -0.0011497270315885544,
epoch = 37, i = 1000, loss = -0.035898853093385696,
epoch = 37, i = 1100, loss = -0.04223382845520973,
epoch = 37, i = 1200, loss = -0.0005515972152352333,
epoch = 37, i = 1300, loss = -0.0061407266184687614,
evaluating model on dev and test...
dev_ac

dev_acc: 0.5225415229797363, dev_rmse: 2.6668124198913574
test_acc: 0.5439800024032593, test_rmse: 2.4220240116119385
learning rate is 0.00031491181560161616
epoch = 46, i = 0, loss = -0.004718436859548092,
epoch = 46, i = 100, loss = 0.0015805186703801155,
epoch = 46, i = 200, loss = -0.04852796345949173,
epoch = 46, i = 300, loss = -0.0520845465362072,
epoch = 46, i = 400, loss = -0.05394623801112175,
epoch = 46, i = 500, loss = -0.035280585289001465,
epoch = 46, i = 600, loss = 0.015183246694505215,
epoch = 46, i = 700, loss = -0.012959063053131104,
epoch = 46, i = 800, loss = -0.0037667127326130867,
epoch = 46, i = 900, loss = -0.018088780343532562,
epoch = 46, i = 1000, loss = -0.03448844328522682,
epoch = 46, i = 1100, loss = -0.45374590158462524,
epoch = 46, i = 1200, loss = -0.007784370332956314,
epoch = 46, i = 1300, loss = -0.012522186152637005,
evaluating model on dev and test...
dev_acc: 0.5232584476470947, dev_rmse: 2.6595311164855957
test_acc: 0.549239993095398, test_rmse

epoch = 55, i = 100, loss = -0.028195932507514954,
epoch = 55, i = 200, loss = -0.04071984440088272,
epoch = 55, i = 300, loss = -0.03370020538568497,
epoch = 55, i = 400, loss = -0.025333739817142487,
epoch = 55, i = 500, loss = -0.017864253371953964,
epoch = 55, i = 600, loss = -0.034663230180740356,
epoch = 55, i = 700, loss = -0.019950423389673233,
epoch = 55, i = 800, loss = -0.048903800547122955,
epoch = 55, i = 900, loss = -0.028548864647746086,
epoch = 55, i = 1000, loss = -0.025361821055412292,
epoch = 55, i = 1100, loss = 0.00863731000572443,
epoch = 55, i = 1200, loss = 0.0399763248860836,
epoch = 55, i = 1300, loss = -0.056219421327114105,
evaluating model on dev and test...
dev_acc: 0.5279891490936279, dev_rmse: 2.650888681411743
test_acc: 0.548799991607666, test_rmse: 2.399458169937134
learning rate is 0.0002848006012385796
epoch = 56, i = 0, loss = 0.004852686543017626,
epoch = 56, i = 100, loss = 0.0015433356165885925,
epoch = 56, i = 200, loss = -0.072960764169693,
epo

epoch = 64, i = 300, loss = -0.0011990414932370186,
epoch = 64, i = 400, loss = -0.001948455348610878,
epoch = 64, i = 500, loss = -0.03064824268221855,
epoch = 64, i = 600, loss = -0.020827051252126694,
epoch = 64, i = 700, loss = -0.014279475435614586,
epoch = 64, i = 800, loss = -0.014966766349971294,
epoch = 64, i = 900, loss = -0.015165104530751705,
epoch = 64, i = 1000, loss = 0.013441766612231731,
epoch = 64, i = 1100, loss = -0.021977948024868965,
epoch = 64, i = 1200, loss = -0.004662880674004555,
epoch = 64, i = 1300, loss = -0.010748039931058884,
evaluating model on dev and test...
dev_acc: 0.5253866910934448, dev_rmse: 2.6517508029937744
test_acc: 0.5433599948883057, test_rmse: 2.4067821502685547
learning rate is 0.0002601702613251532
epoch = 65, i = 0, loss = -0.019633766263723373,
epoch = 65, i = 100, loss = -0.049929115921258926,
epoch = 65, i = 200, loss = -0.023207560181617737,
epoch = 65, i = 300, loss = -0.014705920591950417,
epoch = 65, i = 400, loss = -0.0276557840

epoch = 73, i = 400, loss = -0.031430669128894806,
epoch = 73, i = 500, loss = -0.026872092857956886,
epoch = 73, i = 600, loss = -0.026969334110617638,
epoch = 73, i = 700, loss = -0.04847537353634834,
epoch = 73, i = 800, loss = -0.03446422144770622,
epoch = 73, i = 900, loss = -0.05851098150014877,
epoch = 73, i = 1000, loss = -0.009957882575690746,
epoch = 73, i = 1100, loss = -0.01762247271835804,
epoch = 73, i = 1200, loss = 0.007139238528907299,
epoch = 73, i = 1300, loss = -0.028675507754087448,
evaluating model on dev and test...
dev_acc: 0.525877833366394, dev_rmse: 2.6342897415161133
test_acc: 0.5517200231552124, test_rmse: 2.4065744876861572
learning rate is 0.00023767002100285347
epoch = 74, i = 0, loss = -0.011224345304071903,
epoch = 74, i = 100, loss = -0.023520799353718758,
epoch = 74, i = 200, loss = -0.037581466138362885,
epoch = 74, i = 300, loss = -0.014605973847210407,
epoch = 74, i = 400, loss = -0.028190433979034424,
epoch = 74, i = 500, loss = -0.24627599120140

epoch = 82, i = 600, loss = -0.028876369819045067,
epoch = 82, i = 700, loss = -0.009369701147079468,
epoch = 82, i = 800, loss = 0.003153274767100811,
epoch = 82, i = 900, loss = -0.0047690365463495255,
epoch = 82, i = 1000, loss = -0.020427411422133446,
epoch = 82, i = 1100, loss = -0.024498451501131058,
epoch = 82, i = 1200, loss = -0.04436325281858444,
epoch = 82, i = 1300, loss = -0.02783777564764023,
evaluating model on dev and test...
dev_acc: 0.5275375247001648, dev_rmse: 2.6397488117218018
test_acc: 0.5529199838638306, test_rmse: 2.4022905826568604
learning rate is 0.00021711566339590582
epoch = 83, i = 0, loss = -0.0058473581448197365,
epoch = 83, i = 100, loss = -0.029404930770397186,
epoch = 83, i = 200, loss = -0.018726443871855736,
epoch = 83, i = 300, loss = -0.048185523599386215,
epoch = 83, i = 400, loss = -0.019754517823457718,
epoch = 83, i = 500, loss = -0.020870834589004517,
epoch = 83, i = 600, loss = -0.02379833161830902,
epoch = 83, i = 700, loss = 0.00477411411

epoch = 91, i = 700, loss = -0.03520131856203079,
epoch = 91, i = 800, loss = -0.2377886176109314,
epoch = 91, i = 900, loss = -0.004952789284288883,
epoch = 91, i = 1000, loss = -0.028306398540735245,
epoch = 91, i = 1100, loss = 0.023639490827918053,
epoch = 91, i = 1200, loss = -0.016020189970731735,
epoch = 91, i = 1300, loss = -0.029301635921001434,
evaluating model on dev and test...
dev_acc: 0.5289657711982727, dev_rmse: 2.640796422958374
test_acc: 0.5477200150489807, test_rmse: 2.4084019660949707
learning rate is 0.00019833890321101254
epoch = 92, i = 0, loss = -0.04072896018624306,
epoch = 92, i = 100, loss = -0.009392433799803257,
epoch = 92, i = 200, loss = 0.005980800371617079,
epoch = 92, i = 300, loss = -0.025384392589330673,
epoch = 92, i = 400, loss = -0.016171500086784363,
epoch = 92, i = 500, loss = 0.0024192817509174347,
epoch = 92, i = 600, loss = -0.033555421978235245,
epoch = 92, i = 700, loss = -0.024097610265016556,
epoch = 92, i = 800, loss = -0.034619722515344

In [39]:
torch.save(model.state_dict(), "./saved_models/{}-epoch-99-acc".format(exp_name))

In [40]:
len(test_accs), len(dev_accs)

(100, 100)

In [41]:
top_dev_accs = sorted(zip(dev_accs, test_accs, test_rmses), reverse=True)
best_dev_acc, corr_test_acc, corr_test_rmse = top_dev_accs[0]
print("The best dev accuracy is {}. The corresponding test accuracy and test RMSE are {} and {} respectively".format(
    best_dev_acc, corr_test_acc, corr_test_rmse
))

The best dev accuracy is 0.5324206948280334. The corresponding test accuracy and test RMSE are 0.5515599846839905 and 2.4053690433502197 respectively
