In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import torch
from torch.autograd import Variable

from dataset import Dictionary, HMQAFeatureDataset
from model import SoftCount
from config import *
from datetime import datetime, timedelta

import h5py
import numpy as np
import _pickle as pkl
import json
import torch.nn.functional as F

In [4]:
dictionary = Dictionary.load_from_file('data/dictionary.pkl')

loading dictionary from data/dictionary.pkl


In [6]:
%%time
print('loading features from train hdf5 file')
train_h5_loc = './data/train36.hdf5'
with h5py.File(train_h5_loc, 'r') as hf:
    train_image_features = np.array(hf.get('image_features'))
    train_spatials_features = np.array(hf.get('spatial_features'))
# np.save( open("/tmp/vqa/train_image_features", "wb"), train_image_features)
# np.save( open("/tmp/vqa/train_spatials_features", "wb"), train_spatials_features)

In [7]:
# %%time
# train_image_features = np.load(open("/tmp/vqa/train_image_features", "rb"))
# train_spatials_features = np.load(open("/tmp/vqa/train_spatials_features", "rb"))

CPU times: user 80 ms, sys: 11.7 s, total: 11.8 s
Wall time: 4min 25s


In [8]:
from dataset import HMQAFeatureDataset

hmqa_train_dset = HMQAFeatureDataset(
    img_id2hqma_idx = pkl.load(open("./data/train36_imgid2idx.pkl", "rb")),
    image_features = train_image_features, 
    spatial_features = train_spatials_features, 
    qid2count = json.load(open("./data/how_many_qa/qid2count.json", "rb")), 
    qid2count2score = json.load(open("./data/how_many_qa/qid2count2score.json", "rb")), 
    name="train", 
    dictionary=dictionary
)
del HMQAFeatureDataset

In [9]:
len(hmqa_train_dset)

83642

In [10]:
len(set([x["image_id"] for x in hmqa_train_dset.entries]))

45546

In [11]:
%%time
print('loading features from val hdf5 file')
val_h5_loc = './data/val36.hdf5'
with h5py.File(val_h5_loc, 'r') as hf:
    val_image_features = np.array(hf.get('image_features'))
    val_spatials_features = np.array(hf.get('spatial_features'))
# np.save( open("/tmp/vqa/val_image_features", "wb"), val_image_features)
# np.save( open("/tmp/vqa/val_spatials_features", "wb"), val_spatials_features)

In [12]:
# %%time
# val_image_features = np.load(open("/tmp/vqa/val_image_features", "rb"))
# val_spatials_features = np.load(open("/tmp/vqa/val_spatials_features", "rb"))

CPU times: user 44 ms, sys: 22.7 s, total: 22.7 s
Wall time: 1min 9s


In [13]:
# len(train_image_features)

from dataset import HMQAFeatureDataset

hmqa_dev_dset = HMQAFeatureDataset(
    img_id2hqma_idx = pkl.load(open("./data/val36_imgid2idx.pkl", "rb")),
    image_features = val_image_features, 
    spatial_features = val_spatials_features, 
    qid2count = json.load(open("./data/how_many_qa/qid2count.json", "rb")), 
    qid2count2score = json.load(open("./data/how_many_qa/qid2count2score.json", "rb")), 
    name="dev", 
    dictionary=dictionary
)

hmqa_test_dset = HMQAFeatureDataset(
    img_id2hqma_idx = pkl.load(open("./data/val36_imgid2idx.pkl", "rb")),
    image_features = val_image_features, 
    spatial_features = val_spatials_features, 
    qid2count = json.load(open("./data/how_many_qa/qid2count.json", "rb")), 
    qid2count2score = json.load(open("./data/how_many_qa/qid2count2score.json", "rb")), 
    name="test", 
    dictionary=dictionary
)
del HMQAFeatureDataset

In [14]:
len(hmqa_dev_dset), len(hmqa_test_dset)

(17714, 5000)

In [15]:
from torch.utils.data import DataLoader

hmqa_train_loader = DataLoader(hmqa_train_dset, 64, shuffle=True, num_workers=0)
hmqa_dev_loader = DataLoader(hmqa_dev_dset, 64, shuffle=True, num_workers=0)
hmqa_test_loader = DataLoader(hmqa_test_dset, 64, shuffle=True, num_workers=0)

In [55]:
def evaluate(model, hmqa_loader):
    
    all_acc = []
    all_se = []
    for i, (v_emb, b, q, c, c2s) in enumerate(hmqa_loader):
        v_emb = Variable(v_emb)
        q = Variable(q)
        c = Variable(c).float()
        
        if USE_CUDA:
            v_emb = v_emb.cuda()
            q = q.cuda()
            c = c.cuda()

        pred = model(v_emb, q)
        
        nearest_pred = (pred + 0.5).long().clamp(0, 20)
        for one_c, one_c2s, one_pred in zip(c, c2s, nearest_pred):
            one_c = one_c.cpu().data
            one_pred = one_pred.cpu().data
            
            all_se.append((one_c - one_pred.float()) ** 2)
            all_acc.append(one_c2s[one_pred])
    
    acc = torch.stack(all_acc).mean()
    rmse = torch.stack(all_se).mean() ** 0.5
    
    return acc, rmse

In [67]:
from model import SoftCount
model = SoftCount(ques_dim=1024, score_dim=512, dropout=0.2)
del SoftCount

initialising with glove embeddings
done.


In [68]:
if USE_CUDA:
    model.cuda()
model

SoftCount(
  (ques_parser): QuestionParser(
    (embd): Embedding(20159, 300, padding_idx=20158)
    (rnn): GRU(300, 1024)
    (drop): Dropout(p=0.2)
  )
  (f): ScoringFunction(
    (v_drop): Dropout(p=0.2)
    (q_drop): Dropout(p=0.2)
    (v_proj): FCNet(
      (main): Sequential(
        (0): Linear(in_features=2048, out_features=512, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
    )
    (q_proj): FCNet(
      (main): Sequential(
        (0): Linear(in_features=1024, out_features=512, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
    )
    (s_drop): Dropout(p=0.2)
  )
  (W): Linear(in_features=512, out_features=1, bias=True)
)

In [69]:
test_acc, test_rmse = evaluate(model, hmqa_test_loader)
test_acc, test_rmse

(tensor(1.00000e-03 *
        3.0400), tensor(15.4749))

In [70]:
opt = torch.optim.Adam(model.parameters(), lr=3e-4)

In [71]:
test_accs = []
test_rmses = []

dev_accs = []
dev_rmses = []

In [72]:
for epoch in range(20):
    for i, (v_emb, b, q, c, _) in enumerate(hmqa_train_loader):
        v_emb = Variable(v_emb)
        q = Variable(q)
        c = Variable(c).float().view(-1)
        
        if USE_CUDA:
            v_emb = v_emb.cuda()
            q = q.cuda()
            c = c.cuda()

        pred = model(v_emb, q)
        loss = F.smooth_l1_loss(pred, c)
        
        if i % 100 == 0:
            print("epoch = {}, i = {}, loss = {}".format(
                epoch, i, loss.item()))
        
        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.25)
        opt.step()
    
    print("evaluating model on dev and test...")

    model.eval()
    dev_acc, dev_rmse = evaluate(model, hmqa_dev_loader)
    print("dev_acc: {}, dev_rmse: {}".format(dev_acc, dev_rmse))
    test_acc, test_rmse = evaluate(model, hmqa_test_loader)
    print("test_acc: {}, test_rmse: {}".format(test_acc, test_rmse))
    model.train()
    
    test_accs.append(test_acc)
    test_rmses.append(test_rmse)
    dev_accs.append(dev_acc)
    dev_rmses.append(dev_rmse)
        

epoch = 0, i = 0, loss = 14.628400802612305




epoch = 0, i = 100, loss = 1.1435465812683105
epoch = 0, i = 200, loss = 1.0605370998382568
epoch = 0, i = 300, loss = 0.9011174440383911
epoch = 0, i = 400, loss = 1.6539443731307983
epoch = 0, i = 500, loss = 1.0184824466705322
epoch = 0, i = 600, loss = 1.2407264709472656
epoch = 0, i = 700, loss = 1.3670611381530762
epoch = 0, i = 800, loss = 1.5058083534240723
epoch = 0, i = 900, loss = 0.9516408443450928
epoch = 0, i = 1000, loss = 0.7743334174156189
epoch = 0, i = 1100, loss = 0.8096193671226501
epoch = 0, i = 1200, loss = 1.2751123905181885
epoch = 0, i = 1300, loss = 1.3818869590759277
evaluating model on dev and test...
dev_acc: 0.41113242506980896, dev_rmse: 2.9276204109191895
test_acc: 0.4032999873161316, test_rmse: 2.686856985092163
epoch = 1, i = 0, loss = 1.0717486143112183
epoch = 1, i = 100, loss = 0.7833133935928345
epoch = 1, i = 200, loss = 0.8350750207901001
epoch = 1, i = 300, loss = 0.9446524381637573
epoch = 1, i = 400, loss = 0.6972931623458862
epoch = 1, i = 5

epoch = 10, i = 600, loss = 0.4486617147922516
epoch = 10, i = 700, loss = 0.4893093705177307
epoch = 10, i = 800, loss = 0.8087934851646423
epoch = 10, i = 900, loss = 0.6663355827331543
epoch = 10, i = 1000, loss = 0.38856399059295654
epoch = 10, i = 1100, loss = 0.6736136674880981
epoch = 10, i = 1200, loss = 0.8147845268249512
epoch = 10, i = 1300, loss = 0.7092846632003784
evaluating model on dev and test...
dev_acc: 0.473275363445282, dev_rmse: 2.5685806274414062
test_acc: 0.4786800146102905, test_rmse: 2.3469128608703613
epoch = 11, i = 0, loss = 0.5136302709579468
epoch = 11, i = 100, loss = 0.6258996725082397
epoch = 11, i = 200, loss = 0.5074462294578552
epoch = 11, i = 300, loss = 0.8129042983055115
epoch = 11, i = 400, loss = 0.5542230606079102
epoch = 11, i = 500, loss = 0.5161349773406982
epoch = 11, i = 600, loss = 0.5896819233894348
epoch = 11, i = 700, loss = 0.6863290667533875
epoch = 11, i = 800, loss = 0.5083870887756348
epoch = 11, i = 900, loss = 0.326534032821655

In [73]:
top_dev_accs = sorted(zip(dev_accs, test_accs, test_rmses), reverse=True)
top_dev_accs

[(tensor(0.4835), tensor(0.4965), tensor(2.3054)),
 (tensor(0.4830), tensor(0.5041), tensor(2.3137)),
 (tensor(0.4822), tensor(0.4945), tensor(2.3373)),
 (tensor(0.4806), tensor(0.4864), tensor(2.3323)),
 (tensor(0.4791), tensor(0.4903), tensor(2.3581)),
 (tensor(0.4791), tensor(0.5011), tensor(2.3345)),
 (tensor(0.4788), tensor(0.4883), tensor(2.3233)),
 (tensor(0.4771), tensor(0.4764), tensor(2.3527)),
 (tensor(0.4733), tensor(0.4787), tensor(2.3469)),
 (tensor(0.4731), tensor(0.4937), tensor(2.3661)),
 (tensor(0.4718), tensor(0.4808), tensor(2.4389)),
 (tensor(0.4649), tensor(0.4721), tensor(2.4418)),
 (tensor(0.4612), tensor(0.4730), tensor(2.3175)),
 (tensor(0.4581), tensor(0.4670), tensor(2.4323)),
 (tensor(0.4572), tensor(0.4596), tensor(2.3268)),
 (tensor(0.4526), tensor(0.4577), tensor(2.3845)),
 (tensor(0.4525), tensor(0.4663), tensor(2.4992)),
 (tensor(0.4399), tensor(0.4389), tensor(2.5628)),
 (tensor(0.4387), tensor(0.4358), tensor(2.6740)),
 (tensor(0.4111), tensor(0.4033

In [74]:
best_dev_acc, corr_test_acc, corr_test_rmse = top_dev_accs[0]
print("The best dev accuracy is {}. The corresponding test accuracy and test RMSE are {} and {} respectively".format(
    best_dev_acc, corr_test_acc, corr_test_rmse
))

The best dev accuracy is 0.4835158586502075. The corresponding test accuracy and test RMSE are 0.49654000997543335 and 2.3054285049438477 respectively
