In [14]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [15]:
import sys
import os.path
sys.path.append('/scratch/pramodkm/vqa/vqa_kazemi2017show/')

In [16]:
import math
import json

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
from tqdm import tqdm

import config
import data
import model_IG
import utils
import numpy as np

In [17]:
%reload_ext autoreload
%autoreload 2

In [18]:
# Load pretrained model
log = torch.load('/scratch/pramodkm/vqa/vqa_kazemi2017show/logs/2017-08-04_00.55.19.pth')
tokens = len(log['vocab']['question']) + 1

net = torch.nn.DataParallel(model_IG.Net(tokens))
net.load_state_dict(log['weights'])

In [19]:
# Load vocabulary
with open(config.vocabulary_path, 'r') as fd:
    vocab_json = json.load(fd)
reverse_vocab_question = dict([(v, k) for k, v in vocab_json['question'].items()])
reverse_vocab_answer = dict([(v, k) for k, v in vocab_json['answer'].items()])

In [20]:
# Create embedding matrix
question_emb_lookup = log['weights']['module.text.embedding.weight']
embedding = nn.Embedding(question_emb_lookup.shape[0], question_emb_lookup.shape[1], padding_idx=0)
embedding.weight.data = question_emb_lookup

In [21]:
# Load validation data
val_loader = data.get_loader(val=True)
LOADER=val_loader
PREFIX="val"

In [22]:
# Image index dict
reverse_coco_idxs = dict([(v,k) for k, v in val_loader.dataset.coco_id_to_index.items()])

In [23]:
def get_answer(a):
    indices = torch.nonzero(a >= 3)
    if len(indices) == 0:
        return ''
    return '|'.join([reverse_vocab_answer[int(index)] for index in indices])

In [25]:
def scale_input(q_emb, num_batches=1, padding_emb=[]):
    """ Create scaled versions of input and stack along batch dimension
    q_emb shape = (q_length, emb_dim)
    """
    num_points = config.batch_size*num_batches
    scale = 1.0/num_points
    step = q_emb.unsqueeze(0) * scale
    ans = torch.cat([step*i for i in range(num_points)], dim=0)
    return ans, step.squeeze()

In [None]:
def compute_attributions(q_emb, q_len, v, idx, num_batches=5):
    ans = ''
    for batch_i in range(config.batch_size):
        scaled_q_emb, step = scale_input(q_emb[batch_i,:,:], num_batches=num_batches, embedding())
        diff = 0
        total_grads = 0
        for j in range(num_batches):
            repeated_q_len = torch.ones_like(q_len)*q_len[batch_i]
            repeated_v = torch.ones_like(v)*v[batch_i]
            scaled_answer, gradients = net(repeated_v, scaled_q_emb[j*config.batch_size:(j+1)*config.batch_size], repeated_q_len, compute_gradient=True, ans_index=answer[batch_i])
            # at this point, shape(gradients) = 128 x 23 x 300
            total_grads += torch.sum(gradients, dim=0)
            if j == 0:
                diff -= scaled_answer[0, answer[batch_i]]
                baseline_softmax = scaled_answer[0, :]
            if j == num_batches - 1:
                diff += scaled_answer[-1, answer[batch_i]]
        attributions = torch.sum(total_grads * step, dim = 1)
        area = torch.sum(attributions, dim=0)
        #print('--------------------------')
        #print('diff: ', float(diff))
        #print('area: ', float(area))
        if abs(float(diff) - float(area)) > 0.1:
            print('WARNING: attribution sanity check not matching up!! Diff = ', abs(float(diff) - float(area)))

        predicted_answer = reverse_vocab_answer[int(answer[batch_i])]
        correct_answer = get_answer(a[batch_i, :])
        _, baseline_topk_answers = baseline_softmax.topk(10)
        baseline_topk_answers = ', '.join([reverse_vocab_answer[int(i)] for i in baseline_topk_answers])
        
        if baseline_topk_answers[0] == predicted_answer:
            attributions = attributions*0
        
        #print('Predicted answer: ', predicted_answer)
        #print('Baseline top k answers : ', ' | '.join(baseline_topk_answers))
        #print('Prediction is correct?: ', int(acc[batch_i]))
        #print('Image ID: ', val_loader.dataset.coco_ids[int(idx[batch_i])])
        question_attrs = []
        for j, w in enumerate(q[batch_i, :]):
            if int(w) != 0:
                #print(reverse_vocab_question[int(w)], ': ', float(attributions[j]))
                question_attrs.append('|'.join([str(reverse_vocab_question[int(w)]), str(float(attributions[j]))]))
        tsv_string = [', '.join(question_attrs), baseline_topk_answers, predicted_answer, correct_answer, str(int(acc[batch_i])), str(val_loader.dataset.coco_ids[int(idx[batch_i])])]
        ans += '\t'.join(tsv_string) + '\n'
    return ans

In [None]:
tq = tqdm(LOADER, desc='{} E{:03d}'.format(PREFIX, 0), ncols=0)
net.eval()
answ = []
idxs = []
accs = []
num_iters = 0
batch_id = 0
attrs_tsv_string = ''
#outf = open('/scratch/pramodkm/vqa/tsv/temp_attrs.tsv', 'w')
for v, q, a, idx, q_len in tq:
        
        var_params = {
            'volatile': False,
            'requires_grad': False,        
        }
        q[3, 1] = vocab_json['question']['fast']
        q[3, 4] = vocab_json['question']['bricks']
        q[3, 5] = vocab_json['question']['speaking']
        v = Variable(v.cuda(async=True), **var_params)
        q = Variable(q.cuda(async=True), **var_params)
        a = Variable(a.cuda(async=True), **var_params)
        q_len = Variable(q_len.cuda(async=True), **var_params)
        
        q_emb = embedding(q)
        
        out = net(v, q_emb, q_len)            
        
        acc = utils.batch_accuracy(out.data, a.data).cpu()
        
        _, answer = out.data.cpu().max(dim=1)
        
#        attrs_tsv_string = compute_attributions(q_emb, q_len, v, idx, num_batches=5)
        
#        outf.write(attrs_tsv_string)
        
        answ.append(answer.view(-1))
        accs.append(acc.view(-1))
        idxs.append(idx.view(-1).clone())
        print(acc.mean())
        num_iters += 1
        batch_id += 1
        print(' '.join([reverse_vocab_question[int(w)] for w in q[3,:] if int(w)!=0]))
        print(acc[3])
        #if num_iters == 10:
        break
            
#outf.close()

answ = list(torch.cat(answ, dim=0))
accs = list(torch.cat(accs, dim=0))
idxs = list(torch.cat(idxs, dim=0))

In [None]:
int(indices[0])

## Visualization

In [26]:
from IPython.display import clear_output, display, Image, HTML
import seaborn
import matplotlib.pyplot as plt
import numpy as np

def visualize_attrs(tokens, attrs):
    html_text = ""
    for i, tok in enumerate(tokens):
        r,g,b = get_color(attrs[i])
        html_text += " <strong><span style='size:16;color:rgb(%d,%d,%d)'>%s</span></strong>" % (r, g, b, tok)
    return html_text

def get_latex(tokens, attrs):
    ans = ""
    for i, tok in enumerate(tokens):
        [r,g,b] = [w/256.0 for w in get_color(attrs[i])]
        ans += " {\color[rgb]{%f,%f,%f}%s}" % (r, g, b, tok)
    return ans

# normalize attrs to be between -1 to 1. 
def normalize_attrs(attrs):
    bound = max(abs(attrs.max()), abs(attrs.min()))
    return attrs/bound

# get color for attr between -1 to 1
def get_color(attr):
    if attr > 0:
        return int(128*attr) + 127, 128 - int(64*attr), 128 - int(64*attr)
    return 128 + int(64*attr), 128 + int(64*attr), int(-128*attr) + 127

In [106]:
def make_visualization_html(tsv_filename, html_filename):
    html_str = '<head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous"></head>'
    html_str += '<body> <div class="container"> <h3> Visualizations of the attributions for the Visual QA network <br> <small> Red indicates high values, blue and gray indicates low values <br> A green (or red) block before the question indicates whether the network got the answer right (or wrong)</small></h3></div><br>'
    with open(tsv_filename) as f, open(html_filename,'w') as outf:
        html_str += '<div class="container">'
        html_str += '-'*40 + '<br>'
        outf.write(html_str)
        for line in f:
            line = line.strip()
            question_attrs, baseline_topk_answers, predicted_answer, correct_answer, is_correct, image_id = line.split('\t')
            question_tokens = []
            attrs = []
            for word_attr in question_attrs.split(','):
                word, attr = word_attr.split('|')
                question_tokens.append(word)
                attrs.append(float(attr))
            print('-'*40)
            html_str = visualize_attrs(question_tokens, normalize_attrs(np.array(attrs)))
            
            #plt.figure(figsize=(16,0.5))
            #seaborn.heatmap([normalize_attrs(np.array(attrs))], annot=np.array([question_tokens]), annot_kws={"size": 14}, fmt='', xticklabels=False, yticklabels=False, cbar=False)
            #plt.show()
            if is_correct == '1': 
                html_str = '<span style="background-color:green">&nbsp&nbsp</span> ' + html_str
            else:
                html_str = '<span style="background-color:red">&nbsp&nbsp</span> ' + html_str
            #display(HTML(html_str))
            #html_str += '<br>(prediction, ground truth) = (' + predicted_answer + ', ' + correct_answer + ')'
            html_str += '<br>prediction :' + predicted_answer
            #html_str += '<br>baseline top answers: ' + baseline_topk_answers
            #html_str += '<br>diff : %.3f' % (sum(attrs))
            #html_str += '<br>image ID: ' + str(image_id)
            html_str += '<br><img src="val2014/COCO_val2014_' + '0'*(12 - len(str(image_id))) + str(image_id) + '.jpg" width="256" height="256"></img><br><br>' 
            #display(Image('/scratch/pramodkm/vqa/data_vqa1.0/val2014/COCO_val2014_' + '0'*(12 - len(str(image_id))) + str(image_id) + '.jpg', width=256, height=256))
            outf.write(html_str + '\n')
        outf.write('</div></body>')

In [107]:
make_visualization_html(tsv_filename = '/scratch/pramodkm/vqa/tsv/temp_attrs.tsv', 
                       html_filename = '/scratch/pramodkm/vqa/tsv/temp_attrs2.html')

----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------

ValueError: not enough values to unpack (expected 2, got 1)

## Attack by prefixing sentences

In [40]:
tq = tqdm(LOADER, desc='{} E{:03d}'.format(PREFIX, 0), ncols=0)
net.eval()
answ = []
idxs = []
accs = []
num_iters = 0
batch_id = 0

#prefix_phrase = 'in not a lot of words'
#prefix_phrase = 'what is the answer to'
prefix_phrase = 'in not many words'
#prefix_phrase = 'answer this'
prefix = []
for word in prefix_phrase.split():
    prefix.append(vocab_json['question'][word])
    
prefix = torch.LongTensor(prefix) 
prefix = prefix.unsqueeze(0).repeat(config.batch_size,1)
attrs_tsv_string = ''
for v, q, a, idx, q_len in tq:
        var_params = {
            'volatile': False,
            'requires_grad': False,
        }

        q = torch.cat([prefix, q], dim=1)[:,:23]
        q = q.contiguous()
        q_len = q_len + prefix.shape[1]
        q_len = torch.min(q_len, torch.LongTensor([23]).expand_as(q_len))
        
        v = Variable(v.cuda(async=True), **var_params)
        q = Variable(q.cuda(async=True), **var_params)
        a = Variable(a.cuda(async=True), **var_params)
        q_len = Variable(q_len.cuda(async=True), **var_params)
        
        q_emb = embedding(q)
        
        out = net(v, q_emb, q_len)            
        
        acc = utils.batch_accuracy(out.data, a.data).cpu()
        
        _, answer = out.data.cpu().max(dim=1)
        
#        attrs_tsv_string += compute_attributions(q_emb, q_len, v, idx, num_batches=5)

        answ.append(answer.view(-1))
        accs.append(acc.view(-1))
        idxs.append(idx.view(-1).clone())
        print('===============')
        print(acc.mean())
        print('===============')
        num_iters += 1
        batch_id += 1
        if num_iters == 5:
            break
            
#if attrs_tsv_string:
#    with open('/scratch/pramodkm/vqa/tsv/attack_what_is_the_answer_to_attrs.tsv', 'w') as outf:
#        outf.write(attrs_tsv_string)

answ = list(torch.cat(answ, dim=0))
accs = list(torch.cat(accs, dim=0))
idxs = list(torch.cat(idxs, dim=0))


val E000:   0% 0/950 [00:00<?, ?it/s][A
  attention = F.softmax(attention)
val E000:   0% 3/950 [00:01<09:21,  1.69it/s]

0.3273437521420419
0.36328125139698386
0.3328125011175871
0.28906250186264515
0.2781250011175871


In [51]:
accsx = []
for a1, a2, a3 in zip(accs1, accs2, accs3):
    if a1 == 0 or a2 == 0 or a3 == 0:
        accsx.append(0)
    else:
        accsx.append(np.min([a1,a2,a3]))
        

In [52]:
np.mean(accsx)

0.16156250089406968

In [None]:
make_visualization_html(tsv_filename = '/scratch/pramodkm/vqa/tsv/attack_what_is_the_answer_to_attrs.tsv', 
                       html_filename = '/scratch/pramodkm/vqa/tsv/attack_what_is_the_answer_to_attrs.html')

## Whitelist attack

In [None]:
from collections import Counter

In [None]:
tsv_filename = '/scratch/pramodkm/vqa/tsv/attrs.tsv'
counts_list = []
top_k = 1
with open(tsv_filename) as f:
    for line in f:
        line = line.strip()
        question_attrs, predicted_answer, correct_answer, is_correct, image_id = line.split('\t')
        question_tokens = []
        attrs = []
        for word_attr in question_attrs.split(','):
            if len(word_attr.split('|')) < 2:
                print('skipped')
                continue
            word, attr = word_attr.split('|')
            question_tokens.append(word)
            attrs.append(float(attr))
        k = min(top_k, len(question_tokens))
        counts_list.extend([question_tokens[i].strip() for i in np.argpartition(attrs, -k)[-k:]])

In [None]:
whitelist = set([vocab_json['question'][w] for w, c in Counter(counts_list).most_common(15)])

In [None]:
unattributed_words = set(vocab_json['question'].keys()) - set(counts_list)

In [None]:
', '.join([reverse_vocab_question[i] for i in whitelist])
np.unique(np.floor(np.geomspace(1, len(Counter(counts_list)), 25)))

In [None]:
curve_data = []
for K in np.unique(np.floor(np.geomspace(1, len(Counter(counts_list)), 25))):
    whitelist = set([vocab_json['question'][w] for w, c in Counter(counts_list).most_common(int(K))])
    tq = tqdm(LOADER, desc='{} E{:03d}'.format(PREFIX, 0), ncols=0)
    net.eval()
    answ = []
    idxs = []
    accs = []
    num_iters = 0
    batch_id = 0
    attrs_tsv_string = ''
    avg_question_length_orig = 0
    avg_question_length_new = 0
    num_questions = 0
    html_str = '<head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous"></head>'
    html_str += '<body>'
    html_str += '<table class="container">'
    for v, q, a, idx, q_len in tq:
            var_params = {
                'volatile': False,
                'requires_grad': False,
            }


            old_q = np.asarray(q).copy()
            old_q_len = np.asarray(q_len).copy()

            new_q = np.zeros([config.batch_size, 23])
            for batch_i in range(config.batch_size):
                len_counter = 0
                avg_question_length_orig += int(q_len[batch_i])
                for word_i, w in enumerate(q[batch_i,:int(q_len[batch_i])]):
                    if int(w) == 0 or int(w) in whitelist:
                        new_q[batch_i,word_i] = int(w)
                        len_counter += 1
                if len_counter == 0:
                    len_counter = 1
                avg_question_length_new += int(len_counter)
                num_questions += 1
                q_len[batch_i] = len_counter
            q_len, sorted_idxs = torch.sort(q_len, descending=True)
            new_q = new_q[sorted_idxs, :]
            idx = idx[sorted_idxs]
            v = v[sorted_idxs,:,:,:]
            a = a[sorted_idxs, :]
            old_q = old_q[sorted_idxs, :]
            old_q_len = old_q_len[sorted_idxs]
            q = torch.LongTensor(new_q)

            v = Variable(v.cuda(async=True), **var_params)
            q = Variable(q.cuda(async=True), **var_params)
            a = Variable(a.cuda(async=True), **var_params)
            q_len = Variable(q_len.cuda(async=True), **var_params)

            q_emb = embedding(q)

            out = net(v, q_emb, q_len)            

            acc = utils.batch_accuracy(out.data, a.data).cpu()

            _, answer = out.data.cpu().max(dim=1)


            for batch_i in range(config.batch_size):
                html_str += "<tr>"
                old_question = ' '.join([reverse_vocab_question[int(i)] for i in old_q[batch_i, :] if int(i)!=0])
                new_question = ' '.join([reverse_vocab_question[int(i)] for i in q[batch_i, :] if int(i)!=0])
                is_correct = float(acc[batch_i])==1.0
                #print(old_question, new_question, is_correct)
                html_str += "<td>" + old_question + "</td>"
                html_str += "<td><span style='color:" + ("green" if is_correct else "red") + "'>" + new_question + "</span></td>"
                html_str += "</tr>"
            answ.append(answer.view(-1))
            accs.append(acc.view(-1))
            idxs.append(idx.view(-1).clone())
            #print(acc.mean())
            num_iters += 1
            batch_id += 1
            if num_iters == 5:
                break
    html_str += "</table></body>"
    #with open('/scratch/pramodkm/vqa/tsv/whitelist_attack.html','w') as outf:
    #    outf.write(html_str)

    print("avg question length orig: ", float(avg_question_length_orig)/num_questions)
    print("avg question length new: ", float(avg_question_length_new)/num_questions)
    answ = list(torch.cat(answ, dim=0))
    accs = list(torch.cat(accs, dim=0))
    idxs = list(torch.cat(idxs, dim=0))
    print(sum(np.asarray(accs))/len(accs))
    curve_data.append((K,sum(np.asarray(accs))/len(accs)))

In [None]:
sum(np.asarray(accs) == 1.0)/len(accs)

In [None]:
with open('/scratch/pramodkm/vqa/tsv/whitelist_curve.tsv','w') as outf:
    for k, v in curve_data:
        outf.write(str(k) + '\t' + str(v) + '\n')

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot([w[0] for w in curve_data], [w[1] for w in curve_data])
plt.xscale('log')
plt.xlabel('num. words in vocab')
plt.ylabel('accuracy')
plt.savefig('/scratch/pramodkm/vqa/tsv/whitelist_attack_curve.eps', format='eps')

In [None]:
curve_data

In [None]:
Counter(counts_list).most_common(50)

## Ablation attack

In [None]:
import spacy
from collections import Counter

In [None]:
tsv_filename = '/scratch/pramodkm/vqa/tsv/attrs.tsv'
counts_list = []
top_k = 5
with open(tsv_filename) as f:
    for line in f:
        line = line.strip()
        question_attrs, predicted_answer, correct_answer, is_correct, image_id = line.split('\t')
        question_tokens = []
        attrs = []
        for word_attr in question_attrs.split(','):
            if len(word_attr.split('|')) < 2:
                print('skipped')
                continue
            word, attr = word_attr.split('|')
            question_tokens.append(word)
            attrs.append(float(attr))
        k = min(top_k, len(question_tokens))
        counts_list.extend([question_tokens[i].strip() for i in np.argpartition(attrs, -k)[-k:]])

In [None]:
unattributed_words = set(vocab_json['question'].keys()) - set(counts_list)

In [None]:
list(unattributed_words)[-10:]

In [None]:
nlp = spacy.load('en')
sent = "how symmetrical are the white bricks on either side of the building"
doc=nlp(sent)

sub_toks = [tok for tok in doc if (tok.dep_ == "nobj") ]

print(sub_toks)

In [None]:
[d.dep_ for d in doc]

In [None]:
tq = tqdm(LOADER, desc='{} E{:03d}'.format(PREFIX, 0), ncols=0)
net.eval()
answ = []
idxs = []
accs = []
num_iters = 0
batch_id = 0
attrs_tsv_string = ''
for v, q, a, idx, q_len in tq:
        
        var_params = {
            'volatile': False,
            'requires_grad': False,        
        }
        out_string = ''
        for i in range(config.batch_size):
            if len(np.nonzero(a[i, :] >= 3))==0:
                continue
            answers = [reverse_vocab_answer[int(w)] for w in np.nonzero(a[i, :] >= 3)]
            if 'yes' in answers or 'no' in answers:
                continue
            string_question = [reverse_vocab_question[int(w)] if int(w) != 0 else '' for w in q[i, :]]
            out_string += '-'*50 + '\n'
            out_string += 'orig: ' + ' '.join(string_question) + '\n'
            out_string += 'answers: ' + ' '.join(answers) + '\n'
            doc = nlp(' '.join(string_question))
            pos_tags = [d.dep_ for d in doc]
            #print(pos_tags)
            subject_index = [i for i, t in enumerate(pos_tags) if 'nsubj' in t]
            if len(subject_index) == 0:
                continue
            q[i, subject_index[0]] = vocab_json['question']['civilian']
            string_question = [reverse_vocab_question[int(w)] if int(w) != 0 else '' for w in q[i, :]]
            out_string += 'ablated: ' + ' '.join(string_question) + '\n'

            
        v = Variable(v.cuda(async=True), **var_params)
        q = Variable(q.cuda(async=True), **var_params)
        a = Variable(a.cuda(async=True), **var_params)
        q_len = Variable(q_len.cuda(async=True), **var_params)
        
        q_emb = embedding(q)
        
        out = net(v, q_emb, q_len)            
        
        acc = utils.batch_accuracy(out.data, a.data).cpu()
        
        _, answer = out.data.cpu().max(dim=1)
        
#        attrs_tsv_string = compute_attributions(q_emb, q_len, v, idx, num_batches=5)
        
#        outf.write(attrs_tsv_string)
        #for i in range(config.batch_size):
            #if int(acc[i]) >= 1.0:
                #print(out_string)
        
        answ.append(answer.view(-1))
        accs.append(acc.view(-1))
        idxs.append(idx.view(-1).clone())
        print(acc.mean())
        num_iters += 1
        batch_id += 1
        #print(' '.join([reverse_vocab_question[int(w)] for w in q[3,:] if int(w)!=0]))
        #print(acc[3])
        if num_iters == 5:
            break
            
#outf.close()

answ = list(torch.cat(answ, dim=0))
accs = list(torch.cat(accs, dim=0))
idxs = list(torch.cat(idxs, dim=0))

print('final: ' + str(np.mean(accs)))

In [None]:
reverse_vocab_answer[int(np.nonzero(a[0, :] > 3))]

In [None]:
np.nonzero(a[i, :] >= 3)

## Image specific bias

In [66]:
import json
import scipy.stats as stats
from collections import Counter

In [54]:
json_data=open('/scratch/pramodkm/vqa/data_vqa1.0/OpenEnded_mscoco_val2014_questions.json').read()
data = json.loads(json_data)

In [87]:
data['questions']

[{'image_id': 350623,
  'question': 'What is the table made of?',
  'question_id': 3506232},
 {'image_id': 350623,
  'question': 'Is the food napping on the table?',
  'question_id': 3506230},
 {'image_id': 350623,
  'question': 'What has been upcycled to make lights?',
  'question_id': 3506231},
 {'image_id': 8647,
  'question': 'Is this an Spanish town?',
  'question_id': 86472},
 {'image_id': 8647,
  'question': 'Are there shadows on the sidewalk?',
  'question_id': 86470},
 {'image_id': 8647,
  'question': 'What is in the top right corner?',
  'question_id': 86471},
 {'image_id': 434410,
  'question': 'Is it cold outside?',
  'question_id': 4344100},
 {'image_id': 434410,
  'question': 'What is leaning against the house?',
  'question_id': 4344101},
 {'image_id': 434410,
  'question': 'How many windows can you see?',
  'question_id': 4344102},
 {'image_id': 56205, 'question': 'Is this in a park?', 'question_id': 562050},
 {'image_id': 56205,
  'question': 'Is there a bicycle in thi

In [59]:
json_data=open('/scratch/pramodkm/vqa/data_vqa1.0/mscoco_val2014_annotations.json').read()
annot_data = json.loads(json_data)

In [79]:
annot_data['annotations']

[{'answer_type': 'other',
  'answers': [{'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 1},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 7},
   {'answer': 'wood', 'answer_confidence': 'maybe', 'answer_id': 8},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'wood', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 350623,
  'multiple_choice_answer': 'wood',
  'question_id': 3506232,
  'question_type': 'what is the'},
 {'answer_type': 'yes/no',
  'answers': [{'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 1},
   {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 2},
  

In [90]:
image_ans = dict()
for ans_annot in annot_data['annotations']:
    turk_answers = [ans['answer'] for ans in ans_annot['answers']]
    if ans_annot['image_id'] not in image_ans:
        image_ans[ans_annot['image_id']] = [turk_answers]
    else:
        image_ans[ans_annot['image_id']].append(turk_answers)

In [81]:
image_ans

{67213: ['6', 'yes', 'dog'],
 458755: ['sheep', '1', 'petting zoo'],
 262148: ['picnic table', 'down', 'watching'],
 578902: ['4', 'buildings', 'green'],
 410265: ['2', 'no', 'no'],
 393225: ['yes', 'chopsticks', 'foodiebakercom'],
 393226: ['ice cream', '1', 'walking'],
 109229: ['frisbee', 'leaves', '1'],
 458768: ['lights', 'yes', 'yes'],
 131089: ['yes', 'no', 'green'],
 262162: ['7', 'yes', 'chair'],
 281512: ['policeman', 'white', 'yes'],
 327701: ['yes', '2', 'tennis'],
 458778: ['kitchen', 'black', 'white'],
 393243: ['donut', 'yes', 'blonde'],
 458781: ['laptop', 'iphone', 'bike racing'],
 65567: ['cloudy', 'safford', 'divider'],
 10928: ['2', 'new york', 'usa'],
 131108: ['inward', 'no', 'yes'],
 458790: ['water', 'yes', 'yes'],
 196650: ['2', 'giraffe', 'no'],
 131115: ['no', '3', 'yes'],
 316429: ['smoke', 'airplanes', 'v formation'],
 262189: ['ross st', '4', 'white'],
 122208: ['12', 'yes', 'yes'],
 393266: ['no', 'green', 'red'],
 393267: ['wine tasting', 'wine', 'black'

In [95]:
def visualize_baseline_answers(tokens, attrs, image_ans):
    html_text = ""
    count = 0
    for i, tok in enumerate(tokens):
        r,g,b = get_color(attrs[i])
        val = []
        for ans in image_ans:
            val += [sum(tok == np.array(ans))]
        if sum(np.array(val)>=3)>0:
            tok = '<u>' + tok + '</u>'
            count += 1
        html_text += "<span style='size:16;color:rgb(%d,%d,%d)'>%s</span>, " % (r, g, b, tok)
    return html_text, count

In [99]:
tq = tqdm(LOADER, desc='{} E{:03d}'.format(PREFIX, 0), ncols=0)
net.eval()
answ = []
idxs = []
accs = []
num_iters = 0
batch_id = 0
outf = open('/scratch/pramodkm/vqa/tsv/baseline_answers.html','w')
html_str = '<html><head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous"></head>'
html_str += '<body><div class="container"> <h3> Top 15 answer classes for each image </h3><br>Generated by passing an empty question to the network. Underlined classes appear as answers to questions on the image. <br>'
html_str += '<table class="table">'
outf.write(html_str)

question = 'what color besides blue is there'
question_tokens = torch.LongTensor([vocab_json['question'][w] for w in question.strip().split()] + [0]*(23-len(question.strip().split())))
question_tokens = question_tokens.unsqueeze(0).repeat(config.batch_size,1)
new_q_len = torch.LongTensor([len(question.strip().split())])
new_q_len = new_q_len.repeat(config.batch_size)

covered_image_ids = set()

avg_count = []
for v, q, a, idx, q_len in tq:
        
        var_params = {
            'volatile': False,
            'requires_grad': False,        
        }
        
        q = question_tokens
            
        q_len = new_q_len
        
        v = Variable(v.cuda(async=True), **var_params)
        q = Variable(q.cuda(async=True), **var_params)
        a = Variable(a.cuda(async=True), **var_params)
        q_len = Variable(q_len.cuda(async=True), **var_params)
        
        q_emb = embedding(q)
                
        out = net(v, q_emb, q_len)    
        
        softmax = torch.nn.functional.softmax(out)
        
        acc = utils.batch_accuracy(out.data, a.data).cpu()
        
        _, answer = out.data.cpu().max(dim=1)

        # for baseline answers
        baseline_q = q * 0   
            
        baseline_q_len = q_len/q_len
        
        baseline_q_emb = embedding(baseline_q)
                
        baseline_out = net(v, baseline_q_emb, baseline_q_len)    
        
        baseline_softmax = torch.nn.functional.softmax(baseline_out)


        for batch_i in range(config.batch_size):
            baseline_probs, baseline_idxs = baseline_softmax[batch_i, :].sort(descending=True)
            baseline_answers = [reverse_vocab_answer[int(ix)] for ix in baseline_idxs]
            
            baseline_probs = [float(prob) for prob in baseline_probs]
            print_k = 15
            #outf.write('... ' + visualize_baseline_answers(baseline_answers[-10:], baseline_probs[-10:]))
            image_id = str(val_loader.dataset.coco_ids[int(idx[batch_i])])
            if image_id in covered_image_ids:
                continue
            covered_image_ids.add(image_id)
            #outf.write('<br>Question: ' + ' '.join([reverse_vocab_question[int(w)] for w in q[batch_i, :] if int(w)!=0]))
            #outf.write('<br>Pred. ans.: ' + reverse_vocab_answer[answer[batch_i]])
            outf.write('<br><tr><td><img src="val2014/COCO_val2014_' + '0'*(12 - len(str(image_id))) + str(image_id) + '.jpg" width="256" height="256"></img></td>')
            vis_string, count = visualize_baseline_answers(baseline_answers[:print_k], baseline_probs[:print_k], image_ans[int(image_id)])
            avg_count.append(count)
            outf.write('<td>' + vis_string + '<br> #classes appearing as answers: ' + str(count) + '</td></tr>')
            
            outf.write('<hr>')
                
        answ.append(answer.view(-1))
        accs.append(acc.view(-1))
        idxs.append(idx.view(-1).clone())
        num_iters += 1
        batch_id += 1
        if num_iters == 1:
            break
            
outf.write('</table></div></body></html>')
outf.close()

print(np.mean(avg_count))
answ = list(torch.cat(answ, dim=0))
accs = list(torch.cat(accs, dim=0))
idxs = list(torch.cat(idxs, dim=0))


val E000:   0% 0/950 [00:00<?, ?it/s][A
  attention = F.softmax(attention)


1.60465116279


In [89]:
vocab_json['answer']['wood']

20

In [None]:
tq = tqdm(LOADER, desc='{} E{:03d}'.format(PREFIX, 0), ncols=0)
net.eval()
answ = []
idxs = []
accs = []
num_iters = 0
batch_id = 0
outf = open('/scratch/pramodkm/vqa/tsv/baseline_answers.html','w')
html_str = '<head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous"></head>'
html_str += '<body><div class="container"> '
outf.write(html_str)

question = 'how many are not'
question_tokens = torch.LongTensor([vocab_json['question'][w] for w in question.strip().split()] + [0]*(23-len(question.strip().split())))
question_tokens = question_tokens.unsqueeze(0).repeat(config.batch_size,1)
new_q_len = torch.LongTensor([len(question.strip().split())])
new_q_len = new_q_len.repeat(config.batch_size)

batch_percentages = np.zeros(3000)
for v, q, a, idx, q_len in tq:
        
        var_params = {
            'volatile': False,
            'requires_grad': False,        
        }
                
        v = Variable(v.cuda(async=True), **var_params)
        q = Variable(q.cuda(async=True), **var_params)
        question_tokens = Variable(question_tokens.cuda(async=True), **var_params)
        a = Variable(a.cuda(async=True), **var_params)
        q_len = Variable(q_len.cuda(async=True), **var_params)
        new_q_len = Variable(new_q_len.cuda(async=True), **var_params)
        
        
        # for baseline answers
        baseline_q = q * 0   
            
        baseline_q_len = q_len/q_len
        
        baseline_q_emb = embedding(baseline_q)
                
        baseline_out = net(v, baseline_q_emb, baseline_q_len)    
        
        baseline_softmax = torch.nn.functional.softmax(baseline_out)

        
        test_k = 300          
        test_q = question_tokens


        batch_baseline_answers = []
        for batch_i in range(config.batch_size):
            baseline_probs, baseline_idxs = baseline_softmax[batch_i, :].sort(descending=True)
            baseline_answers = [reverse_vocab_answer[int(ix)] for ix in baseline_idxs]
            
            counter = 4
            for ba in baseline_answers[2:]:
                if counter == 7:
                    break
                if ba not in vocab_json['question']:
                    continue
                test_q[batch_i, counter] = vocab_json['question'][ba]
                counter += 1
            test_out = net(v, embedding(test_q), new_q_len)
            _, answer = test_out.data.cpu().max(dim=1)

        for batch_i in range(config.batch_size):
            baseline_probs, baseline_idxs = baseline_softmax[batch_i, :].sort(descending=True)
            baseline_answers = [reverse_vocab_answer[int(ix)] for ix in baseline_idxs]
            
            baseline_probs = [float(prob) for prob in baseline_probs]
            print_k = 100
            outf.write(visualize_baseline_answers(baseline_answers[:print_k], baseline_probs[:print_k]))
            outf.write('... ' + visualize_baseline_answers(baseline_answers[-10:], baseline_probs[-10:]))
            image_id = str(val_loader.dataset.coco_ids[int(idx[batch_i])])
            outf.write('<br>Question: ' + ' '.join([reverse_vocab_question[int(w)] for w in test_q[batch_i, :] if int(w)!=0]))
            outf.write('<br>Pred. ans.: ' + reverse_vocab_answer[answer[batch_i]])
            outf.write('<br><img src="val2014/COCO_val2014_' + '0'*(12 - len(str(image_id))) + str(image_id) + '.jpg" width="256" height="256"></img><br><br>')
            outf.write('<hr>')
                      
        

        answ.append(answer.view(-1))
        accs.append(acc.view(-1))
        idxs.append(idx.view(-1).clone())
        num_iters += 1
        batch_id += 1
        if num_iters == 1:
            break
            
outf.write('</div></body>')
outf.close()
answ = list(torch.cat(answ, dim=0))
accs = list(torch.cat(accs, dim=0))
idxs = list(torch.cat(idxs, dim=0))

In [None]:
plt.plot(batch_percentages[:300]/300)

In [None]:
batch_percentages/300

In [None]:
vocab_json['question']['rooster']

In [None]:
set(vocab_json['question'].keys()) - set(vocab_json['answer'].keys())