# QA - gluon
* http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html

In [3]:
from __future__ import print_function
import numpy as np
import mxnet as mx
import mxnet.ndarray as F
import mxnet.contrib.ndarray as C
import mxnet.gluon as gluon
from mxnet.gluon import nn
from mxnet import autograd
import bisect
from IPython.core.display import display, HTML
import logging
logging.basicConfig(level=logging.INFO)
import os
from mxnet.test_utils import download
import json
from IPython.display import HTML, display

# Define the model

In [4]:
batch_size = 64
ctx = mx.cpu()
compute_size = batch_size
out_dim = 10000
gpus = 2

## In the first model, we will concatenate the image and question features and use multilayer perception(MLP) to predict the answe

In [14]:
class Net1(gluon.Block):
    def __init__(self, **kwargs):
        super(Net1, self).__init__(**kwargs)
        with self.name_scope():
            self.bn = nn.BatchNorm()
            self.dropout = nn.Dropout(0.3)
            self.fc1 = nn.Dense(8192, activation = 'relu')
            self.fc2 = nn.Dense(1000)
        
    def forward(self, x):
        x1 = F.L2Normalization(x[0])
        x2 = F.L2Normalization(x[1])
        z = F.concat(x1, x2, dim = 1)
        z = self.fc1(z)
        z = self.bn(z)
        z = self.dropout(z)
        z = self.fc2(z)
        return z


In [15]:
net1 = Net1()
net1

Net1(
  (bn): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, in_channels=None)
  (dropout): Dropout(p = 0.3)
  (fc1): Dense(None -> 8192, Activation(relu))
  (fc2): Dense(None -> 1000, linear)
)

## In the second model, instead of linearly combine the image and text features, we use count sketch to estimate the outer product of the image and question features. It is also named as multimodel compact bilinear pooling(MCB)

In [None]:
class Net2(gluon.Block):
    def __init__(self, **kwargs):
        super(Net2, self).__init__(**kwargs)
        with self.name_scope():
            slef.bn = nn.BatchNorm()
            self.dropout = nn.Dropout(0.3)
            self.fc1 = nn.Dense(8192, activation = 'relu')
            self.fc2 = nn.Dense(1000)
    
    def forward(self, x):
        x1 = F.L2Normalization(x[0])
        x2 = F.L2Normalization(x[1])
        text_ones = F.ones((batch_size/gpus, 2048), ctx = ctx)
        img_ones = F.ones((batch_size/gpus, 2048), ctx = ctx)
        text_data = F.Concat(x1, text_ones, dim = 1)
        image_data = F.Concat(x2, img_ones, dim = 1)
        # Initialize hash tables
        S1 = F.array(np.random.randint(0, 2, (1, 3072))*2 -1 , ctx = ctx)
        H1 = F.array(np.random.randint(0, out_dim(1, 3072))*2 - 1, ctx = ctx)
        S2 = F.array(np.random.randint(0, 2, (1, 3072))*2 -1 , ctx = ctx)
        H2 = F.array(np.random.randint(0, out_dim(1, 3072))*2 - 1, ctx = ctx)
        # Count Sketch
        cs1 = C.count_sketch(data = image_data, s = S1, h = H1, name = 'cs1', out_dim = out_dim)
        cs2 = C.count_sketch(data = text_data, s = S1, h = H1, name = 'cs1', out_dim = out_dim)
        fft1 = C.fft(data = cs1, name = 'fft1', compute_size = compute_size)
        ff2 = C.fft(data = cs2, name = 'fft2', compute_size = compute_size)
        c = fft1 * fft2 # Elementwise product
        ifft1 = C.ifft(data = c, name = 'ifft1', compute_size = compute_size)
        # MLP
        z = self.fc1(ifft1)
        z = self.bn(z)
        z = self.dropout(z)
        z = self.fc2(z)
        return z

# Data Iterator

In [None]:
class VQAtrainIter(mx.io.DataIter):
    def __init__(self, img, sentences, answer, batch_size, buckets=None, )