In [1]:
import sys
import os
import os.path
import random
import collections
import shutil
import time
import glob
import logging
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.trainer as trainer
import torch.utils.trainer.plugins
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import models

from PIL import Image


torch.manual_seed(1)
model = models.resnet152(pretrained=True)
use_cuda = True
# print model

In [2]:
class Affine2D(nn.Module):
    def __init__(self, L, M, N):
        super(Affine2D, self).__init__()
        
        self.weight = nn.Parameter(torch.Tensor(M, N))
        self.bias = nn.Parameter(torch.Tensor(L, N))
        
        self.weight.data.uniform_(-0.1, 0.1)
        self.bias.data.uniform_(-0.1, 0.1)
        
        self.L = L
        self.M = M
        self.N = N
        
    #   expect a 3d tensor : Batch * L * M
    def forward(self, input):
        input = input.contiguous().view(-1, self.M)
        input = torch.mm(input, self.weight)
        input = input.view(-1, self.L, self.N)
        input += self.bias.unsqueeze(0).expand_as(input)
        return input

In [3]:
class BoW(nn.Module):
    def __init__(self, T, N):
        super(BoW, self).__init__()
        
        self.weight = nn.Parameter(torch.Tensor(1, T))
        self.bias = nn.Parameter(torch.Tensor(1, N))
        
        self.weight.data.uniform_(-0.1, 0.1)
        self.bias.data.uniform_(-0.1, 0.1)
        
        self.T = T
        self.N = N
        
    #   expect a 3d sentence tensor : Batch * T * N
    def forward(self, input):
        output = autograd.Variable(torch.Tensor(input.size()[0], input.size()[2]))
        for b in xrange(input.size()[0]):
            output[b, :] = (torch.mm(self.weight, input[b, :, :]) + self.bias).squeeze()
        return output

In [4]:
class SMN(nn.Module):
    def __init__(self, model, hop, L, T, M, N, K, dic_size):
        super(SMN, self).__init__()

        # load resnet
        modules = list(model.children())
        modules.pop()
        modules.pop()
        modules.pop()
        self.features = nn.Sequential(*modules)
        for param in self.features.parameters():
            param.require_grad = False

        self.L = L
        self.T = T
        self.M = M
        self.N = N
        self.K = K
        self.dic_size = dic_size
        
        self.hop = hop
        
        # word embedding
        self.embedding = nn.Embedding(self.dic_size, self.N)

        self.attention = nn.Sequential(
            Affine2D(self.L, self.M, self.N)
        )
        self.evidence = nn.Sequential(
            Affine2D(self.L, self.M, self.N)
        )
        self.BoW = BoW(self.T, self.N)
        self.linear = nn.Linear(self.N, self.K)
        
        
    def forward(self, img, question):
        #   extract features from resNet
        S = self.features(img)
        
        # embedding
        V = self.embedding(question)
        
        #   word attention
        S = ((S.view(-1, self.M, self.L)).permute(0, 2, 1))
        att = self.attention(S)
        
        batch_size = att.size()[0]
        C = autograd.Variable(torch.FloatTensor(batch_size, self.T, self.L))
        C = C.cuda()
        for b in xrange(batch_size):
            C[b, :, :] = torch.mm(V[b, :, :], att.transpose(1, 2)[b, :, :])

        # C : Batch * T * L
        W_att = nn.Softmax()(torch.squeeze(torch.max(C, 1)[0])).unsqueeze(1)
    
        # evidence * attention
        evid = self.evidence(S)
        S_att = autograd.Variable(torch.Tensor(batch_size, self.N))
        for b in xrange(batch_size):
            S_att[b, :] = torch.mm(W_att[b, :, :], evid[b, :, :]).squeeze()
        
        Q = self.BoW(V)
        
        if self.hop == 1:    
            # 1-hop output
            P = nn.LogSoftmax()(self.linear(nn.ReLU()(S_att + Q)))
        elif self.hop == 2:
            O_hop1 = S_att + Q
            C_hop2 = autograd.Variable(torch.Tensor(batch_size, self.L))
            for b in xrange(batch_size):
                C_hop2[b, :] = torch.mm(evid[b, :, :], O_hop1[b, :].unsqueeze(1)).squeeze()
            
            W_att2 = nn.Softmax()(C_hop2).unsqueeze(1)
            evid2 = self.evidence(S)
            S_att2 = autograd.Variable(torch.Tensor(batch_size, self.N))
            for b in xrange(batch_size):
                S_att2[b, :] = torch.mm(W_att2[b, :, :], evid2[b, :, :]).squeeze()
            
            P = nn.LogSoftmax()(self.linear(nn.ReLU()(O_hop1 + S_att2)))
        return P

In [5]:
stc_idx_to_img_name = {}
question_word_to_idx = {'UNK': 0, 'END': 1}
answer_word_to_idx = {'UNK': 0}

# read txt questions and answers
f = open('./datasets/texts/train/train.txt', 'r')

# preload
is_question = True
line = f.readline()
max_len = 0
cnt = 0
while line != '':
    if is_question:
        cnt = cnt + 1
        if len(line.split()[:-4]) > max_len:
            max_len = len(line.split()[:-4])
        for word in line.split()[:-4]:
            if word not in question_word_to_idx:
                question_word_to_idx[word] = len(question_word_to_idx)
    else:
        word = line.split()[0]
        if word not in answer_word_to_idx:
            answer_word_to_idx[word] = len(answer_word_to_idx)
    line = f.readline()
    is_question = not is_question
    

# reload, add padding and convert into integers
f.seek(0)
line = f.readline()
train_q = torch.LongTensor(cnt, max_len).zero_() + 1
train_a = torch.LongTensor(cnt)
train_img_names = []
idx = 0
while line != '':
    if is_question:
        q = line.split()[:-4]
        train_img_names.append(line.split()[-2])
        for i in xrange(len(q)):
            train_q[idx, i] = question_word_to_idx[q[i]]
    else:
        word = line.split()[0]
        train_a[idx] = answer_word_to_idx[word]
        idx = idx + 1
    line = f.readline()
    is_question = not is_question

f.close()

# print max_len
# print len(question_word_to_idx)
# print torch.max(train_q)
# print len(train_img_names)
# print len(answer_word_to_idx)


smn_model = SMN(model, 
                2,
                L=14*14, 
                T=max_len, 
                M=1024, 
                N=300, 
                K=len(answer_word_to_idx), 
                dic_size=len(question_word_to_idx)
                )

if use_cuda:
    smn_model = torch.nn.DataParallel(smn_model).cuda()

print cnt

3876


In [6]:
f = open('./datasets/texts/val/val.txt', 'r')

cnt = 0
line = f.readline()
while line != '':
    cnt = cnt + 1
    line = f.readline()

cnt /= 2
f.seek(0)
line = f.readline()
val_q = torch.LongTensor(cnt, max_len).zero_() + 1
val_a = torch.LongTensor(cnt)
val_img_names = []
idx = 0
is_question = True
while line != '':
    if is_question:
        q = line.split()[:-4]
        val_img_names.append(line.split()[-2])
        for i in xrange(len(q)):
            if q[i] not in question_word_to_idx:
                val_q[idx, i] = 0
            else:
                val_q[idx, i] = question_word_to_idx[q[i]]
    else:
        word = line.split()[0]
        if word not in answer_word_to_idx:
            val_a[idx] = 0
        else:
            val_a[idx] = answer_word_to_idx[word]
        idx = idx + 1
    line = f.readline()
    is_question = not is_question

f.close()
print cnt

297


In [7]:
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, root, questions, answers, imgs, transform=None):
        super(VQADataset, self).__init__()
        self.root = root
        self.questions = questions
        self.answers = answers
        self.imgs = imgs
        self.transform = transform
        
    def __getitem__(self, index):
        filename = self.imgs[index]
        img = Image.open(os.path.join(self.root, filename + '.png'))
        img = img.resize((224, 224))
        if self.transform is not None:
            img = self.transform(img)
        return img, self.questions[index], self.answers[index]

    def __len__(self):
        return len(self.answers)

In [19]:
train_data = VQADataset('./datasets/images',
                          train_q,
                          train_a,
                          train_img_names,
                          transforms.Compose([
#                                 transforms.Scale(224),
#                                 transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                                     std=[0.229, 0.224, 0.225])
                            ])
                         )
val_data = VQADataset('./datasets/images',
                          val_q,
                          val_a,
                          val_img_names,
                          transforms.Compose([
#                                 transforms.Scale(224),
#                                 transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                                     std=[0.229, 0.224, 0.225])
                            ])
                         )

# print train_data.__getitem__(2)
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=2, shuffle=True)
val_loader = torch.utils.data.DataLoader(
    val_data, batch_size=5, shuffle=True)

In [20]:
class AverageLogger:
    def __init__(self):
        self.N = 0
        self.val = 0
        self.avg = 0
    def update(self, val, n=1):
        self.val += val
        self.N += n
        self.avg = self.val / self.N
    def getAverage(self):
        return self.avg
    

In [21]:
def accuracy(pred, target):
    pred_max, indices = torch.sort(pred, dim=1, descending=True)
    indices = torch.squeeze(indices[:, 0])
    return float((indices == target).sum())
    

In [22]:
def train(train_loader, model, criterion, optimizer, epoch):
    losses = AverageLogger()
    acc = AverageLogger()
    
    # switch to train mode
    model.train()
    
    old_time= time.time()
    for i, (images, questions, target) in enumerate(train_loader):
        if use_cuda:
            target = target.cuda(async=True)

        # compute y_pred
        y_pred = model(autograd.Variable(images), autograd.Variable(questions))
        loss = criterion(y_pred, autograd.Variable(target))
        
        # measure accuracy and record loss
#         print y_pred.size()
#         print target.size()
        pred_acc = accuracy(y_pred.data, target)
        losses.update(loss.data[0], y_pred.size(0))
        acc.update(pred_acc, y_pred.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        new_time = time.time()
        print "Batch %d's time: %.3f" % (i, new_time - old_time)
        old_time = new_time
        break
    
    print "Epoch %d: Losses: %.3f, Accuracy: %.3f" % (epoch, losses.getAverage(), acc.getAverage())

In [23]:
criterion = nn.NLLLoss()
if use_cuda:
    criterion = criterion.cuda()
optimizer = torch.optim.SGD(smn_model.parameters(), 1e-3, 0.9)

epochs = 1
for epoch in xrange(epochs):
    train(train_loader, smn_model, criterion, optimizer, epoch)

TypeError: torch.addmm received an invalid combination of arguments - got (int, torch.FloatTensor, int, torch.FloatTensor, torch.cuda.FloatTensor, out=torch.FloatTensor), but expected one of:
 * (torch.FloatTensor source, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (torch.FloatTensor source, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (float beta, torch.FloatTensor source, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (torch.FloatTensor source, float alpha, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (float beta, torch.FloatTensor source, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (torch.FloatTensor source, float alpha, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (float beta, torch.FloatTensor source, float alpha, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
      didn't match because some of the arguments have invalid types: ([32;1mint[0m, [32;1mtorch.FloatTensor[0m, [32;1mint[0m, [32;1mtorch.FloatTensor[0m, [31;1mtorch.cuda.FloatTensor[0m, [32;1mout=torch.FloatTensor[0m)
 * (float beta, torch.FloatTensor source, float alpha, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
      didn't match because some of the arguments have invalid types: ([32;1mint[0m, [32;1mtorch.FloatTensor[0m, [32;1mint[0m, [31;1mtorch.FloatTensor[0m, [31;1mtorch.cuda.FloatTensor[0m, [32;1mout=torch.FloatTensor[0m)
