In [1]:
import os
import sys
sys.path.append('/'.join(os.getcwd().split('/')[:-1]))
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import os
from model.bAbI_utils import bAbI_data_loader, data_loader, pad_to_batch, pad_to_story

USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

---

![](./figs/E2EMN.png)

## Single Layer
Sentences: 

$$X = [x_1, x_2, \cdots, x_n]: n \times T_c$$

* $n$: number of sentences in context
* $T_c$: max length of a sentence in context

Embeding Matrix: 

$$\begin{aligned}
A &: d \times V \\
B &: d \times V \\
C &: d \times V
\end{aligned}$$

inputs: 

$$\begin{aligned}
m_i &: T_c \times d \\ 
u &: T_q \times d
\end{aligned}$$

total embedding of context: $M : n \times T_c \times d$
* $m_i(c_i)$: summation embedded for each sentence in context as length of $T_c$, $n \times d$
* $u$: summation embedded for query(question) as length of $T_q$, $1 \times d$
* $score = m_iu^T: (n \times d) \cdot (d \times 1) = n \times 1$

attention:
$$\begin{aligned}
p_i &= softmax(score): n \times 1 \\
o &= \sum_i c_i p_i : d \times 1 \\
\end{aligned}$$

summation vectors to linear layer:
$$\begin{aligned}
inputs = u + o : d \times 1 \\
a = softmax(W \cdot inputs) : (V \times d) \times (d \times 1) = V \times 1
\end{aligned}$$

https://arxiv.org/pdf/1503.08895.pdf

https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py

# Model

In [2]:
class E2EMN(nn.Module):
    def __init__(self, vocab_size, embed_size, n_hops=3):
        super(E2EMN, self).__init__()

        self.n_hops = n_hops

        # sharing matrix for k hops
        self.embed_A = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_B = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_C = nn.Embedding(vocab_size, embed_size, padding_idx=0)

        self.linear = nn.Linear(embed_size, vocab_size)

    def forward(self, stories, questions):
        """
        stories: B, n, T_c
        questions: B, T_q
        """
        o_list = []
        # questions: B, T_q
        embed_B = self.embed_B(questions) # B, T_q, d
        u = embed_B.sum(1) # u^0: B, d
        o_list.append(u) # [(B, d)]

        for k in range(self.n_hops):
            batch_memories = [] # B, n, d
            batch_contexts = [] # B, n, d
            for inputs in stories: 
                # inputs: n, T_c
                embed_A = self.embed_A(inputs) # n, T_c, d
                embed_C = self.embed_C(inputs)
                m = embed_A.sum(1) # n, d
                c = embed_C.sum(1) 
                batch_memories.append(m)
                batch_contexts.append(c)

            batch_memories = torch.stack(batch_memories) # B, n, d
            batch_contexts = torch.stack(batch_contexts) # B, n, d

            # attention: select which sentence to attent
            score = torch.bmm(batch_memories, o_list[-1].unsqueeze(2)) # m * u[-1] : (B, n, d) x (B, d, 1) = B, n, 1
            probs = F.softmax(score, dim=1) # p: B, n, 1

            # output: element-wies mul & sum (B, n, d) x (B, n, 1) = B, n, d > B, d
            o = torch.sum(batch_contexts * probs, 1)

            o_next = o_list[-1] + o
            o_list.append(o_next) # B, d

        outputs = self.linear(o_list[-1]) # B, d > B, V
        return outputs

In [17]:
path = '../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_train.txt'
train_data, w2idx = bAbI_data_loader(path)
train_loader = data_loader(train_data, batch_size=32, shuffle=True)
idx2w = {v: k for k, v in w2idx.items()}

In [18]:
VOCAB_SIZE = len(w2idx)
EMBED_SIZE = 50
N_HOPS = 3
LR = 0.001
STEP = 100

In [19]:
model = E2EMN(VOCAB_SIZE, EMBED_SIZE, n_hops=N_HOPS)
if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adm(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(gamma=0.5, milestones=[25, 50, 75], optimizer=optimizer)

In [20]:
def get_cuda(*args):
    return [x.cuda() for x in args]

In [21]:
model.train()
for step in range(STEP):
    losses=[]
    scheduler.step()
    for i, batch in enumerate(train_loader):
        stories, _, questions, _, answers, supports = pad_to_batch(batch, w2idx)
        
        if USE_CUDA:
            stories, questions, answers = get_cuda(stories, questions, answers)

        model.zero_grad()
        
        preds = model(stories, questions)
        
        loss = loss_function(preds, answers.view(-1))
        losses.append(loss.data[0])
        
        loss.backward()
        
#         torch.nn.utils.clip_grad_norm(model.parameters(), 0.25)
        optimizer.step()
    
    if step % 5 == 0:
        string = '[{}/{}] loss: {:.4f}, lr: {}'.format(step+1, STEP, np.mean(losses), scheduler.get_lr()[0])
        print(string)
        losses=[]
    train_loader = data_loader(train_data, batch_size=32, shuffle=True)

[1/100] loss: 2.3857, lr: 0.001
[6/100] loss: 1.5484, lr: 0.001
[11/100] loss: 1.2506, lr: 0.001
[16/100] loss: 1.0635, lr: 0.001
[21/100] loss: 0.9784, lr: 0.001
[26/100] loss: 0.9061, lr: 0.0005
[31/100] loss: 0.8622, lr: 0.0005
[36/100] loss: 0.8411, lr: 0.0005
[41/100] loss: 0.8273, lr: 0.0005
[46/100] loss: 0.8082, lr: 0.0005
[51/100] loss: 0.7880, lr: 0.00025
[56/100] loss: 0.7795, lr: 0.00025
[61/100] loss: 0.7772, lr: 0.00025
[66/100] loss: 0.7697, lr: 0.00025
[71/100] loss: 0.7690, lr: 0.00025
[76/100] loss: 0.7614, lr: 0.000125
[81/100] loss: 0.7610, lr: 0.000125
[86/100] loss: 0.7579, lr: 0.000125
[91/100] loss: 0.7565, lr: 0.000125
[96/100] loss: 0.7580, lr: 0.000125


In [49]:
path_test = '../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_test.txt'
test_data, w2idx = bAbI_data_loader(path_test, w2idx)
test_loader = data_loader(test_data, batch_size=32, shuffle=True)

In [50]:
model.eval()
accuracy = 0
for i, batch in enumerate(test_loader):
    stories, _, questions, _, answers, supports = pad_to_story(batch, w2idx)
    
    if USE_CUDA:
        stories = [get_cuda(x) for x in stories]
        questions, answers = get_cuda(questions, answers)
        
    for story, q, a in zip(stories, questions, answers):
        model.zero_grad()
        pred = model(story.unsqueeze(0), q.unsqueeze(0))
        accuracy += torch.eq(torch.max(pred, 1)[1], a).data[0]

print('Accuracy: {}'.format(accuracy/len(test_data)))

Accuracy: 0.629


In [81]:
story, _, q, _, a, s = pad_to_story([random.choice(test_data)], w2idx)
model.zero_grad()
pred = model(story[0].unsqueeze(0), q)
pred_a = torch.max(pred, 1)[1]

print("Facts : ")
print('-'*45)
print('\n'.join([' '.join(list(map(lambda x: idx2w[x], f))) for f in story[0].data.tolist()]))
print('-'*45)
print("Question : ",' '.join(list(map(lambda x: idx2w[x], q.data.tolist()[0]))))
print('-'*45)
print("Answer : ",' '.join(list(map(lambda x: idx2w[x], a.squeeze(1).data.tolist()))))
print("Prediction : ",' '.join(list(map(lambda x: idx2w[x], pred_a.data.tolist()))))

Facts : 
---------------------------------------------
sandra went back to the bathroom
sandra journeyed to the office <pad>
---------------------------------------------
Question :  where is sandra ?
---------------------------------------------
Answer :  office
Prediction :  bathroom
