In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import os
from bAbI_data_utils import bAbI_data_load, data_loader, pad_to_batch, pad_to_fact

USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

In [2]:
train_data, word2index = bAbI_data_load("../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_train.txt")

Start to data loading...


In [3]:
train_loader = data_loader(train_data, batch_size=32, shuffle=True)

In [4]:
for batch in train_loader:
    break

In [5]:
len(batch)

32

In [6]:
facts, fact_masks, questions, question_masks, answers = pad_to_batch(batch, word2index)

In [13]:
len(question_masks)

32

In [8]:
facts[0]

Variable containing:
    6    14     4    15     7     3     0
    8    12     4    15    19     3     0
   18    11     4    15    21     3     0
    8     9     4    15     7     3     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
[torch.LongTensor of size 10x7]

In [9]:
fact_masks[0]

Variable containing:
    0     0     0     0     0     0     1
    0     0     0     0     0     0     1
    0     0     0     0     0     0     1
    0     0     0     0     0     0     1
    1     1     1     1     1     1     1
    1     1     1     1     1     1     1
    1     1     1     1     1     1     1
    1     1     1     1     1     1     1
    1     1     1     1     1     1     1
    1     1     1     1     1     1     1
[torch.ByteTensor of size 10x7]

In [16]:
questions

Variable containing:
   10    20    18     5
   10    20     6     5
   10    20    22     5
   10    20     6     5
   10    20     6     5
   10    20    18     5
   10    20     8     5
   10    20     8     5
   10    20     6     5
   10    20    22     5
   10    20     6     5
   10    20     6     5
   10    20     6     5
   10    20     8     5
   10    20     8     5
   10    20     8     5
   10    20    22     5
   10    20    22     5
   10    20    18     5
   10    20    18     5
   10    20    18     5
   10    20     8     5
   10    20     6     5
   10    20    18     5
   10    20     6     5
   10    20    18     5
   10    20     6     5
   10    20    18     5
   10    20     6     5
   10    20    22     5
   10    20    18     5
   10    20     8     5
[torch.LongTensor of size 32x4]

In [15]:
question_masks

Variable containing:
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
[torch.ByteTensor of size 32x4]

![](./figs/E2EMN.png)

## Single Layer
Sentences: 

$$X = [x_1, x_2, \cdots, x_n]: n \times T_c$$

* $n$: number of sentences in context
* $T_c$: max length of a sentence in context

Embeding Matrix: 

$$\begin{aligned}
A &: d \times V \\
B &: d \times V \\
C &: d \times V
\end{aligned}$$

inputs: 

$$\begin{aligned}
m_i &: T_c \times d \\ 
u &: T_q \times d
\end{aligned}$$

total embedding of context: $M : n \times T_c \times d$
* $m_i$: summation embedded for each sentence in context as length of $T_c$, $n \times d$
* $u$: summation embedded for query(question) as length of $T_q$, $1 \times d$
* $score = m_iu^T: (n \times d) \cdot (d \times 1) = n \times 1$

attention:
$$\begin{aligned}
p &= softmax(score): n \times 1 \\
o &= c^T p : d \times 1 \\
\end{aligned}$$

summation vectors to linear layer:
$$\begin{aligned}
inputs = u + o : d \times 1 \\
a = softmax(W \cdot inputs) : (V \times d) \times (d \times 1) = V \times 1
\end{aligned}$$

https://arxiv.org/pdf/1503.08895.pdf

# Model

In [7]:
facts[0]  # n, T_c

Variable containing:
    8     6    17    19    23     3     0
   22    21    15    17    19     7     3
    8    13    17    19    18     3     0
    8    12    17    19     7     3     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
[torch.LongTensor of size 10x7]

In [11]:
questions[0] # T_q

Variable containing:
 16
 10
  8
 14
[torch.LongTensor of size 4]

In [12]:
d = 5
n = 10

In [13]:
embed = nn.Embedding(len(word2index), d, padding_idx=0)

In [20]:
embed.parameters

<bound method Module.parameters of Embedding(24, 5, padding_idx=0)>

In [14]:
embed_a = embed(facts[0])

In [15]:
embed_b = embed(questions[0])

In [16]:
embed_b.size()  # T_q, d

torch.Size([4, 5])

In [17]:
embed_a.size()  # n, T_c, d

torch.Size([10, 7, 5])

In [22]:
m = embed_a.sum(dim=1) # n, d
m

Variable containing:
-3.4360  5.4795  2.6442  1.1397 -2.2188
-5.3981  3.0593 -0.1660  0.2619 -1.0017
-5.2118  2.7386  1.0787 -1.1378 -2.8889
-4.8005 -0.0609  0.0309 -2.9232 -0.8567
-5.3981  3.0593 -0.1660  0.2619 -1.0017
-4.8060  1.8243 -3.2936 -1.6170  0.2388
 0.0000  0.0000  0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  0.0000  0.0000
[torch.FloatTensor of size 10x5]

In [25]:
u = embed_b.sum(dim=0) # d
u

Variable containing:
-0.4348
 3.1010
-2.8605
 3.2210
-1.8064
[torch.FloatTensor of size 5]

In [56]:
score = torch.mm(m, u.unsqueeze(1)) # n, 1
score

Variable containing:
 18.6007
 14.9615
  9.2263
 -6.0585
 14.9615
 11.5281
  0.0000
  0.0000
  0.0000
  0.0000
[torch.FloatTensor of size 10x1]

In [60]:
p = F.softmax(score, dim=0) # n, 1
p

Variable containing:
 9.4923e-01
 2.4941e-02
 8.0560e-05
 1.8537e-11
 2.4941e-02
 8.0498e-04
 7.9288e-09
 7.9288e-09
 7.9288e-09
 7.9288e-09
[torch.FloatTensor of size 10x1]

In [48]:
embed_c = embed(facts[0])

In [50]:
c = embed_c.sum(1) # n, d
c.size()

torch.Size([10, 5])

In [68]:
o = torch.mm(c.t(), p) # d, 1
o

Variable containing:
-3.5351
 5.3556
 2.4991
 1.0935
-2.1562
[torch.FloatTensor of size 5x1]

In [72]:
inputs = u.unsqueeze(1) + o

In [73]:
inputs

Variable containing:
-3.9699
 8.4565
-0.3614
 4.3145
-3.9626
[torch.FloatTensor of size 5x1]

In [None]:
class E2EMN(nn.Module):
    def __init__(self, vocab_size, embed_size, mem_size, n_hops=3):
        super(E2EMN, self).__init__()
        
        self.n_hops = n_hops
        
        # sharing matrix for k hops
        self.embed_A = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_B = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_C = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        
        self.linear = nn.Linear()
        
    def forward(self, stories, questions):
        u_list = []
        # questions: B, T_q
        embed_B = self.embed_B(questions) # B, T_q, d
        u = embed_B.sum(1) # u^0: B, d
        u_list.append(u) # [(B, d)]
        
        for k in range(self.n_hops):
            batch_memories = [] # B, n, d
            batch_outputs = [] # B, n, d
            for inputs in stories: 
                # inputs: n, T_c
                embed_A = self.embed_A(inputs) # n, T_c, d
                embed_C = self.embed_C(inputs)
                m = embed_A.sum(1) # n, d
                c = embed_C.sum(1) 
                batch_memories.append(m)
                batch_outputs.append(c)
                
            batch_probs = []
            for i, m in enumerate(batch_memories):
                # m: n, d
                score = torch.mm(m, u[-1][i].unsqueeze(1)) # m * u[i] : (n, d) x (d, 1) = n, 1
                p = F.softmax(score, dim=0) # n, 1
                batch_probs.append(p)
            
            
                

In [30]:
questions.size()

torch.Size([32, 4])

In [38]:
embed(questions).sum(1)[0].unsqueeze(1)

Variable containing:
-3.1572
-1.4834
-1.7043
 2.3171
-0.9361
[torch.FloatTensor of size 5x1]