In [1]:
import os
import sys
sys.path.append('/'.join(os.getcwd().split('/')[:-1]))
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import os
from model.bAbI_utils_loader import bAbIDataset, bAbIDataLoader

USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

---

![](./figs/E2EMN.png)

## Single Layer
Sentences: 

$$X = [x_1, x_2, \cdots, x_n]: n \times T_c$$

* $n$: number of sentences in context
* $T_c$: max length of a sentence in context

Embeding Matrix: 

$$\begin{aligned}
A &: d \times V \\
B &: d \times V \\
C &: d \times V
\end{aligned}$$

$$\begin{aligned}
m_i &= \sum_j Ax_{ij}: T_c \times d \\ 
c_i &= \sum_j Cx_{ij}: T_c \times d\\
u &= \sum_j Bq_{j}: T_q \times d
\end{aligned}$$

total embedding of context: $M : n \times T_c \times d$
* $m_i(c_i)$: summation embedded for each sentence in context as length of $T_c$, $n \times d$
* $u$: summation embedded for query(question) as length of $T_q$, $1 \times d$
* $score = m_iu^T: (n \times d) \cdot (d \times 1) = n \times 1$

attention:
$$\begin{aligned}
p_i &= softmax(score): n \times 1 \\
o &= \sum_i c_i p_i : d \times 1 \\
\end{aligned}$$

summation vectors to linear layer:

$$\begin{aligned}
inputs &= u + o : d \times 1 \\
a &= softmax(W \cdot inputs) : (V \times d) \times (d \times 1) = V \times 1
\end{aligned}$$

https://arxiv.org/pdf/1503.08895.pdf

https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py

postion encoding(PE):

$$\begin{aligned}
m_i &= \sum_j l_j \otimes Ax_{ij}: T_c \times d \\ 
l_{kj} &= (1-\frac{j}{J}) - (\frac{k}{d})(1-\frac{2j}{J})
\end{aligned}$$

* $J$: number of word in sentences
* $d$: dimension of embedding

# Model

In [28]:
class E2EMN(nn.Module):
    def __init__(self, vocab_size, embed_size, n_hops=3, encoding_method=['basic'], use_cuda=False):
        super(E2EMN, self).__init__()
        assert isinstance(encoding_method, list), 'encoding_method must be a list type'
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.n_hops = n_hops
        self.encoding_method = [e.lower() for e in encoding_method]
        self.use_cuda = use_cuda
        
        
        # sharing matrix for k hops & and init to normal dist.
        self.embed_A = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_B = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_C = nn.Embedding(vocab_size, embed_size, padding_idx=0)
    
        self.linear = nn.Linear(embed_size, vocab_size)
    
    def _weigth_init(self):
        pass
    
    def _temporal_encoding_requirements(self, story_len):
        # temporal encoding
        if 'temp' in self.encoding_method:
            assert story_len is not None, 'must have a fixed story_len, insert "story_len" as a number'
            assert isinstance(story_len, int), '"story_len" must be a integer'
            
            self.embed_A_T = nn.Embedding(story_len, embed_size)
            self.embed_C_T = nn.Embedding(story_len, embed_size)
        else:
            self.embed_A_T = None
            self.embed_C_T = None
            
    def _pe_requirements(self, stories_masks):
        if stories_masks is not None:
            pe_word_lengths = stories_masks.eq(0).sum(2) #B, n : byte tensor
        else:
            pe_word_lengths = None
        return pe_word_lengths
    
    def encoding2memory(self, embeded_x, word_length=None):
        """
        embed_x: n, T_c, d
        word_length: n
        """
        if (len(self.encoding_method) == 1) and self.encoding_method[0] == 'basic':
            return embeded_x.sum(1) # n, d
        
        if 'pe' in self.encoding_method:
            assert word_length is not None, 'insert stories_masks when forward'
            
            T_c, d = embeded_x.size()[1:]
            j = Variable(torch.arange(1, T_c+1).unsqueeze(1).repeat(1, d), requires_grad=False)
            k = Variable(torch.arange(1, d+1).unsqueeze(1).repeat(1, T_c).t(), requires_grad=False)
            if self.use_cuda:
                    j, k = j.cuda(), k.cuda()
                    
            embeded_x_pe = []
            for embed, J in zip(embeded_x, word_length.float()): # iteration of n size
                if J.eq(0).data[0]: # all pad data which word length = 0
                    embeded_x_pe.append(embed)
                else:
                    l = (torch.ones_like(embed).float() - j/J) - (k/d)*(torch.ones_like(embed) - (2*j)/J)
                    embed = embed * l
                    embeded_x_pe.append(embed) # T_c, d
            embeded_x_pe = torch.stack(embeded_x_pe) # n, T_c, d
            return embeded_x_pe.sum(1) # n, d
        
        
        
    def forward(self, stories, questions, stories_masks=None, questions_masks=None, story_len=None):
        """
        stories, stories_masks: B, n, T_c
        questions, questions_masks: B, T_q
        """
        # init some requirements
        self._temporal_encoding_requirements(story_len)
        pe_word_lengths = self._pe_requirements(stories_masks)
        
        # Start Learning
        o_list = []
        # questions: B, T_q
        embeded_B = self.embed_B(questions) # B, T_q, d
        u = embeded_B.sum(1) # u: B, d
        o_list.append(u) # [(B, d)]
        
        for k in range(self.n_hops):
            batch_memories = [] # B, n, d
            batch_contexts = [] # B, n, d
            for i, inputs in enumerate(stories): # iteration of batch
                # inputs: n, T_c
                embeded_A = self.embed_A(inputs) # n, T_c, d
                embeded_C = self.embed_C(inputs)

                m = self.encoding2memory(embeded_A, pe_word_lengths[i]) # n, d
                c = self.encoding2memory(embeded_C, pe_word_lengths[i])
                
                batch_memories.append(m)
                batch_contexts.append(c)

            batch_memories = torch.stack(batch_memories) # B, n, d
            batch_contexts = torch.stack(batch_contexts) # B, n, d

            # attention: select which sentence to attent
            # score = m * u[-1] : (B, n, d) * (B, d, 1) = B, n, 1
            score = torch.bmm(batch_memories, o_list[-1].unsqueeze(2)) 
            probs = F.softmax(score, dim=1) # p: B, n, 1

            # output: element-wies mul & sum (B, n, d) x (B, n, 1) = B, n, d > B, d
            o = torch.sum(batch_contexts * probs, 1)

            o_next = o_list[-1] + o
            o_list.append(o_next) # B, d

        outputs = self.linear(o_list[-1]) # B, d > B, V
        return outputs

## Settings: Train_loader & Parameters

In [29]:
path_train = '../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_train.txt'
bAbI_train = bAbIDataset(path_train, train=True, return_masks=True)
train_loader = bAbIDataLoader(dataset=bAbI_train, batch_size=32, shuffle=True, to_tensor=True)

In [30]:
VOCAB_SIZE = len(bAbI_train.word2idx)
EMBED_SIZE = 50
N_HOPS = 3
LR = 0.01
STEP = 100

## Settings: Loss Function & Optimizer

In [31]:
model = E2EMN(VOCAB_SIZE, EMBED_SIZE, n_hops=N_HOPS, encoding_method=['pe'])
if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(gamma=0.5, milestones=[25, 50, 75], optimizer=optimizer)

In [32]:
def get_cuda(*args):
    return [x.cuda() for x in args]

## Train

In [33]:
model.train()
for step in range(STEP):
    losses=[]
    scheduler.step()
    for i, batch in enumerate(train_loader.load()):
        stories, stories_masks, questions, _, answers, _ = batch
        
        if USE_CUDA:
            stories, stories_masks, questions, answers = get_cuda(stories, stories_masks, questions, answers)

        model.zero_grad()
        
        preds = model.forward(stories, questions, stories_masks=stories_masks)
        
        loss = loss_function(preds, answers.view(-1))
        losses.append(loss.data[0])
        
        loss.backward()
        optimizer.step()
    
    if (step+1) % 10 == 0:
        string = '[{}/{}] loss: {:.4f}, lr: {}'.format(step+1, STEP, np.mean(losses), scheduler.get_lr()[0])
        print(string)
        losses=[]


KeyboardInterrupt: 

## Test

In [14]:
path_test = '../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_test.txt'
bAbI_test = bAbIDataset(path_test, train=False, vocab=bAbI_train.word2idx, return_masks=False)
test_loader = bAbIDataLoader(dataset=bAbI_test, batch_size=32, shuffle=True, to_tensor=True)

In [17]:
model.eval()
accuracy = 0
for i, batch in enumerate(test_loader.load()):
    stories, questions, answers, _ = batch
    
    if USE_CUDA:
        stories = [get_cuda(x) for x in stories]
        questions, answers = get_cuda(questions, answers)
        
    for story, q, a in zip(stories, questions, answers):
        model.zero_grad()
        pred = model(story.unsqueeze(0), q.unsqueeze(0))
        accuracy += torch.eq(torch.max(pred, 1)[1], a).data[0]

print('Accuracy: {}'.format(accuracy/len(bAbI_test)))

Accuracy: 0.599


## Test: random print

In [19]:
story, q, a, s = bAbI_test.pad_to_story([random.choice(bAbI_test.data)], w2idx)
model.zero_grad()
pred = model(story[0].unsqueeze(0), q)
pred_a = torch.max(pred, 1)[1]

print("Facts : ")
print('-'*45)
print('\n'.join([' '.join(list(map(lambda x: idx2w[x], f))) for f in story[0].data.tolist()]))
print('-'*45)
print("Question : ",' '.join(list(map(lambda x: idx2w[x], q.data.tolist()[0]))))
print('-'*45)
print("Answer : ",' '.join(list(map(lambda x: idx2w[x], a.squeeze(1).data.tolist()))))
print("Prediction : ",' '.join(list(map(lambda x: idx2w[x], pred_a.data.tolist()))))

NameError: name 'w2idx' is not defined

In [36]:
path_train = '../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_train.txt'
bAbI_train = bAbIDataset(path_train, train=True, return_masks=True)
train_loader = bAbIDataLoader(dataset=bAbI_train, batch_size=32, shuffle=True, to_tensor=True)

In [37]:
for i, batch in enumerate(train_loader.load()):
    stories, stories_masks, questions, questions_masks, answers, _ = batch
    break

In [38]:
word_length = stories_masks.eq(0).sum(2).float()  # J

In [39]:
word_length.size()

torch.Size([32, 10])

In [100]:
temp_x = model.embed_A(stories[0])

In [101]:
embed_x = temp_x

In [152]:
T_c, d = embed.size()
j = Variable(torch.arange(1, T_c+1).unsqueeze(1).repeat(1, d), requires_grad=False)
k = Variable(torch.arange(1, d+1).unsqueeze(1).repeat(1, T_c).t(), requires_grad=False)

tt = []
for i, (embed, J) in enumerate(zip(embed_x, word_length[0])):
    if J.eq(0).data[0]: # all pad data
        tt.append(embed)
    else:
        l = (torch.ones_like(embed).float() - j/J) - (k/d)*(torch.ones_like(embed) - (2*j)/J)
        embed = embed * l
        tt.append(embed)

In [43]:
stories[0][0]

Variable containing:
  7
 16
 10
 14
 12
  0
[torch.LongTensor of size 6]

In [40]:
n= 10

In [41]:
embed_temp = nn.Embedding(n, EMBED_SIZE, padding_idx=0)

In [44]:
embed_temp(stories[0][0])

RuntimeError: index out of range at /Users/soumith/minicondabuild3/conda-bld/pytorch_1518385717421/work/torch/lib/TH/generic/THTensorMath.c:277

In [118]:
embed_temp.state_dict()

OrderedDict([('weight', 
               0.9967 -1.5335 -0.2404  ...  -0.5641 -0.4126 -0.6967
               0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
              -1.1828 -0.8180 -0.6191  ...  -0.5717  0.8391  0.1589
                        ...             ⋱             ...          
              -0.6489  1.3622  1.0723  ...   1.0169  1.2179 -1.1631
               0.0083 -1.2892  0.5822  ...   0.7219 -1.1516  0.1734
              -0.2452  1.2055  0.0817  ...   1.1516 -1.6721 -0.0940
              [torch.FloatTensor of size 22x50])])

In [107]:
for i in range(temp_x.size(0)):
    

10

In [91]:
torch.zeros

Variable containing:

Columns 0 to 5 
-2.2702e-01 -2.5431e-02  1.2480e-01 -9.1293e-02  1.0095e-02  3.6205e-02
 5.9523e-03 -3.7339e-02 -7.9318e-02  1.0661e-01 -7.0634e-02  4.7868e-02
 7.0597e-03  5.0365e-02  6.3651e-02 -1.1283e-01 -9.9317e-03  9.4435e-02
-2.4348e-02  7.9932e-02 -2.8611e-02 -3.5126e-02 -1.0483e-02 -2.4387e-02
-2.2498e-04 -8.6771e-03 -3.4956e-03  4.8646e-02 -3.2025e-02 -1.3837e-02
-3.4523e-03  1.3824e-03  1.9319e-03 -1.8455e-02  1.9208e-02 -3.5778e-03

Columns 6 to 11 
-1.0558e-01 -1.0705e-01 -6.8083e-02  2.7638e-02  5.5749e-02 -2.7952e-02
 1.6787e-02 -5.2203e-02  6.7505e-02  6.5956e-02 -6.8389e-02  1.2085e-01
-3.6121e-02  8.4510e-02 -6.3131e-02 -8.0532e-02 -1.6561e-02  3.2549e-02
-3.3294e-02  4.0431e-02 -9.3410e-04 -1.8585e-02 -1.0578e-01  8.2174e-02
 1.2895e-02 -8.1865e-03 -4.9495e-02  1.2060e-02  3.1218e-02  2.6556e-02
-4.6101e-03 -3.7042e-02 -1.7590e-02  2.0497e-03  1.2112e-02  3.0038e-02

Columns 12 to 17 
 5.6343e-02  2.8727e-02 -3.0791e-03  4.1079e-02 -1.5735e-02 -