In [1]:
import os
import sys
sys.path.append('/'.join(os.getcwd().split('/')[:-1]))
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import os
from model.bAbI_utils_loader import bAbIDataset, bAbIDataLoader

USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

---

![](./figs/E2EMN.png)

## Single Layer

### Sentences: 

$$X = [x_1, x_2, \cdots, x_n]: n \times T_c$$

* $n$: number of sentences in context
* $T_c$: max length of a sentence in context

### Embeding Matrix: 

$$\begin{aligned}
A &: d \times V \\
B &: d \times V \\
C &: d \times V
\end{aligned}$$

$$\begin{aligned}
m_i &= \sum_j Ax_{ij}: T_c \times d \\ 
c_i &= \sum_j Cx_{ij}: T_c \times d\\
u &= \sum_j Bq_{j}: T_q \times d
\end{aligned}$$

total embedding of context: $M : n \times T_c \times d$
* $m_i(c_i)$: summation embedded for each sentence in context as length of $T_c$, $n \times d$
* $u$: summation embedded for query(question) as length of $T_q$, $1 \times d$
* $score = m_iu^T: (n \times d) \cdot (d \times 1) = n \times 1$

### attention:
$$\begin{aligned}
p_i &= softmax(score): n \times 1 \\
o &= \sum_i c_i p_i : d \times 1 \\
\end{aligned}$$

### summation vectors to linear layer:

$$\begin{aligned}
inputs &= u + o : d \times 1 \\
a &= softmax(W \cdot inputs) : (V \times d) \times (d \times 1) = V \times 1
\end{aligned}$$

https://arxiv.org/pdf/1503.08895.pdf

https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py

### postion encoding(PE):

for each story(sentence) memory $m_i, c_i$
$$\begin{aligned}
m_i &= \sum_j l_j \otimes Ax_{ij}: T_c \times d \\ 
l_{jk} &= (1-\frac{j}{J}) - (\frac{k}{d})(1-\frac{2j}{J})
\end{aligned}$$

remember, $l_j$ is a matrix that size is $T_c \times d$

* $J$: number of word in sentences
* $j$: index of words
* $d$: dimension of embedding
* $k$: index of embedding dimension

### temporal encoding(TE):

for each story(sentence)

# Model

In [2]:
class E2EMN(nn.Module):
    def __init__(self, vocab_size, embed_size, n_hops=3, encoding_method='basic', temporal=True, \
                 use_cuda=False, max_story_len=None):
        super(E2EMN, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.n_hops = n_hops
        self.encoding_method = encoding_method.lower()
        self.te = temporal
        self.use_cuda = use_cuda
        
        # sharing matrix for k hops & and init to normal dist.
        self.embed_A = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_B = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_C = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        
        # TE: temporal encoding
        if self.te:
            assert max_story_len is not None, 'must have a fixed story_len, insert "max_story_len" as a number'
            assert isinstance(max_story_len, int), '"max_story_len" must be a integer'
            
            self.embed_A_T = nn.Embedding(max_story_len+1, self.embed_size, padding_idx=0)
            self.embed_C_T = nn.Embedding(max_story_len+1, self.embed_size, padding_idx=0)
            if self.use_cuda:
                self.embed_A_T = self.embed_A_T.cuda()
                self.embed_C_T = self.embed_C_T.cuda()            
            
        self.linear = nn.Linear(embed_size, vocab_size)
        self._weight_init()
    
    def _weight_init(self):
        for x in [self.embed_A, self.embed_B, self.embed_C]:
            nn.init.normal(x.weight, mean=0, std=0.1)
        if self.te:
            for x in [self.embed_A_T, self.embed_C_T]:
                nn.init.normal(x.weight, mean=0, std=0.1)
    
    def _temporal_encoding_requirements(self, stories_masks):
        # temporal encoding
        if self.te:
            story_len = stories_masks.size(1)
            temp = stories_masks.eq(0).sum(2) # B, n : byte tensor
            te_idx_matrix = Variable(torch.arange(1, story_len+1).repeat(temp.size(0)).view(temp.size()), \
                                     requires_grad=False).long()
            if self.use_cuda:
                te_idx_matrix = te_idx_matrix.cuda()
            te_idx_matrix = te_idx_matrix * temp.ge(1).long() # B, n
        else:
            te_idx_matrix = None
            
        return te_idx_matrix

            
    def _pe_requirements(self, stories_masks):
        if stories_masks is not None:
            pe_word_lengths = stories_masks.eq(0).sum(2) # B, n : byte tensor
        else:
            pe_word_lengths = None
        return pe_word_lengths
    
    def encoding2memory(self, embeded_x, word_length=None):
        """
        embed_x: n, T_c, d
        word_length: n
        """
        if self.encoding_method == 'basic':
            return embeded_x.sum(1) # n, d
        
        elif self.encoding_method == 'pe':
            assert word_length is not None, 'insert stories_masks when forward'
            
            T_c, d = embeded_x.size()[1:]
            j = Variable(torch.arange(1, T_c+1).unsqueeze(1).repeat(1, d), requires_grad=False)
            k = Variable(torch.arange(1, d+1).unsqueeze(1).repeat(1, T_c).t(), requires_grad=False)
            if self.use_cuda:
                j, k = j.cuda(), k.cuda()
                    
            embeded_x_pe = []
            for embed, J in zip(embeded_x, word_length.float()): # iteration of n size
                # embed: T_c d
                # J: scalar
                if J.eq(0).data[0]: # all words are pad data, which means word_length = 0
                    embeded_x_pe.append(embed)
                else:
                    l = (torch.ones_like(embed).float() - j/J) - (k/d)*(torch.ones_like(embed) - (2*j)/J)
                    embed = embed * l
                    embeded_x_pe.append(embed) # T_c, d
            embeded_x_pe = torch.stack(embeded_x_pe) # n, T_c, d
            return embeded_x_pe.sum(1) # n, d
        
        else:
            assert True, 'insert encoding_method key value in the model, default is "basic".'
        
    def forward(self, stories, questions, stories_masks=None, questions_masks=None):
        """
        stories, stories_masks: B, n, T_c
        questions, questions_masks: B, T_q
        """
        # init some requirements
        te_idx_matrix = self._temporal_encoding_requirements(stories_masks)
        pe_word_lengths = self._pe_requirements(stories_masks) # B, n 
        
        # Start Learning
        o_list = []
        # questions: B, T_q
        embeded_B = self.embed_B(questions) # B, T_q, d
        u = embeded_B.sum(1) # u: B, d
        o_list.append(u) # [(B, d)]
        
        for k in range(self.n_hops):
            # encoding part: PE, TE
            batch_memories = [] # B, n, d
            batch_contexts = [] # B, n, d
            for i, inputs in enumerate(stories): # iteration of batch
                # inputs: n, T_c
                embeded_A = self.embed_A(inputs) # n, T_c, d
                embeded_C = self.embed_C(inputs)
                # basic or PE
                m = self.encoding2memory(embeded_A, pe_word_lengths[i]) # n, d
                c = self.encoding2memory(embeded_C, pe_word_lengths[i]) # n, d
                # TE
                if self.te:
                    A_T = self.embed_A_T(te_idx_matrix[i]) # n, d
                    C_T = self.embed_C_T(te_idx_matrix[i]) # n, d
                    m = m + A_T
                    c = c + C_T
                batch_memories.append(m)
                batch_contexts.append(c)

            batch_memories = torch.stack(batch_memories) # B, n, d
            batch_contexts = torch.stack(batch_contexts) # B, n, d

            # attention part: select which sentence to attent
            # score = m * u[-1] : (B, n, d) * (B, d, 1) = B, n, 1
            score = torch.bmm(batch_memories, o_list[-1].unsqueeze(2))
            probs = F.softmax(score, dim=1) # p: B, n, 1

            # output: element-wies mul & sum (B, n, d) x (B, n, 1) = B, n, d > B, d
            o = torch.sum(batch_contexts * probs, 1)

            o_next = o_list[-1] + o
            o_list.append(o_next) # B, d
        
        # guessing part:
        outputs = self.linear(o_list[-1]) # B, d > B, V
        return outputs

## Settings: Train_loader & Parameters

In [3]:
path_train = '../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_train.txt'
bAbI_train = bAbIDataset(path_train, train=True, return_masks=True)
train_loader = bAbIDataLoader(dataset=bAbI_train, batch_size=32, shuffle=True, to_tensor=True)

In [4]:
VOCAB_SIZE = len(bAbI_train.word2idx)
EMBED_SIZE = 50
N_HOPS = 3
LR = 0.01
STEP = 100
MAX_STORY_LEN = bAbI_train.max_story_len
BATCH_SIZE = 32
EARLY_STOPPING = False
# ENCODING_METHOD = 'basic'
# TEMPORAL = False
ENCODING_METHOD = 'pe'
TEMPORAL = True

In [5]:
def get_cuda(*args):
    return [x.cuda() for x in args]

## Settings: Loss Function & Optimizer

In [6]:
model = E2EMN(VOCAB_SIZE, EMBED_SIZE, n_hops=N_HOPS, encoding_method=ENCODING_METHOD, 
              temporal=TEMPORAL, use_cuda=USE_CUDA, max_story_len=MAX_STORY_LEN)

if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(gamma=0.5, milestones=[25, 50, 75], optimizer=optimizer)

## Train

In [7]:
model.train()
for step in range(STEP):
    losses=[]
    scheduler.step()
    if EARLY_STOPPING:
        break
    for i, batch in enumerate(train_loader.load()):
        stories, stories_masks, questions, _, answers, _ = batch
        
        if USE_CUDA:
            stories, stories_masks, questions, answers = get_cuda(stories, stories_masks, questions, answers)

        model.zero_grad()
        
        preds = model(stories, questions, stories_masks=stories_masks)
        
        loss = loss_function(preds, answers.view(-1))
        losses.append(loss.data[0])
        
        loss.backward()
        optimizer.step()
    
    if step % 5 == 0:
        string = '[{}/{}] loss: {:.4f}, lr: {},'.format(step+1, STEP, np.mean(losses), scheduler.get_lr()[0])
        print(string)
        if np.mean(losses) < 0.01:
            EARLY_STOPPING = True
            print("Early Stopping!")
            break
        losses=[]

[1/100] loss: 0.4770, lr: 0.01,
[6/100] loss: 0.0001, lr: 0.01,
Early Stopping!


In [8]:
# model_path = '../model/E2EMN_basic.model'
# ENCODING_METHOD = 'basic'
# TEMPORAL = False

In [6]:
model_path = '../model/E2EMN_te_pe.model'
ENCODING_METHOD = 'pe'
TEMPORAL = True

In [10]:
torch.save(model.state_dict(), model_path)

In [11]:
model.state_dict()

OrderedDict([('embed_A.weight', 
               8.7212e-02 -2.4311e-02 -5.3458e-02  ...  -1.0072e-01 -7.0019e-02 -8.0092e-02
              -1.2276e-01  8.6810e-03  5.4344e-02  ...   1.3872e-01 -1.1800e-03  6.2671e-02
              -1.4740e-01  1.6852e-01 -1.3936e-01  ...   1.6313e-01  9.0611e-02 -4.2169e-01
                              ...                   ⋱                   ...                
              -2.3257e-01 -1.2822e-01  5.6160e-01  ...   2.9820e-01  1.0062e-01  1.2733e-01
               2.0542e-02 -6.0076e-02 -2.2000e-02  ...   2.1464e-02  4.2375e-02  7.6490e-02
              -1.7213e-05  8.0307e-02 -1.4132e-02  ...  -3.1378e-02 -1.9798e-01 -4.4935e-02
              [torch.FloatTensor of size 22x50]), ('embed_B.weight', 
              -0.0114 -0.0028  0.0633  ...  -0.0429  0.1092  0.1136
               0.0152 -0.1037 -0.2155  ...  -0.0356 -0.0099 -0.1166
              -0.0591  0.0252  0.1548  ...  -0.0778  0.0614  0.0145
                        ...             ⋱        

### Load model

In [7]:
model = E2EMN(VOCAB_SIZE, EMBED_SIZE, n_hops=N_HOPS, encoding_method=ENCODING_METHOD, 
              temporal=TEMPORAL, use_cuda=USE_CUDA, max_story_len=MAX_STORY_LEN)
model.load_state_dict(torch.load(model_path))
# if USE_CUDA:
#     model = model.cuda()
#     model.load_state_dict(torch.load(model_path))
# else:
#     pass
#     model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))

In [8]:
model.state_dict()

OrderedDict([('embed_A.weight', 
               8.7212e-02 -2.4311e-02 -5.3458e-02  ...  -1.0072e-01 -7.0019e-02 -8.0092e-02
              -1.2276e-01  8.6810e-03  5.4344e-02  ...   1.3872e-01 -1.1800e-03  6.2671e-02
              -1.4740e-01  1.6852e-01 -1.3936e-01  ...   1.6313e-01  9.0611e-02 -4.2169e-01
                              ...                   ⋱                   ...                
              -2.3257e-01 -1.2822e-01  5.6160e-01  ...   2.9820e-01  1.0062e-01  1.2733e-01
               2.0542e-02 -6.0076e-02 -2.2000e-02  ...   2.1464e-02  4.2375e-02  7.6490e-02
              -1.7213e-05  8.0307e-02 -1.4132e-02  ...  -3.1378e-02 -1.9798e-01 -4.4935e-02
              [torch.FloatTensor of size 22x50]), ('embed_B.weight', 
              -0.0114 -0.0028  0.0633  ...  -0.0429  0.1092  0.1136
               0.0152 -0.1037 -0.2155  ...  -0.0356 -0.0099 -0.1166
              -0.0591  0.0252  0.1548  ...  -0.0778  0.0614  0.0145
                        ...             ⋱        

## Test

In [9]:
path_test = '../data/QA_bAbI_tasks/en-10k/qa1_single-supporting-fact_test.txt'
bAbI_test = bAbIDataset(path_test, train=False, vocab=bAbI_train.word2idx, return_masks=True)
test_loader = bAbIDataLoader(dataset=bAbI_test, batch_size=32, shuffle=False, to_tensor=True)

---

막 training 끝마치고 나서 결과

In [15]:
model.eval()
accuracy = 0
for i, batch in enumerate(test_loader.load()):
    stories, stories_masks, questions, _, answers, _ = batch
    
    if USE_CUDA:
        stories = [get_cuda(x) for x in stories]
        stories_masks = [get_cuda(x) for x in stories_masks]
        questions, answers = get_cuda(questions, answers)
    
    for story, mask, q, a in zip(stories, stories_masks, questions, answers):
        model.zero_grad()
        pred = model(story.unsqueeze(0), q.unsqueeze(0), stories_masks=mask.unsqueeze(0))
        accuracy += torch.eq(torch.max(pred, 1)[1], a).data[0]

print('Accuracy: {}'.format(accuracy/len(bAbI_test)))

Accuracy: 0.999


나갔다가 load state 하고 나온 결과

In [10]:
model.eval()
accuracy = 0
for i, batch in enumerate(test_loader.load()):
    stories, stories_masks, questions, _, answers, _ = batch
    
    if USE_CUDA:
        stories = [get_cuda(x) for x in stories]
        stories_masks = [get_cuda(x) for x in stories_masks]
        questions, answers = get_cuda(questions, answers)
    
    for story, mask, q, a in zip(stories, stories_masks, questions, answers):
        model.zero_grad()
        pred = model(story.unsqueeze(0), q.unsqueeze(0), stories_masks=mask.unsqueeze(0))
        accuracy += torch.eq(torch.max(pred, 1)[1], a).data[0]

print('Accuracy: {}'.format(accuracy/len(bAbI_test)))

Accuracy: 0.181


---

* basic: 0.653
* pe_te: 1.0

In [18]:
bAbI_test = bAbIDataset(path_test, train=False, vocab=bAbI_train.word2idx, return_masks=True)

In [19]:
random.choice(bAbI_test.data)

[[[3, 7, 15, 16, 10],
  [3, 17, 2, 15, 16, 6],
  [14, 7, 15, 16, 6],
  [19, 17, 2, 15, 16, 10],
  [14, 17, 15, 16, 10],
  [19, 5, 15, 16, 6],
  [3, 13, 15, 16, 10],
  [3, 13, 15, 16, 6]],
 [18, 20, 3, 21],
 [6],
 11]

In [None]:
story, q, a, s = bAbI_test.pad_to_story([random.choice(bAbI_test.data)])

## Test: random print

In [None]:
story, q, a, s = bAbI_test.pad_to_story([random.choice(bAbI_test.data)], w2idx)
model.zero_grad()
pred = model(story[0].unsqueeze(0), q)
pred_a = torch.max(pred, 1)[1]

print("Facts : ")
print('-'*45)
print('\n'.join([' '.join(list(map(lambda x: idx2w[x], f))) for f in story[0].data.tolist()]))
print('-'*45)
print("Question : ",' '.join(list(map(lambda x: idx2w[x], q.data.tolist()[0]))))
print('-'*45)
print("Answer : ",' '.join(list(map(lambda x: idx2w[x], a.squeeze(1).data.tolist()))))
print("Prediction : ",' '.join(list(map(lambda x: idx2w[x], pred_a.data.tolist()))))