In [1]:
# -*- coding: utf-8 -*-

#https://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html#Applications-of-Word-Embeddings
from __future__ import division, print_function

import os
import sys
import babi

import math
import numpy as np

In [2]:
BABI_DIR = "../data/tasks_1-20_v1-2/en"
EMB_DATA_DIR = "../data/"
TASK_NBR = 1
WORD2VEC_EMBED_SIZE = EMBED_HIDDEN_SIZE = 300
BATCH_SIZE = 32
NBR_EPOCHS = 40

train_file, test_file = babi.get_files_for_task(TASK_NBR, BABI_DIR)

data_train = babi.get_stories(os.path.join(BABI_DIR, train_file))
data_test = babi.get_stories(os.path.join(BABI_DIR, test_file))

['qa1_single-supporting-fact_train.txt', 'qa1_single-supporting-fact_test.txt']


In [3]:
cnt = 0
for context, query, answer in data_train:

    print ('CONTEXT: '  + " ".join(context))   
    print ( 'QUERY: ' +  " ".join(query))
    print ( 'ANSWER: ' +  answer + '\n')
    
    cnt += 1
    
    if cnt == 3:
        break

CONTEXT: Mary moved to the bathroom . John went to the hallway .
QUERY: Where is Mary ?
ANSWER: bathroom

CONTEXT: Mary moved to the bathroom . John went to the hallway . Daniel went back to the hallway . Sandra moved to the garden .
QUERY: Where is Daniel ?
ANSWER: hallway

CONTEXT: Mary moved to the bathroom . John went to the hallway . Daniel went back to the hallway . Sandra moved to the garden . John moved to the office . Sandra journeyed to the bathroom .
QUERY: Where is Daniel ?
ANSWER: hallway



In [4]:
word2idx = babi.build_vocab([data_train, data_test])
vocab_size = len(word2idx) + 1
print("vocab_size=", vocab_size)

story_maxlen, question_maxlen = babi.get_maxlens([data_train, data_test])
print("story_maxlen=", story_maxlen)
print("question_maxlen=", question_maxlen)

Xs_train, Xq_train, Y_train = babi.vectorize(data_train, word2idx, 
                                             story_maxlen, question_maxlen)
Xs_test, Xq_test, Y_test = babi.vectorize(data_test, word2idx,
                                          story_maxlen, question_maxlen)
print(Xs_train.shape, Xq_train.shape, Y_train.shape)
print(Xs_test.shape, Xq_test.shape, Y_test.shape)

vocab_size= 22
story_maxlen= 66
question_maxlen= 4
(1000, 66) (1000, 4) (1000, 22)
(1000, 66) (1000, 4) (1000, 22)


In [5]:
## LSTM

In [6]:
import mxnet as mx
from mxnet import autograd, gluon, nd, init
from mxnet.gluon import Block, nn, rnn

class MemN2N(Block):
    def __init__(self, vocab_size=22, emb_dim=50, init_std=0.015, **kwargs):
        super(MemN2N, self).__init__(**kwargs)
        ## 고정
        self.vocab_size = vocab_size
        
        ## 
        self.init_std = init_std
        self.emb_dim = emb_dim

        with self.name_scope():
             
            # Embedding C (컨텍스트 벡터)
            self.C = nn.Embedding(input_dim=self.vocab_size,
                                                 output_dim=self.emb_dim, 
                                                 weight_initializer=init.Normal(self.init_std))
            
            # Embedding Q (쿼리 벡터)
            self.B = nn.Embedding(input_dim=self.vocab_size, 
                                                     output_dim=self.emb_dim, 
                                                     weight_initializer=init.Normal(self.init_std))

            # Final Predict
            self.W_ = nn.Dense(self.vocab_size, weight_initializer=init.Normal(self.init_std))

            self.C_encoder= rnn.LSTM(self.emb_dim,  bidirectional=True)
            self.C_dropout = nn.Dropout(0.3)
            self.Q_encoder= rnn.LSTM(self.emb_dim,  bidirectional=True)
            self.Q_dropout = nn.Dropout(0.3)
            
    def forward(self, sentences, question):
       
        c_i = self.C(sentences)         
        q_i = self.B(question)
       
        c_i_encode = self.C_encoder(c_i )
        c_i_encode = self.C_dropout(c_i_encode)
        q_i_encode = self.Q_encoder(q_i)
        q_i_encode = self.Q_dropout(q_i_encode)

        m_out = nd.batch_dot(c_i_encode, q_i_encode.swapaxes(1,2))
        
        #print (np.shape(m_out))
        #print (np.shape(q_i))
        
        z = self.W_(m_out)

        return z

In [7]:
def train(model, trainer, softmax_cross_entropy, x_input, x_query, x_answers, batch_size):
    max_grad_norm=40
    
    with autograd.record():
        out = model(x_input, x_query)

        loss = softmax_cross_entropy(out, x_answers)
        loss.backward()
        
    grads = [i.grad() for i in model.collect_params().values()]
    gluon.utils.clip_global_norm(grads, max_grad_norm)
            
    trainer.step(batch_size)

    return loss

In [8]:
def model_train(model, epochs=3000, learning_rate=0.1):

    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
    trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': learning_rate})#, 'clip_gradient':40, 'wd':1.01})
    log_loss = []
    log_perp = []
    batch_size = 16
    epoch_log_loss = []
    for epoch in range(epochs):
        dataiter = mx.io.NDArrayIter([inputs_train, queries_train], answers_train, batch_size, shuffle=False, last_batch_handle='discard')

        for batch in dataiter:
            log_loss = []
            train_loss = train(model, trainer, softmax_cross_entropy,
                               batch.data[0].as_in_context(ctx), 
                               batch.data[1].as_in_context(ctx),
                               batch.label[0].as_in_context(ctx),
                               batch_size)

            # Logging
            log_loss.append([train_loss.asnumpy()/batch_size])#, test_loss])

        epoch_log_loss.append([np.mean(log_loss)])

        state = { 'epoch': epoch, 'learning_rate': trainer.learning_rate, 'perplexity': math.exp(epoch_log_loss[epoch][0])}
        print(state)

        #lr_decay = 1.01
        #if (len(epoch_log_loss) > 1) and (epoch_log_loss[epoch][0] > epoch_log_loss[epoch-1][0] * 0.9999):
          #  print ('update learning rate from %.3f to %.3f' % (trainer.learning_rate, trainer.learning_rate/lr_decay))
           # trainer.set_learning_rate(trainer.learning_rate / lr_decay)
        if trainer.learning_rate < 1e-5: 
            break
            
    return model

In [9]:
#https://github.com/apache/incubator-mxnet/issues/9486
#sym_c_data = mx.sym.Variable('data')
#sym_q_data = mx.sym.Variable('data')
#net = model(sym_c_data, sym_q_data)
#viz = mx.viz.plot_network(net, title='lstm', save_format='png', shape={'data':(1,3,256,256)})
#viz.render('images/sam')
#from IPython.display import Image
#Image("images/sam.png")

In [10]:
inputs_train = nd.array(Xs_train)
queries_train = nd.array(Xq_train)
answers_train = nd.array([np.where(i == 1)[0][0] for i in Y_train])

inputs_test = nd.array(Xs_test)
queries_test = nd.array(Xq_test)
answers_test = nd.array([np.where(i == 1)[0][0] for i in Y_test])

In [11]:
import mxnet as mx
ctx = mx.gpu()
model = MemN2N(vocab_size=vocab_size, emb_dim=50, init_std=0.015)
model.collect_params().initialize(mx.init.Xavier(),ctx=ctx)

In [None]:
from line_profiler import LineProfiler

profile = LineProfiler()
lp_wrapper=profile(train)
lp_wrapper=profile(model_train)
lp_wrapper(model, epochs=2, learning_rate=0.2)
profile.print_stats()

  return f(*args, **kwds)


{'learning_rate': 0.2, 'perplexity': 64.80204280860912, 'epoch': 0}
{'learning_rate': 0.2, 'perplexity': 4.51971724392699, 'epoch': 1}
Timer unit: 1e-06 s

Total time: 2.75367 s
File: <ipython-input-7-4e772d2b80f8>
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def train(model, trainer, softmax_cross_entropy, x_input, x_query, x_answers, batch_size):
     2       124         59.0      0.5      0.0      max_grad_norm=40
     3                                               
     4       124       1137.0      9.2      0.0      with autograd.record():
     5       124     379029.0   3056.7     13.8          out = model(x_input, x_query)
     6                                           
     7       124      24663.0    198.9      0.9          loss = softmax_cross_entropy(out, x_answers)
     8       124      75660.0    610.2      2.7          loss.backward()
     9                                    

In [None]:
model = model_train(model, epochs=5000)

{'learning_rate': 0.1, 'perplexity': 12.563814869331576, 'epoch': 0}
{'learning_rate': 0.1, 'perplexity': 5.714457203734448, 'epoch': 1}
{'learning_rate': 0.1, 'perplexity': 4.155200294894872, 'epoch': 2}
{'learning_rate': 0.1, 'perplexity': 4.601557001660292, 'epoch': 3}
{'learning_rate': 0.1, 'perplexity': 3.8386680205685435, 'epoch': 4}


In [None]:
def print_res():
    import numpy as np
    res = model(mx.nd.array(inputs_test, ctx), mx.nd.array(queries_test, ctx))
    res = res.asnumpy()
    answers_test = [np.where(i == 1)[0][0] for i in Y_test]
    from collections import Counter

    print (Counter(np.argmax(i) for i in res).items())
    print (Counter(i for i in answers_test).items())
    print ([np.argmax(i) for i in res[0:20]])
    print ([i for i in answers_test[0:20]])

    print ('ACC')
    print (np.sum([bool(np.argmax(i) == answers_test[idx:idx+1]) for idx, i in enumerate(res)])/len(res))

    print ('BASE')
    print (1/len(set(i for i in answers_test)))

In [None]:
print_res()

In [None]:
## CUSTOM EMBEDDING

import gensim
WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300
word2vec = gensim.models.KeyedVectors.load_word2vec_format(
    os.path.join(EMB_DATA_DIR, WORD2VEC_BIN), binary=True)

In [None]:
import gluonnlp as nlp
import numpy as np

In [None]:
#nlp.embedding.list_sources()
#nlp.embedding.list_sources('fasttext')
#nlp.embedding.list_sources('glove')

In [None]:
fasttext_emb = nlp.embedding.create('fasttext', source='wiki.en')

In [None]:
vocab = nlp.Vocab(nlp.data.Counter(fasttext_emb.idx_to_token))
vocab.set_embedding(fasttext_emb)

In [None]:
len(vocab)

In [None]:
embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
np.shape(embedding_weights)

In [None]:
for word, index in word2idx.items():
    print (word)
    try:
        embedding_weights[index, :] = vocab.embedding[word.lower()].asnumpy()
    except KeyError:
        pass  # keep as zero (not ideal, but what else can we do?)

In [None]:
from mxnet import nd
def cos_sim(x, y):
    return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))

def norm_vecs_by_row(x):
    return x / nd.sqrt(nd.sum(x * x, axis=1)).reshape((-1,1))

def get_knn(vocab, k, word):
    word_vec = vocab.embedding[word].reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_vec)
    indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+4, ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    # Remove unknown and input tokens.
    return vocab.to_tokens(indices[4:])


In [None]:
cos_sim(vocab.embedding['baby'], vocab.embedding['babies'])

In [None]:
cos_sim(vocab.embedding['bathroom'], vocab.embedding['office'])

In [None]:
model = MemN2N(vocab_size=vocab_size, emb_dim=300, init_std=0.015)
model.collect_params().initialize(mx.init.Xavier(),ctx=ctx)

In [None]:
keys = [i for i in list(model.collect_params().keys()) if 'embedding' in i]
print (keys)

In [None]:
model.collect_params()[list(keys)[0]].data().asnumpy()

In [None]:
for key in keys:
    model.collect_params()[key].set_data(embedding_weights)

In [None]:
model.collect_params()[list(keys)[0]].data().asnumpy()

In [None]:
model = model_train(model, epochs=5000)

In [None]:
print_res()