In [2]:
!pip install mxnet-cu100
!pip install gluonnlp pandas tqdm



## Entity Taggging

In [0]:
import pandas as pd
import numpy as np
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd 
import mxnet as mx
import time
import itertools
from tqdm import tqdm
import multiprocessing as mp

In [0]:
train_raw = pd.read_csv("https://www.dropbox.com/s/83n0uoy20rd2vq4/trainset.txt?dl=1",names=['intent', 'entity', 'sentence'], sep='\t')
#validation_raw = pd.read_csv("https://www.dropbox.com/s/kbl7kw54jdo2550/test_hidden.txt?dl=1",names=['intent', 'entity', 'sentence'], sep='\t')
validation_raw = pd.read_csv("https://www.dropbox.com/s/enxp9yt9cstcal2/validation.txt?dl=1",names=['intent', 'entity', 'sentence'], sep='\t')

In [5]:
train_raw.head(30)

Unnamed: 0,intent,entity,sentence
0,area,EECCCCCCCCCCCCCCCCCCC,자강의 면적은 얼마 정도되는지 알려줄래
1,birth_date,CCCCCCCCCCCCEEECCCCCCCCCCCC,WIKI PEDIA로 변재일 생년월일을 알고 싶어
2,age,EEEEEEEEEEECCCCCCCCCCCCCCCCC,남쪽 물고기자리 알파 나이가 위키백과사전으로 얼마야
3,length,EEEECCCCCCCCCCCCCCCCCC,삼양터널의 총 길이 위키백과사전에서 뭐야
4,birth_place,EEEEEECCCCCCCCCCC,코니 윌리스의 태어난 곳은 뭐지
5,weight,CCCCCCCCCCCCEEEECCCCCCCCCCCCC,WIKI백과사전 검색 AA12의 무게가 얼만지 찾아봐
6,definition,CCCCCCCCCCCCCEEECCCCCCCC,WIKIPEDIA백과로 라이프 찾아서 말해줘
7,height,EEEEEEEECCCCCCCCCCCCCCCCCCC,송파 헬리오시티 구조물 높이 위키 피디아에서 뭐야
8,birth_date,CCCEEEEEECCCCCCCCCCCCCCC,검색 HLKVAM 언제 출생했는지를 검색해라
9,height,CCCCCCCCEEEEEECCCCCCCC,위키 피디아에 푸조 508 전고가 몇이야


#### 데이터 전처리

In [0]:
train_dataset = [(l, d) for d,l in zip(train_raw['entity'], train_raw['sentence'])]
valid_dataset = [(l, d) for d,l in zip(validation_raw['entity'], validation_raw['sentence'])]

In [0]:
seq_len = 32

length_clip = nlp.data.PadSequence(seq_len, pad_val="<pad>")

def preprocess(data):
    sent, entity = data
    char_sent = list(str(sent))
    char_entity = list(str(entity))
    return(length_clip(char_sent), len(sent),length_clip(char_entity))

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'
          .format(end - start, len(dataset)))
    return dataset


In [8]:
train_preprocessed  = preprocess_dataset(train_dataset)
valid_preprocessed  = preprocess_dataset(valid_dataset)

Done! Tokenizing Time=0.34s, #Sentences=9000
Done! Tokenizing Time=0.13s, #Sentences=1000


In [0]:
counter_sent   = nlp.data.count_tokens(itertools.chain.from_iterable([c for c, _, _ in train_preprocessed]))
counter_entity = nlp.data.count_tokens(itertools.chain.from_iterable([c for _,_, c in train_preprocessed]))

In [0]:
vocab_sent = nlp.Vocab(counter_sent, bos_token=None, eos_token=None, min_freq=15)
vocab_entity = nlp.Vocab(counter_entity, bos_token=None, eos_token=None, unknown_token=None ,min_freq=15)

In [11]:
vocab_sent.idx_to_token[:10], vocab_entity.idx_to_token[:10], 

(['<unk>', '<pad>', ' ', 'I', '이', '색', '검', '의', '지', '아'],
 ['<pad>', 'C', 'E'])

In [0]:
train_preprocessed_encoded  = [(vocab_sent[sent], length ,vocab_entity[entity])  for sent, length ,entity in train_preprocessed ]
valid  = [(vocab_sent[sent], length ,vocab_entity[entity])  for sent, length ,entity in valid_preprocessed ]

In [0]:
train, test = nlp.data.train_valid_split(train_preprocessed_encoded, valid_ratio=0.1)

In [0]:
nbatch = 30
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Stack(),
                                      nlp.data.batchify.Stack('float32'),
                                      nlp.data.batchify.Stack())

train_dataloader  = gluon.data.DataLoader(train, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)
test_dataloader  = gluon.data.DataLoader(test, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)
valid_dataloader  = gluon.data.DataLoader(valid, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)

#### 모델링 

In [0]:
class EntityTagger(gluon.HybridBlock):
    def __init__(self, vocab_size, vocab_out_size, num_embed, hidden_size, use_attention=False, **kwargs):
        super(EntityTagger, self).__init__(**kwargs)
        self.hidden_size = hidden_size 
        self.vocab_out_size = vocab_out_size
        self.use_attention = use_attention
        with self.name_scope():
            self.embed = nn.Embedding(input_dim=vocab_size, output_dim=num_embed)
            self.bigru = rnn.GRU(self.hidden_size, dropout=0.2, bidirectional=True)
            self.dense_prev = nn.Dense(10, flatten=False)
            self.dense = nn.Dense(self.vocab_out_size, flatten=False)
            if self.use_attention:
              self.attention = nlp.model.MLPAttentionCell(30, dropout=0.2)
            
    def hybrid_forward(self, F ,inputs, length):
        em_out = self.embed(inputs)
        bigruout = self.bigru(em_out)
        masked_encoded = F.SequenceMask(bigruout,
                                        sequence_length=length,
                                        use_sequence_length=True).transpose((1,0,2))
        if self.use_attention:
            masked_encoded,_ = self.attention(masked_encoded, masked_encoded)
        dense_out = self.dense_prev(masked_encoded)
        outs = self.dense(dense_out) 
        return(outs)

In [0]:
ctx = mx.gpu()

model = EntityTagger(vocab_size = len(vocab_sent.idx_to_token), vocab_out_size=len(vocab_entity.idx_to_token), 
                     num_embed=50, hidden_size=30, use_attention=True)

In [0]:
model.initialize(mx.initializer.Xavier(), ctx=ctx)

In [0]:
trainer = gluon.Trainer(model.collect_params(),"Adam")
loss = gluon.loss.SoftmaxCELoss() 

In [0]:
model.hybridize()

In [54]:
model

EntityTagger(
  (embed): Embedding(481 -> 50, float32)
  (bigru): GRU(None -> 30, TNC, dropout=0.2, bidirectional)
  (dense_prev): Dense(None -> 10, linear)
  (dense): Dense(None -> 3, linear)
  (attention): MLPAttentionCell(
    (_act): Activation(tanh)
    (_dropout_layer): Dropout(p = 0.2, axes=())
    (_query_mid_layer): Dense(None -> 30, linear)
    (_key_mid_layer): Dense(None -> 30, linear)
    (_attention_score): Dense(30 -> 1, linear)
  )
)

In [0]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    corrected = 0
    n = 0
    for i, (data, length, label) in enumerate(data_iter):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        output = model(data.T, length)
        predictions = nd.argmax(output, axis=2)
        tf = predictions.astype('int64') == label
        for i in range(length.shape[0]):
            l = int(length[i].asscalar())
            corrected += nd.sum(tf[i][:l]).asscalar() == l
            n += 1
        #acc.update(preds=predictions, labels=label)
    return(corrected/n)

In [0]:
def calculate_loss(model, data_iter, loss_obj, ctx=ctx):
    test_loss = []
    for i, (te_data, te_length, te_label) in enumerate(data_iter):
        te_data = te_data.as_in_context(ctx)
        te_label = te_label.as_in_context(ctx)
        te_length = te_length.as_in_context(ctx)
        te_output = model(te_data.T, te_length)
        loss_te = loss_obj(te_output, te_label)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return(np.mean(test_loss))

In [57]:
epochs = 100


tot_test_loss = []
tot_test_accu = []
tot_train_loss = []
tot_train_accu = []
tot_valid_accu = [] 
for e in range(epochs):
    #batch training 
    for i, (data, length, label) in enumerate(tqdm(train_dataloader)):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        with autograd.record():
            output = model(data.T, length)
            loss_ = loss(output, label)
            loss_.backward()
        trainer.step(data.shape[0])

    #caculate test loss
    if e % 10 == 0: 
        test_loss = calculate_loss(model, test_dataloader, loss_obj = loss, ctx=ctx) 
        train_loss = calculate_loss(model, train_dataloader, loss_obj = loss, ctx=ctx) 
        test_accu = evaluate_accuracy(model, test_dataloader,  ctx=ctx)
        train_accu = evaluate_accuracy(model, train_dataloader,  ctx=ctx)
        valid_accu = evaluate_accuracy(model, valid_dataloader,  ctx=ctx)

        print("Epoch %s. Train Loss: %s, Test Loss : %s," \
        " Test Accuracy : %s," \
        " Train Accuracy : %s : Valid Accuracy : %s" % (e, train_loss, test_loss, test_accu, train_accu, valid_accu))    
        tot_test_loss.append(test_loss)
        tot_train_loss.append(train_loss)
        tot_test_accu.append(test_accu)
        tot_train_accu.append(train_accu)
        tot_valid_accu.append(valid_accu)

100%|██████████| 270/270 [00:01<00:00, 240.68it/s]
  9%|▊         | 23/270 [00:00<00:01, 226.74it/s]

Epoch 0. Train Loss: 0.04581105, Test Loss : 0.04551146, Test Accuracy : 0.7133333333333334, Train Accuracy : 0.7050617283950618 : Valid Accuracy : 0.704


100%|██████████| 270/270 [00:01<00:00, 175.59it/s]
100%|██████████| 270/270 [00:01<00:00, 191.02it/s]
100%|██████████| 270/270 [00:01<00:00, 167.02it/s]
100%|██████████| 270/270 [00:01<00:00, 184.27it/s]
100%|██████████| 270/270 [00:01<00:00, 202.12it/s]
100%|██████████| 270/270 [00:01<00:00, 181.01it/s]
100%|██████████| 270/270 [00:01<00:00, 176.99it/s]
100%|██████████| 270/270 [00:01<00:00, 182.01it/s]
100%|██████████| 270/270 [00:01<00:00, 197.47it/s]
100%|██████████| 270/270 [00:01<00:00, 196.37it/s]
  7%|▋         | 19/270 [00:00<00:01, 188.03it/s]

Epoch 10. Train Loss: 0.0020585118, Test Loss : 0.00977939, Test Accuracy : 0.9533333333333334, Train Accuracy : 0.9849382716049383 : Valid Accuracy : 0.963


100%|██████████| 270/270 [00:01<00:00, 178.87it/s]
100%|██████████| 270/270 [00:01<00:00, 193.61it/s]
100%|██████████| 270/270 [00:01<00:00, 180.71it/s]
100%|██████████| 270/270 [00:01<00:00, 193.57it/s]
100%|██████████| 270/270 [00:01<00:00, 186.82it/s]
100%|██████████| 270/270 [00:01<00:00, 192.00it/s]
100%|██████████| 270/270 [00:01<00:00, 179.13it/s]
100%|██████████| 270/270 [00:01<00:00, 188.45it/s]
100%|██████████| 270/270 [00:01<00:00, 206.38it/s]
100%|██████████| 270/270 [00:01<00:00, 192.83it/s]
  9%|▉         | 24/270 [00:00<00:01, 238.61it/s]

Epoch 20. Train Loss: 0.0019672026, Test Loss : 0.010476824, Test Accuracy : 0.9588888888888889, Train Accuracy : 0.9837037037037037 : Valid Accuracy : 0.961


100%|██████████| 270/270 [00:01<00:00, 200.94it/s]
100%|██████████| 270/270 [00:01<00:00, 175.08it/s]
100%|██████████| 270/270 [00:01<00:00, 185.28it/s]
100%|██████████| 270/270 [00:01<00:00, 176.77it/s]
100%|██████████| 270/270 [00:01<00:00, 175.91it/s]
100%|██████████| 270/270 [00:01<00:00, 186.16it/s]
100%|██████████| 270/270 [00:01<00:00, 184.73it/s]
100%|██████████| 270/270 [00:01<00:00, 183.85it/s]
100%|██████████| 270/270 [00:01<00:00, 195.25it/s]
100%|██████████| 270/270 [00:01<00:00, 178.42it/s]
  8%|▊         | 22/270 [00:00<00:01, 219.21it/s]

Epoch 30. Train Loss: 0.0002923226, Test Loss : 0.0132401, Test Accuracy : 0.9611111111111111, Train Accuracy : 0.9969135802469136 : Valid Accuracy : 0.972


100%|██████████| 270/270 [00:01<00:00, 178.83it/s]
100%|██████████| 270/270 [00:01<00:00, 180.42it/s]
100%|██████████| 270/270 [00:01<00:00, 185.86it/s]
100%|██████████| 270/270 [00:01<00:00, 204.22it/s]
100%|██████████| 270/270 [00:01<00:00, 179.28it/s]
100%|██████████| 270/270 [00:01<00:00, 191.57it/s]
100%|██████████| 270/270 [00:01<00:00, 177.67it/s]
100%|██████████| 270/270 [00:01<00:00, 191.65it/s]
100%|██████████| 270/270 [00:01<00:00, 198.73it/s]
100%|██████████| 270/270 [00:01<00:00, 198.85it/s]
  9%|▊         | 23/270 [00:00<00:01, 229.68it/s]

Epoch 40. Train Loss: 3.341425e-05, Test Loss : 0.012503282, Test Accuracy : 0.9688888888888889, Train Accuracy : 0.9997530864197531 : Valid Accuracy : 0.98


100%|██████████| 270/270 [00:01<00:00, 200.50it/s]
100%|██████████| 270/270 [00:01<00:00, 195.91it/s]
100%|██████████| 270/270 [00:01<00:00, 175.03it/s]
100%|██████████| 270/270 [00:01<00:00, 200.46it/s]
100%|██████████| 270/270 [00:01<00:00, 195.08it/s]
100%|██████████| 270/270 [00:01<00:00, 185.90it/s]
100%|██████████| 270/270 [00:01<00:00, 207.01it/s]
100%|██████████| 270/270 [00:01<00:00, 197.68it/s]
100%|██████████| 270/270 [00:01<00:00, 174.51it/s]
100%|██████████| 270/270 [00:01<00:00, 189.14it/s]
  6%|▌         | 16/270 [00:00<00:01, 154.69it/s]

Epoch 50. Train Loss: 7.684146e-06, Test Loss : 0.013977752, Test Accuracy : 0.9711111111111111, Train Accuracy : 1.0 : Valid Accuracy : 0.982


100%|██████████| 270/270 [00:01<00:00, 173.55it/s]
100%|██████████| 270/270 [00:01<00:00, 180.13it/s]
100%|██████████| 270/270 [00:01<00:00, 196.53it/s]
100%|██████████| 270/270 [00:01<00:00, 178.13it/s]
100%|██████████| 270/270 [00:01<00:00, 188.66it/s]
100%|██████████| 270/270 [00:01<00:00, 176.26it/s]
100%|██████████| 270/270 [00:01<00:00, 191.22it/s]
100%|██████████| 270/270 [00:01<00:00, 173.48it/s]
100%|██████████| 270/270 [00:01<00:00, 179.11it/s]
100%|██████████| 270/270 [00:01<00:00, 189.78it/s]
  8%|▊         | 22/270 [00:00<00:01, 217.96it/s]

Epoch 60. Train Loss: 0.0005088213, Test Loss : 0.014564858, Test Accuracy : 0.9644444444444444, Train Accuracy : 0.9950617283950617 : Valid Accuracy : 0.972


100%|██████████| 270/270 [00:01<00:00, 170.08it/s]
100%|██████████| 270/270 [00:01<00:00, 190.07it/s]
100%|██████████| 270/270 [00:01<00:00, 174.73it/s]
100%|██████████| 270/270 [00:01<00:00, 170.88it/s]
100%|██████████| 270/270 [00:01<00:00, 189.08it/s]
100%|██████████| 270/270 [00:01<00:00, 201.39it/s]
100%|██████████| 270/270 [00:01<00:00, 176.72it/s]
100%|██████████| 270/270 [00:01<00:00, 183.48it/s]
100%|██████████| 270/270 [00:01<00:00, 167.09it/s]
100%|██████████| 270/270 [00:01<00:00, 187.38it/s]
  7%|▋         | 20/270 [00:00<00:01, 198.38it/s]

Epoch 70. Train Loss: 2.6472359e-05, Test Loss : 0.012694985, Test Accuracy : 0.9744444444444444, Train Accuracy : 0.9998765432098765 : Valid Accuracy : 0.983


100%|██████████| 270/270 [00:01<00:00, 171.07it/s]
100%|██████████| 270/270 [00:01<00:00, 173.73it/s]
100%|██████████| 270/270 [00:01<00:00, 178.87it/s]
100%|██████████| 270/270 [00:01<00:00, 194.86it/s]
100%|██████████| 270/270 [00:01<00:00, 190.54it/s]
100%|██████████| 270/270 [00:01<00:00, 174.13it/s]
100%|██████████| 270/270 [00:01<00:00, 183.71it/s]
100%|██████████| 270/270 [00:01<00:00, 183.54it/s]
100%|██████████| 270/270 [00:01<00:00, 195.52it/s]
100%|██████████| 270/270 [00:01<00:00, 201.67it/s]
  8%|▊         | 21/270 [00:00<00:01, 209.25it/s]

Epoch 80. Train Loss: 3.950536e-06, Test Loss : 0.013301165, Test Accuracy : 0.98, Train Accuracy : 1.0 : Valid Accuracy : 0.983


100%|██████████| 270/270 [00:01<00:00, 182.13it/s]
100%|██████████| 270/270 [00:01<00:00, 194.95it/s]
100%|██████████| 270/270 [00:01<00:00, 180.37it/s]
100%|██████████| 270/270 [00:01<00:00, 193.45it/s]
100%|██████████| 270/270 [00:01<00:00, 182.30it/s]
100%|██████████| 270/270 [00:01<00:00, 192.52it/s]
100%|██████████| 270/270 [00:01<00:00, 178.73it/s]
100%|██████████| 270/270 [00:01<00:00, 182.85it/s]
100%|██████████| 270/270 [00:01<00:00, 184.27it/s]
100%|██████████| 270/270 [00:01<00:00, 175.67it/s]
  9%|▉         | 25/270 [00:00<00:01, 240.53it/s]

Epoch 90. Train Loss: 1.37819925e-05, Test Loss : 0.009812501, Test Accuracy : 0.9822222222222222, Train Accuracy : 0.9997530864197531 : Valid Accuracy : 0.985


100%|██████████| 270/270 [00:01<00:00, 192.52it/s]
100%|██████████| 270/270 [00:01<00:00, 188.30it/s]
100%|██████████| 270/270 [00:01<00:00, 184.50it/s]
100%|██████████| 270/270 [00:01<00:00, 187.15it/s]
100%|██████████| 270/270 [00:01<00:00, 178.01it/s]
100%|██████████| 270/270 [00:01<00:00, 186.42it/s]
100%|██████████| 270/270 [00:01<00:00, 198.87it/s]
100%|██████████| 270/270 [00:01<00:00, 197.53it/s]
100%|██████████| 270/270 [00:01<00:00, 182.71it/s]


In [0]:
model.collect_params().reset_ctx(mx.cpu())

In [0]:
def get_entitytag(sent):
    sent_len = len(sent)
    coded_sent = vocab_sent[length_clip(list(sent))]
    co = nd.array(coded_sent).expand_dims(axis=1)
    ret_code = model(co, nd.array([sent_len,]))
    ret_seq = vocab_entity.to_tokens(ret_code.argmax(axis=2)[0].asnumpy().astype('int').tolist())
    return(''.join(ret_seq)[:sent_len])

In [60]:
get_entitytag("파이콘이 뭔지 알려줘")

'EEEECCCCCCC'

### TODO
- Test Accuracy 95% 이상 올리기
- test_hidden 셋의 성능 90% 이상 올리기 
- Entity Tagging과 Intent Classification을 MultiTask Learning으로 통합해보기(성능이 좋아지나? 나빠지나?)