In [3]:
!pip install mxnet-cu100
!pip install gluonnlp pandas tqdm

Collecting gluonnlp
[?25l  Downloading https://files.pythonhosted.org/packages/c1/c8/e180cd98ab190e7ac3c6a767a909918e719be33f967bca13d0d4cd7c5468/gluonnlp-0.8.0.tar.gz (235kB)
[K     |████████████████████████████████| 245kB 1.4MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.8.0-cp36-none-any.whl size=292704 sha256=4fed194e608655d54c907bdce8c379817cb6c1e42c09936efccaee6111328e97
  Stored in directory: /root/.cache/pip/wheels/28/ff/33/d73801f242fb93c02f2076f81232fcb9a29305480cc42c5454
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.8.0


## Intent Classification

In [0]:
import pandas as pd
import numpy as np
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd 
import mxnet as mx
import time
import itertools
from tqdm import tqdm
import multiprocessing as mp

In [0]:
train_raw = pd.read_csv("https://www.dropbox.com/s/83n0uoy20rd2vq4/trainset.txt?dl=1",names=['intent', 'entity', 'sentence'], sep='\t')
#validation_raw = pd.read_csv("https://www.dropbox.com/s/kbl7kw54jdo2550/test_hidden.txt?dl=1",names=['intent', 'entity', 'sentence'], sep='\t')
validation_raw = pd.read_csv("https://www.dropbox.com/s/enxp9yt9cstcal2/validation.txt?dl=1",names=['intent', 'entity', 'sentence'], sep='\t')

In [6]:
train_raw.head(30)

Unnamed: 0,intent,entity,sentence
0,area,EECCCCCCCCCCCCCCCCCCC,자강의 면적은 얼마 정도되는지 알려줄래
1,birth_date,CCCCCCCCCCCCEEECCCCCCCCCCCC,WIKI PEDIA로 변재일 생년월일을 알고 싶어
2,age,EEEEEEEEEEECCCCCCCCCCCCCCCCC,남쪽 물고기자리 알파 나이가 위키백과사전으로 얼마야
3,length,EEEECCCCCCCCCCCCCCCCCC,삼양터널의 총 길이 위키백과사전에서 뭐야
4,birth_place,EEEEEECCCCCCCCCCC,코니 윌리스의 태어난 곳은 뭐지
5,weight,CCCCCCCCCCCCEEEECCCCCCCCCCCCC,WIKI백과사전 검색 AA12의 무게가 얼만지 찾아봐
6,definition,CCCCCCCCCCCCCEEECCCCCCCC,WIKIPEDIA백과로 라이프 찾아서 말해줘
7,height,EEEEEEEECCCCCCCCCCCCCCCCCCC,송파 헬리오시티 구조물 높이 위키 피디아에서 뭐야
8,birth_date,CCCEEEEEECCCCCCCCCCCCCCC,검색 HLKVAM 언제 출생했는지를 검색해라
9,height,CCCCCCCCEEEEEECCCCCCCC,위키 피디아에 푸조 508 전고가 몇이야


### Intent Classification

#### 데이터 전처리

In [0]:
train_dataset = [(l, d) for d,l in zip(train_raw['intent'], train_raw['sentence'])]
valid_dataset = [(l, d) for d,l in zip(validation_raw['intent'], validation_raw['sentence'])]

In [0]:
seq_len = 32

length_clip = nlp.data.PadSequence(seq_len, pad_val="<pad>")

def preprocess(data):
    sent, entity = data
    char_sent = list(str(sent))
    char_entity = str(entity)
    return(length_clip(char_sent), len(sent),char_entity)

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'
          .format(end - start, len(dataset)))
    return dataset


In [9]:
train_preprocessed  = preprocess_dataset(train_dataset)
valid_preprocessed  = preprocess_dataset(valid_dataset)

Done! Tokenizing Time=0.13s, #Sentences=9000
Done! Tokenizing Time=0.38s, #Sentences=1000


In [0]:
counter_sent   = nlp.data.count_tokens(itertools.chain.from_iterable([c for c, _, _ in train_preprocessed]))
counter_intent = nlp.data.count_tokens([c for _,_, c in train_preprocessed])

In [11]:
counter_intent

Counter({'age': 900,
         'area': 900,
         'belong_to': 900,
         'birth_date': 900,
         'birth_place': 900,
         'definition': 900,
         'height': 900,
         'length': 900,
         'weight': 900,
         'width': 900})

In [0]:
vocab_sent = nlp.Vocab(counter_sent, bos_token=None, eos_token=None, min_freq=15)
vocab_intent = nlp.Vocab(counter_intent, bos_token=None, eos_token=None, unknown_token=None, padding_token=None)

In [13]:
vocab_sent.idx_to_token[:10], vocab_intent.idx_to_token[:10], 

(['<unk>', '<pad>', ' ', 'I', '이', '색', '검', '의', '지', '아'],
 ['age',
  'area',
  'belong_to',
  'birth_date',
  'birth_place',
  'definition',
  'height',
  'length',
  'weight',
  'width'])

In [0]:
train_preprocessed_encoded  = [(vocab_sent[sent], length ,vocab_intent[entity])  for sent, length ,entity in train_preprocessed ]
valid  = [(vocab_sent[sent], length ,vocab_intent[entity])  for sent, length ,entity in valid_preprocessed ]

In [0]:
train, test = nlp.data.train_valid_split(train_preprocessed_encoded, valid_ratio=0.1)

In [0]:
nbatch = 30
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Stack(),
                                      nlp.data.batchify.Stack('float32'),
                                      nlp.data.batchify.Stack())

train_dataloader  = gluon.data.DataLoader(train, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)
test_dataloader  = gluon.data.DataLoader(test, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)
valid_dataloader  = gluon.data.DataLoader(valid, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)

#### 모델링 

In [0]:
class IntentClassification(gluon.HybridBlock):
    def __init__(self, vocab_size, vocab_out_size, num_embed, seq_len, hidden_size, **kwargs):
        super(IntentClassification, self).__init__(**kwargs)
        self.seq_len = seq_len
        self.hidden_size = hidden_size 
        self.vocab_out_size = vocab_out_size
        with self.name_scope():
            self.embed = nn.Embedding(input_dim=vocab_size, output_dim=num_embed)
            self.bigru = rnn.GRU(self.hidden_size, dropout=0.2, bidirectional=True)
            self.dense_prev = nn.Dense(10, flatten=False)
            self.dense = nn.Dense(self.vocab_out_size)  
            
    def hybrid_forward(self, F ,inputs, length):
        em_out = self.embed(inputs)
        bigruout = self.bigru(em_out)
        masked_encoded = F.SequenceMask(bigruout,
                                        sequence_length=length,
                                        use_sequence_length=True).transpose((1,0,2))
        dense_out = self.dense_prev(masked_encoded)
        outs = self.dense(dense_out) 
        return(outs)

In [0]:
ctx = mx.gpu()

model = IntentClassification(vocab_size = len(vocab_sent.idx_to_token), 
                             vocab_out_size=len(vocab_intent.idx_to_token), num_embed=50, seq_len=seq_len, hidden_size=30)

In [0]:
model.initialize(mx.initializer.Xavier(), ctx=ctx)

In [0]:
trainer = gluon.Trainer(model.collect_params(),"Adam")
loss = gluon.loss.SoftmaxCELoss() 

In [0]:
model.hybridize()

In [0]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    for i, (data, length, label) in enumerate(data_iter):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        output = model(data.T, length)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return(acc.get()[1])

In [0]:
def calculate_loss(model, data_iter, loss_obj, ctx=ctx):
    test_loss = []
    for i, (te_data, te_length, te_label) in enumerate(data_iter):
        te_data = te_data.as_in_context(ctx)
        te_label = te_label.as_in_context(ctx)
        te_length = te_length.as_in_context(ctx)
        te_output = model(te_data.T, te_length)
        loss_te = loss_obj(te_output, te_label)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return(np.mean(test_loss))

In [24]:
epochs = 100


tot_test_loss = []
tot_test_accu = []
tot_train_loss = []
tot_train_accu = []
tot_valid_accu = [] 
for e in range(epochs):
    #batch training 
    for i, (data, length, label) in enumerate(tqdm(train_dataloader)):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        with autograd.record():
            output = model(data.T, length)
            loss_ = loss(output, label)
            loss_.backward()
        trainer.step(data.shape[0])

    #caculate test loss
    if e % 10 == 0: 
        test_loss = calculate_loss(model, test_dataloader, loss_obj = loss, ctx=ctx) 
        train_loss = calculate_loss(model, train_dataloader, loss_obj = loss, ctx=ctx) 
        test_accu = evaluate_accuracy(model, test_dataloader,  ctx=ctx)
        train_accu = evaluate_accuracy(model, train_dataloader,  ctx=ctx)
        valid_accu = evaluate_accuracy(model, valid_dataloader,  ctx=ctx)

        print("Epoch %s. Train Loss: %s, Test Loss : %s," \
        " Test Accuracy : %s," \
        " Train Accuracy : %s : Valid Accuracy : %s" % (e, train_loss, test_loss, test_accu, train_accu, valid_accu))    
        tot_test_loss.append(test_loss)
        tot_train_loss.append(train_loss)
        tot_test_accu.append(test_accu)
        tot_train_accu.append(train_accu)
        tot_valid_accu.append(valid_accu)

100%|██████████| 270/270 [00:01<00:00, 222.23it/s]
  9%|▉         | 24/270 [00:00<00:01, 227.36it/s]

Epoch 0. Train Loss: 0.18470986, Test Loss : 0.18911381, Test Accuracy : 0.9455555555555556, Train Accuracy : 0.9514814814814815 : Valid Accuracy : 0.939


100%|██████████| 270/270 [00:01<00:00, 234.59it/s]
100%|██████████| 270/270 [00:01<00:00, 245.90it/s]
100%|██████████| 270/270 [00:01<00:00, 209.04it/s]
100%|██████████| 270/270 [00:01<00:00, 236.16it/s]
100%|██████████| 270/270 [00:01<00:00, 235.68it/s]
100%|██████████| 270/270 [00:01<00:00, 230.73it/s]
100%|██████████| 270/270 [00:01<00:00, 226.29it/s]
100%|██████████| 270/270 [00:01<00:00, 243.93it/s]
100%|██████████| 270/270 [00:01<00:00, 206.43it/s]
100%|██████████| 270/270 [00:01<00:00, 208.71it/s]
  9%|▊         | 23/270 [00:00<00:01, 228.15it/s]

Epoch 10. Train Loss: 0.0107786665, Test Loss : 0.03377559, Test Accuracy : 0.99, Train Accuracy : 0.9971604938271605 : Valid Accuracy : 0.989


100%|██████████| 270/270 [00:01<00:00, 233.27it/s]
100%|██████████| 270/270 [00:01<00:00, 232.03it/s]
100%|██████████| 270/270 [00:01<00:00, 241.32it/s]
100%|██████████| 270/270 [00:01<00:00, 227.35it/s]
100%|██████████| 270/270 [00:01<00:00, 226.52it/s]
100%|██████████| 270/270 [00:01<00:00, 230.68it/s]
100%|██████████| 270/270 [00:01<00:00, 227.91it/s]
100%|██████████| 270/270 [00:01<00:00, 242.82it/s]
100%|██████████| 270/270 [00:01<00:00, 227.59it/s]
100%|██████████| 270/270 [00:01<00:00, 226.63it/s]
  9%|▉         | 25/270 [00:00<00:01, 241.38it/s]

Epoch 20. Train Loss: 6.623845e-05, Test Loss : 0.022001153, Test Accuracy : 0.9933333333333333, Train Accuracy : 1.0 : Valid Accuracy : 0.995


100%|██████████| 270/270 [00:01<00:00, 213.77it/s]
100%|██████████| 270/270 [00:01<00:00, 237.85it/s]
100%|██████████| 270/270 [00:01<00:00, 230.91it/s]
100%|██████████| 270/270 [00:01<00:00, 235.98it/s]
100%|██████████| 270/270 [00:01<00:00, 235.30it/s]
100%|██████████| 270/270 [00:01<00:00, 243.00it/s]
100%|██████████| 270/270 [00:01<00:00, 237.30it/s]
100%|██████████| 270/270 [00:01<00:00, 226.72it/s]
100%|██████████| 270/270 [00:01<00:00, 240.38it/s]
100%|██████████| 270/270 [00:01<00:00, 231.97it/s]
  9%|▉         | 25/270 [00:00<00:01, 242.76it/s]

Epoch 30. Train Loss: 1.3015229e-05, Test Loss : 0.029010795, Test Accuracy : 0.9933333333333333, Train Accuracy : 1.0 : Valid Accuracy : 0.994


100%|██████████| 270/270 [00:01<00:00, 226.23it/s]
100%|██████████| 270/270 [00:01<00:00, 228.21it/s]
100%|██████████| 270/270 [00:01<00:00, 225.57it/s]
100%|██████████| 270/270 [00:01<00:00, 230.60it/s]
100%|██████████| 270/270 [00:01<00:00, 245.43it/s]
100%|██████████| 270/270 [00:01<00:00, 241.21it/s]
100%|██████████| 270/270 [00:01<00:00, 222.03it/s]
100%|██████████| 270/270 [00:01<00:00, 231.73it/s]
100%|██████████| 270/270 [00:01<00:00, 245.55it/s]
100%|██████████| 270/270 [00:01<00:00, 238.74it/s]
 10%|▉         | 26/270 [00:00<00:00, 254.18it/s]

Epoch 40. Train Loss: 2.8318507e-06, Test Loss : 0.036456905, Test Accuracy : 0.9911111111111112, Train Accuracy : 1.0 : Valid Accuracy : 0.992


100%|██████████| 270/270 [00:01<00:00, 222.40it/s]
100%|██████████| 270/270 [00:01<00:00, 242.70it/s]
100%|██████████| 270/270 [00:01<00:00, 234.89it/s]
100%|██████████| 270/270 [00:01<00:00, 234.75it/s]
100%|██████████| 270/270 [00:01<00:00, 240.13it/s]
100%|██████████| 270/270 [00:01<00:00, 247.74it/s]
100%|██████████| 270/270 [00:01<00:00, 241.33it/s]
100%|██████████| 270/270 [00:01<00:00, 232.07it/s]
100%|██████████| 270/270 [00:01<00:00, 225.22it/s]
100%|██████████| 270/270 [00:01<00:00, 232.54it/s]
  8%|▊         | 22/270 [00:00<00:01, 218.93it/s]

Epoch 50. Train Loss: 6.434988e-07, Test Loss : 0.041608997, Test Accuracy : 0.9911111111111112, Train Accuracy : 1.0 : Valid Accuracy : 0.992


100%|██████████| 270/270 [00:01<00:00, 223.49it/s]
100%|██████████| 270/270 [00:01<00:00, 230.21it/s]
100%|██████████| 270/270 [00:01<00:00, 245.84it/s]
100%|██████████| 270/270 [00:01<00:00, 224.68it/s]
100%|██████████| 270/270 [00:01<00:00, 219.22it/s]
100%|██████████| 270/270 [00:01<00:00, 204.98it/s]
100%|██████████| 270/270 [00:01<00:00, 248.60it/s]
100%|██████████| 270/270 [00:01<00:00, 229.96it/s]
100%|██████████| 270/270 [00:01<00:00, 223.54it/s]
100%|██████████| 270/270 [00:01<00:00, 232.01it/s]
  9%|▉         | 24/270 [00:00<00:01, 237.49it/s]

Epoch 60. Train Loss: 1.4549369e-07, Test Loss : 0.051583666, Test Accuracy : 0.9911111111111112, Train Accuracy : 1.0 : Valid Accuracy : 0.992


100%|██████████| 270/270 [00:01<00:00, 237.42it/s]
100%|██████████| 270/270 [00:01<00:00, 219.72it/s]
100%|██████████| 270/270 [00:01<00:00, 221.40it/s]
100%|██████████| 270/270 [00:01<00:00, 229.97it/s]
100%|██████████| 270/270 [00:01<00:00, 200.17it/s]
100%|██████████| 270/270 [00:01<00:00, 206.36it/s]
100%|██████████| 270/270 [00:01<00:00, 224.62it/s]
100%|██████████| 270/270 [00:01<00:00, 224.25it/s]
100%|██████████| 270/270 [00:01<00:00, 238.12it/s]
100%|██████████| 270/270 [00:01<00:00, 236.44it/s]
 10%|█         | 27/270 [00:00<00:00, 256.06it/s]

Epoch 70. Train Loss: 9.1341244e-05, Test Loss : 0.021057624, Test Accuracy : 0.9944444444444445, Train Accuracy : 1.0 : Valid Accuracy : 0.995


100%|██████████| 270/270 [00:01<00:00, 231.05it/s]
100%|██████████| 270/270 [00:01<00:00, 232.42it/s]
100%|██████████| 270/270 [00:01<00:00, 220.26it/s]
100%|██████████| 270/270 [00:01<00:00, 237.14it/s]
100%|██████████| 270/270 [00:01<00:00, 221.74it/s]
100%|██████████| 270/270 [00:01<00:00, 227.62it/s]
100%|██████████| 270/270 [00:01<00:00, 237.04it/s]
100%|██████████| 270/270 [00:01<00:00, 231.40it/s]
100%|██████████| 270/270 [00:01<00:00, 224.03it/s]
100%|██████████| 270/270 [00:01<00:00, 228.41it/s]
  8%|▊         | 22/270 [00:00<00:01, 213.39it/s]

Epoch 80. Train Loss: 1.5107484e-05, Test Loss : 0.020093959, Test Accuracy : 0.9955555555555555, Train Accuracy : 1.0 : Valid Accuracy : 0.995


100%|██████████| 270/270 [00:01<00:00, 246.41it/s]
100%|██████████| 270/270 [00:01<00:00, 239.37it/s]
100%|██████████| 270/270 [00:01<00:00, 235.47it/s]
100%|██████████| 270/270 [00:01<00:00, 236.65it/s]
100%|██████████| 270/270 [00:01<00:00, 243.75it/s]
100%|██████████| 270/270 [00:01<00:00, 227.64it/s]
100%|██████████| 270/270 [00:01<00:00, 245.69it/s]
100%|██████████| 270/270 [00:01<00:00, 240.43it/s]
100%|██████████| 270/270 [00:01<00:00, 235.14it/s]
100%|██████████| 270/270 [00:01<00:00, 251.99it/s]
 10%|█         | 28/270 [00:00<00:00, 270.58it/s]

Epoch 90. Train Loss: 4.6649916e-06, Test Loss : 0.019216476, Test Accuracy : 0.9944444444444445, Train Accuracy : 1.0 : Valid Accuracy : 0.996


100%|██████████| 270/270 [00:01<00:00, 231.63it/s]
100%|██████████| 270/270 [00:01<00:00, 235.63it/s]
100%|██████████| 270/270 [00:01<00:00, 246.76it/s]
100%|██████████| 270/270 [00:01<00:00, 234.70it/s]
100%|██████████| 270/270 [00:01<00:00, 228.10it/s]
100%|██████████| 270/270 [00:01<00:00, 233.21it/s]
100%|██████████| 270/270 [00:01<00:00, 241.09it/s]
100%|██████████| 270/270 [00:01<00:00, 238.56it/s]
100%|██████████| 270/270 [00:01<00:00, 239.69it/s]


In [0]:
model.collect_params().reset_ctx(mx.cpu())

In [0]:
def get_intent(sent):
    sent_len = len(sent)
    coded_sent = vocab_sent[length_clip(list(sent))]
    co = nd.array(coded_sent).expand_dims(axis=1)
    ret_code = model(co, nd.array([sent_len,]))
    ret_seq = vocab_intent.to_tokens(ret_code.argmax(axis=1).asnumpy().astype('int').tolist())
    return(''.join(ret_seq))

In [31]:
get_intent("파이콘이 뭔지 알려줘?")

'definition'

### TODO
- 개별 Intent와 Entity 모형을 하나의 모형으로 구축해본다. (Multi-Task Learning) 
  - 분류 성능이 좋아지는가? 학습 수렴 속도는 어떠한가?