<a href="https://colab.research.google.com/github/saitros/C-estLaVie/blob/master/intent_classification_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install mxnet
!pip install gluonnlp
!pip install konlpy
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
!apt-get install mecab mecab-ipadic-utf8 libmecab-dev swig

In [0]:
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn, rnn
import numpy as np
import warnings
warnings.simplefilter('ignore')

import time
import itertools
from tqdm import tqdm
import multiprocessing as mp
from mxnet import nd
import gluonnlp as nlp
import re
from konlpy.tag import Mecab
import re
import pandas as pd
mecab = Mecab()

In [0]:
mecab.nouns('공부를 합시다. 커피를 먹습니다')

In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
!ls /gdrive/My\ Drive/Colab\ Notebooks/

In [0]:
train_raw = pd.read_csv('/gdrive/My Drive/Colab Notebooks/intent_v3.txt',names=['sentence', 'intent'], sep='\t')

In [0]:
train_raw.head(0-30)

Unnamed: 0,sentence,intent
0,QUESTION,INTENT
1,윈터솔져 나오는 2018년 개봉한 영화 제목이 뭐지?,findMovie
2,타노스 나오는 영화 알려줘,findMovie
3,어벤져스 3편 이름 알려줘,findMovie
4,어벤져스 최신작 알려줘,findMovie
5,어벤져스 최근작품이 뭐지?,findMovie
6,루소형제 감독의 최근작품이 뭐지?,findMovie
7,MCU영화 알려줘,findMovie
8,많은 히어로들이 나오는 영화 알려줘,findMovie
9,초호화캐스팅의 헐리웃 영화 알려줘,findMovie


In [0]:
train_dataset = [(l, d) for d,l in zip(train_raw['intent'], train_raw['sentence'])]
train_dataset[:10]

[('QUESTION', 'INTENT'),
 ('윈터솔져 나오는 2018년 개봉한 영화 제목이 뭐지?', 'findMovie'),
 ('타노스 나오는 영화 알려줘', 'findMovie'),
 ('어벤져스 3편 이름 알려줘', 'findMovie'),
 ('어벤져스 최신작 알려줘', 'findMovie'),
 ('어벤져스 최근작품이 뭐지?', 'findMovie'),
 ('루소형제 감독의 최근작품이 뭐지?', 'findMovie'),
 ('MCU영화 알려줘', 'findMovie'),
 ('많은 히어로들이 나오는 영화 알려줘', 'findMovie'),
 ('초호화캐스팅의 헐리웃 영화 알려줘', 'findMovie')]

In [0]:
seq_len = 32

length_clip = nlp.data.PadSequence(seq_len, pad_val="<pad>")

#sent: sentence
def preprocess(data):
    sent, entity = data
    char_sent = list(str(sent))
    char_entity = str(entity)
    return(length_clip(char_sent), len(sent),char_entity)

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'
          .format(end - start, len(dataset)))
    return dataset

In [0]:
train_preprocessed  = preprocess_dataset(train_dataset)
train_preprocessed

Done! Tokenizing Time=0.13s, #Sentences=2937


<mxnet.gluon.data.dataset.SimpleDataset at 0x7f4713b84588>

In [0]:
counter_sent   = nlp.data.count_tokens(itertools.chain.from_iterable([c for c, _, _ in train_preprocessed]))
counter_intent = nlp.data.count_tokens([c for _,_, c in train_preprocessed])

In [0]:
counter_intent

Counter({'INTENT': 1,
         'findMovie': 395,
         'findPerson': 190,
         'findPlot': 6,
         'findSimilar': 19,
         'getCast': 18,
         'getGanre': 5,
         'getGrade': 167,
         'getMaker': 383,
         'getOfficialSite': 129,
         'getOpenDate': 244,
         'getPlot': 273,
         'getPoster': 207,
         'getRunningTime': 163,
         'getTrailer': 157,
         'isTrue': 1,
         'recommend': 320,
         'search': 259})

In [0]:
counter_sent

Counter({' ': 9835,
         '-': 1,
         '.': 4,
         '0': 84,
         '1': 120,
         '2': 73,
         '3': 44,
         '4': 1,
         '5': 13,
         '6': 2,
         '7': 11,
         '8': 49,
         '9': 2,
         '<pad>': 46503,
         '?': 964,
         'A': 2,
         'C': 19,
         'D': 119,
         'E': 1,
         'F': 3,
         'G': 1,
         'I': 5,
         'J': 1,
         'K': 1,
         'L': 34,
         'M': 20,
         'N': 1,
         'O': 46,
         'P': 1,
         'Q': 1,
         'R': 33,
         'S': 11,
         'T': 8,
         'U': 53,
         'V': 77,
         'X': 1,
         'a': 5,
         'e': 6,
         'i': 2,
         'l': 6,
         'm': 2,
         'n': 3,
         'r': 6,
         's': 3,
         't': 6,
         'x': 2,
         '가': 168,
         '각': 10,
         '간': 271,
         '갈': 1,
         '감': 152,
         '같': 13,
         '개': 258,
         '객': 5,
         '거': 84,
         '걸': 31,
     

In [0]:
# 개별 글자(단어 아님)의 수 
len(counter_sent)

537

In [0]:
vocab_sent = nlp.Vocab(counter_sent, bos_token=None, eos_token=None, min_freq=3)
vocab_intent = nlp.Vocab(counter_intent, bos_token=None, eos_token=None, unknown_token=None, padding_token=None)

In [0]:
vocab_sent.idx_to_token[:10], vocab_intent.idx_to_token[:10], 

(['<unk>', '<pad>', ' ', '영', '화', '줘', '는', '?', '알', '라'],
 ['findMovie',
  'getMaker',
  'recommend',
  'getPlot',
  'search',
  'getOpenDate',
  'getPoster',
  'findPerson',
  'getGrade',
  'getRunningTime'])

In [0]:
train_preprocessed_encoded  = [(vocab_sent[sent], length ,vocab_intent[intent])  for sent, length ,intent in train_preprocessed ]
train_preprocessed_encoded[1]

([369,
  15,
  364,
  266,
  2,
  18,
  20,
  6,
  2,
  114,
  106,
  85,
  142,
  163,
  2,
  37,
  42,
  40,
  2,
  3,
  4,
  2,
  46,
  198,
  12,
  2,
  48,
  25,
  7,
  1,
  1,
  1],
 29,
 0)

In [0]:
vocab_sent.idx_to_token[369],vocab_sent.idx_to_token[15], vocab_sent.idx_to_token[364], vocab_sent.idx_to_token[266],  vocab_sent.idx_to_token[2]\
,vocab_sent.idx_to_token[18] 

('윈', '터', '솔', '져', ' ', '나')

In [0]:
vocab_intent.idx_to_token[0]

'findMovie'

In [0]:
train, test = nlp.data.train_valid_split(train_preprocessed_encoded, valid_ratio=0.1)

In [0]:
nbatch = 30
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Stack(),
                                      nlp.data.batchify.Stack('float32'),
                                      nlp.data.batchify.Stack())

train_dataloader  = gluon.data.DataLoader(train, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)
test_dataloader  = gluon.data.DataLoader(test, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)

In [0]:
class IntentClassification(gluon.HybridBlock):
    def __init__(self, vocab_size, vocab_out_size, num_embed, seq_len, hidden_size, **kwargs):
        super(IntentClassification, self).__init__(**kwargs)
        self.seq_len = seq_len
        self.hidden_size = hidden_size 
        self.vocab_out_size = vocab_out_size
        with self.name_scope():
            self.embed = nn.Embedding(input_dim=vocab_size, output_dim=num_embed)
            self.bigru = rnn.GRU(self.hidden_size, dropout=0.2, bidirectional=True)
            self.dense_prev = nn.Dense(10, flatten=False)
            self.dense = nn.Dense(self.vocab_out_size)  
            
    def hybrid_forward(self, F ,inputs, length):
        em_out = self.embed(inputs)
        bigruout = self.bigru(em_out)
        masked_encoded = F.SequenceMask(bigruout,
                                        sequence_length=length,
                                        use_sequence_length=True).transpose((1,0,2))
        dense_out = self.dense_prev(masked_encoded)
        outs = self.dense(dense_out) 
        return(outs)

In [0]:
ctx = mx.cpu()

In [0]:
model = IntentClassification(vocab_size = len(vocab_sent.idx_to_token), 
                             vocab_out_size=len(vocab_intent.idx_to_token), num_embed=50, seq_len=seq_len, hidden_size=30)

In [0]:
model.initialize(mx.initializer.Xavier(), ctx=ctx)

In [0]:
trainer = gluon.Trainer(model.collect_params(),"Adam")
loss = gluon.loss.SoftmaxCELoss() 

In [0]:
model.hybridize()

In [0]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    for i, (data, length, label) in enumerate(data_iter):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        output = model(data.T, length)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return(acc.get()[1])

In [0]:
def calculate_loss(model, data_iter, loss_obj, ctx=ctx):
    test_loss = []
    for i, (te_data, te_length, te_label) in enumerate(data_iter):
        te_data = te_data.as_in_context(ctx)
        te_label = te_label.as_in_context(ctx)
        te_length = te_length.as_in_context(ctx)
        te_output = model(te_data.T, te_length)
        loss_te = loss_obj(te_output, te_label)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return(np.mean(test_loss))

In [0]:
epochs = 100


tot_test_loss = []
tot_test_accu = []
tot_train_loss = []
tot_train_accu = []
tot_valid_accu = [] 
for e in range(epochs):
    #batch training 
    for i, (data, length, label) in enumerate(tqdm(train_dataloader)):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        with autograd.record():
            output = model(data.T, length)
            loss_ = loss(output, label)
            loss_.backward()
        trainer.step(data.shape[0])

    #caculate test loss
    if e % 10 == 0: 
        test_loss = calculate_loss(model, test_dataloader, loss_obj = loss, ctx=ctx) 
        train_loss = calculate_loss(model, train_dataloader, loss_obj = loss, ctx=ctx) 
        test_accu = evaluate_accuracy(model, test_dataloader,  ctx=ctx)
        train_accu = evaluate_accuracy(model, train_dataloader,  ctx=ctx)

        print("Epoch %s. Train Loss: %s, Test Loss : %s," \
        " Test Accuracy : %s," \
        " Train Accuracy : %s" % (e, train_loss, test_loss, test_accu, train_accu))    
        tot_test_loss.append(test_loss)
        tot_train_loss.append(train_loss)
        tot_test_accu.append(test_accu)
        tot_train_accu.append(train_accu)

100%|██████████| 89/89 [00:01<00:00, 53.70it/s]
  7%|▋         | 6/89 [00:00<00:01, 57.31it/s]

Epoch 0. Train Loss: 1.6732327, Test Loss : 1.6632694, Test Accuracy : 0.5408163265306123, Train Accuracy : 0.5187287173666288


100%|██████████| 89/89 [00:01<00:00, 50.66it/s]
100%|██████████| 89/89 [00:01<00:00, 50.85it/s]
100%|██████████| 89/89 [00:01<00:00, 50.45it/s]
100%|██████████| 89/89 [00:01<00:00, 50.91it/s]
100%|██████████| 89/89 [00:01<00:00, 50.83it/s]
100%|██████████| 89/89 [00:01<00:00, 50.43it/s]
100%|██████████| 89/89 [00:01<00:00, 50.01it/s]
100%|██████████| 89/89 [00:01<00:00, 49.91it/s]
100%|██████████| 89/89 [00:01<00:00, 50.13it/s]
100%|██████████| 89/89 [00:01<00:00, 50.17it/s]
  7%|▋         | 6/89 [00:00<00:01, 57.15it/s]

Epoch 10. Train Loss: 0.028381426, Test Loss : 0.23261285, Test Accuracy : 0.9421768707482994, Train Accuracy : 0.9958380628074158


100%|██████████| 89/89 [00:01<00:00, 50.50it/s]
100%|██████████| 89/89 [00:01<00:00, 49.84it/s]
100%|██████████| 89/89 [00:01<00:00, 50.48it/s]
100%|██████████| 89/89 [00:01<00:00, 49.81it/s]
100%|██████████| 89/89 [00:01<00:00, 49.66it/s]
100%|██████████| 89/89 [00:01<00:00, 49.96it/s]
100%|██████████| 89/89 [00:01<00:00, 50.13it/s]
100%|██████████| 89/89 [00:01<00:00, 49.80it/s]
100%|██████████| 89/89 [00:01<00:00, 50.08it/s]
100%|██████████| 89/89 [00:01<00:00, 50.01it/s]
  7%|▋         | 6/89 [00:00<00:01, 56.36it/s]

Epoch 20. Train Loss: 0.00397077, Test Loss : 0.32617038, Test Accuracy : 0.9455782312925171, Train Accuracy : 0.9996216420734014


100%|██████████| 89/89 [00:01<00:00, 50.18it/s]
100%|██████████| 89/89 [00:01<00:00, 49.85it/s]
100%|██████████| 89/89 [00:01<00:00, 49.89it/s]
100%|██████████| 89/89 [00:01<00:00, 50.26it/s]
100%|██████████| 89/89 [00:01<00:00, 49.81it/s]
100%|██████████| 89/89 [00:01<00:00, 49.91it/s]
100%|██████████| 89/89 [00:01<00:00, 49.30it/s]
100%|██████████| 89/89 [00:01<00:00, 49.74it/s]
100%|██████████| 89/89 [00:01<00:00, 49.48it/s]
100%|██████████| 89/89 [00:01<00:00, 49.66it/s]
  7%|▋         | 6/89 [00:00<00:01, 56.47it/s]

Epoch 30. Train Loss: 0.00095306983, Test Loss : 0.40781444, Test Accuracy : 0.9455782312925171, Train Accuracy : 1.0


100%|██████████| 89/89 [00:01<00:00, 49.06it/s]
100%|██████████| 89/89 [00:01<00:00, 49.58it/s]
100%|██████████| 89/89 [00:01<00:00, 49.72it/s]
100%|██████████| 89/89 [00:01<00:00, 48.88it/s]
100%|██████████| 89/89 [00:01<00:00, 49.98it/s]
100%|██████████| 89/89 [00:01<00:00, 49.59it/s]
100%|██████████| 89/89 [00:01<00:00, 49.79it/s]
100%|██████████| 89/89 [00:01<00:00, 49.87it/s]
100%|██████████| 89/89 [00:01<00:00, 49.29it/s]
100%|██████████| 89/89 [00:01<00:00, 49.37it/s]
  7%|▋         | 6/89 [00:00<00:01, 56.41it/s]

Epoch 40. Train Loss: 0.00040844482, Test Loss : 0.44551325, Test Accuracy : 0.9455782312925171, Train Accuracy : 1.0


100%|██████████| 89/89 [00:01<00:00, 49.82it/s]
100%|██████████| 89/89 [00:01<00:00, 50.00it/s]
100%|██████████| 89/89 [00:01<00:00, 49.89it/s]
100%|██████████| 89/89 [00:01<00:00, 49.25it/s]
100%|██████████| 89/89 [00:01<00:00, 49.19it/s]
100%|██████████| 89/89 [00:01<00:00, 49.13it/s]
100%|██████████| 89/89 [00:01<00:00, 49.44it/s]
100%|██████████| 89/89 [00:01<00:00, 49.23it/s]
100%|██████████| 89/89 [00:01<00:00, 49.25it/s]
100%|██████████| 89/89 [00:01<00:00, 49.31it/s]
  7%|▋         | 6/89 [00:00<00:01, 55.88it/s]

Epoch 50. Train Loss: 0.00020473589, Test Loss : 0.46764532, Test Accuracy : 0.9455782312925171, Train Accuracy : 1.0


100%|██████████| 89/89 [00:01<00:00, 49.33it/s]
100%|██████████| 89/89 [00:01<00:00, 49.34it/s]
100%|██████████| 89/89 [00:01<00:00, 48.13it/s]
100%|██████████| 89/89 [00:01<00:00, 46.96it/s]
100%|██████████| 89/89 [00:01<00:00, 46.89it/s]
100%|██████████| 89/89 [00:01<00:00, 46.77it/s]
100%|██████████| 89/89 [00:01<00:00, 46.78it/s]
100%|██████████| 89/89 [00:01<00:00, 47.04it/s]
100%|██████████| 89/89 [00:01<00:00, 49.17it/s]
100%|██████████| 89/89 [00:01<00:00, 49.50it/s]
  7%|▋         | 6/89 [00:00<00:01, 56.94it/s]

Epoch 60. Train Loss: 0.00010783896, Test Loss : 0.502421, Test Accuracy : 0.9455782312925171, Train Accuracy : 1.0


100%|██████████| 89/89 [00:01<00:00, 49.23it/s]
100%|██████████| 89/89 [00:01<00:00, 49.52it/s]
100%|██████████| 89/89 [00:01<00:00, 49.48it/s]
100%|██████████| 89/89 [00:01<00:00, 49.40it/s]
100%|██████████| 89/89 [00:01<00:00, 49.21it/s]
100%|██████████| 89/89 [00:01<00:00, 49.37it/s]
100%|██████████| 89/89 [00:01<00:00, 49.68it/s]
100%|██████████| 89/89 [00:01<00:00, 49.54it/s]
100%|██████████| 89/89 [00:01<00:00, 49.44it/s]
100%|██████████| 89/89 [00:01<00:00, 49.26it/s]
  7%|▋         | 6/89 [00:00<00:01, 55.33it/s]

Epoch 70. Train Loss: 6.120158e-05, Test Loss : 0.5331315, Test Accuracy : 0.9455782312925171, Train Accuracy : 1.0


100%|██████████| 89/89 [00:01<00:00, 49.34it/s]
100%|██████████| 89/89 [00:01<00:00, 49.35it/s]
100%|██████████| 89/89 [00:01<00:00, 49.26it/s]
100%|██████████| 89/89 [00:01<00:00, 48.85it/s]
100%|██████████| 89/89 [00:01<00:00, 49.44it/s]
100%|██████████| 89/89 [00:01<00:00, 49.21it/s]
100%|██████████| 89/89 [00:01<00:00, 49.35it/s]
100%|██████████| 89/89 [00:01<00:00, 48.94it/s]
100%|██████████| 89/89 [00:01<00:00, 48.89it/s]
100%|██████████| 89/89 [00:01<00:00, 49.46it/s]
  7%|▋         | 6/89 [00:00<00:01, 56.92it/s]

Epoch 80. Train Loss: 3.435269e-05, Test Loss : 0.5833335, Test Accuracy : 0.9455782312925171, Train Accuracy : 1.0


100%|██████████| 89/89 [00:01<00:00, 49.52it/s]
100%|██████████| 89/89 [00:01<00:00, 49.37it/s]
100%|██████████| 89/89 [00:01<00:00, 49.22it/s]
100%|██████████| 89/89 [00:01<00:00, 46.29it/s]
100%|██████████| 89/89 [00:01<00:00, 46.50it/s]
100%|██████████| 89/89 [00:01<00:00, 46.66it/s]
100%|██████████| 89/89 [00:01<00:00, 49.19it/s]
100%|██████████| 89/89 [00:01<00:00, 49.00it/s]
100%|██████████| 89/89 [00:01<00:00, 48.86it/s]
100%|██████████| 89/89 [00:01<00:00, 49.20it/s]
  7%|▋         | 6/89 [00:00<00:01, 56.93it/s]

Epoch 90. Train Loss: 2.0578658e-05, Test Loss : 0.5823115, Test Accuracy : 0.9455782312925171, Train Accuracy : 1.0


100%|██████████| 89/89 [00:01<00:00, 49.00it/s]
100%|██████████| 89/89 [00:01<00:00, 49.18it/s]
100%|██████████| 89/89 [00:01<00:00, 48.93it/s]
100%|██████████| 89/89 [00:01<00:00, 49.24it/s]
100%|██████████| 89/89 [00:01<00:00, 48.96it/s]
100%|██████████| 89/89 [00:01<00:00, 49.10it/s]
100%|██████████| 89/89 [00:01<00:00, 49.33it/s]
100%|██████████| 89/89 [00:01<00:00, 49.18it/s]
100%|██████████| 89/89 [00:01<00:00, 49.07it/s]


In [0]:
model.export("ka-model3")

In [0]:
load_model = gluon.nn.SymbolBlock.imports("ka-model3-symbol.json", ['data0', 'data1'], "ka-model3-0000.params")

NameError: ignored

In [0]:
def get_intent(sent):
    sent_len = len(sent)
    coded_sent = vocab_sent[length_clip(list(sent))]
    co = nd.array(coded_sent).expand_dims(axis=1)
    ret_code = load_model(co, nd.array([sent_len,]))
    ret_seq = vocab_intent.to_tokens(ret_code.argmax(axis=1).asnumpy().astype('int').tolist())
    #print(ret_code.argmax(axis=1).asnumpy().astype('int').tolist())
    #print(ret_seq)
    return(''.join(ret_seq))

## 인텐트 테스트

데이터가 300개 이상인 인텐트

In [0]:
get_intent(list(str("캡틴아메리카 맡은 배우 누구야")))

'findPerson'

In [0]:
get_intent(list(str("해리포터 만든 배급사 알려줘")))

'getMaker'

In [0]:
get_intent(list(str("영웅들이 많이 나오는 영화 알려줘")))

'recommend'

In [0]:
get_intent(list(str("우울할 때 보기 좋은 영화 알려줘")))

'recommend'

아래 부터는 데이터가 300개 미만인 것들입니다. 정확도가 다소 떨어집니다

In [0]:
get_intent(list(str("인턴 영화 개봉일 찾아줘")))

'getOpenDate'

In [0]:
get_intent(list(str("인터스텔라 영화 포스터 보여줘")))

'getPoster'

In [0]:
get_intent(list(str("스타워즈 예고편 찾아줘")))

'getPoster'

In [0]:
get_intent(list(str("스파이더맨 상영등급 좀 알려줄래")))

'getGrade'

In [0]:
get_intent(list(str("겨울왕국 줄거리 알려줘")))

'getPlot'

In [0]:
get_intent(list(str("다크나이트 공식 사이트 주소 좀")))

'recommend'

In [0]:
get_intent(list(str("타노스 나오는 영화 찾아봐")))