In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import codecs
from random import sample

import tensorflow as tf

## 1. Data loading, preprocessing and transforming

### 데이터 정보
- 나루토 자막 데이터

### 데이터 불러온 후, 탐색

In [2]:
fileObj = codecs.open("./data/naruto_send_tokenized_corpus.txt", "r", "utf-8" )
pairs = fileObj.readlines()

In [3]:
input_set = []
output_set = []

In [4]:
for pair in pairs:
    pair.replace('\r\n','')
    input_, output_ = pair.split('\t')
    input_set.append(input_)
    output_set.append(output_)

In [5]:
print(input_set[0], ' ====> ', output_set[0])

어떠냐  ====>  이걸 네가 빌린 사륜안으로 없앨 수 있겠어



In [6]:
print(input_set[12345], ' ====> ', output_set[12345])

그런 미친 소리 지랄 하지 말라 그래  ====>  알겠냐



In [7]:
len(input_set)

382996

In [8]:
len(output_set)

382996

### 가장 길이가 긴 문서 찾기
- 문서를 단어 임베딩 벡터로 이루어진 이미지처럼 만들기 위해, max_document_length를 추출

In [9]:
def max_doc_length(documents):
    max_length = 0
    longest_doc = 0
    for i, document in enumerate(documents):
        length = len(document)
        if length > max_length:
            max_length = length
            longest_doc = i
    return max_length, longest_doc

In [10]:
input_max_length, input_longest = max_doc_length(input_set)
output_max_length, output_longest = max_doc_length(output_set)

In [11]:
print('input seq 최대 길이: ', input_max_length)
print('output seq 최대 길이: ', output_max_length)

input seq 최대 길이:  107
output seq 최대 길이:  109


### Data transforming
tensorflow.contrib.learn.preprocessing 내에 **VocabularyProcessor**라는 클래스를 이용
- 모든 문서에 등장하는 단어들에 인덱스를 할당
- 길이가 다른 문서를 max_document_length로 맞춰주는 역할


In [12]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max(input_max_length, output_max_length))

In [13]:
input_seq_set = np.array(list(vocab_processor.fit_transform(input_set)))
output_seq_set = np.array(list(vocab_processor.fit_transform(output_set)))

- input sequence의 최대 길이는 107, output sequence의 최대 길이는 109
- input_seq_set의 한 성분에서 뒤의 2개 요소는 zero padding이 추가적으로 들어가므로, 이를 제거

In [14]:
input_seq_set = np.array([seq[0:-2] for seq in input_seq_set])

In [15]:
input_seq_set.shape

(382996, 107)

In [16]:
output_seq_set.shape

(382996, 109)

- vocabulary 추출

In [17]:
# http://stackoverflow.com/questions/40661684/tensorflow-vocabularyprocessor

# Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping

# Sort the vocabulary dictionary on the basis of values(id).
sorted_vocab = sorted(vocab_dict.items(), key = lambda x : x[1])

# Treat the id's as index into list and create a list of words in the ascending order of id's
# word with id i goes at index i of the list.
vocabulary = list(list(zip(*sorted_vocab))[0])

# print(vocabulary)

- Data를 train / validation / test set으로 분할

In [18]:
def split_dataset(X, Y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(X)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = X[:lens[0]], Y[:lens[0]]
    testX, testY = X[lens[0]:lens[0]+lens[1]], Y[lens[0]:lens[0]+lens[1]]
    validX, validY = X[-lens[-1]:], Y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)

In [19]:
(trainX, trainY), (testX, testY), (validX, validY) = split_dataset(input_seq_set, output_seq_set)

## 2. Seq2seq modeling

In [20]:
import seq2seq_wrapper

### Model parameter

In [21]:
# parameters for data shape
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
xvocab_size = len(vocabulary)  
yvocab_size = xvocab_size

In [22]:
# parameters for training model
train_batch_size = 32
test_batch_size = 256
emb_dim = 128
num_layers = 1

In [23]:
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len,
                               yseq_len=yseq_len,
                               xvocab_size=xvocab_size,
                               yvocab_size=yvocab_size,
                               ckpt_path='ckpt/naruto/',
                               emb_dim=emb_dim,
                               num_layers=num_layers
                               )

<log> Building Graph </log>

In [24]:
def rand_batch_gen(x, y, batch_size):
    while True:
        sample_idx = sample(list(np.arange(len(x))), batch_size)
        yield x[sample_idx].T, y[sample_idx].T

In [25]:
val_batch_gen = rand_batch_gen(validX, validY, test_batch_size)
test_batch_gen = rand_batch_gen(testX, testY, test_batch_size)
train_batch_gen = rand_batch_gen(trainX, trainY, train_batch_size)

In [26]:
sess = model.train(train_batch_gen, val_batch_gen)


<log> Training started </log>

Model saved to disk at iteration #1000
val   loss : 0.448158

Model saved to disk at iteration #2000
val   loss : 0.427563

Model saved to disk at iteration #3000
val   loss : 0.418473

Model saved to disk at iteration #4000
val   loss : 0.417144

Model saved to disk at iteration #5000
val   loss : 0.413452

Model saved to disk at iteration #6000
val   loss : 0.404385

Model saved to disk at iteration #7000
val   loss : 0.406439

Model saved to disk at iteration #8000
val   loss : 0.405627

Model saved to disk at iteration #9000
val   loss : 0.407131
Interrupted by user at iteration 9884


## 3. Model test

In [65]:
sess = model.restore_last_session()

In [67]:
input_ = test_batch_gen.__next__()[0]
output = model.predict(sess, input_)
# print(output.shape)

(256, 109)


In [68]:
def decode(sequence, lookup, separator=''): # 0 used for padding, is ignored
    return separator.join([ lookup[element] for element in sequence if element ])

In [69]:
replies = []
for ii, oi in zip(input_.T, output):
    q = decode(sequence=ii, lookup=vocabulary, separator=' ')
    decoded = decode(sequence=oi, lookup=vocabulary, separator=' ').split(' ')
    if decoded.count('<UNK>') == 0:
        if decoded not in replies:
            print('q : [{0}]; a : [{1}]'.format(q, ' '.join(decoded)))
            replies.append(decoded)

q : [자 잠깐 기다 려보 라고 아무 리 그 녀석 이 못되 먹은 자식 이라 도 우리 를 배신 할리]; a : [그 의 의 을 거야]
