# seq2seq Model


In [None]:
!pip install -q tensorflow-gpu==2.0.0-rc1

In [None]:
!pip install konlpy

In [1]:
import os

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

In [3]:
base_path = '/content/gdrive/My Drive/Colab Notebooks'

In [8]:
data_in_path = base_path + '/data_in/'
data_out_path = base_path + '/data_out/'

In [9]:
if not os.path.exists(data_in_path):
    os.makedirs(data_in_path)
    
if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

In [10]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt

In [11]:
data = pd.read_csv(data_in_path + 'ChatBotData.csv')

In [12]:
def tokenize_by_morph(text, okt):
    tokenized_question = []
    tokenized_answer = []
    for question, answer in text:
        tokenized_question.append(okt.morphs(question))
        tokenized_answer.append(okt.morphs(answer))
    
    return tokenized_question, tokenized_answer

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
def build_vocab(word_list, special_tokens = ['<PAD>', '<BOS>', '<EOS>']):
    from collections import Counter

    word_counts = Counter(word_list)
    idx2word = special_tokens + [word for word, _ in word_counts.most_common()]
    word2idx = {word:idx for idx, word in enumerate(idx2word)}

    return idx2word, word2idx

In [15]:
def text_to_sequence(text_list, word2idx):
    sequence = []
    for text in text_list:
        sequence.append([word2idx[word] for word in text if word in word2idx.keys()])
    
    return sequence    

In [16]:
def sequence_to_text(sequence, vocab):
    return [vocab[idx] for idx in sequence if idx != 0]

In [17]:
train_data, test_data = train_test_split(data, test_size = 0.1)

okt = Okt()
train_question, train_answer = tokenize_by_morph(train_data[['Q', 'A']].values, okt)
test_question, test_answer = tokenize_by_morph(test_data[['Q', 'A']].values, okt)

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [18]:
print('train qeustion')
print(train_question[:3])
print('\ntest qeustion')
print(test_question[:3])

train qeustion
[['처음', '보는', '사람', '이랑', '뭔', '말', '을', '하지'], ['요가', '온', '아줌마', '들', '이', '참견', '해'], ['재회', '그리고', '다시', '이별']]

test qeustion
[['첫', '눈', '에', '반함'], ['남친', '이', '사진', '잘', '찍었으면'], ['혼자', '가', '편하대']]


In [19]:
flatten_word = sum(train_question+train_answer, [])
idx2word, word2idx = build_vocab(sum(train_question+train_answer, []))

In [20]:
print('flatten word')
print(flatten_word[:5])
print('\nidx2word')
print(idx2word[:5])
print('\nword2idx')
print(word2idx)

flatten word
['처음', '보는', '사람', '이랑', '뭔']

idx2word
['<PAD>', '<BOS>', '<EOS>', '.', '이']

word2idx
{'<PAD>': 0, '<BOS>': 1, '<EOS>': 2, '.': 3, '이': 4, '가': 5, '을': 6, '?': 7, '거': 8, '사람': 9, '에': 10, '예요': 11, '도': 12, '은': 13, '를': 14, '요': 15, '사랑': 16, '것': 17, '해보세요': 18, '생각': 19, '안': 20, '잘': 21, '의': 22, '나': 23, '마음': 24, '너무': 25, '수': 26, '하는': 27, '는': 28, '이별': 29, '더': 30, '좋아하는': 31, '봐요': 32, '보세요': 33, '하고': 34, '말': 35, '해': 36, '연락': 37, '내': 38, '할': 39, '한': 40, '게': 41, '시간': 42, '만': 43, '많이': 44, '좋은': 45, '못': 46, '들': 47, '한테': 48, '좀': 49, '때': 50, '썸': 51, '같아요': 52, '친구': 53, '으로': 54, '에서': 55, '저': 56, '하세요': 57, '그': 58, '같아': 59, '하지': 60, '있을': 61, '있어요': 62, '싶어': 63, '다': 64, '마세요': 65, '오늘': 66, '!': 67, '고': 68, '일': 69, '이에요': 70, '뭐': 71, '건': 72, '남자': 73, '이제': 74, '하면': 75, '연애': 76, '랑': 77, '에게': 78, '자신': 79, '로': 80, '죠': 81, '여자친구': 82, '왜': 83, '적': 84, '다른': 85, '있는': 86, '남자친구': 87, '해도': 88, '어떻게': 89, '네': 90, '인': 91, '혼자': 92, 

In [22]:
index_train_question = text_to_sequence(train_question, word2idx)
index_train_answer = text_to_sequence(train_answer, word2idx)

index_test_question = text_to_sequence(test_question, word2idx)
index_test_answer = text_to_sequence(test_answer, word2idx)

In [23]:
print('index train question')
print(index_train_question[:3])
print('\nindex test question')
print(index_test_question[:3])

index train question
[[292, 314, 9, 123, 581, 35, 6, 60], [4263, 1120, 4264, 47, 4, 1737, 36], [378, 698, 102, 29]]

index test question
[[661, 178, 10], [268, 4, 437, 21], [92, 5]]


In [24]:
train_src_inputs = []
train_tgt_inputs = []
train_labels = []

test_src_inputs = []
test_tgt_inputs = []
test_labels = []

# 수정
for question, answer in zip(index_train_question, index_train_answer):
    train_src_inputs.append(question)
#     train_tgt_inputs.append([word2idx['<BOS>']]+answer[1:])
    train_tgt_inputs.append([word2idx['<BOS>']]+answer[:-1])
    train_labels.append(answer[1:]+[word2idx['<EOS>']])
    
for question, answer in zip(index_test_question, index_test_answer):
    test_src_inputs.append(question)
#     test_tgt_inputs.append([word2idx['<BOS>']]+answer[1:])
    test_tgt_inputs.append([word2idx['<BOS>']]+answer[:-1])
    test_labels.append(answer[1:]+[word2idx['<EOS>']])

In [25]:
max_length = 31
batch_size = 64

In [64]:
pad_train_src_inputs = pad_sequences(train_src_inputs, maxlen = max_length, padding = 'post')
pad_train_tgt_inputs = pad_sequences(train_tgt_inputs, maxlen = max_length, padding = 'post')
pad_train_labels = pad_sequences(train_labels, maxlen = max_length, padding = 'post')

pad_test_src_inputs = pad_sequences(test_src_inputs, maxlen = max_length, padding = 'post')
pad_test_tgt_inputs = pad_sequences(test_tgt_inputs, maxlen = max_length, padding = 'post')
pad_test_labels = pad_sequences(test_labels, maxlen = max_length, padding = 'post')

In [65]:
print(pad_train_src_inputs.shape)
print(pad_train_tgt_inputs.shape)
print(pad_train_labels.shape)

print(pad_test_src_inputs.shape)
print(pad_test_tgt_inputs.shape)
print(pad_test_labels.shape)

(10640, 31)
(10640, 31)
(10640, 31)
(1183, 31)
(1183, 31)
(1183, 31)


In [72]:
num_epochs = 3
batch_size = 64
model_name = 'seq2seq'

In [67]:
def mapping_function(src, tgt, label=None):
    feature = {'src': src, 'tgt': tgt}
    if label is not None:
        return feature, label
    else:
        return feature
    
dataset = tf.data.Dataset.from_tensor_slices((pad_train_src_inputs, pad_train_tgt_inputs, pad_train_labels))
dataset = dataset.shuffle(len(pad_train_labels))
dataset = dataset.batch(batch_size) 
dataset = dataset.map(mapping_function)
    
test_dataset = tf.data.Dataset.from_tensor_slices((pad_test_src_inputs, pad_test_tgt_inputs, pad_test_labels))
test_dataset = test_dataset.shuffle(len(pad_test_labels))
test_dataset = test_dataset.batch(batch_size) 
test_dataset = test_dataset.map(mapping_function)

In [68]:
from tensorflow.keras import layers

In [69]:
class Encoder(layers.Layer):
    def __init__(self, **kargs):
        super(Encoder, self).__init__()
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                         output_dim=kargs['embedding_dimension'])
        self.lstm_layers = [layers.LSTM(units=kargs['lstm_dimension'],
                                        return_sequences=True,
                                        return_state=True) for _ in range(kargs['num_lstm'])]
        
    def call(self, inputs):
        h_states = []
        c_states = []
        x = tf.reverse(inputs, [1])
        x = self.embedding(x)
        for layer in self.lstm_layers:
            x, h, c = layer(x)
            h_states.append(h)
            c_states.append(c)
            
        return x, h_states, c_states 

In [70]:
class Decoder(layers.Layer):
    def __init__(self, **kargs):
        super(Decoder, self).__init__()
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                         output_dim=kargs['embedding_dimension'])
        self.lstm_layers = [layers.LSTM(units=kargs['lstm_dimension'],
                                        return_sequences=True) for _ in range(kargs['num_lstm'])]
        
    def call(self, inputs, h_states, c_states):
        x = self.embedding(inputs)
        for layer, h, c in zip(self.lstm_layers, h_states, c_states):
            x = layer(x, initial_state=[h, c])
            
        return x

In [71]:
class Seq2seq(tf.keras.Model):
    def __init__(self, **kargs):
        super(Seq2seq, self).__init__(name=model_name)
        self.encoder = Encoder(**kargs)
        self.decoder = Decoder(**kargs)
        self.generator = layers.Dense(units=kargs['vocab_size'])
    
    def call(self, inputs):
        src = inputs['src']
        tgt = inputs['tgt']
        
        _, h_states, c_states = self.encoder(src)
        decoder_outputs = self.decoder(tgt, h_states, c_states)
        outputs = self.generator(decoder_outputs)
            
        return outputs

In [73]:
kargs = {'vocab_size': len(word2idx),
        'embedding_dimension': 128,
        'lstm_dimension': 128,
        'num_lstm': 4}

In [74]:
model = Seq2seq(**kargs)

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])

In [77]:
checkpoint_path = data_out_path + model_name + '/weights.{epoch:02d}-{val_loss:.2f}'
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, verbose=1, save_weights_only=True)

model.fit(dataset, epochs=num_epochs,
             validation_data=test_dataset,
         callbacks=[cp_callback])

Epoch 1/3
    167/Unknown - 69s 412ms/step - loss: 1.8835 - sparse_categorical_accuracy: 0.7932
Epoch 00001: saving model to ./data_out/seq2seq/weights.01-1.85
Epoch 2/3
Epoch 00002: saving model to ./data_out/seq2seq/weights.02-1.74
Epoch 3/3
Epoch 00003: saving model to ./data_out/seq2seq/weights.03-1.49


<tensorflow.python.keras.callbacks.History at 0x7feaeb5ed9e8>