# seq2seq Model


In [None]:
!pip install -q tensorflow-gpu==2.0.0-rc1

In [None]:
!pip install konlpy

In [2]:
import os

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

In [3]:
base_path = '/content/gdrive/My Drive/Colab Notebooks'

In [5]:
data_in_path = base_path + '/data_in/'
data_out_path = base_path + '/data_out/'

In [6]:
if not os.path.exists(data_in_path):
    os.makedirs(data_in_path)
    
if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

In [7]:
import tensorflow as tf
import pandas as pd
import pickle

from random import shuffle
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
data = pd.read_csv(data_in_path + 'ChatBotData.csv')

In [9]:
train_data, test_data = train_test_split(data, test_size = 0.1)

In [18]:
train_text_list = (train_data.Q + ' // ' + train_data.A).tolist()
test_text_list = (test_data.Q + ' // ' + test_data.A).tolist()

In [19]:
train_text = '\n'.join(train_text_list)
test_text = '\n'.join(test_text_list)

In [20]:
train_text_path = data_in_path + 'train_chat_data.txt' 
test_text_path = data_in_path + 'test_chat_data.txt' 

In [21]:
with open(train_text_path, 'w') as f:
    f.write(train_text)

with open(test_text_path, 'w') as f:
    f.write(test_text)

In [122]:
class Dataset:
    
    def __init__(self, train_path, test_path, is_shuffle, train_bs,
                 test_bs, epoch, max_length, vocab_path):
        self.train_path = train_path
        self.test_path = test_path
        self.is_shuffle = is_shuffle
        self.train_bs = train_bs
        self.test_bs = test_bs
        self.epoch = epoch
        self.max_length = max_length
        self.okt = Okt()
        self.special_tokens = ['<PAD>', '<BOS>', '<EOS>']
        
        if not os.path.exists(vocab_path):
            print('There is no vocabulary...')
            print('Building vocabulary...')
            self.build_vocab_by_chatdata(vocab_path)
            print('Successfully build vocabulary!')
        
        print('Loading vocabulary...')    
        self.idx2word, self.word2idx = pickle.load(open(vocab_path, 'rb'))
        print('Successfully load vocabulary!')
    
    def build_vocab(self, word_list):
        from collections import Counter

        word_counts = Counter(word_list)
        idx2word = self.special_tokens + [word for word, _ in word_counts.most_common()]
        word2idx = {word:idx for idx, word in enumerate(idx2word)}

        return idx2word, word2idx
    
    def build_vocab_by_chatdata(self, vocab_path):
        data = []
        with open(self.train_path, 'r') as f:
            for line in f.readlines():
                data += line.split('//')

        tokenized_data = self.tokenize_by_morph(data)
        
        word_list = sum(tokenized_data, [])
        idx2word, word2idx = self.build_vocab(word_list)
        
        vocab = (idx2word, word2idx)
        pickle.dump(vocab, open(vocab_path, 'wb'))
        
    def tokenize_by_morph(self, text):
        tokenized_text = []
        for sentence in text:
            tokenized_text.append(self.okt.morphs(sentence))

        return tokenized_text
    
    def text_to_sequence(self, text_list):
        sequences = []
        for text in text_list:
            sequences.append([self.word2idx[word] for word in text if word in self.word2idx.keys()])

        return sequences

    def sequence_to_text(self, sequence):
        
        return [self.idx2word[idx] for idx in sequence if idx != 0]
    
    def make_decoder_input_and_label(self, answers):
        
        decoder_input = []
        labels = []
        
        for sentence in answers:
            decoder_input.append(['<BOS>'] + sentence[:-1])
            labels.append(sentence[1:] + ['<EOS>'])
        
        return decoder_input, labels
            
    
    def read_lines(self, indices, path):
        questions = []
        answers = []

        with open(path, 'r') as f:
            lines = f.readlines()

        for index in indices:
            text = lines[index]
            question, answer = text.split('//')
            questions.append(question)
            answers.append(answer)

        return questions, answers

    def data_generator(self, is_train):

        if is_train:
            batch_size = self.train_bs
            is_shuffle = self.is_shuffle
            path = self.train_path
        else:
            batch_size = self.test_bs
            is_shuffle = False
            path = self.test_path

        with open(path, 'r') as f:
            data_length = len(f.readlines())

        indices = list(range(data_length))
        if is_shuffle:
            shuffle(indices)

        current_count = 0
        while True:
            if current_count >= data_length:
                return
            else:
                target_indices = indices[current_count:current_count+batch_size]
                questions, answers = self.read_lines(target_indices, path)

                tokenized_questions = self.tokenize_by_morph(questions)
                tokenized_answers = self.tokenize_by_morph(answers)
                
                tokenized_encoder_inputs = tokenized_questions
                tokenized_decoder_inputs, tokenized_labels = self.make_decoder_input_and_label(tokenized_answers)
                

                indexed_encoder_inputs = self.text_to_sequence(tokenized_encoder_inputs)
                indexed_decoder_inputs = self.text_to_sequence(tokenized_decoder_inputs)
                indexed_labels = self.text_to_sequence(tokenized_labels)


                padded_encoder_inputs = pad_sequences(indexed_encoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'post')
                padded_decoder_inputs = pad_sequences(indexed_decoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'post')

                padded_labels = pad_sequences(indexed_labels,
                                              maxlen = self.max_length,
                                              padding = 'post')


                yield padded_encoder_inputs, padded_decoder_inputs, padded_labels

In [123]:
train_text_path = data_in_path + 'train_chat_data.txt' 
test_text_path = data_in_path + 'test_chat_data.txt' 
vocab_path = data_in_path+'ChatBotData.voc'

dataset = Dataset(train_path = train_text_path,
                  test_path = test_text_path,
                  is_shuffle = True,
                  train_bs = 64,
                  test_bs = 128,
                  epoch = 10,
                  max_length = 30,
                  vocab_path = vocab_path)

Loading vocabulary...
Successfully load vocabulary!


In [124]:
model_name = 'seq2seq_attn'

In [125]:
def mapping_fn(src, tgt, label = None):
    features = {"src": src, 'tgt': tgt}
    if label is not None:
        return features, label
    else:
        return features
    
train_dataset = tf.data.Dataset.from_generator(generator = lambda: dataset.data_generator(is_train=True),
                                        output_types = (tf.int64, tf.int64, tf.int64),
                                        output_shapes = ((None, dataset.max_length),
                                                         (None, dataset.max_length),
                                                         (None, dataset.max_length)))
train_dataset = train_dataset.map(mapping_fn)
        

test_dataset = tf.data.Dataset.from_generator(generator = lambda: dataset.data_generator(is_train=False),
                                        output_types = (tf.int64, tf.int64),
                                        output_shapes = ((None, dataset.max_length),
                                                         (None, dataset.max_length)))
test_dataset = test_dataset.map(mapping_fn)

In [126]:
from tensorflow.keras import layers

In [146]:
class Encoder(layers.Layer):
    def __init__(self, **kargs):
        super(Encoder, self).__init__()
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                         output_dim=kargs['embedding_dimension'])
        self.gru_layer = layers.GRU(units=kargs['gru_dimension'],
                                    return_sequences=True,
                                    return_state=True)
        
    def call(self, inputs):
        x = self.embedding(inputs)
        x, state = self.gru_layer(x)
            
        return x, state

In [135]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, attention_dimension):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(attention_dimension)
        self.W2 = tf.keras.layers.Dense(attention_dimension)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)

        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [152]:
class Decoder(layers.Layer):
    def __init__(self, **kargs):
        super(Decoder, self).__init__()
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                         output_dim=kargs['embedding_dimension'])
        self.gru_layer = layers.GRU(units=kargs['gru_dimension'],
                                     return_sequences=True,
                                     return_state=True)
        self.attention_layer = BahdanauAttention(attention_dimension=kargs['attention_dimension'])
        
    def call(self, x, state, encoder_outputs):
        
        context_vector, attention_weights = self.attention_layer(state, encoder_outputs)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru_layer(x, initial_state=state)
            
        return x, state

In [161]:
class Seq2seq(tf.keras.Model):
    def __init__(self, **kargs):
        super(Seq2seq, self).__init__(name=model_name)
        self.encoder = Encoder(**kargs)
        self.decoder = Decoder(**kargs)
        self.generator = layers.Dense(units=kargs['vocab_size'])
        self.teacher_forcing_rate = kargs['teacher_forcing_rate']
    
    @tf.function
    def call(self, inputs):
        src = inputs['src']
        tgt = inputs['tgt']
        
        encoder_outputs, state = self.encoder(src)
        
        decoder_input = tgt[:, :1]
        
        output_list = []
        for t in range(tgt.shape[1]):
            output, state = self.decoder(decoder_input, state, encoder_outputs)
            output = self.generator(output)
            output_list.append(output)
            if tf.random.uniform(()) > self.teacher_forcing_rate:
                output = tf.reshape(output, (-1, output.shape[2]))
                decoder_input = tf.expand_dims(tf.argmax(output, -1), -1)
            else:
                decoder_input = tgt[:, t:t+1]
        
        outputs = tf.concat(output_list, 1)
            
        return outputs

In [162]:
kargs = {'vocab_size': len(dataset.word2idx),
        'embedding_dimension': 128,
        'gru_dimension': 128,
        'teacher_forcing_rate': 0.5,
        'attention_dimension': 128}

In [163]:
model = Seq2seq(**kargs)

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])

In [164]:
checkpoint_path = data_out_path + model_name + '/weights.{epoch:02d}-{val_loss:.2f}'
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, verbose=1, save_weights_only=True)

model.fit(train_dataset, epochs=10,
         validation_data=test_dataset,
         callbacks=[cp_callback])

Epoch 1/10
     89/Unknown - 79s 891ms/step - loss: 2.2854 - sparse_categorical_accuracy: 0.7603

KeyboardInterrupt: 