# seq2seq Model


In [1]:
import os

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
base_path = './gdrive/My Drive/Colab Notebooks'

In [98]:
data_in_path = base_path + '/data_in/'
data_out_path = base_path + '/data_out/'

In [99]:
if not os.path.exists(data_in_path):
    os.makedirs(data_in_path)
    
if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

In [5]:
import tensorflow as tf
import pandas as pd
import pickle

from random import shuffle
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
!wget -P /content/gdrive/My\ Drive/Colab\ Notebooks/data_in/ https://raw.githubusercontent.com/changwookjun/learningspoons/master/Data/ChatBotData.csv

In [46]:
data = pd.read_csv(data_in_path + 'ChatBotData.csv')

In [40]:
train_data, test_data = train_test_split(data, test_size = 0.1)

In [10]:
train_path = data_in_path + 'train_chat_data.csv'
test_path = data_in_path + 'test_chat_data.csv'

In [42]:
train_data.to_csv(train_path, index=False)
test_data.to_csv(test_path, index=False)

In [299]:
class Dataset:
    
    def __init__(self, train_path, test_path, is_shuffle, train_bs,
                 test_bs, epoch, max_length, vocab_path, is_header_first):
        self.train_path = train_path
        self.test_path = test_path
        self.is_shuffle = is_shuffle
        self.train_bs = train_bs
        self.test_bs = test_bs
        self.epoch = epoch
        self.max_length = max_length
        self.is_header_first = is_header_first
        self.okt = Okt()
        self.special_tokens = ['<PAD>', '<BOS>', '<EOS>']
        
        if not os.path.exists(vocab_path):
            print('There is no vocabulary...')
            print('Building vocabulary...')
            self.build_vocab_by_chatdata(vocab_path)
            print('Successfully build vocabulary!')
        
        print('Loading vocabulary...')    
        self.idx2word, self.word2idx = pickle.load(open(vocab_path, 'rb'))
        print('Successfully load vocabulary!')
    
    def build_vocab(self, word_list):
        from collections import Counter

        word_counts = Counter(word_list)
        idx2word = self.special_tokens + [word for word, _ in word_counts.most_common()]
        word2idx = {word:idx for idx, word in enumerate(idx2word)}

        return idx2word, word2idx
    
    def build_vocab_by_chatdata(self, vocab_path):
        data = pd.read_csv(self.train_path)

        questions = self.tokenize_by_morph(data['Q'].values)
        answers = self.tokenize_by_morph(data['Q'].values)
        
        word_list = sum(questions+answers, [])
        idx2word, word2idx = self.build_vocab(word_list)
        
        vocab = (idx2word, word2idx)
        pickle.dump(vocab, open(vocab_path, 'wb'))
        
    def tokenize_by_morph(self, text):
        tokenized_text = []
        for sentence in text:
            tokenized_text.append(self.okt.morphs(sentence))

        return tokenized_text
    
    def text_to_sequence(self, text_list):
        sequences = []
        for text in text_list:
            sequences.append([self.word2idx[word] for word in text if word in self.word2idx.keys()])

        return sequences

    def sequence_to_text(self, sequence):
        
        return [self.idx2word[idx] for idx in sequence if idx != 0]
    
    def make_decoder_input_and_label(self, answers):
        
        decoder_input = []
        labels = []
        
        for sentence in answers:
            decoder_input.append(['<BOS>'] + sentence)
            labels.append(sentence + ['<EOS>'])
        
        return decoder_input, labels
            
    
    def read_lines(self, indices, path):
        line_count = 0
        questions = []
        answers = []

        with open(path, 'r') as f:
            if self.is_header_first:
                f.readline()
            for line in f.readlines():
                if line_count in indices:
                    try:
                        question, answer, _ = next(csv.reader([line], skipinitialspace=True))
                    except:
                        print(line)
                        print(line_count)
                    questions.append(question)
                    answers.append(answer)
                line_count += 1

        return questions, answers

    def data_generator(self, is_train):

        if is_train:
            batch_size = self.train_bs
            is_shuffle = self.is_shuffle
            path = self.train_path
        else:
            batch_size = self.test_bs
            is_shuffle = False
            path = self.test_path

        with open(path, 'r') as f:
            if self.is_header_first:
                data_length = len(f.readlines())-1
            else:
                data_length = len(f.readlines())

        indices = list(range(data_length))
        if is_shuffle:
            shuffle(indices)

        current_count = 0
        while True:
            if current_count >= data_length:
                return
            else:
                target_indices = indices[current_count:current_count+batch_size]
                questions, answers = self.read_lines(target_indices, path)

                tokenized_questions = self.tokenize_by_morph(questions)
                tokenized_answers = self.tokenize_by_morph(answers)
                
                tokenized_encoder_inputs = tokenized_questions
                tokenized_decoder_inputs, tokenized_labels = self.make_decoder_input_and_label(tokenized_answers)
                

                indexed_encoder_inputs = self.text_to_sequence(tokenized_encoder_inputs)
                indexed_decoder_inputs = self.text_to_sequence(tokenized_decoder_inputs)
                indexed_labels = self.text_to_sequence(tokenized_labels)


                padded_encoder_inputs = pad_sequences(indexed_encoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'pre')
                padded_decoder_inputs = pad_sequences(indexed_decoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'pre')

                padded_labels = pad_sequences(indexed_labels,
                                              maxlen = self.max_length,
                                              padding = 'pre')

                
                yield padded_encoder_inputs, padded_decoder_inputs, padded_labels

    
    def mapping_fn(self, question, answer, labels=None):
        features = {"question": question, 'answer': answer}

        return features, labels
    
    def train_input_fn(self):
        dataset = tf.data.Dataset.from_generator(generator = lambda: self.data_generator(is_train=True),
                                                output_types = (tf.int64, tf.int64, tf.int64),
                                                output_shapes = ((None, self.max_length),
                                                                 (None, self.max_length),
                                                                 (None, self.max_length)))
        dataset = dataset.map(self.mapping_fn)
        dataset = dataset.repeat(self.epoch)
        
        return dataset

    def test_input_fn(self):
        dataset = tf.data.Dataset.from_generator(generator = lambda: self.data_generator(is_train=False),
                                                output_types = (tf.int64, tf.int64),
                                                output_shapes = ((None, self.max_length),
                                                                 (None, self.max_length)))
        dataset = dataset.map(self.mapping_fn)
        
        return dataset

In [300]:
vocab_path = data_in_path+'ChatBotData.voc'
dataset = Dataset(train_path = train_path,
                  test_path = test_path,
                  is_shuffle = True,
                  train_bs = 64,
                  test_bs = 128,
                  epoch = 10,
                  max_length = 30,
                  vocab_path = vocab_path,
                  is_header_first = True)

Loading vocabulary...
Successfully load vocabulary!


In [291]:
def seq2seq(features, labels, mode, params):

    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    embedding = tf.keras.layers.Embedding(params['vocab_size'], params['embedding_dimension'])
    encoder = tf.keras.layers.GRU(params['gru_dimension'], return_state = True, return_sequences=True)
    decoder = tf.keras.layers.GRU(params['gru_dimension'], return_state = True)
    predicition_layer = tf.keras.layers.Dense(params['vocab_size'])
    W = tf.keras.layers.Dense(params['attention_dimension'])
    U = tf.keras.layers.Dense(params['attention_dimension'])
    v = tf.keras.layers.Dense(1)
    
    
    embedded_question = embedding(features['question'])
    encoder_output, encoder_state = encoder(embedded_question)
    
    logits = []
    predictions = []
    
    # prepare decoder
    decoder_state = encoder_state
    predicted_token = tf.ones_like(features['answer'][:, :1]) * params['start_token_index']
    
    for i in range(params['max_length']):
        with tf.variable_scope(f'decoder_{i}_step', reuse=tf.AUTO_REUSE):

            if i > 0:
                random_value = tf.random_uniform(shape=(), maxval=1)
                input_embed = tf.cond(tf.logical_and(TRAIN, (random_value <= params['teacher_forcing_rate'])),
                                      lambda: features['answer'][:, i:i+1],
                                      lambda: predicted_token)
            else:
                decoder_inputs = predicted_token
            if params['use_attention']:
                state_with_3dim = tf.expand_dims(decoder_state, 1)
                energy = v(W(state_with_3dim) + U(encoder_output))
                attention = tf.nn.softmax(energy, 1)
                initial_state = tf.reduce_sum(attention*encoder_output, 1)                

            else:
                initial_state = decoder_state
                
            decoder_inputs = predicted_token

            embed_decoder_inputs = embedding(decoder_inputs)
            decoder_output, decoder_state = decoder(embed_decoder_inputs, initial_state=initial_state)
            predicted_logits = predicition_layer(decoder_output)
            predicted_token = tf.argmax(predicted_logits, -1)

            predictions.append(predicted_token)
            predicted_token = tf.reshape(predicted_token, (-1, 1))
            logits.append(predicted_logits)

    output_logits = tf.stack(logits, 1)
    output_predictions = tf.stack(predictions, 1)
    
    one_hot_label = tf.one_hot(labels, params['vocab_size'])

    loss = tf.losses.softmax_cross_entropy(one_hot_label, output_logits, reduction=tf.losses.Reduction.NONE)
    loss = tf.where(labels==0, tf.zeros_like(loss), loss)
    loss = tf.reduce_sum(loss, 1)
    sequence_size = tf.reduce_sum(tf.cast(labels, tf.float32), -1)
    loss = tf.reduce_mean((loss / sequence_size))
    
    global_step = tf.train.get_global_step()
    train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

    return tf.estimator.EstimatorSpec(
                        mode=mode,
                        train_op=train_op,
                        loss=loss,
                        predictions={'prediction': predicted_token})

In [292]:
hyper_params = {'vocab_size': len(dataset.word2idx),
                'embedding_dimension': 128,
                'gru_dimension': 128,
                'attention_dimension': 256,
                'start_token_index': dataset.word2idx['<BOS>'],
                'max_length': 30,
                'teacher_forcing_rate': 0.5,
                'use_attention': True}

In [293]:
tf.logging.set_verbosity(tf.logging.INFO)

In [294]:
estimator = tf.estimator.Estimator(model_fn = seq2seq,
                                   params=hyper_params,
                                   model_dir =data_out_path+'basic_seq2seq3')

I1003 19:52:08.646857 4755715520 estimator.py:1790] Using default config.
I1003 19:52:08.648288 4755715520 estimator.py:209] Using config: {'_model_dir': './data_out/basic_seq2seq3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe1383b4710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [295]:
estimator.train(dataset.train_input_fn)

I1003 19:52:09.006586 4755715520 estimator.py:1145] Calling model_fn.


(?, 30)
(?, 30)
(?,)
(?, 30)
(?,)


I1003 19:52:27.097210 4755715520 estimator.py:1147] Done calling model_fn.
I1003 19:52:27.099187 4755715520 basic_session_run_hooks.py:541] Create CheckpointSaverHook.
I1003 19:52:27.200532 4755715520 monitored_session.py:240] Graph was finalized.
I1003 19:52:27.203590 4755715520 saver.py:1280] Restoring parameters from ./data_out/basic_seq2seq3/model.ckpt-0
I1003 19:52:28.411020 4755715520 session_manager.py:500] Running local_init_op.
I1003 19:52:28.771840 4755715520 session_manager.py:502] Done running local_init_op.
I1003 19:52:38.849664 4755715520 basic_session_run_hooks.py:606] Saving checkpoints for 0 into ./data_out/basic_seq2seq3/model.ckpt.
I1003 19:52:53.409914 4755715520 basic_session_run_hooks.py:262] loss = 4.732609, step = 1


KeyboardInterrupt: 