In [1]:
import os

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

In [2]:
base_path = './gdrive/My Drive/Colab Notebooks'

In [3]:
data_in_path = base_path + '/data_in/'
data_out_path = base_path + '/data_out/'

In [4]:
if not os.path.exists(data_in_path):
    os.makedirs(data_in_path)
    
if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle

from random import shuffle
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
!wget -P /content/gdrive/My\ Drive/Colab\ Notebooks/data_in/ https://raw.githubusercontent.com/changwookjun/learningspoons/master/Data/ChatBotData.csv

--2019-10-16 15:11:48--  https://raw.githubusercontent.com/changwookjun/learningspoons/master/Data/ChatBotData.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 889842 (869K) [text/plain]
Saving to: ‘./data_in/ChatBotData.csv’


2019-10-16 15:11:50 (1.16 MB/s) - ‘./data_in/ChatBotData.csv’ saved [889842/889842]



In [6]:
data = pd.read_csv(data_in_path + 'ChatBotData.csv')

In [7]:
data[['Q','A']].values[0]

array(['12시 땡!', '하루가 또 가네요.'], dtype=object)

In [8]:
string_list = []

for line in data[['Q','A']].values:
    question, answer = line
    string_list.append(question + '\t' + answer)

In [104]:
string_list[:5]

['12시 땡!\t하루가 또 가네요.',
 '1지망 학교 떨어졌어\t위로해 드립니다.',
 '3박4일 놀러가고 싶다\t여행은 언제나 좋죠.',
 '3박4일 정도 놀러가고 싶다\t여행은 언제나 좋죠.',
 'PPL 심하네\t눈살이 찌푸려지죠.']

In [12]:
#TODO: text 로 train,test저장하는 것부터 수정 필
train_data, test_data = train_test_split(string_list, test_size = 0.1)

train_string_data = '\n'.join(train_data)
test_string_data = '\n'.join(test_data)

In [16]:
train_path = data_in_path + 'train_chat_data.txt'
test_path = data_in_path + 'test_chat_data.txt'

In [17]:
with open(train_path, 'w') as f:
    f.writelines(train_string_data)
with open(test_path, 'w') as f:
    f.writelines(test_string_data)

In [56]:
f = open(train_path, 'r')

In [57]:
f.readline()

'남자친구하고 커플티를 하고싶은데 어디거를 할까?\t좋아하는 브랜드가 좋겠어요.\n'

In [58]:
class Dataset:
    
    def __init__(self, train_path, test_path, is_shuffle, train_bs,
                 test_bs, epoch, max_length, vocab_path):
        self.train_path = train_path
        self.test_path = test_path
        self.is_shuffle = is_shuffle
        self.train_bs = train_bs
        self.test_bs = test_bs
        self.epoch = epoch
        self.max_length = max_length
        self.okt = Okt()
        self.special_tokens = ['<PAD>', '<BOS>', '<EOS>']
        
        if not os.path.exists(vocab_path):
            print('There is no vocabulary...')
            print('Building vocabulary...')
            self.build_vocab_by_chatdata(vocab_path)
            print('Successfully build vocabulary!')
        
        print('Loading vocabulary...')    
        self.idx2word, self.word2idx = pickle.load(open(vocab_path, 'rb'))
        print('Successfully load vocabulary!')
    
    def build_vocab(self, word_list):
        from collections import Counter

        word_counts = Counter(word_list)
        idx2word = self.special_tokens + [word for word, _ in word_counts.most_common()]
        word2idx = {word:idx for idx, word in enumerate(idx2word)}

        return idx2word, word2idx
    
    def build_vocab_by_chatdata(self, vocab_path):
        with open(self.train_path, 'r') as f:
            data = f.readlines()
        
        questions = []
        answers = []
        
        for line in data:
            string = line.replace('\n', '')
            question, answer = string.split('\t')
            questions.append(question)
            answers.append(answer)

        questions = self.tokenize_by_morph(questions)
        answers = self.tokenize_by_morph(answers)
        
        word_list = sum(questions+answers, [])
        idx2word, word2idx = self.build_vocab(word_list)
        
        vocab = (idx2word, word2idx)
        pickle.dump(vocab, open(vocab_path, 'wb'))
        
    def tokenize_by_morph(self, text):
        tokenized_text = []
        for sentence in text:
            tokenized_text.append(self.okt.morphs(sentence))

        return tokenized_text
    
    def text_to_sequence(self, text_list):
        sequences = []
        for text in text_list:
            sequences.append([self.word2idx[word] for word in text if word in self.word2idx.keys()])

        return sequences

    def sequence_to_text(self, sequence):
        
        return [self.idx2word[idx] for idx in sequence if idx != 0]
    
    def make_decoder_input_and_label(self, answers):
        
        decoder_input = []
        labels = []
        
        for sentence in answers:
            decoder_input.append(['<BOS>'] + sentence)
            labels.append(sentence + ['<EOS>'])
        
        return decoder_input, labels
            
    
    def read_lines(self, indices, path):
        line_count = 0
        questions = []
        answers = []

        with open(path, 'r') as f:
            for line in f.readlines():
                if line_count in indices:
                    string = line.replace('\n', '')
                    question, answer = string.split('\t') 
                    questions.append(question)
                    answers.append(answer)
                line_count += 1

        return questions, answers

    def data_generator(self, is_train):

        if is_train:
            batch_size = self.train_bs
            is_shuffle = self.is_shuffle
            path = self.train_path
        else:
            batch_size = self.test_bs
            is_shuffle = False
            path = self.test_path

        with open(path, 'r') as f:
            data_length = len(f.readlines())

        indices = list(range(data_length))
        if is_shuffle:
            shuffle(indices)

        current_count = 0
        while True:
            if current_count >= data_length:
                return
            else:
                target_indices = indices[current_count:current_count+batch_size]
                current_count += batch_size
                questions, answers = self.read_lines(target_indices, path)

                tokenized_questions = self.tokenize_by_morph(questions)
                tokenized_answers = self.tokenize_by_morph(answers)
                
                tokenized_encoder_inputs = tokenized_questions
                tokenized_decoder_inputs, tokenized_labels = self.make_decoder_input_and_label(tokenized_answers)
                

                indexed_encoder_inputs = self.text_to_sequence(tokenized_encoder_inputs)
                indexed_decoder_inputs = self.text_to_sequence(tokenized_decoder_inputs)
                indexed_labels = self.text_to_sequence(tokenized_labels)


                padded_encoder_inputs = pad_sequences(indexed_encoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'pre')
                padded_decoder_inputs = pad_sequences(indexed_decoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'pre')

                padded_labels = pad_sequences(indexed_labels,
                                              maxlen = self.max_length,
                                              padding = 'pre')

                
                yield padded_encoder_inputs, padded_decoder_inputs, padded_labels

    
    def mapping_fn(self, question, answer, labels=None):
        features = {"question": question, 'answer': answer}

        return features, labels
    
    def train_input_fn(self):
        dataset = tf.data.Dataset.from_generator(generator = lambda: self.data_generator(is_train=True),
                                                output_types = (tf.int64, tf.int64, tf.int64),
                                                output_shapes = ((None, self.max_length),
                                                                 (None, self.max_length),
                                                                 (None, self.max_length)))
        dataset = dataset.map(self.mapping_fn)
        dataset = dataset.repeat(self.epoch)
        
        return dataset

    def test_input_fn(self):
        dataset = tf.data.Dataset.from_generator(generator = lambda: self.data_generator(is_train=False),
                                                output_types = (tf.int64, tf.int64),
                                                output_shapes = ((None, self.max_length),
                                                                 (None, self.max_length)))
        dataset = dataset.map(self.mapping_fn)
        
        return dataset

In [60]:
vocab_path = data_in_path+'ChatBotData.voc'
dataset = Dataset(train_path = train_path,
                  test_path = test_path,
                  is_shuffle = True,
                  train_bs = 64,
                  test_bs = 128,
                  epoch = 10,
                  max_length = 30,
                  vocab_path = vocab_path)

There is no vocabulary...
Building vocabulary...
Successfully build vocabulary!
Loading vocabulary...
Successfully load vocabulary!


In [68]:
def layer_norm(inputs, eps=1e-6):
    # LayerNorm(x + Sublayer(x))
    feature_shape = inputs.get_shape()[-1:]
    #  평균과 표준편차을 넘겨 준다.
    mean = tf.keras.backend.mean(inputs, [-1], keepdims=True)
    std = tf.keras.backend.std(inputs, [-1], keepdims=True)
    beta = tf.get_variable("beta", initializer=tf.zeros(feature_shape))
    gamma = tf.get_variable("gamma", initializer=tf.ones(feature_shape))

    return gamma * (inputs - mean) / (std + eps) + beta

In [69]:
def sublayer_connection(inputs, sublayer, dropout=0.2):
    outputs = layer_norm(inputs + tf.keras.layers.Dropout(dropout)(sublayer))
    return outputs

In [70]:
def positional_encoding(dim, sentence_length):
    encoded_vec = np.array([pos/np.power(10000, 2*i/dim)
                            for pos in range(sentence_length) for i in range(dim)])

    encoded_vec[::2] = np.sin(encoded_vec[::2])
    encoded_vec[1::2] = np.cos(encoded_vec[1::2])

    return tf.constant(encoded_vec.reshape([sentence_length, dim]), dtype=tf.float32)

In [1]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, num_units, heads, sub_masked=False):
        super(MultiHeadAttention, self).__init__()

        self.heads = heads
        self.sub_masked = sub_masked

        self.query_dense = tf.keras.layers.Dense(num_units, use_bias=False)
        self.key_dense = tf.keras.layers.Dense(num_units, use_bias=False)
        self.value_dense = tf.keras.layers.Dense(num_units, use_bias=False)
        self.out_dense = tf.keras.layers.Dense(num_units, use_bias=False)

    def scaled_dot_product_attention(self, query, key, value, key_mask=None):
        key_seq_length = float(key.get_shape().as_list()[-1])
        key = tf.transpose(key, perm=[0, 2, 1])
        outputs = tf.matmul(query, key) / tf.sqrt(key_seq_length)
        
        masks = tf.ones_like(outputs)
        masks = tf.cast(tf.logical_and(tf.cast(masks, tf.bool),
                                      tf.cast(tf.expand_dims(key_mask, 1), tf.bool)),
                       tf.float32)
        if self.sub_masked:
            diag_vals = tf.ones_like(outputs[0, :, :])
            tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
            subsequent_masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1])
            masks = tf.cast(tf.logical_and(tf.cast(masks, tf.bool),
                                          tf.cast(subsequent_masks, tf.bool)),
                           tf.float32)
        inf = tf.ones_like(masks) * (-2 ** 32 + 1)
        outputs = tf.where(tf.equal(masks, 0), inf, outputs)

        attention_map = tf.nn.softmax(outputs)

        return tf.matmul(attention_map, value)

    def call(self, query, key, value, key_mask):
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        query = tf.concat(tf.split(query, self.heads, axis=-1), axis=0)
        key = tf.concat(tf.split(key, self.heads, axis=-1), axis=0)
        value = tf.concat(tf.split(value, self.heads, axis=-1), axis=0)

        attention_map = self.scaled_dot_product_attention(query, key, value, key_mask)

        attn_outputs = tf.concat(tf.split(attention_map, self.heads, axis=0), axis=-1)

        return self.out_dense(attn_outputs)

NameError: name 'tf' is not defined

In [91]:
class PositionWiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, num_units, feature_shape):
        super(PositionWiseFeedForward, self).__init__()

        self.inner_dense = tf.keras.layers.Dense(num_units, activation=tf.nn.relu)
        self.output_dense = tf.keras.layers.Dense(feature_shape)

    def call(self, inputs):
        inner_layer = self.inner_dense(inputs)
        outputs = self.output_dense(inner_layer)

        return outputs

In [92]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, model_dims, ffn_dims, attn_heads, num_layers=1):
        super(Encoder, self).__init__()

        self.self_attention = [MultiHeadAttention(model_dims, attn_heads) for _ in range(num_layers)]
        self.position_feedforward = [PositionWiseFeedForward(ffn_dims, model_dims) for _ in range(num_layers)]

    def call(self, inputs, src_mask):
        output_layer = None

        for i, (s_a, p_f) in enumerate(zip(self.self_attention, self.position_feedforward)):
            with tf.variable_scope('encoder_layer_' + str(i + 1)):
                attention_layer = sublayer_connection(inputs, s_a(inputs, inputs, inputs, src_mask))
                output_layer = sublayer_connection(attention_layer, p_f(attention_layer))

                inputs = output_layer

        return output_layer

In [93]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, model_dims, ffn_dims, attn_heads, num_layers=1):
        super(Decoder, self).__init__()

        self.self_attention = [MultiHeadAttention(model_dims, attn_heads, sub_masked=True) for _ in range(num_layers)]
        self.encoder_decoder_attention = [MultiHeadAttention(model_dims, attn_heads) for _ in range(num_layers)]
        self.position_feedforward = [PositionWiseFeedForward(ffn_dims, model_dims) for _ in range(num_layers)]

    def call(self, inputs, encoder_outputs, src_mask, tgt_mask):
        output_layer = None

        for i, (s_a, ed_a, p_f) in enumerate(zip(self.self_attention, self.encoder_decoder_attention, self.position_feedforward)):
            with tf.variable_scope('decoder_layer_' + str(i + 1)):
                masked_attention_layer = sublayer_connection(inputs, s_a(inputs, inputs, inputs, tgt_mask))
                attention_layer = sublayer_connection(masked_attention_layer, ed_a(masked_attention_layer,
                                                                                           encoder_outputs,
                                                                                           encoder_outputs, src_mask))
                output_layer = sublayer_connection(attention_layer, p_f(attention_layer))
                inputs = output_layer

        return output_layer

In [99]:
def model_function(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    src_mask = tf.not_equal(features['question'], 0)
    tgt_mask = tf.not_equal(features['answer'], 0)
    
    src_mask = tf.tile(src_mask, [params['attention_head_size'], 1])
    tgt_mask = tf.tile(tgt_mask, [params['attention_head_size'], 1])

    position_encode = positional_encoding(params['model_hidden_size'], params['max_sequence_length'])

    embedding = tf.keras.layers.Embedding(params['vocabulary_length'],
                                          params['model_hidden_size'])

    encoder_layers = Encoder(params['model_hidden_size'], params['ffn_hidden_size'],
                      params['attention_head_size'], params['layer_size'])

    decoder_layers = Decoder(params['model_hidden_size'], params['ffn_hidden_size'],
                      params['attention_head_size'], params['layer_size'])

    logit_layer = tf.keras.layers.Dense(params['vocabulary_length'])

    with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
        x_embedded_matrix = embedding(features['question']) + position_encode
        encoder_outputs = encoder_layers(x_embedded_matrix, src_mask)

    loop_count = params['max_sequence_length'] if PREDICT else 1

    predict, output, logits = None, None, None

    for i in range(loop_count):
        with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
            if i > 0:
                output = tf.concat([tf.ones((output.shape[0], 1), dtype=tf.int64), predict[:, :-1]], axis=-1)
            else:
                output = features['answer']

            y_embedded_matrix = embedding(output) + position_encode
            decoder_outputs = decoder_layers(y_embedded_matrix, encoder_outputs, src_mask, tgt_mask)

            logits = logit_layer(decoder_outputs)
            predict = tf.argmax(logits, 2)

    if PREDICT:
        predictions = {
            'indexs': predict,
            'logits': logits,
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
    accuracy = tf.metrics.accuracy(labels=labels, predictions=predict, name='accOp')

    metrics = {'accuracy': accuracy}
    tf.summary.scalar('accuracy', accuracy[1])

    if EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

    assert TRAIN

    optimizer = tf.train.AdamOptimizer(learning_rate=params['learning_rate'])
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

In [100]:
hyper_params = {'vocabulary_length': len(dataset.word2idx),
                'embedding_size': 128,
                'model_hidden_size': 128,
                'ffn_hidden_size': 128*4,
                'layer_size': 3,
                'attention_head_size': 8,
                'max_sequence_length': 30,
                'learning_rate': 0.001}

In [101]:
tf.logging.set_verbosity(tf.logging.INFO)

In [102]:
estimator = tf.estimator.Estimator(model_fn = model_function,
                                   params=hyper_params,
                                   model_dir =data_out_path+'transformer')

I1016 21:04:50.374693 4554216896 estimator.py:1790] Using default config.
I1016 21:04:50.375674 4554216896 estimator.py:209] Using config: {'_model_dir': './data_out/transformer', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8b265fc390>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
estimator.train(dataset.train_input_fn)