In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

In [9]:
base_path = '/content/gdrive/My Drive/Colab Notebooks'

In [3]:
data_in_path = base_path + '/data_in/'
data_out_path = base_path + '/data_out/'

In [4]:
if not os.path.exists(data_in_path):
    os.makedirs(data_in_path)
    
if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import os

from random import shuffle
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
data = pd.read_csv(data_in_path + 'ChatBotData.csv')

In [7]:
train_data, test_data = train_test_split(data, test_size = 0.1)

In [8]:
train_text_list = (train_data.Q + ' // ' + train_data.A).tolist()
test_text_list = (test_data.Q + ' // ' + test_data.A).tolist()

In [None]:
train_text = '\n'.join(train_text_list)
test_text = '\n'.join(test_text_list)

In [None]:
train_text_path = data_in_path + 'train_chat_data.txt' 
test_text_path = data_in_path + 'test_chat_data.txt' 

In [8]:
with open(train_text_path, 'w') as f:
    f.write(train_text)

with open(test_text_path, 'w') as f:
    f.write(test_text)

In [9]:
class Dataset:
    
    def __init__(self, train_path, test_path, is_shuffle, train_bs,
                 test_bs, epoch, max_length, vocab_path):
        self.train_path = train_path
        self.test_path = test_path
        self.is_shuffle = is_shuffle
        self.train_bs = train_bs
        self.test_bs = test_bs
        self.epoch = epoch
        self.max_length = max_length
        self.okt = Okt()
        self.special_tokens = ['<PAD>', '<BOS>', '<EOS>']
        
        if not os.path.exists(vocab_path):
            print('There is no vocabulary...')
            print('Building vocabulary...')
            self.build_vocab_by_chatdata(vocab_path)
            print('Successfully build vocabulary!')
        
        print('Loading vocabulary...')    
        self.idx2word, self.word2idx = pickle.load(open(vocab_path, 'rb'))
        print('Successfully load vocabulary!')
    
    def build_vocab(self, word_list):
        from collections import Counter

        word_counts = Counter(word_list)
        idx2word = self.special_tokens + [word for word, _ in word_counts.most_common()]
        word2idx = {word:idx for idx, word in enumerate(idx2word)}

        return idx2word, word2idx
    
    def build_vocab_by_chatdata(self, vocab_path):
        data = []
        with open(self.train_path, 'r') as f:
            for line in f.readlines():
                data += line.split('//')

        tokenized_data = self.tokenize_by_morph(data)
        
        word_list = sum(tokenized_data, [])
        idx2word, word2idx = self.build_vocab(word_list)
        
        vocab = (idx2word, word2idx)
        pickle.dump(vocab, open(vocab_path, 'wb'))
        
    def tokenize_by_morph(self, text):
        tokenized_text = []
        for sentence in text:
            tokenized_text.append(self.okt.morphs(sentence))

        return tokenized_text
    
    def text_to_sequence(self, text_list):
        sequences = []
        for text in text_list:
            sequences.append([self.word2idx[word] for word in text if word in self.word2idx.keys()])

        return sequences

    def sequence_to_text(self, sequence):
        
        return [self.idx2word[idx] for idx in sequence if idx != 0]
    
    def make_decoder_input_and_label(self, answers):
        
        decoder_input = []
        labels = []
        
        for sentence in answers:
            decoder_input.append(['<BOS>'] + sentence[:-1])
            labels.append(sentence[1:] + ['<EOS>'])
        
        return decoder_input, labels
            
    
    def read_lines(self, indices, path):
        questions = []
        answers = []

        with open(path, 'r') as f:
            lines = f.readlines()

        for index in indices:
            text = lines[index]
            question, answer = text.split('//')
            questions.append(question)
            answers.append(answer)

        return questions, answers

    def data_generator(self, is_train):

        if is_train:
            batch_size = self.train_bs
            is_shuffle = self.is_shuffle
            path = self.train_path
        else:
            batch_size = self.test_bs
            is_shuffle = False
            path = self.test_path

        with open(path, 'r') as f:
            data_length = len(f.readlines())

        indices = list(range(data_length))
        if is_shuffle:
            shuffle(indices)

        current_count = 0
        while True:
            if current_count >= data_length:
                return
            else:
                target_indices = indices[current_count:current_count+batch_size]
                questions, answers = self.read_lines(target_indices, path)

                tokenized_questions = self.tokenize_by_morph(questions)
                tokenized_answers = self.tokenize_by_morph(answers)
                
                tokenized_encoder_inputs = tokenized_questions
                tokenized_decoder_inputs, tokenized_labels = self.make_decoder_input_and_label(tokenized_answers)
                

                indexed_encoder_inputs = self.text_to_sequence(tokenized_encoder_inputs)
                indexed_decoder_inputs = self.text_to_sequence(tokenized_decoder_inputs)
                indexed_labels = self.text_to_sequence(tokenized_labels)


                padded_encoder_inputs = pad_sequences(indexed_encoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'post')
                padded_decoder_inputs = pad_sequences(indexed_decoder_inputs,
                                                      maxlen = self.max_length,
                                                      padding = 'post')

                padded_labels = pad_sequences(indexed_labels,
                                              maxlen = self.max_length,
                                              padding = 'post')


                yield padded_encoder_inputs, padded_decoder_inputs, padded_labels

In [10]:
train_text_path = data_in_path + 'train_chat_data.txt' 
test_text_path = data_in_path + 'test_chat_data.txt' 
vocab_path = data_in_path+'ChatBotData.voc'

dataset = Dataset(train_path = train_text_path,
                  test_path = test_text_path,
                  is_shuffle = True,
                  train_bs = 64,
                  test_bs = 128,
                  epoch = 10,
                  max_length = 30,
                  vocab_path = vocab_path)

Loading vocabulary...
Successfully load vocabulary!


-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [25]:
def mapping_fn(src, tgt, label = None):
    features = {"src": src, 'tgt': tgt}
    if label is not None:
        return features, label
    else:
        return features
    
train_dataset = tf.data.Dataset.from_generator(generator = lambda: dataset.data_generator(is_train=True),
                                        output_types = (tf.int64, tf.int64, tf.int64),
                                        output_shapes = ((None, dataset.max_length),
                                                         (None, dataset.max_length),
                                                         (None, dataset.max_length)))
train_dataset = train_dataset.map(mapping_fn)
        

test_dataset = tf.data.Dataset.from_generator(generator = lambda: dataset.data_generator(is_train=False),
                                        output_types = (tf.int64, tf.int64),
                                        output_shapes = ((None, dataset.max_length),
                                                         (None, dataset.max_length)))
test_dataset = test_dataset.map(mapping_fn)

In [26]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [27]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [28]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [29]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [46]:
def create_masks(src, tgt):
    enc_padding_mask = create_padding_mask(src)

    dec_padding_mask = create_padding_mask(src)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tgt)[1])
    dec_target_padding_mask = create_padding_mask(tgt)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

In [47]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)

    return output, attention_weights

In [48]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)

        return output, attention_weights

In [49]:
class PositionWiseFeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForwardNetwork, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.ff1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.ff2 = tf.keras.layers.Dense(d_model)
    
    def call(self, x):
        x = self.ff1(x)
        x = self.ff2(x)
        
        return x

In [50]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionWiseFeedForwardNetwork(d_model, d_ff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, mask):

        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [51]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = PositionWiseFeedForwardNetwork(d_model, d_ff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
    def call(self, x, enc_output, 
           look_ahead_mask, padding_mask):

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2

In [52]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, d_ff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)


        self.enc_layers = [EncoderLayer(d_model, num_heads, d_ff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, mask):

        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask)

        return x

In [53]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, d_ff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, d_ff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, 
           look_ahead_mask, padding_mask):

        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output,
                                                 look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [60]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, d_ff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, d_ff, 
                               input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, d_ff, 
                               target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs):
        
        src = inputs['src']
        tgt = inputs['tgt']
        
        enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(src, tgt)

        enc_output = self.encoder(src, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        dec_output, _ = self.decoder(
            tgt, enc_output, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output

In [61]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [62]:
model_name = 'transformer'
num_layers = 2
d_model = 256
num_heads = 4
dff = 256 * 4
input_vocab_size = len(dataset.word2idx)
target_vocab_size = len(dataset.word2idx)
pe_input = len(dataset.word2idx)
pe_target = len(dataset.word2idx)
dropout_rate = 0.1
learning_rate = 0.001

In [63]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

learning_rate = CustomSchedule(d_model)

In [64]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                                 epsilon=1e-9),
                    loss=loss_function,
                    metric=[train_loss, train_accuracy])

In [65]:
checkpoint_path = data_out_path + model_name + '/weights.{epoch:02d}-{val_loss:.2f}'
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, verbose=1, save_weights_only=True)

transformer.fit(train_dataset, epochs=10,
                 validation_data=test_dataset,
                 callbacks=[cp_callback])

Epoch 1/10
    201/Unknown - 114s 567ms/step - loss: 1.9670

KeyError: 'val_loss'