In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from konlpy.tag import Mecab
import nltk
import gensim
import random
import re
from tqdm.notebook import tqdm
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [21]:
data = pd.read_csv("./ChatbotData.csv")

In [22]:
data
question = data["Q"]
answer = data["A"]

In [23]:
def preprocessing(sentences):
    s = []
    for sentence in sentences:
        temp = sentence.lower().strip()
        temp = re.sub("ㅋ+", " ㅋㅋ", temp)
        temp = re.sub("ㅎ+", " ㅎㅎ", temp)
        temp = re.sub("ㅜ+", " ㅜㅜ", temp)
        temp = re.sub("ㅠ+", " ㅠㅠ", temp)
        temp = re.sub(r"!+", "!", temp)
        temp = re.sub(r"~+", "~", temp)
        temp = re.sub(r";+", ";", temp)
        temp = re.sub(r"\"", "", temp)
        temp = re.sub(r"([.,!?~;])", r" \1", temp)
        s.append(temp)
    return s

In [24]:
question = preprocessing(question)
answer = preprocessing(answer)

In [25]:
valid_size = 10
question = question[valid_size:]
answer = answer[valid_size:]

valid_input = question[:valid_size]
valid_output = answer[:valid_size]

In [26]:
print(question[:10])

['sns보면 나만 빼고 다 행복해보여', '가끔 궁금해', '가끔 뭐하는지 궁금해', '가끔은 혼자인게 좋다', '가난한 자의 설움', '가만 있어도 땀난다', '가상화폐 쫄딱 망함', '가스불 켜고 나갔어', '가스불 켜놓고 나온거 같아', '가스비 너무 많이 나왔다 .']


In [27]:
def cut_length(question, answer):
    temp_question = []
    temp_answer = []
    for q, a in zip(question, answer):
        if len(q)< 40 and len(a) < 40:
            temp_question.append(q)
            temp_answer.append(a)
    return temp_question, temp_answer

In [28]:
question, answer = cut_length(question, answer)
print(len(question), len(answer))

11678 11678


In [29]:
from sklearn.model_selection import train_test_split
question, test_input, answer, test_output = train_test_split(question, answer, test_size=0.1)

In [30]:
def build_corpus(source, target, tokenizer):
    words = []
    source_tokens = []
    for s in source:
        temp = tokenizer.morphs(s)
        source_tokens.append(temp)
        words += temp
    
    target_tokens = []
    for t in target:
        temp = tokenizer.morphs(t)
        target_tokens.append(temp)
        words += temp
        
    counter = Counter(words)
    counter = counter.most_common()
    
    words = ["<pad>", "<sos>", "<eos>"] + [key for key, _ in counter]
    word_to_index = {word : index for index, word in enumerate(words)}
    
    source_corpus = []
    target_corpus = []
    for source_token in source_tokens:
        source_corpus.append([word_to_index[s] for s in source_token])
    for target_token in target_tokens:
        target_corpus.append([word_to_index[t] for t in target_token])
        
    return source_corpus, target_corpus, word_to_index

In [31]:
mecab = Mecab()
que_corpus, ans_corpus, word_to_index = build_corpus(question, answer, mecab)

In [32]:
index_to_word = {index:value for index, value in enumerate(word_to_index)}

In [33]:
def make_sentence(tokens):
    return [index_to_word[token] for token in tokens]

In [34]:
def return_tokens(sentence):
    return [word_to_index[s] for s in sentence.split()]

In [35]:
from gensim.models.word2vec import Word2Vec

file_path = "./ko.bin"
wv_model = Word2Vec.load(file_path)

In [36]:
def lexical_sub(toks, word2vec):
    res = ""
    
    try:
        _from = random.choice(toks)
        _to = word2vec.most_similar(_from)[0][0]
        
    except:
        return None, None

    for tok in toks:
        if tok is _from: res += _to + " "
        else: res += tok + " "

    return res, _to

In [37]:
questions = []
answers = []
for q, a in zip(que_corpus, ans_corpus):
    questions.append(q)
    answers.append(a)
    
    temp, new_word = lexical_sub(make_sentence(q), wv_model)
    if temp is not None and new_word in word_to_index:
        questions.append(return_tokens(temp))
        answers.append(a)
    
    temp, new_word = lexical_sub(make_sentence(a), wv_model)
    if temp is not None and new_word in word_to_index:
        questions.append(q)
        answers.append(return_tokens(temp))

  _to = word2vec.most_similar(_from)[0][0]


In [38]:
answers = [[word_to_index["<sos>"]] + corpus + [word_to_index["<eos>"]] for corpus in answers]

In [39]:
print(len(questions), len(answers))
print(questions[:10])
print(answers[:10])
print(len(word_to_index))

21105 21105
[[3108, 5, 6, 43, 4, 62, 2170, 107, 111, 3, 529, 125, 663, 193, 29, 65, 58, 102], [3108, 5, 6, 43, 4, 62, 2170, 107, 111, 3, 529, 125, 663, 193, 29, 65, 58, 102], [494, 4366, 100, 107, 67, 3], [494, 4366, 100, 107, 67, 3], [3109, 466, 6, 168, 94, 949, 104, 24, 18, 19], [2568, 130, 14], [2568, 130, 14], [65, 582, 59, 334], [105, 6, 194, 403, 21, 682, 184], [2127, 6, 194, 403, 21, 682, 184]]
[[1, 2170, 7, 1235, 15, 8, 3, 2], [1, 2170, 7, 1235, 15, 8, 50, 2], [1, 113, 57, 48, 5, 6, 26, 17, 2301, 5, 8, 3, 2], [1, 113, 57, 48, 1120, 6, 26, 17, 2301, 1120, 8, 3, 2], [1, 2977, 100, 107, 168, 4, 147, 104, 7, 13, 27, 3, 2], [1, 47, 365, 40, 16, 7, 13, 27, 3, 2], [1, 47, 365, 40, 16, 30, 13, 27, 3, 2], [1, 179, 1054, 4, 38, 20, 24, 28, 3, 2], [1, 565, 32, 166, 115, 45, 286, 59, 13, 147, 257, 28, 3, 2], [1, 565, 32, 166, 115, 45, 286, 59, 13, 147, 257, 28, 3, 2]]
6515


In [40]:
encoder_input = tf.keras.preprocessing.sequence.pad_sequences(questions, padding='post')
decoder_input = tf.keras.preprocessing.sequence.pad_sequences(answers, padding='post')

In [41]:
BATCH_SIZE = 128
train_dataset = tf.data.Dataset.from_tensor_slices((encoder_input, decoder_input)).batch(batch_size=BATCH_SIZE)

In [42]:
def positional_encoding(pos, d_model):
    table = np.zeros((pos, d_model))
    for i in range(pos):
        for j in range(d_model):
            table[i][j] = i / (10000 ** (2*(j//2) / d_model))
    
    sinusoid_table = table

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])   
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    return sinusoid_table

In [43]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [44]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        out = self.combine_heads(out)
        out = self.linear(out)
            
        return out, attention_weights

In [45]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        return out

In [46]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

In [47]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual
        
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [48]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
    
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask): 
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [49]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [50]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [51]:
transformer = Transformer(
    n_layers=4,
    d_model=512,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=len(word_to_index),
    tgt_vocab_size=len(word_to_index),
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)

In [52]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

In [53]:
learning_rate = LearningRateScheduler(512)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [54]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [55]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1] 
    gold = tgt[:, 1:]   

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)
    
    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [68]:
EPOCHS = 1

for epoch in range(EPOCHS):
    total_loss = 0
    
    dataset_count = tf.data.experimental.cardinality(train_dataset).numpy()
    tqdm_bar = tqdm(total=dataset_count)
    for step, (enc_batch, dec_batch) in enumerate(train_dataset):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_batch,
                    dec_batch,
                    transformer,
                    optimizer)

        total_loss += batch_loss
        
        tqdm_bar.set_description_str('Epoch %2d' % (epoch + 1))
        tqdm_bar.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (step + 1)))
        tqdm_bar.update()


  0%|          | 0/165 [00:00<?, ?it/s]

In [60]:
def translate(input_sentence):
    sentence = preprocessing([input_sentence])
    input_sentence = [word_to_index[s] for s in mecab.morphs(input_sentence)]
    input_sentence = tf.keras.preprocessing.sequence.pad_sequences([input_sentence], padding="post", maxlen=39)
    
    output_sentence = []
    output_sequence = tf.expand_dims([word_to_index["<sos>"]], 0)
    
    flag = True
    while(flag):
        enc_padding_mask, combined_mask, dec_padding_mask = generate_masks(input_sentence, output_sequence)
        predictions, _, _, _ = transformer(input_sentence, output_sequence, enc_padding_mask, combined_mask, dec_padding_mask)
        
        predicted_id = tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()
        
        if predicted_id == word_to_index["<eos>"]:
            flag = False
            
        output_sentence.append(index_to_word[predicted_id])
        output_sequence = tf.concat([output_sequence, tf.expand_dims([predicted_id], 0)], axis=-1)
    return " ".join(output_sentence[:-1])

In [76]:
print(translate("배고파"))
print(translate("밥먹을까"))
print(translate("김치찌개 먹을래"))

챙겨 챙겨 챙겨 챙겨 드세요 .
맛있 게 드세요 !
맛있 게 드세요 .


In [70]:
def eval_bleu_single(model, src_sentence, tgt_sentence, verbose=True):
    if (len(src_sentence) > 40): return None
    if (len(tgt_sentence) > 40): return None

    reference = tgt_sentence
    candidate = translate(src_sentence)

    score = sentence_bleu([reference], candidate,
                          smoothing_function=SmoothingFunction().method1)

    if verbose:
        print("Source Sentence: ", src_sentence)
        print("Model Prediction: ", candidate)
        print("Real: ", reference)
        print("Score: %lf\n" % score)
        
    return score

In [71]:
def eval_bleu(model, src_sentences, tgt_sentence, verbose=True):
    total_score = 0.0
    sample_size = len(src_sentences)
    
    for idx in tqdm(range(sample_size)):
        score = eval_bleu_single(model, src_sentences[idx], tgt_sentence[idx], verbose)
        if not score: continue
        
        total_score += score
    
    print("Num of Sample:", sample_size)
    print("Total Score:", total_score / sample_size)

In [72]:
eval_bleu(transformer, valid_input, valid_output)

  0%|          | 0/10 [00:00<?, ?it/s]

Source Sentence:  sns보면 나만 빼고 다 행복해보여
Model Prediction:  자랑 하 는 자리 니까요 .
Real:  자랑하는 자리니까요 .
Score: 0.457883

Source Sentence:  가끔 궁금해
Model Prediction:  그럴 수 있 어요 .
Real:  그 사람도 그럴 거예요 .
Score: 0.128357

Source Sentence:  가끔 뭐하는지 궁금해
Model Prediction:  애정 과 표현 하 는 게 좋 을 것 같 아요 .
Real:  그 사람도 그럴 거예요 .
Score: 0.042764

Source Sentence:  가끔은 혼자인게 좋다
Model Prediction:  혼자 가 서 도 중요 해요 .
Real:  혼자를 즐기세요 .
Score: 0.080121

Source Sentence:  가난한 자의 설움
Model Prediction:  다시 다시 다시 다시 들어올 거 예요 .
Real:  돈은 다시 들어올 거예요 .
Score: 0.475952

Source Sentence:  가만 있어도 땀난다
Model Prediction:  땀 식혀 주 세요 .
Real:  땀을 식혀주세요 .
Score: 0.361328

Source Sentence:  가상화폐 쫄딱 망함
Model Prediction:  좋 은 사람 이 길 바라 요 .
Real:  어서 잊고 새출발 하세요 .
Score: 0.067701

Source Sentence:  가스불 켜고 나갔어
Model Prediction:  천천히 집 에 돌아가 돌아가 돌아가 고 있 어요 .
Real:  빨리 집에 돌아가서 끄고 나오세요 .
Score: 0.202849

Source Sentence:  가스불 켜놓고 나온거 같아
Model Prediction:  빨리 집 에 돌아가 돌아가 돌아가 돌아가 고 있 가 세요 .
Real:  빨리 집에 돌아가서 끄고 나오세요 .
Score: 0.266153

Source Sentence