In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from collections import Counter
import random
import math
from konlpy.tag import Mecab


In [2]:
# 포지셔널 인코딩 레이어
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, (2*(i//2)) / np.float32(d_model))

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

In [3]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [4]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        out = self.combine_heads(out)
        out = self.linear(out)
            
        return out, attention_weights

In [5]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
            
        return out

In [6]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

In [7]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        '''
        Multi-Head Attention
        '''
        residual = out
        out = self.norm_2(out)
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [8]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
    
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [9]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [10]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

        
    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [11]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

In [12]:
d_model = 512
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)

In [13]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [14]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  # Decoder의 input
    gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [15]:
#https://github.com/songys/Chatbot_data
train_data = pd.read_csv('~/aiffel/transformer_chatbot/data/ChatbotData.csv')
train_data.head(30)


Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0
5,SD카드 망가졌어,다시 새로 사는 게 마음 편해요.,0
6,SD카드 안돼,다시 새로 사는 게 마음 편해요.,0
7,SNS 맞팔 왜 안하지ㅠㅠ,잘 모르고 있을 수도 있어요.,0
8,SNS 시간낭비인 거 아는데 매일 하는 중,시간을 정하고 해보세요.,0
9,SNS 시간낭비인데 자꾸 보게됨,시간을 정하고 해보세요.,0


In [16]:
print(train_data.isnull().sum())


Q        0
A        0
label    0
dtype: int64


In [17]:
def preprocess_sentence(sentence):
  # 입력받은 sentence를 소문자로 변경하고 양쪽 공백을 제거
  sentence = sentence.lower()
  # 단어와 구두점(punctuation) 사이의 거리를 만듭니다.
  # 예를 들어서 "I am a student." => "I am a student ."와 같이
  # student와 온점 사이에 거리를 만듭니다.
  sentence = re.sub(r"[^a-zA-Z?.!,가-힣ㄱ-ㅎㅏ-ㅣ0-9]+", " ", sentence)  
  sentence = sentence.strip()
  return sentence

In [18]:
# 데이터를 전처리하고 토큰화하는 함수
def build_corpus(data, tokenizer):
    train_data = data.dropna(how = 'any') 
    train_data.drop_duplicates(subset=['Q'], inplace=True)#Q중복제거
    train_data.drop_duplicates(subset=['A'], inplace=True)
    que_corpus=[]
    ans_corpus=[]
    max_len = 50  # 일정 길이 이상인 문장은 제외

    for _, row in data.iterrows():
        src_sentence = preprocess_sentence(row['Q'])
        tgt_sentence = preprocess_sentence(row['A'])

        src_tokens = tokenizer.morphs(src_sentence)
        tgt_tokens = tokenizer.morphs(tgt_sentence)

        if len(src_tokens) <= 0 or len(tgt_tokens) <= 0:
            continue  # 토큰의 개수가 0인 문장은 제외

        if len(src_tokens) > max_len or len(tgt_tokens) > max_len:
            continue  # 일정 길이 이상인 문장은 제외
        que_corpus.append(src_tokens)
        ans_corpus.append(["<start>"] + tgt_tokens + ["<end>"])
    words = np.concatenate(que_corpus+ans_corpus).tolist()
    counter = Counter(words)
    counter = counter.most_common()
    vocab = ['<PAD>', '<BOS>', '<UNK>', '<UNUSED>'] + [key for key, _ in counter]
    word_to_index = {word:index for index, word in enumerate(vocab)}
    return que_corpus,ans_corpus,word_to_index, {index:word for word, index in word_to_index.items()},len(vocab)
mecab = Mecab()

    
que_corpus, ans_corpus, word_to_index, index_to_word, VOCAB_SIZE = build_corpus(train_data, mecab)


In [19]:
# 데이터를 로드하고 전처리하여 질문을 questions, 답변을 answers에 저장합니다.
print('전체 샘플 수 :', len(que_corpus))
print('전체 샘플 수 :', len(ans_corpus))
print('전처리 후의 22번째 질문 샘플: {}'.format(que_corpus[21]))
print('전처리 후의 22번째 답변 샘플: {}'.format(ans_corpus[21]))

전체 샘플 수 : 11823
전체 샘플 수 : 11823
전처리 후의 22번째 질문 샘플: ['가스', '비', '장난', '아님']
전처리 후의 22번째 답변 샘플: ['<start>', '다음', '달', '에', '는', '더', '절약', '해', '봐요', '.', '<end>']


In [20]:
# 문장 1개를 활용할 딕셔너리와 함께 주면, 단어 인덱스 리스트 벡터로 변환해 주는 함수입니다. 
# 단, 모든 문장은 <BOS>로 시작하는 것으로 합니다. 
def get_encoded_sentence(sentence, word_to_index):
    return [word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in sentence]

# 여러 개의 문장 리스트를 한꺼번에 단어 인덱스 리스트 벡터로 encode해 주는 함수입니다. 
def get_encoded_sentences(sentences, word_to_index):
    return [get_encoded_sentence(sentence, word_to_index) for sentence in sentences]

# 숫자 벡터로 encode된 문장을 원래대로 decode하는 함수입니다. 
def get_decoded_sentence(encoded_sentence, index_to_word):
    return ' '.join(index_to_word[index] if index in index_to_word else '<UNK>' for index in encoded_sentence) 
#     return ' '.join(index_to_word[index] if index in index_to_word else '<UNK>' for index in encoded_sentence[1:])  #[1:]를 통해 <BOS>를 제외

# 여러 개의 숫자 벡터로 encode된 문장을 한꺼번에 원래대로 decode하는 함수입니다. 
def get_decoded_sentences(encoded_sentences, index_to_word):
    return [get_decoded_sentence(encoded_sentence, index_to_word) for encoded_sentence in encoded_sentences]

In [21]:
test_sentence_count = len(que_corpus)//200

enc_train = get_encoded_sentences(que_corpus[:-test_sentence_count], word_to_index)
dec_train = get_encoded_sentences(ans_corpus[:-test_sentence_count], word_to_index)

enc_test = get_encoded_sentences(que_corpus[-test_sentence_count:], word_to_index)
dec_test = get_encoded_sentences(ans_corpus[-test_sentence_count:], word_to_index)

In [22]:
MAX_LEN = 50
enc_ndarray = tf.keras.preprocessing.sequence.pad_sequences(enc_train, maxlen=MAX_LEN, padding='post')
dec_ndarray = tf.keras.preprocessing.sequence.pad_sequences(dec_train, maxlen=MAX_LEN, padding='post')


In [23]:
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((enc_ndarray, dec_ndarray)).batch(batch_size=BATCH_SIZE)


In [24]:
transformer = Transformer(
    n_layers=2,
    d_model=d_model,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)


In [25]:
# Q. 위의 코드를 활용하여 모델을 훈련시켜봅시다!
EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    
    dataset_count = tf.data.experimental.cardinality(train_dataset).numpy()
    tqdm_bar = tqdm(total=dataset_count)
    for step, (enc_batch, dec_batch) in enumerate(train_dataset):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_batch,
                    dec_batch,
                    transformer,
                    optimizer)

        total_loss += batch_loss
        
        tqdm_bar.set_description_str('Epoch %2d' % (epoch + 1))
        tqdm_bar.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (step + 1)))
        tqdm_bar.update()

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

In [26]:
def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    return sentence_bleu([reference],
                         candidate,
                         weights=weights,
                         smoothing_function=SmoothingFunction().method1)  # smoothing_function 적용

In [27]:
def translate(tokens, model, src_tokenizer, tgt_tokenizer):
    padded_tokens = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=MAX_LEN,
                                                           padding='post')
    ids = []
    output = tf.expand_dims([word_to_index['<start>']], 0)   
    for i in range(MAX_LEN):
        enc_padding_mask, combined_mask, dec_padding_mask = \
        generate_masks(padded_tokens, output)

        predictions, _, _, _ = model(padded_tokens, 
                                      output,
                                      enc_padding_mask,
                                      combined_mask,
                                      dec_padding_mask)

        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if word_to_index['<end>'] == predicted_id:
#             result = tgt_tokenizer.decode_ids(ids)  
            print(ids)
            print([word_to_index['<start>']])
            result = ids
            return result

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)
#     result = tgt_tokenizer.decode_ids(ids)  
    result = ids  
    return result

In [28]:
def eval_bleu_single(model, src_sentence, tgt_sentence, src_tokenizer, tgt_tokenizer, verbose=True):
#     src_tokens = src_tokenizer.encode_as_ids(src_sentence)
#     tgt_tokens = tgt_tokenizer.encode_as_ids(tgt_sentence)

    if (len(src_sentence) > MAX_LEN): return None
    if (len(tgt_sentence) > MAX_LEN): return None

#     reference = tgt_sentence.split()
    candidate = translate(src_sentence, model, src_tokenizer, tgt_tokenizer)#.split()

    score = sentence_bleu([tgt_sentence], candidate,
                          smoothing_function=SmoothingFunction().method1)

    if verbose:
        print("Source Sentence: ", src_sentence)
        print("Model Prediction: ", candidate)
        print("Real: ", reference)
        print("Score: %lf\n" % score)
        
    return score

In [29]:
def eval_bleu(model, src_sentences, tgt_sentence, src_tokenizer, tgt_tokenizer, verbose=True):
    total_score = 0.0
    sample_size = len(src_sentences)
    
    for idx in tqdm(range(sample_size)):
        score = eval_bleu_single(model, src_sentences[idx], tgt_sentence[idx], src_tokenizer, tgt_tokenizer, verbose)
        if not score: continue
        
        total_score += score
    
    print("Num of Sample:", sample_size)
    print("Total Score:", total_score / sample_size)

In [30]:
eval_bleu(transformer, enc_test, dec_test, mecab, mecab, verbose=False)

  0%|          | 0/59 [00:00<?, ?it/s]

[15, 24, 8, 268, 15, 24, 55, 16, 30, 4]
[5]
[15, 24, 8, 268, 94, 14, 62, 16, 30, 4]
[5]
[37, 20, 102, 42, 25, 18, 21, 54, 9, 95, 195, 106, 102, 24, 19, 11, 4]
[5]
[623, 414, 37, 20, 102, 42, 18, 9, 18, 10, 16, 30, 4]
[5]
[214, 214, 1815, 42, 18, 554, 62, 42, 18, 31, 4]
[5]
[15, 24, 8, 9, 551, 7, 79, 89, 189, 165, 13, 19, 9, 95, 276, 93, 4]
[5]
[15, 24, 8, 9, 95, 276, 93, 4]
[5]
[15, 24, 8, 9, 56, 10, 499, 9, 26, 15, 27, 38, 4]
[5]
[15, 24, 8, 14, 18, 9, 95, 276, 93, 4]
[5]
[15, 24, 8, 9, 95, 115, 7, 59, 66, 23, 47, 4]
[5]
[65, 193, 8, 21, 54, 89, 23, 47, 4]
[5]
[134, 7, 44, 9, 193, 59, 9, 95, 276, 93, 4]
[5]
[37, 7, 38, 4]
[5]
[15, 24, 8, 76, 201, 4]
[5]
[37, 48, 369, 28, 456, 491, 12, 93, 4]
[5]
[15, 24, 8, 9, 56, 7, 59, 66, 23, 47, 4]
[5]
[15, 24, 8, 9, 95, 732, 214, 61, 79, 24, 19, 9, 95, 276, 93, 4]
[5]
[804, 81, 944, 14, 18, 9, 233, 348, 7, 143, 16, 30, 4]
[5]
[15, 24, 8, 9, 95, 109, 145, 50, 8, 9, 26, 15, 110, 4]
[5]
[37, 7, 44, 9, 16, 145, 269, 31, 4]
[5]
[247, 21, 54, 89, 23, 2

In [31]:
print(get_decoded_sentence(translate(get_encoded_sentence(mecab.morphs('지루하다, 놀러가고 싶어.'), word_to_index), transformer, mecab, mecab), index_to_word))
print(get_decoded_sentence(translate(get_encoded_sentence(mecab.morphs('오늘 일찍 일어났더니 피곤하다.'), word_to_index), transformer, mecab, mecab), index_to_word))

# sentence_generation('배고파')
# sentence_generation('졸려')
# sentence_generation('안녕')
# sentence_generation('내일 날씨')
# sentence_generation('재밌는 얘기 해봐')
# sentence_generation('입출력이 뭐야?')

[56, 78, 14, 23, 138, 211, 14, 165, 13, 19, 9, 95, 276, 93, 4]
[5]
마음 먹 고 나 서 놀 고 물 어 보 는 건 어떨까 요 .
[170, 64, 614, 347, 344, 7, 36, 4]
[5]
참 기 위해 그게 인생 이 죠 .


In [32]:
from gensim.models.keyedvectors import Word2VecKeyedVectors
wv = Word2VecKeyedVectors.load('/aiffel/data/word2vec_ko.model')


In [33]:
wv.wv.most_similar(positive='영화', topn=10)

[('드라마', 0.8418774008750916),
 ('뮤지컬', 0.7775140404701233),
 ('코미디', 0.7489107251167297),
 ('다큐멘터리', 0.7401294708251953),
 ('헐리우드', 0.7397844195365906),
 ('애니메이션', 0.7170552015304565),
 ('독립영화', 0.7113528251647949),
 ('로맨틱', 0.7107657194137573),
 ('장편', 0.7101576924324036),
 ('극영화', 0.7045413255691528)]

In [34]:
def lexical_sub(sentence, word2vec):
    import random

    res = []
    toks = sentence

    try:
        _from = random.choice(toks)
        _to = word2vec.most_similar(_from)[0][0]

    except:   # 단어장에 없는 단어
        return None

    for tok in toks:
        if tok is _from: res.append(_to)
        else: res.append(tok)

    return res

In [46]:
que_arg = []
ans_arg = []
for enc_tokens, dec_tokens in tqdm(zip(enc_train, dec_train)):
    new_enc = lexical_sub([index_to_word[index] if index in index_to_word else '<UNK>' for index in enc_tokens], wv.wv)
    new_dec = lexical_sub([index_to_word[index] if index in index_to_word else '<UNK>' for index in dec_tokens[1:-1]], wv.wv)
        
    if new_enc is not None: 
        que_arg.append(new_enc)
        ans_arg.append([index_to_word[index] if index in index_to_word else '<UNK>' for index in dec_tokens])
    if new_dec is not None: 
        que_arg.append([index_to_word[index] if index in index_to_word else '<UNK>' for index in enc_tokens])
#         print(new_dec)
        new_dec = ["<start>"]+new_dec+['<end>']
        ans_arg.append(new_dec)
    que_arg.append([index_to_word[index] if index in index_to_word else '<UNK>' for index in enc_tokens])
    ans_arg.append([index_to_word[index] if index in index_to_word else '<UNK>' for index in dec_tokens])
    
for enc_tokens, dec_tokens in tqdm(zip(que_arg, ans_arg)):
    print(enc_tokens)
    print(dec_tokens)
    break

0it [00:00, ?it/s]

0it [00:00, ?it/s]

['12', '시', '땡', '!">']
['<start>', '하루', '가', '또', '가', '네요', '.', '<end>']


In [47]:
enc_train = get_encoded_sentences(que_arg, word_to_index)
dec_train = get_encoded_sentences(ans_arg, word_to_index)
enc_ndarray = tf.keras.preprocessing.sequence.pad_sequences(enc_train, maxlen=MAX_LEN, padding='post')
dec_ndarray = tf.keras.preprocessing.sequence.pad_sequences(dec_train, maxlen=MAX_LEN, padding='post')
train_dataset = tf.data.Dataset.from_tensor_slices((enc_ndarray, dec_ndarray)).batch(batch_size=BATCH_SIZE)

In [48]:
transformer = Transformer(
    n_layers=2,
    d_model=d_model,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)

In [50]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  # Decoder의 input
    gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [51]:
EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    
    dataset_count = tf.data.experimental.cardinality(train_dataset).numpy()
    tqdm_bar = tqdm(total=dataset_count)
    for step, (enc_batch, dec_batch) in enumerate(train_dataset):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_batch,
                    dec_batch,
                    transformer,
                    optimizer)

        total_loss += batch_loss
        
        tqdm_bar.set_description_str('Epoch %2d' % (epoch + 1))
        tqdm_bar.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (step + 1)))
        tqdm_bar.update()

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/547 [00:00<?, ?it/s]

In [52]:
eval_bleu(transformer, enc_test, dec_test, mecab, mecab, verbose=False)

  0%|          | 0/59 [00:00<?, ?it/s]

[393, 1403, 61, 74, 4]
[5]
[576, 28, 165, 13, 19, 14, 40, 13, 19, 171, 15, 27, 31, 4]
[5]
[15, 24, 8, 9, 26, 109, 145, 1403, 84, 9, 562, 8, 14, 18, 31, 4]
[5]
[23, 7, 171, 650, 171, 15, 27, 12, 15, 27, 128, 15, 27, 12, 15, 27, 12, 15, 27, 12, 15, 27, 12, 570, 8, 9, 26, 15, 27, 12, 15, 27, 12, 15, 27, 12, 15, 27, 12, 15, 27, 12, 15, 27, 31, 4]
[5]
[114, 1030, 55, 84, 12, 109, 14, 250, 4]
[5]
[347, 427, 350, 7, 118, 80, 4]
[5]
[56, 7, 1656, 337, 47, 4]
[5]
[15, 24, 8, 9, 2, 15, 27, 31, 4]
[5]
[686, 9, 26, 44, 215, 98, 21, 54, 9, 26, 15, 27, 31, 4]
[5]
[104, 401, 7, 193, 12, 79, 64, 596, 7, 80, 4]
[5]
[677, 2387, 29, 7, 38, 4]
[5]
[677, 35, 134, 10, 4658, 50, 17, 57, 11, 4]
[5]
[92, 9, 260, 350, 7, 1447, 4]
[5]
[68, 10, 55, 42, 18, 128, 2018, 50, 358, 7, 80, 4]
[5]
[397, 103, 551, 93, 4]
[5]
[334, 222, 405, 8, 14, 944, 529, 1024, 11, 4]
[5]
[23, 85, 2, 351, 4]
[5]
[866, 103, 7, 32, 375, 8, 11, 4]
[5]
[1260, 31, 4]
[5]
[697, 81, 37, 810, 221, 42, 18, 31, 4]
[5]
[476, 35, 871, 328, 12, 75, 

In [53]:
print(get_decoded_sentence(translate(get_encoded_sentence(mecab.morphs('지루하다, 놀러가고 싶어.'), word_to_index), transformer, mecab, mecab), index_to_word))
print(get_decoded_sentence(translate(get_encoded_sentence(mecab.morphs('오늘 일찍 일어났더니 피곤하다.'), word_to_index), transformer, mecab, mecab), index_to_word))


[56, 25, 142, 121, 55, 84, 12, 18, 31, 4]
[5]
마음 도 정리 해야 할 때 가 있 어요 .
[1509, 27, 31, 4, 475, 407, 21, 54, 20, 229, 7, 164, 61, 275, 11, 4]
[5]
두근거리 겠 어요 . 자연 스럽 지 않 은 곳 이 조금 만 기다리 세요 .
