In [1]:
import random
import re

from konlpy.tag import Mecab
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import numpy as np
import pandas as pd
import sentencepiece as spm
import tensorflow as tf
from tqdm.notebook import tqdm
import nltk
from gensim.models import Word2Vec

# 데이터 불러오기 

In [2]:
dataset_df = pd.read_csv("~/data/ChatbotData.csv")
dataset_df.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


## 결측치 확인 

In [3]:
dataset_df.isna().sum()

Q        0
A        0
label    0
dtype: int64

## 중복 데이터 확인

In [4]:
dataset_df.duplicated().sum()

0

## 질문/답변 분리

In [5]:
questions = dataset_df['Q'].to_numpy()
answers = dataset_df['A'].to_numpy()

In [6]:
print(questions.shape, answers.shape)

(11823,) (11823,)


# 데이터 정제

In [7]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    for p in ['!', '?', '.', ',']:
        sentence = sentence.replace(p, ' ' + p + ' ')
    sentence = re.sub(r'[^a-zA-Zㄱ-ㅎ가-힣0-9!?\., ]', ' ', sentence)
    sentence = re.sub(r' +', ' ', sentence)
    sentence = sentence.strip()
    
    return sentence

# 토큰화

In [8]:
mecab = Mecab()

def build_corpus(sentences, tokenizer):
    corpus = []
    for sentence in sentences:
        sentence = preprocess_sentence(sentence)
        tokens = tokenizer.morphs(sentence)
        corpus.append(' '.join(tokens))
    return corpus

In [9]:
que_corpus = build_corpus(questions, mecab)
ans_corpus = build_corpus(answers, mecab)

In [10]:
print(len(que_corpus), len(ans_corpus))
print(questions[0])
print(que_corpus[0])

11823 11823
12시 땡!
12 시 땡 !


# 데이터 증대 

## 학습된 Word2Vec 모델 불러오기

In [11]:
wv = Word2Vec.load('data/word2vec_ko.model').wv

In [12]:
wv.most_similar('바나나')

[('파인애플', 0.8756669759750366),
 ('아몬드', 0.8629516363143921),
 ('딸기', 0.8533044457435608),
 ('땅콩', 0.8528051376342773),
 ('바닐라', 0.8519390225410461),
 ('당근', 0.8484757542610168),
 ('카사바', 0.8473575711250305),
 ('코코넛', 0.8457838892936707),
 ('블루베리', 0.8431717157363892),
 ('망고', 0.8348483443260193)]

## (원본, 원본), (augmented, 원본), (원본, augmented) 쌍으로 데이터 3배 증대하기 

In [13]:
# 어휘 대치 함수
def lexical_sub(sample_sentence, wv):
    sample_tokens = sample_sentence.split()

    # 랜덤으로 Word2Vec 사전에 존재하는 단어 고르기
    for _ in range(len(sample_tokens)):
        selected_tok = random.choice(sample_tokens)
        if selected_tok in wv.key_to_index:
            break
        else:
            # 못찾는다면 어휘 대치 하지 않음
            selected_tok = None

    result = ""
    for tok in sample_tokens:
        if tok is selected_tok:
            result += wv.most_similar(tok)[0][0] + " "
        else:
            result += tok + " "
    return result

In [14]:
que_corpus_augmented = []
ans_corpus_augmented = []

for question, answer in tqdm(zip(que_corpus, ans_corpus), total=len(que_corpus)):
    # 어휘 대치
    question_augmented = lexical_sub(question, wv)
    answer_augmented = lexical_sub(answer, wv)
    
    # 원본, 원본 쌍
    que_corpus_augmented.append(question)
    ans_corpus_augmented.append(answer)
    
    # augmented, 원본 쌍
    que_corpus_augmented.append(question_augmented)
    ans_corpus_augmented.append(answer)
    
    # 원본, augmented 쌍
    que_corpus_augmented.append(question)
    ans_corpus_augmented.append(answer_augmented)
    
print(len(que_corpus_augmented), len(ans_corpus_augmented))

  0%|          | 0/11823 [00:00<?, ?it/s]

35469 35469


## 벡터화 

In [15]:
# 시작 토큰, 종료 토큰 추가
for i in tqdm(range(len(que_corpus_augmented))):
    que_corpus_augmented[i] = '<start> ' + que_corpus_augmented[i] + ' <end>'
    
for i in tqdm(range(len(ans_corpus_augmented))):
    ans_corpus_augmented[i] = '<start> ' + ans_corpus_augmented[i] + ' <end>'

  0%|          | 0/35469 [00:00<?, ?it/s]

  0%|          | 0/35469 [00:00<?, ?it/s]

In [16]:
# SentencePiece를 통해 vectorizer 생성
def generate_vectorizer(corpus,
                       vocab_size,
                       pad_id=0,
                       bos_id=1,
                       eos_id=2,
                       unk_id=3):
    file = "./corpus.txt"
    model = "kor_spm"

    with open(file, 'w') as f:
        for row in corpus:
            f.write(str(row) + '\n')

    spm.SentencePieceTrainer.Train(
        '--input=./%s --model_prefix=%s --vocab_size=%d --model_type=bpe '\
        % (file, model, vocab_size) + \
        '--pad_id=%d --bos_id=%d --bos_piece=<start> --eos_id=%d --eos_piece=<end> --unk_id=%d --unk_piece=<unk>'\
        % (pad_id, bos_id, eos_id, unk_id)
    )

    vectorizer = spm.SentencePieceProcessor()
    vectorizer.Load('%s.model' % model)

    return vectorizer

In [17]:
VOCAB_SIZE = 8000

vectorizer = generate_vectorizer(que_corpus_augmented + ans_corpus_augmented,
                                vocab_size=VOCAB_SIZE)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=././corpus.txt --model_prefix=kor_spm --vocab_size=8000 --model_type=bpe --pad_id=0 --bos_id=1 --bos_piece=<start> --eos_id=2 --eos_piece=<end> --unk_id=3 --unk_piece=<unk>
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ././corpus.txt
  input_format: 
  model_prefix: kor_spm
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk

In [18]:
# 코퍼스 벡터화 & 패딩 추가
def vectorize_corpus(corpus, max_len, vectorizer):
    sequences = []
    
    for sentence in corpus:
        sequence = vectorizer.encode_as_ids(sentence)
        sequences.append(sequence)
        
    sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding='post')
    
    return sequences

ymbols. max_freq=210 min_freq=43
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=196 size=520 all=13525 active=1041 piece=▁벌
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=188 size=540 all=13572 active=1088 piece=▁찾아
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=180 size=560 all=13641 active=1157 piece=▁조심
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=174 size=580 all=13697 active=1213 piece=▁크
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=165 size=600 all=13738 active=1254 piece=▁확실
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=164 min_freq=39
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=158 size=620 all=13791 active=1053 piece=▁밥
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=150 size=640 all=13861 active=1123 piece=▁건강
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=142 size=660 all=13872 active=1134 piece=▁글
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=139 size=680 all=13915 active=1177 piece=▁마주
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5040 all=12864 active=964 piece=▁임신
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5060 all=12849 active=949 piece=▁전형
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5080 all=12831 active=931 piece=▁쥬스
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5100 all=12814 active=914 piece=▁차임
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=6 min_freq=4
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5120 all=12797 active=984 piece=▁최우
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5140 all=12781 active=968 piece=▁쿠폰
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5160 all=12761 active=948 piece=▁푸석
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5180 all=12746 active=933 piece=▁헬멧
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5200 all=12730 active=917 piece=라이너
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=6 min_freq=4
bpe_model_train

In [19]:
MAX_LEN = 20

enc_train = vectorize_corpus(que_corpus_augmented, MAX_LEN, vectorizer)
dec_train = vectorize_corpus(ans_corpus_augmented, MAX_LEN, vectorizer)

print(enc_train[0])

dded: freq=6 size=5560 all=12387 active=944 piece=▁째깍째깍
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6 size=5580 all=12367 active=924 piece=▁흘러갑니다
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5 size=5600 all=12366 active=923 piece=곡차
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=5 min_freq=3
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5 size=5620 all=12370 active=1003 piece=모이
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5 size=5640 all=12372 active=1005 piece=처지
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5 size=5660 all=12363 active=996 piece=▁고작
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5 size=5680 all=12349 active=982 piece=▁녀석
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5 size=5700 all=12331 active=964 piece=▁동전
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=5 min_freq=3
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5 size=5720 all=12312 active=982 piece=▁멋졌
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=

[6899    3 7317 7728 7532 7856 7728 7444 3490   41 2235  133 6899    3
 7676 7440 7877 7444    0    0]


# 모델 생성

## Positional Encoding 

In [20]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, (2*(i//2)) / np.float32(d_model))

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

## Masking 

In [21]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

## Multi-Head Attention 

In [22]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        self.depth = d_model // self.num_heads

        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)

        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)
        return out, attentions

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)

        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)

        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)

        out = self.combine_heads(out)
        out = self.linear(out)

        return out, attention_weights

## Position-Wise Feed Forward Network 

In [23]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)

        return out

## Encoder Layer 

In [24]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual

        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, enc_attn

## Decoder Layer 

In [25]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        '''
        Multi-Head Attention
        '''
        residual = out
        out = self.norm_2(out)
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual

        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

## Encoder 

In [26]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        out = x

        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)

        return out, enc_attns

## Decoder 

In [27]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]

    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x

        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

## Transformer 

In [28]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)

        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)

        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

## Learning Rate Scheduler

In [29]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()

        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)

        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

## Loss Function 

In [30]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

## Optimizer 

In [31]:
D_MODEL = 512
WARMUP_STEPS = 4000

lr_scheduler = LearningRateScheduler(D_MODEL, WARMUP_STEPS)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

# 모델 훈련하기 

In [60]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]
    gold = tgt[:, 1:]

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [61]:
hyper_params = {
    'n_layers': 2,
    'd_model': D_MODEL,
    'n_heads': 4,
    'd_ff': 2048,
    'src_vocab_size': VOCAB_SIZE,
    'tgt_vocab_size': VOCAB_SIZE,
    'pos_len': MAX_LEN, # prev: 200
    'dropout': 0.5,
    'shared_fc': True,
    'shared_emb': True
}

transformer = Transformer(**hyper_params)

transformer.compile(optimizer=optimizer)

In [62]:
EPOCHS = 40
BATCH_SIZE = 50

for epoch in range(EPOCHS):
    num_batches = len(enc_train) // BATCH_SIZE
    
    shuffled_index = np.arange(len(enc_train))
    np.random.shuffle(shuffled_index)
    
    total_loss = 0.
    for i in tqdm(range(num_batches)):
        start_idx = i * BATCH_SIZE
        end_idx = (i + 1) * BATCH_SIZE
        batch_index = shuffled_index[start_idx:end_idx]
        
        enc_batch = enc_train[batch_index]
        dec_batch = dec_train[batch_index]
        
        loss, _, _, _ = train_step(enc_batch, dec_batch, transformer, optimizer)
        total_loss += loss
    print(f'epoch {epoch} loss: {total_loss}')

  0%|          | 0/709 [00:00<?, ?it/s]

epoch 0 loss: 1171.553955078125


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 1 loss: 429.31561279296875


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 2 loss: 203.98739624023438


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 3 loss: 128.5432586669922


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 4 loss: 103.64532470703125


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 5 loss: 93.74603271484375


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 6 loss: 88.4327163696289


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 7 loss: 83.51567840576172


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 8 loss: 80.37123107910156


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 9 loss: 76.33719635009766


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 10 loss: 73.25294494628906


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 11 loss: 70.69779968261719


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 12 loss: 68.81723022460938


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 13 loss: 66.36868286132812


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 14 loss: 63.88459777832031


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 15 loss: 61.830055236816406


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 16 loss: 60.637386322021484


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 17 loss: 59.0507926940918


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 18 loss: 57.76933288574219


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 19 loss: 55.52130889892578


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 20 loss: 54.383426666259766


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 21 loss: 53.6436653137207


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 22 loss: 52.97811508178711


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 23 loss: 51.121089935302734


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 24 loss: 50.473323822021484


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 25 loss: 49.049835205078125


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 26 loss: 47.99140167236328


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 27 loss: 47.50832748413086


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 28 loss: 47.33024597167969


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 29 loss: 46.55543899536133


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 30 loss: 45.27193832397461


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 31 loss: 44.38750076293945


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 32 loss: 43.82415008544922


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 33 loss: 43.490413665771484


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 34 loss: 42.35816192626953


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 35 loss: 42.16437530517578


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 36 loss: 42.33056640625


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 37 loss: 41.41029739379883


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 38 loss: 40.45881271362305


  0%|          | 0/709 [00:00<?, ?it/s]

epoch 39 loss: 40.31974411010742


# 훈련 결과 확인하기

In [63]:
def translate(tokens, model, tgt_vectorizer):
    ids = []
    output = tf.expand_dims([tgt_vectorizer.bos_id()], 0)   
    for i in range(MAX_LEN):
        enc_padding_mask, combined_mask, dec_padding_mask = generate_masks(tokens, output)

        predictions, _, _, _ = model(tokens, 
                                      output,
                                      enc_padding_mask,
                                      combined_mask,
                                      dec_padding_mask)

        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if tgt_vectorizer.eos_id() == predicted_id:
            result = tgt_vectorizer.decode_ids(ids)  
            return result

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

    result = tgt_vectorizer.decode_ids(ids)  
    return result

In [64]:
test_text = [
    "지루하다, 놀러가고 싶어.",
    "오늘 일찍 일어났더니 피곤하다.",
    "간만에 여자친구랑 데이트 하기로 했어.",
    "집에 있는다는 소리야.",
]

test_corpus = build_corpus(test_text, mecab)
enc_test = vectorize_corpus(test_corpus, MAX_LEN, vectorizer)

print(enc_test)

[[3123    7   24  325  262  392   12   15   60    8    4    0    0    0
     0    0    0    0    0    0]
 [ 154  907 2084  847  795    7   24    4    0    0    0    0    0    0
     0    0    0    0    0    0]
 [3703   97   71  184  405    7   38  118   85    8    4    0    0    0
     0    0    0    0    0    0]
 [ 178   20   25 1429  911  108    4    0    0    0    0    0    0    0
     0    0    0    0    0    0]]


In [65]:
dec_preds = []

for i in range(enc_test.shape[0]):
    pred = translate(enc_test[i:i+1], transformer, vectorizer)
    dec_preds.append(pred)
    
print(dec_preds)

['에 한 번 웃 어 봐요 . 기분 이 좋 겠 어요 .  ⁇ end>>', '는 것 처럼 반응 이 일어나 서 완전히 바뀌 게 되 죠 .  ⁇ end>>', '을 받 아 가 서 같이 먹 는 게 좋 겠 어요 .  ⁇ end>>', '한다는 말 아끼 지 않 는 선 이 좋 겠 어요 .  ⁇ end>>>']


In [66]:
print('# 예문')
for i, question in enumerate(test_text):
    print(f"{i}. {question}")

print("\n---\n")

print("# 제출\n")
for i, answer in enumerate(dec_preds):
    print(f"> {i}. {answer}")
    
print("\nHyperparameters")
for key, value in hyper_params.items():
    print(f"> {key}: {value}")
    
print("\nTraining Parameters")
print("> Warmup Steps:", WARMUP_STEPS)
print("> Batch Size: ", BATCH_SIZE)
print("> Epoch At:", EPOCHS)

# 예문
0. 지루하다, 놀러가고 싶어.
1. 오늘 일찍 일어났더니 피곤하다.
2. 간만에 여자친구랑 데이트 하기로 했어.
3. 집에 있는다는 소리야.

---

# 제출

> 0. 에 한 번 웃 어 봐요 . 기분 이 좋 겠 어요 .  ⁇ end>>
> 1. 는 것 처럼 반응 이 일어나 서 완전히 바뀌 게 되 죠 .  ⁇ end>>
> 2. 을 받 아 가 서 같이 먹 는 게 좋 겠 어요 .  ⁇ end>>
> 3. 한다는 말 아끼 지 않 는 선 이 좋 겠 어요 .  ⁇ end>>>

Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 4
> d_ff: 2048
> src_vocab_size: 8000
> tgt_vocab_size: 8000
> pos_len: 20
> dropout: 0.5
> shared_fc: True
> shared_emb: True

Training Parameters
> Warmup Steps: 4000
> Batch Size:  50
> Epoch At: 40


# 성능 측정하기

In [67]:
def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    return sentence_bleu([reference],
                         candidate,
                         weights=weights,
                         smoothing_function=SmoothingFunction().method1)

In [68]:
enc_samples = enc_train[0:15:3, :]
dec_samples = dec_train[0:15:3, :]

for i in range(len(test_text)):
    question = vectorizer.decode_ids(enc_samples[i].tolist())
    answer = vectorizer.decode_ids(dec_samples[i].tolist())[9:]
    print("질문:", question)
    print("챗봇 대답:", dec_preds[i])
    print("원본 대답:", answer)
    print("BLEU score:", calculate_bleu(answer, dec_preds[i]))

질문:  ⁇ start> 12 시 땡 !  ⁇ end>
챗봇 대답: 에 한 번 웃 어 봐요 . 기분 이 좋 겠 어요 .  ⁇ end>>
원본 대답:  하루 가 또 가 네요 .  ⁇ end>
BLEU score: 0.2919781303036754
질문:  ⁇ start> 1 지망 학교 떨어졌 어  ⁇ end>
챗봇 대답: 는 것 처럼 반응 이 일어나 서 완전히 바뀌 게 되 죠 .  ⁇ end>>
원본 대답:  위로 해 드립니다 .  ⁇ end>
BLEU score: 0.22786124148191833
질문: art> 3 박 4 일 놀 러 가 고 싶 다  ⁇ end>
챗봇 대답: 을 받 아 가 서 같이 먹 는 게 좋 겠 어요 .  ⁇ end>>
원본 대답:  여행 은 언제나 좋 죠 .  ⁇ end>
BLEU score: 0.29759282342490984
질문: rt> 3 박 4 일 정도 놀 러 가 고 싶 다  ⁇ end>
챗봇 대답: 한다는 말 아끼 지 않 는 선 이 좋 겠 어요 .  ⁇ end>>>
원본 대답:  여행 은 언제나 좋 죠 .  ⁇ end>
BLEU score: 0.2892014777444793


# 회고

- SentencePiece에서 bpe 방식으로 토큰화할 때 vocab_size가 최대 개수를 넘어가면 무한 루프에 빠짐
    - 지금까지 bpe로 돌렸을때 4시간동안 안되는 이유
    - unigram의 경우에는 Warning 문구로 가능한 최대 단어 개수를 초과했다고 알려줌
- SentencePiece 라이브러리를 좀 더 살펴봐야함
    - `<start>`와 `<end>`를 토큰으로 등록해도 인코딩 문제가 존재함