In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

import re
import os
import random
import math

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [2]:
# CSV 파일 읽기
file_path = './data/ChatbotData.csv'
chatbot_data = pd.read_csv(file_path)

# 데이터 확인
print(chatbot_data.head())

                 Q            A  label
0           12시 땡!   하루가 또 가네요.      0
1      1지망 학교 떨어졌어    위로해 드립니다.      0
2     3박4일 놀러가고 싶다  여행은 언제나 좋죠.      0
3  3박4일 정도 놀러가고 싶다  여행은 언제나 좋죠.      0
4          PPL 심하네   눈살이 찌푸려지죠.      0


In [3]:
question = chatbot_data['Q'].tolist()
answer = chatbot_data['A'].tolist()

In [4]:
print('question: ', len(question))
for sen in question[0:100][::20]: 
    print(">>", sen)
print('answer: ', len(answer))
for sen in answer[0:100][::20]: 
    print(">>", sen)

question:  11823
>> 12시 땡!
>> 가스비 비싼데 감기 걸리겠어
>> 간만에 떨리니까 좋더라
>> 감정컨트롤을 못하겠어
>> 개강룩 입어볼까
answer:  11823
>> 하루가 또 가네요.
>> 따뜻하게 사세요!
>> 떨리는 감정은 그 자체로 소중해요.
>> 그건 습관이에요.
>> 개시해보세요.


In [5]:
def preprocess_sentence(sentence):
    # 1. 모든 영문자를 소문자로 변환
    sentence = sentence.lower()

    # 2. 영문자, 한글, 숫자, 주요 특수문자만 남기고 나머지 제거, 탭문자 살리기
    sentence = re.sub(r"[^a-z0-9ㄱ-ㅎ가-힣.,!?()\t]", " ", sentence)

    # 3. 둘 이상의 공백을 하나의 공백으로 치환
    sentence = re.sub(r"\s{2,}", " ", sentence).strip()

    return sentence

In [6]:
question = list(map(preprocess_sentence, question))
answer = list(map(preprocess_sentence, answer))

In [7]:
total_sentence_count = len(question)
test_sentence_count = total_sentence_count // 100

print("Test Size: ", test_sentence_count)
print("\n")

train_question = question[:-test_sentence_count]
train_answer = answer[:-test_sentence_count]

test_question = question[-test_sentence_count:]
test_answer = answer[-test_sentence_count:]

print("Train question:", len(train_question))
for sen in train_question[0:100][::20]: 
    print(">>", sen)
print("\n")
print("Train answer:", len(train_answer))
for sen in train_answer[0:100][::20]: 
    print(">>", sen)
print("\n")   
print("Test question:", len(test_question))
for sen in test_question[0:100][::20]: 
    print(">>", sen)
print("\n")
print("Test answer:", len(test_answer))
for sen in test_answer[0:100][::20]: 
    print(">>", sen)

Test Size:  118


Train question: 11705
>> 12시 땡!
>> 가스비 비싼데 감기 걸리겠어
>> 간만에 떨리니까 좋더라
>> 감정컨트롤을 못하겠어
>> 개강룩 입어볼까


Train answer: 11705
>> 하루가 또 가네요.
>> 따뜻하게 사세요!
>> 떨리는 감정은 그 자체로 소중해요.
>> 그건 습관이에요.
>> 개시해보세요.


Test question: 118
>> 친구의 구남친을 좋아하게 됐어요.
>> 카페 알바생이 좋아졌는데 들이대도 괜찮을까?
>> 키 큰 여자는 별로인가?
>> 학교에 심남 있는데 연락해볼까?
>> 헤어지고 얼마 안됐는데 썸타는 거 가능?


Test answer: 118
>> 선택을 해야 겠네요.
>> 언제 끝나냐고 물어보세요.
>> 키는 중요하지 않을 거예요.
>> 호감을 어느 정도 표현해보는 것도 좋을 것 같아요.
>> 썸 정도는 언제 타든 상관 없는 거 같아요.


In [8]:
from konlpy.tag import Mecab

# Mecab 객체 생성
mecab = Mecab()

def build_corpus(src_data, tgt_data, tokenizer, max_len=50):
    """
    소스 및 타겟 문장 데이터를 전처리하고 토큰화하여 중복을 제거한 코퍼스 반환
    - src_data: 소스 문장 리스트
    - tgt_data: 타겟 문장 리스트
    - tokenizer: 토크나이즈 함수 (예: mecab.morphs)
    - max_len: 최대 토큰 길이
    """
    src_corpus, tgt_corpus = [], []
    # 중복 방지용 세트
    unique_src = set()
    unique_tgt = set()

    for src_sentence, tgt_sentence in zip(src_data, tgt_data):
        # 1. 문장 정제
        src_sentence = preprocess_sentence(src_sentence)
        tgt_sentence = preprocess_sentence(tgt_sentence)

        # 2. 토큰화
        src_tokens = tokenizer(src_sentence)
        tgt_tokens = tokenizer(tgt_sentence)

        # 3. 토큰 길이 검사
        if len(src_tokens) > max_len or len(tgt_tokens) > max_len:
            continue  # 길이 초과 문장은 제외

        # 4. 중복 체크 후 저장
        src_duplicated = tuple(src_tokens)
        tgt_duplicated = tuple(tgt_tokens)
            
        if src_duplicated not in unique_src and tgt_duplicated not in unique_tgt:
            src_corpus.append(src_tokens)
            tgt_corpus.append(tgt_tokens)
                
            unique_src.add(src_duplicated)
            unique_tgt.add(tgt_duplicated)

    return src_corpus, tgt_corpus

In [9]:
train_question, train_answer = build_corpus(train_question, train_answer, mecab.morphs)

print(question[:5])
print()
print(answer[:5])

['12시 땡!', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'ppl 심하네']

['하루가 또 가네요.', '위로해 드립니다.', '여행은 언제나 좋죠.', '여행은 언제나 좋죠.', '눈살이 찌푸려지죠.']


In [10]:
from gensim.models import Word2Vec

# Word2Vec 모델 로드
embedding_model_path = './data/ko.bin'  # 다운로드한 ko.bin 파일 경로

wv = Word2Vec.load(embedding_model_path)

In [11]:
def lexical_sub(sentence, wv):
    # 문장에서 무작위로 단어 선택
    selected_tok = random.choice(sentence)
    
    # 유사한 단어로 변환된 결과 문장을 저장할 변수
    result = []
    
    for tok in sentence:
        if tok == selected_tok:
            try:
                # 선택된 단어와 유사한 단어로 대체
                similar_word = wv.most_similar(tok)[0][0]
                result.append(similar_word)
            except KeyError:
                # 단어가 임베딩 모델에 없을 경우 원래 단어 사용
                result.append(tok)
        else:
            result.append(tok)
    
    return result

In [12]:
# 데이터 증강 함수
def augment_data(que_corpus, ans_corpus, wv):
    augmented_que_corpus = []
    augmented_ans_corpus = []

    # 1. 질문 문장에 Lexical Substitution 적용
    for que_sentence in tqdm(que_corpus, desc="Augmenting Questions"):
        augmented_que_corpus.append(lexical_sub(que_sentence, wv))


    # 2. 답변 문장에 Lexical Substitution 적용
    for ans_sentence in tqdm(ans_corpus, desc="Augmenting Answers"):
        augmented_ans_corpus.append(lexical_sub(ans_sentence, wv))


    # 3. 병렬 데이터셋 구성 (총 3배 데이터)
    final_que_corpus = que_corpus + augmented_que_corpus + que_corpus
    final_ans_corpus = ans_corpus + ans_corpus + augmented_ans_corpus

    return final_que_corpus, final_ans_corpus

In [13]:
# 데이터 증강
augmented_que_corpus, augmented_ans_corpus = augment_data(train_question, train_answer, wv)

# 결과 확인
print("증강 후 질문 데이터 개수:", len(augmented_que_corpus))
print("증강 후 답변 데이터 개수:", len(augmented_ans_corpus))
print("예시 증강 데이터:")
print("질문:", augmented_que_corpus[:5])
print("답변:", augmented_ans_corpus[:5])

Augmenting Questions:   0%|          | 0/7594 [00:00<?, ?it/s]

  similar_word = wv.most_similar(tok)[0][0]


Augmenting Answers:   0%|          | 0/7594 [00:00<?, ?it/s]

증강 후 질문 데이터 개수: 22782
증강 후 답변 데이터 개수: 22782
예시 증강 데이터:
질문: [['12', '시', '땡', '!'], ['1', '지망', '학교', '떨어졌', '어'], ['3', '박', '4', '일', '놀', '러', '가', '고', '싶', '다'], ['ppl', '심하', '네'], ['sd', '카드', '망가졌', '어']]
답변: [['하루', '가', '또', '가', '네요', '.'], ['위로', '해', '드립니다', '.'], ['여행', '은', '언제나', '좋', '죠', '.'], ['눈살', '이', '찌푸려', '지', '죠', '.'], ['다시', '새로', '사', '는', '게', '마음', '편해요', '.']]


In [14]:
# 타겟 데이터에 <start>와 <end> 토큰 추가
ans_question = [["<start>"] + corpus + ["<end>"] for corpus in augmented_que_corpus]
ans_answer = [["<start>"] + corpus + ["<end>"] for corpus in augmented_ans_corpus]

In [15]:
print("토큰 추가 후 ans_question:", ans_question[:5])
print("토큰 추가 후 ans_answer:", ans_answer[:5])

토큰 추가 후 ans_question: [['<start>', '12', '시', '땡', '!', '<end>'], ['<start>', '1', '지망', '학교', '떨어졌', '어', '<end>'], ['<start>', '3', '박', '4', '일', '놀', '러', '가', '고', '싶', '다', '<end>'], ['<start>', 'ppl', '심하', '네', '<end>'], ['<start>', 'sd', '카드', '망가졌', '어', '<end>']]
토큰 추가 후 ans_answer: [['<start>', '하루', '가', '또', '가', '네요', '.', '<end>'], ['<start>', '위로', '해', '드립니다', '.', '<end>'], ['<start>', '여행', '은', '언제나', '좋', '죠', '.', '<end>'], ['<start>', '눈살', '이', '찌푸려', '지', '죠', '.', '<end>'], ['<start>', '다시', '새로', '사', '는', '게', '마음', '편해요', '.', '<end>']]


In [16]:
def ko_tokenize(tokenized_corpus, VOCA_SIZE):
   
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCA_SIZE, filters='')
    tokenizer.fit_on_texts(tokenized_corpus)

    return tokenizer

def vectorize(tokenized_corpus, tokenizer):
    tensor = tokenizer.texts_to_sequences(tokenized_corpus)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=50)
    return tensor

In [17]:
VOCAB_SIZE = 20000

tokenizer = ko_tokenize(ans_question + ans_answer, VOCAB_SIZE)
enc_train = vectorize(ans_question, tokenizer)
dec_train = vectorize(ans_answer, tokenizer)

In [18]:
print(enc_train.shape)
print(dec_train.shape)

(22782, 50)
(22782, 50)


In [19]:
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train)).batch(batch_size=BATCH_SIZE)

In [20]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, (2*(i//2)) / np.float32(d_model))

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

In [21]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [22]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        out = self.combine_heads(out)
        out = self.linear(out)
            
        return out, attention_weights

In [23]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
            
        return out

In [24]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

In [25]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        '''
        Multi-Head Attention
        '''
        residual = out
        out = self.norm_2(out)
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [26]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
    
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [27]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [28]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

        
    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [29]:
transformer = Transformer(
    n_layers=1,
    d_model=368,
    n_heads=8,
    d_ff=1024,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.2,
    shared_fc=True,
    shared_emb=True)

d_model=368

In [30]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)  # d_model을 float32로 변환
        self.warmup_steps = tf.cast(warmup_steps, tf.float32)  # warmup_steps을 float32로 변환
    
    def __call__(self, step):
        step = tf.cast(step, tf.float32)  # step도 float32로 변환
        
        # 모든 지수 연산에 대해 tf.constant()를 사용하여 명시적 타입 지정
        arg1 = tf.math.pow(step, tf.constant(-0.5, dtype=tf.float32))
        arg2 = step * tf.math.pow(self.warmup_steps, tf.constant(-1.5, dtype=tf.float32))
        
        return tf.math.pow(self.d_model, tf.constant(-0.5, dtype=tf.float32)) * tf.math.minimum(arg1, arg2)

In [31]:
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)

In [32]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [33]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  # Decoder의 input
    gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [34]:
EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    dataset_count = tf.data.experimental.cardinality(train_dataset).numpy()
    tqdm_bar = tqdm(total=dataset_count, desc=f'Epoch {epoch + 1}/{EPOCHS}')

    # 배치 단위로 학습
    for batch, (inputs, targets) in enumerate(train_dataset):
        loss, _, _, _ = train_step(inputs, targets, transformer, optimizer)
        total_loss += float(loss)

        # 진행 상황 업데이트
        tqdm_bar.update(1)
        tqdm_bar.set_postfix(loss=float(loss))

    tqdm_bar.close()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / dataset_count:.4f}')

Epoch 1/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 1, Loss: 6.3047


Epoch 2/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 2, Loss: 3.8588


Epoch 3/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 3, Loss: 3.2788


Epoch 4/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 4, Loss: 2.8477


Epoch 5/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 5, Loss: 2.3135


Epoch 6/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 6, Loss: 1.6401


Epoch 7/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 7, Loss: 1.1064


Epoch 8/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 8, Loss: 0.7714


Epoch 9/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 9, Loss: 0.5874


Epoch 10/10:   0%|          | 0/356 [00:00<?, ?it/s]

Epoch 10, Loss: 0.5074


In [35]:
def sequences_to_texts(sequences, tokenizer):
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    return [" ".join([reverse_word_map.get(i, "") for i in seq]) for seq in sequences]


def translate(tokens, model, src_tokenizer, tgt_tokenizer):
    padded_tokens = tf.keras.preprocessing.sequence.pad_sequences(tokens,
                                                           maxlen=MAX_LEN,
                                                           padding='post')
    
    ids = []
    output = tf.expand_dims([tgt_tokenizer.word_index['<start>']], 0)  
    for i in range(MAX_LEN):
        enc_padding_mask, combined_mask, dec_padding_mask = \
        generate_masks(padded_tokens, output)

        predictions, _, _, _ = model(padded_tokens, 
                                      output,
                                      enc_padding_mask,
                                      combined_mask,
                                      dec_padding_mask)

        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if tgt_tokenizer.word_index['<end>'] == predicted_id:
            result = sequences_to_texts(ids, tgt_tokenizer)
            return result

        ids.append([predicted_id])
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)
    
    result = sequences_to_texts(ids, tgt_tokenizer)
    return result

In [36]:
def eval_bleu_single(model, src_sentence, tgt_sentence, src_tokenizer, tgt_tokenizer, verbose=True):
    src_tokens = src_tokenizer.texts_to_sequences(src_sentence)
    tgt_tokens = tgt_tokenizer.texts_to_sequences(tgt_sentence)

    if (len(src_tokens) > MAX_LEN): return None
    if (len(tgt_tokens) > MAX_LEN): return None

    reference = tgt_sentence
    candidate = translate(src_tokens, model, src_tokenizer, tgt_tokenizer)

    score = sentence_bleu([reference], candidate,
                          smoothing_function=SmoothingFunction().method1)

    if verbose:
        print("Source Sentence: ", src_sentence)
        print("Model Prediction: ", candidate)
        print("Real: ", reference)
        print("Score: %lf\n" % score)
        
    return score

In [37]:
def eval_bleu(model, src_sentences, tgt_sentence, src_tokenizer, tgt_tokenizer, verbose=True):
    total_score = 0.0
    sample_size = len(src_sentences)
    
    for idx in tqdm(range(sample_size)):
        score = eval_bleu_single(model, src_sentences[idx], tgt_sentence[idx], src_tokenizer, tgt_tokenizer, verbose)
        if not score: continue
        
        total_score += score
    
    print("Num of Sample:", sample_size)
    print("Total Score:", total_score / sample_size)

In [39]:
MAX_LEN = 50
eval_bleu(transformer, test_question, test_answer, tokenizer, tokenizer, verbose=False)

  0%|          | 0/118 [00:00<?, ?it/s]

Num of Sample: 118
Total Score: 0.003889679875584514
