# 12-1. 들어가며

학습 목표

1. 번역 모델을 활용한 챗봇 만들기

# 12-2. 번역 데이터 준비

데이터셋 : 영어 - 스페인어 데이터 셋

In [8]:
# sentencepiece 설치

!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [9]:
# 라이브러리 불러오기

import numpy as np
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

import re
import os
import random
import math

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

print(tf.__version__)

2.9.2


In [3]:
# 데이터 불러오기

zip_path = tf.keras.utils.get_file(
    'spa-eng.zip',
    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
# 데이터 출력

file_path = os.path.dirname(zip_path)+"/spa-eng/spa.txt"

with open(file_path, "r") as f:
    spa_eng_sentences = f.read().splitlines()

spa_eng_sentences = list(set(spa_eng_sentences)) 
total_sentence_count = len(spa_eng_sentences)
print("Example:", total_sentence_count)

for sen in spa_eng_sentences[0:100][::20]: 
    print(">>", sen)

# 총 118964개 데이터가 있다.

Example: 118964
>> Blow the horn so that car will let us pass.	Toque la bocina para que ese auto nos deje pasar.
>> Would you like to be considered for the job?	¿Le gustaría que se le considerase para el trabajo?
>> Women generally live longer than men.	Las mujeres generalmente viven más que los hombres.
>> I won't have enough time for everything I want to do.	No voy a tener suficiente tiempo para todo lo que quiero hacer.
>> The floor was very cold.	El piso estaba muy frío.


In [5]:
# 데이터 전처리

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = sentence.strip()
    return sentence

In [6]:
# 전처리 적용

spa_eng_sentences = list(map(preprocess_sentence, spa_eng_sentences))

In [7]:
# 테스트에 사용할 데이터 분리

test_sentence_count = total_sentence_count // 200
print("Test Size: ", test_sentence_count)
print("\n")

train_spa_eng_sentences = spa_eng_sentences[:-test_sentence_count]
test_spa_eng_sentences = spa_eng_sentences[-test_sentence_count:]
print("Train Example:", len(train_spa_eng_sentences))
for sen in train_spa_eng_sentences[0:100][::20]: 
    print(">>", sen)
print("\n")
print("Test Example:", len(test_spa_eng_sentences))
for sen in test_spa_eng_sentences[0:100][::20]: 
    print(">>", sen)

Test Size:  594


Train Example: 118370
>> blow the horn so that car will let us pass.	toque la bocina para que ese auto nos deje pasar.
>> would you like to be considered for the job?	¿le gustaría que se le considerase para el trabajo?
>> women generally live longer than men.	las mujeres generalmente viven más que los hombres.
>> i won't have enough time for everything i want to do.	no voy a tener suficiente tiempo para todo lo que quiero hacer.
>> the floor was very cold.	el piso estaba muy frío.


Test Example: 594
>> he never fails to call his mother on her birthday.	él siempre llama a su madre el día de su cumpleaños.
>> the sun is shining in my face.	el sol brilla en mi cara.
>> i met him at a club.	lo conocí en el club.
>> how much is this going to cost us?	¿cuánto nos va a costar esto?
>> my father was lost in thought.	mi padre estaba en las nubes.


In [8]:
# 영어 문장과 스페인어 문장 구분하기
# tab으로 연결되어 있기에 \t을 기준으로 분리

def split_spa_eng_sentences(spa_eng_sentences):
    spa_sentences = []
    eng_sentences = []
    for spa_eng_sentence in tqdm(spa_eng_sentences):
        eng_sentence, spa_sentence = spa_eng_sentence.split('\t')
        spa_sentences.append(spa_sentence)
        eng_sentences.append(eng_sentence)
    return eng_sentences, spa_sentences

In [9]:
# train 데이터 나누기

train_eng_sentences, train_spa_sentences = split_spa_eng_sentences(train_spa_eng_sentences)
print(len(train_eng_sentences))
print(train_eng_sentences[0])
print('\n')
print(len(train_spa_sentences))
print(train_spa_sentences[0])

  0%|          | 0/118370 [00:00<?, ?it/s]

118370
blow the horn so that car will let us pass.


118370
toque la bocina para que ese auto nos deje pasar.


In [10]:
# test 데이터 나누기

test_eng_sentences, test_spa_sentences = split_spa_eng_sentences(test_spa_eng_sentences)
print(len(test_eng_sentences))
print(test_eng_sentences[0])
print('\n')
print(len(test_spa_sentences))
print(test_spa_sentences[0])

  0%|          | 0/594 [00:00<?, ?it/s]

594
he never fails to call his mother on her birthday.


594
él siempre llama a su madre el día de su cumpleaños.


토큰화

이제 문장 데이터를 토큰화를 해야 할 차례입니다. Sentencepiece 기반의 토크나이저를 생성해 주는 generate_tokenizer() 함수를 정의하여 토크나이저를 얻어보도록 하죠!

In [11]:
def generate_tokenizer(corpus,
                       vocab_size,
                       lang="spa-eng",
                       pad_id=0,   # pad token의 일련번호
                       bos_id=1,  # 문장의 시작을 의미하는 bos token(<s>)의 일련번호
                       eos_id=2,  # 문장의 끝을 의미하는 eos token(</s>)의 일련번호
                       unk_id=3):   # unk token의 일련번호
    file = "./%s_corpus.txt" % lang
    model = "%s_spm" % lang

    with open(file, 'w') as f:
        for row in corpus: f.write(str(row) + '\n')

    import sentencepiece as spm
    spm.SentencePieceTrainer.Train(
        '--input=./%s --model_prefix=%s --vocab_size=%d'\
        % (file, model, vocab_size) + \
        '--pad_id==%d --bos_id=%d --eos_id=%d --unk_id=%d'\
        % (pad_id, bos_id, eos_id, unk_id)
    )

    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load('%s.model' % model)

    return tokenizer

한-영 번역 때와 다륵, 두 언어가 단어 사전을 공유하도록 한다. 영어와 스페인어는 모두 알파벳으로 이뤄지는데다가 인도유럽족이기 때문에 기대할 수 있는 효과가 많다. 후에 챗봇을 만들 때에도 질문과 답변이 모두 한글로 이루어져 있기 때문에 Embedding 층을 공유하는 것이 성능에 도움이 됩니다.

In [12]:
VOCAB_SIZE = 20000
tokenizer = generate_tokenizer(train_eng_sentences + train_spa_sentences, VOCAB_SIZE, 'spa-eng')
tokenizer.set_encode_extra_options("bos:eos")  # 문장 양 끝에 <s> , </s> 추가

True

위에서 두 언어 사이에 단어 사전을 공유하기로 하였으므로 Encoder와 Decoder의 전용 토크나이저를 만들지 않고, 방금 만들어진 토크나이저를 두 언어 사이에서 공유하게 됩니다.

토크나이저가 준비되었으니 본격적으로 데이터를 토큰화하도록 하겠습니다. 토큰화를 해주는 함수를 만들어 줍니다.

In [13]:
def make_corpus(sentences, tokenizer):
    corpus = []
    for sentence in tqdm(sentences):
        tokens = tokenizer.encode_as_ids(sentence)
        corpus.append(tokens)
    return corpus

In [14]:
eng_corpus = make_corpus(train_eng_sentences, tokenizer)
spa_corpus = make_corpus(train_spa_sentences, tokenizer)

  0%|          | 0/118370 [00:00<?, ?it/s]

  0%|          | 0/118370 [00:00<?, ?it/s]

In [15]:
print(train_eng_sentences[0])
print(eng_corpus[0])
print('\n')
print(train_spa_sentences[0])
print(spa_corpus[0])

blow the horn so that car will let us pass.
[1, 5579, 9, 8393, 155, 32, 184, 90, 165, 182, 1432, 0, 2]


toque la bocina para que ese auto nos deje pasar.
[1, 6473, 16, 14679, 66, 15, 230, 468, 229, 2305, 754, 0, 2]


In [16]:
# 토큰의 길이 50으로 제한 후 패딩 작업

MAX_LEN = 50
enc_ndarray = tf.keras.preprocessing.sequence.pad_sequences(eng_corpus, maxlen=MAX_LEN, padding='post')
dec_ndarray = tf.keras.preprocessing.sequence.pad_sequences(spa_corpus, maxlen=MAX_LEN, padding='post')

In [17]:
# 데이터셋

BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((enc_ndarray, dec_ndarray)).batch(batch_size=BATCH_SIZE)

# 12-3. 번역 모델 만들기

1. 트랜스포머 구현하기

아래 웹페이지를 참고하여 구현한다.

[위키독스: 트랜스포머](https://wikidocs.net/31379)

[Trax: Transformer](https://github.com/google/trax/blob/master/trax/models/transformer.py)

[Tensorflow: Transformer](https://www.tensorflow.org/text/tutorials/transformer)

In [18]:
# Positional Encoding 구현

def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    return sinusoid_table

tf.cast() : 배열의 dtype을 변경시켜준다
tf.math.equal() : 
[텐서플로우를 이용한 논리연산](https://chan-lab.tistory.com/m/9)

![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fk.kakaocdn.net%2Fdn%2FcRzJTh%2FbtqwMdHnKKz%2F64aVKDH4W04DsJm2ybOSr1%2Fimg.png)
<center>tf.math.equal 예시/center>



In [19]:
# Mask 생성하기

def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [20]:
# Multi Head Attention 구현
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        out = self.combine_heads(out)
        out = self.linear(out)
            
        return out, attention_weights

In [22]:
# Position-wise Feed Forward Network 구현
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
            
        return out

In [23]:
# Encoder의 레이어 구현
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

In [24]:
# Decoder 레이어 구현
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        '''
        Multi-Head Attention
        '''
        residual = out
        out = self.norm_2(out)
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [25]:
# Encoder 구현
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
    
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [26]:
# Decoder 구현
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [27]:
# 트랜스포머 전체 모델 조립

class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

        
    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [28]:
# 트랜스포머 하이퍼 파라미터 지정

transformer = Transformer(
    n_layers=2,
    d_model=512,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)
		
d_model = 512

In [29]:
# Learning Rate Scheduler 구현
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

In [30]:
# Learning Rate 인스턴스 선언 & Optimizer 구현
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)

In [31]:
# Loss Function 정의
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [32]:
# Train Step 정의
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  # Decoder의 input
    gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [33]:
# 훈련시키기

EPOCHS = 3

for epoch in range(EPOCHS):
    total_loss = 0
    
    dataset_count = tf.data.experimental.cardinality(train_dataset).numpy()
    tqdm_bar = tqdm(total=dataset_count)
    for step, (enc_batch, dec_batch) in enumerate(train_dataset):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_batch,
                    dec_batch,
                    transformer,
                    optimizer)

        total_loss += batch_loss
        
        tqdm_bar.set_description_str('Epoch %2d' % (epoch + 1))
        tqdm_bar.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (step + 1)))
        tqdm_bar.update()

  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/1850 [00:00<?, ?it/s]

# 12-4. 번역 성능 측정하기 (1) BLEU Score



1. NLTK를 활용한 BLEU Score

NLTK는 Natural Language Tool Kit 의 준말로 이름부터 자연어 처리에 큰 도움이 될 것 같은 라이브러리입니다.😃 nltk 가 BLEU Score를 지원하니 이를 활용하도록 합시다.

In [34]:
# BLEU socre

reference = "많 은 자연어 처리 연구자 들 이 트랜스포머 를 선호 한다".split()
candidate = "적 은 자연어 학 개발자 들 가 트랜스포머 을 선호 한다 요".split()

print("원문:", reference)
print("번역문:", candidate)
print("BLEU Score:", sentence_bleu([reference], candidate))

원문: ['많', '은', '자연어', '처리', '연구자', '들', '이', '트랜스포머', '를', '선호', '한다']
번역문: ['적', '은', '자연어', '학', '개발자', '들', '가', '트랜스포머', '을', '선호', '한다', '요']
BLEU Score: 8.190757052088229e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Socer는 0~1 사이의 값을 가지지만, 100을 곱한 백분율 값으로 표기하는 경우도 많다. BLEU Score의 점수대별 해석은 [여기](https://cloud.google.com/translate/automl/docs/evaluate?hl=ko#bleu)를 참고

BLEU Socre가 50점을 넘는다면 좋은 번역을 생성했다는 의미이다. 보통 논문에서 제시하는 BLEU Socre는 20점에서 높으면 40점을 바라본다. 지금 위에서 나온 점수는 0점이다. 이건 BLEU Score의 점수 측정 방식때문에 그런 것으로 추측된다.

BLEU Score의 정의를 다시 생각해보자

BLEU Score가 N-gram으로 점수를 측정한다는 것을 기억할 수 있다.

아래 수식을 보면 1-gram부터 4-gram까지의 점수(Precision)를 모두 곱한 후, 루트를 두 번 씌우면(1/4) BLEU Score가 된다. 만약 번역 결과가 정말 괜찮았다면 모든 N-gram에서 좋은 점수를 얻었을 것이다.

아래 코드에서 각 그램마다 점수가 몇 인지 확인해보자

![](https://miro.medium.com/max/828/1*0Zi8SI4CkOMd7avkk9BTTA.png)
<center></center>



In [35]:
print("1-gram:", sentence_bleu([reference], candidate, weights=[1, 0, 0, 0]))
print("2-gram:", sentence_bleu([reference], candidate, weights=[0, 1, 0, 0]))
print("3-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 1, 0]))
print("4-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 0, 1]))

1-gram: 0.5
2-gram: 0.18181818181818182
3-gram: 2.2250738585072626e-308
4-gram: 2.2250738585072626e-308


3, 4gram에 점수를보면 왜 위에서 점수가 0점에 가까웠는지 알 수 있다. 하지만 만약 nltk의 낮은 버전을 사용할 경우, 간혹 이런 경우에 3-gram, 4-gram 점수가 1이 나와서, 전체적인 BLEU 점수가 50점 이상으로 매우 높게 나오게 될 수도 있습니다.

예전 버전에서는 위 수식에서 어떤 N-gram이 0의 값을 갖는다면 그 하위 N-gram 점수들이 곱했을 때 모두 소멸해버리기 때문에 일치하는 N-gram이 없더라도 점수를 1.0 으로 유지하여 하위 점수를 보존하게끔 구현되어 있었습니다. 하지만 1.0 은 모든 번역을 완벽히 재현했음을 의미하기 때문에 총점이 의도치 않게 높아질 수 있디. 그럴 경우에는 BLEU Score가 바람직하지 못할 것(Undesirable) 이라는 경고문이 추가되긴 합니다.

2. SmoothingFunction()으로 BLEU Score 보정하기

그래서 BLEU 계산시 특정 N-gram이 0점이 나와서 BLEU가 너무 커지거나 작아지는 쪽으로 왜곡되는 문제를 보완하기 위해 SmoothingFunction() 을 사용하고 있습니다.

Smoothing 함수는 모든 Precision에 아주 작은 epsilon 값을 더해주는 역할을 하는데, 이로써 0점이 부여된 Precision도 완전한 0이 되지 않으니 점수를 1.0 으로 대체할 필요가 없어지죠. 즉 우리의 의도대로 점수가 계산되는 거예요.

진실된 BLEU Score를 확인하기 위해 어서 SmoothingFunction() 을 적용해 봅시다! 아래 코드에서는 SmoothingFunction().method1을 사용해 보겠습니다. 자신만의 Smoothing 함수를 구현해서 적용할 수도 있겠지만, nltk에서는 method0부터 method7까지를 이미 제공하고 있습니다.

- (참고) 각 method들의 상세한 설명은 [nltk의 bleu_score 소스코드](https://www.nltk.org/_modules/nltk/translate/bleu_score.html)를 참고해 봅시다. sentence_bleu() 함수에 smoothing_function=None을 적용하면 method0가 기본 적용됨을 알 수 있습니다.

In [36]:
# SmoothingFunction을 사용한 BLEU score

def calculate_bleu(reference, candidate, weights = [0.25,0.25,0.25,0.25]):
  return sentence_bleu([reference],
                       candidate, weights =weights,
                       smoothing_function = SmoothingFunction().method1) # smoothing_function 적용

print("BLEU-1:", calculate_bleu(reference, candidate, weights=[1, 0, 0, 0]))
print("BLEU-2:", calculate_bleu(reference, candidate, weights=[0, 1, 0, 0]))
print("BLEU-3:", calculate_bleu(reference, candidate, weights=[0, 0, 1, 0]))
print("BLEU-4:", calculate_bleu(reference, candidate, weights=[0, 0, 0, 1]))

print("\nBLEU-Total:", calculate_bleu(reference, candidate))         

BLEU-1: 0.5
BLEU-2: 0.18181818181818182
BLEU-3: 0.010000000000000004
BLEU-4: 0.011111111111111112

BLEU-Total: 0.05637560315259291


아까와 비교했을때 5점이나 더 상승한 모습을 볼 수 있다.

여기서 BLEU-4가 BLEU-3보다 조금이나마 점수가 높은 이유는 한 문장에서 발생하는 3-gram 쌍의 개수와 4-gram 쌍의 개수를 생각해 보면 이해할 수 있습니다. 각 Precision을 N-gram 개수로 나누는 부분에서 차이가 발생하는 것이죠.

3. 트랜스포머 모델의 번역 성능 알아보기

아까 제외시켜둔 test 데이터셋을 사용하여 트랜스포머의 번역 성능을 알아보도록하자.

In [37]:
# translate 함수 구현

def translate(tokens, model, src_tokenizer, tgt_tokenizer):
  padded_tokens = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                                maxlen=MAX_LEN,
                                                                padding = 'post')
  ids = []
  output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)
  for i in range(MAX_LEN):
    enc_padding_mask, combined_mask, dec_padding_mask = \
    generate_masks(padded_tokens, output)

    predictions,_,_,_ = model(padded_tokens, output, enc_padding_mask,
                              combined_mask, dec_padding_mask)
    
    predicted_id = tf.argmax(tf.math.softmax(predictions, axis = -1)[0, -1]).numpy().item()

    if tgt_tokenizer.eos_id() == predicted_id:
      result = tgt_tokenizer.decode_ids(ids)
      return result

    ids.append(predicted_id)
    output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis = -1)

  result = tgt_tokenizer.decode_ids(ids)
  return result

In [38]:
# eval_bleu_single 구현

def eval_bleu_single(model, src_sentence, tgt_sentence, src_tokenizer, tgt_tokenizer, verbose=True):
  src_tokens = src_tokenizer.encode_as_ids(src_sentence)
  tgt_tokens = tgt_tokenizer.encode_as_ids(tgt_sentence)

  if (len(src_tokens) > MAX_LEN) : return None
  if (len(tgt_tokens) > MAX_LEN) : return None

  reference = tgt_sentence.split()
  candidate = translate(src_tokens, model, src_tokenizer, tgt_tokenizer).split()

  score = sentence_bleu([reference], candidate,
                        smoothing_function=SmoothingFunction().method1)
  
  if verbose:
    print("Source Sentence: ", src_sentence)
    print("Model Prediction: ", candidate)
    print("Real: ", reference)
    print(f'Score: {score:.2f}')

  return score

In [39]:
# 테스트해보기

test_idx = range(5)

for num in test_idx:
  eval_bleu_single(transformer, 
                 test_eng_sentences[num], 
                 test_spa_sentences[num], 
                 tokenizer, 
                 tokenizer)

Source Sentence:  he never fails to call his mother on her birthday.
Model Prediction:  ['él', 'nunca', 'se', 'fracasara', 'a', 'su', 'madre', 'en', 'su', 'cumpleaños', 'de', 'cumpleaños', 'de', 'cumpleaños', 'de', 'cumpleaños', 'de', 'cumpleaños', 'de', 'cumpleaños', 'de', 'cumpleaños', 'regaló', 'su', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'su', 'cumpleaños', 'regaló', 'su', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'regaló', 'su', 'cumpleaños', 'cumpleaños', 'cumpleaños', 'cumpleaños']
Real:  ['él', 'siempre', 'llama', 'a', 'su', 'madre', 'el', 'día', 'de', 'su', 'cumpleaños.']
Score: 0.02
Source Sentence:  there was fear in her eyes.
Model Prediction:  ['en', 'los', 'ojos', 'le', 'dan', 'ojos', 'en', 'los', 'ojos', 'de', 'los', 'ojos', 'en', 'los', 'ojos', 'en', 'los', 'ojos', 'de', 'ellas', 'estaban', 'temor', 'estaban', 'temors', 'en', 'los', 'ojos', 'e

In [40]:
# 전체 문장을 평가하는 eval_bleu 구현
def eval_bleu(model, src_sentences, tgt_sentence, src_tokenizer, tgt_tokenizer, verbose=True):
  total_score = 0.0
  sample_size = len(src_sentences)

  for idx in tqdm(range(sample_size)):
    score = eval_bleu_single(model, src_sentences[idx], tgt_sentence[idx],
                             src_tokenizer, tgt_tokenizer, verbose)
    if not score: continue

    total_score += score

  print('Num of Sample:', sample_size)
  print('Total Score : ', total_score / sample_size)

In [41]:
eval_bleu(transformer, test_eng_sentences, test_spa_sentences, tokenizer, tokenizer, verbose=False)

  0%|          | 0/594 [00:00<?, ?it/s]

Num of Sample: 594
Total Score :  0.08351059642317914


# 12-5. 번역 성능 측정하기 (2) Beam Search Decoder

 Greedy Decoding 대신 새로운 기법을 사용하여 모델을 더 잘 평가할 수 있을것이다.

In [42]:
# Bean Sarch 코드

def beam_search_decoder(prob, beam_size):
    sequences = [[[], 1.0]]  # 생성된 문장과 점수를 저장

    for tok in prob:
        all_candidates = []

        for seq, score in sequences:
            for idx, p in enumerate(tok): # 각 단어의 확률을 총점에 누적 곱
                candidate = [seq + [idx], score * -math.log(-(p-1))]
                all_candidates.append(candidate)

        ordered = sorted(all_candidates,
                         key=lambda tup:tup[1],
                         reverse=True) # 총점 순 정렬
        sequences = ordered[:beam_size] # Beam Size에 해당하는 문장만 저장 

    return sequences

In [43]:
vocab = {
    0: "<pad>",
    1: "까요?",
    2: "커피",
    3: "마셔",
    4: "가져",
    5: "될",
    6: "를",
    7: "한",
    8: "잔",
    9: "도",
}

prob_seq = [[0.01, 0.01, 0.60, 0.32, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.75, 0.01, 0.01, 0.17],
            [0.01, 0.01, 0.01, 0.35, 0.48, 0.10, 0.01, 0.01, 0.01, 0.01],
            [0.24, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.68],
            [0.01, 0.01, 0.12, 0.01, 0.01, 0.80, 0.01, 0.01, 0.01, 0.01],
            [0.01, 0.81, 0.01, 0.01, 0.01, 0.01, 0.11, 0.01, 0.01, 0.01],
            [0.70, 0.22, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]]

prob_seq = np.array(prob_seq)
beam_size = 3

result = beam_search_decoder(prob_seq, beam_size)

for seq, score in result:
    sentence = ""

    for word in seq:
        sentence += vocab[word] + " "

    print(sentence, "// Score: %.4f" % score)

커피 를 가져 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 42.5243
커피 를 마셔 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 28.0135
마셔 를 가져 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 17.8983


사실 이 예시는 Beam Search를 설명하는 데에는 더 없이 적당하지만 실제로 모델이 문장을 생성하는 과정과는 거리가 멀다. 당장 모델이 문장을 생성하는 과정만 떠올려도 위의 prob_seq처럼 확률을 정의할 수 없다. 각 단어에 대한 확률은 prob_seq처럼 한 번에 정의가 되지 않고 이전 스텝까지의 단어에 따라서 결정되기 때문이다.

간단한 예시로, Beam Size가 2이고 Time-Step이 2인 순간의 두 문장이 나는 밥을, 나는 커피를 이라고 한다면 세 번째 단어로 먹는다, 마신다 를 고려할 수 있다. 이때, 전자에서 마신다에 할당하는 확률과 후자에서 마신다 에 할당하는 확률과 후자에서 마신다 에 할당하는 확률은 각각 이전 단어들인 나는 밥을, 나는 커피를 에 따라서 결정되기 때문에 서로 독립적인 확률을 갖는다. 예컨대 후자가 마신다 에 더 높을 확률을 할당할 것을 할 수 있다. 위 소스에서 처럼 3번째 단어는 항상 [마신디:0.3, 먹는다:0.5..]의 확률을 가진다고 할 수 없다.

따라서 Beam Search를 생성 기법으로 구현할 때에는 분기를 잘 나눠줘야 합니다. Beam Size가 5라고 가정하면 맨 첫 단어로 적합한 5개의 단어를 생성하고, 두 번째 단어로 각 첫 단어(5개 단어)에 대해 5순위까지 확률을 구하여 총 25개의 문장을 생성하죠. 그 25개의 문장들은 각 단어에 할당된 확률을 곱하여 구한 점수(존재 확률) 를 가지고 있으니 각각의 순위를 매길 수 있겠다 점수 상위 5개의 표본만 살아남아 세 번째 단어를 구할 자격을 얻게 됩니다.

위 과정을 반복하면 최종적으로 점수가 가장 높은 5개의 문장을 얻게 됩니다. 물론 Beam Size를 조절해 주면 그 수는 유동적으로 변할 수 있다.

1. Beam Search Decoder 작성 및 평가하기

각 단어의 확률값을 계산하는 calc_prob()와 Beam Search를 기반으로 동작하는 beam_search_decoder() 를 구현하고 생성된 문장에 대해 BLEU Score를 출력하는 beam_bleu() 를 구현하세요!

편의에 따라서 두 기능을 하나의 함수에 구현해도 좋습니다!


In [44]:
# calc_prob() 구현
def calc_prob(src_ids, tgt_ids, model):
  enc_padding_mask, combined_mask, dec_padding_mask = generate_masks(src_ids, tgt_ids)

  predictions, enc_attns, dec_attns, dec_enc_attns = \
  model(src_ids, tgt_ids, enc_padding_mask, combined_mask, dec_padding_mask)

  return tf.math.softmax(predictions, axis = -1)

In [1]:
# beam_search_decoder() 구현

def beam_search_decoder(sentence, src_len, tgt_len, model, src_tokenizer, tgt_tokenizer, beam_size):
  tokens = src_tokenizer.encode_as_ids(sentence)

  src_in = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                         maxlen=src_len,
                                                         padding='post')
  pred_cache = np.zeros((beam_size * beam_size, tgt_len), dtype = np.int64)
  pred_tmp = np.zeros((beam_size, tgt_len), dtype = np.int64)

  eos_flag = np.zeros((beam_size,), dtype = np.int64)
  scores = np.ones((beam_size,))

  pred_tmp[:, 0] = tgt_tokenizer.bos_id()

  dec_in = tf.expand_dims(pred_tmp[0, :1], 0)
  prob = calc_prob(src_in, dec_in, model)[0, -1].numpy()

  for seq_pos in range(1, tgt_len):
    score_cache = np.ones((beam_size * beam_size, ))

    # init
    for branch_idx in range(beam_size):
      cache_pos = branch_idx * beam_size

      score_cache[cache_pos:cache_pos+beam_size] = scores[branch_idx]
      pred_cache[cache_pos:cache_pos+beam_size, :seq_pos] = \
      pred_tmp[branch_idx, :seq_pos]

    for branch_idx in range(beam_size):
      cache_pos = branch_idx * beam_size

      if seq_pos != 1: # 모든 Branch를 로 시작하는 경우를 방지
        dec_in = pred_cache[branch_idx, :seq_pos]
        dec_in = tf.expand_dims(dec_in, 0)

        prob = calc_prob(src_in, dec_in, model)

      for beam_idx in range(beam_size):
        max_idx = np.argmax(prob)

        score_cache[cache_pos+beam_idx] *= prob[max_idx]
        pred_cache[cache_pos+beam_idx, seq_pos] = max_idx

        prob[max_idx] = -1

    for beam_idx in range(beam_size):
      if eos_flag[beam_idx] == -1: continue

      max_idx = np.argmax(score_cache)
      prediction = pred_cache[max_idx, :seq_pos + 1]

      pred_tmp[beam_idx, :seq_pos + 1] = prediction
      scores[beam_idx] = score_cache[max_idx]
      score_cache[max_idx] = -1

      if prediction[-1] == tgt_tokenizer.eos_id():
        eos_flag[beam_idx] = -1

  pred = []
  for long_pred in pred_tmp:
    zero_idx = long_pred.tolist().index(tgt_tokenizer.eos_id())
    short_pred = long_pred[:zero_idx + 1]
    pred.append(short_pred)
  
  return pred

In [2]:
def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    return sentence_bleu([reference],
                            candidate,
                            weights=weights,
                            smoothing_function=SmoothingFunction().method1)

In [3]:
# beam_bleu() 구현
def beam_bleu(reference, ids, tokenizer):
    reference = reference.split()

    total_score = 0.0
    for _id in ids:
        candidate = tokenizer.decode_ids(_id.tolist()).split()
        score = calculate_bleu(reference, candidate)

        print("Reference:", reference)
        print("Candidate:", candidate)
        print("BLEU:", calculate_bleu(reference, candidate))

        total_score += score

In [None]:
# 인덱스를 바꿔가며 확인해 보세요
test_idx = 1

ids = \
beam_search_decoder(test_eng_sentences[test_idx],
                    MAX_LEN,
                    MAX_LEN,
                    transformer,
                    tokenizer,
                    tokenizer,
                    beam_size=5)

bleu = beam_bleu(test_spa_sentences[test_idx], ids, tokenizer)
print(bleu)

# 12-6. 데이터 부풀리기

이번 스텝에서는 Data Augmentation, 그중에서도 Embedding을 활용한 Lexical Substitution을 구현해 볼 거예요. gensim 라이브러리를 활용하면 어렵지 않게 해낼 수 있습니다.

gensim 에 사전 훈련된 Embedding 모델을 불러오는 것은 두 가지 방법이 있습니다.

1. 직접 모델을 다운로드해 load 하는 방법

2. gensim 이 자체적으로 지원하는 downloader 를 활용해 모델을 load 하는 방법

한국어는 gensim 에서 지원하지 않으므로 두 번째 방법을 사용할 수 없지만, 영어라면 얘기가 달라지죠! 아래 웹페이지의 Available data → Model 부분에서 공개된 모델의 종류를 확인할 수 있습니다.

[RaRe-Technologies/gensim-data](https://github.com/RaRe-Technologies/gensim-data)

대표적으로 사용되는 Embedding 모델은 word2vec-google-news-300 이지만 용량이 커서 다운로드에 많은 시간이 소요되므로 이번 실습엔 적합하지 않습니다. 우리는 적당한 사이즈의 모델인 glove-wiki-gigaword-300 을 사용할게요! 아래 소스를 실행해 사전 훈련된 Embedding 모델을 다운로드해 주세요.


In [5]:
import gensim.downloader as api

wv = api.load('glove-wiki-gigaword-300')



불러온 모델은 아래와 같이 활용할 수 있습니다.

In [6]:
wv.most_similar("banana")

[('bananas', 0.6691170930862427),
 ('mango', 0.580410361289978),
 ('pineapple', 0.5492371916770935),
 ('coconut', 0.5462779402732849),
 ('papaya', 0.541056752204895),
 ('fruit', 0.5218108296394348),
 ('growers', 0.4877638816833496),
 ('nut', 0.4839959144592285),
 ('peanut', 0.48062020540237427),
 ('potato', 0.4806118607521057)]

주어진 데이터를 토큰 단위로 분리한 후, 랜덤하게 하나를 선정하여 해당 토큰과 가장 유사한 단어를 찾아 대치하면 그것으로 Lexical Substitution은 완성되겠죠? 가볍게 확인해 봅시다!



In [11]:
sample_sentence = 'you know ? all you need is attention .'
sample_tokens = sample_sentence.split()

selected_tok = random.choice(sample_tokens)

result = ''

for tok in sample_tokens:
  if tok is selected_tok:
    result += wv.most_similar(tok)[0][0] + ' '
  else:
    result += tok + ' '

print("From:", sample_sentence)
print("To:", result)

From: you know ? all you need is attention .
To: you think ? all you need is attention . 


1. Lexical Substitution 구현하기
- 입력된 문장을 Embedding 유사도를 기반으로 Augmentation 하여 반환하는 lexical_sub() 를 구현하세요!

In [12]:
# Lexical Substitution 구현

def lexical_sub(sentence, word2vec):
  res = ''
  toks = sentence.split()

  try:
    _from = random.choice(toks)
    _to = word2vec.most_similar(_from)[0][0]

  except: # 단어장에 없는 단어
    return None
  
  for tok in toks:
    if tok is _from: res += _to + ' '
    else: res += tok + ' '
    
  return res

In [13]:
new_corpus = []

for old_src in tqdm(test_eng_sentences):
  new_src = lexical_sub(old_src, wv)
  if new_src is not None:
    new_corpus.append(new_src)
  # Augmentation이 없더라도 원본 문장을 포함시킵니다
  new_corpus.append(old_src)

print(new_corpus[:10])

NameError: ignored