# 12. Seq2seq으로 번역기 만들기 [프로젝트]
프로젝트: 한영 번역기 만들기

## 0. 준비

In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import re
from konlpy.tag import Mecab

import warnings

warnings.filterwarnings(action='ignore') 

In [2]:
with open('./data/korean-english-park.train.ko', 'r') as f:
    train_ko_data = f.read().splitlines()

with open('./data/korean-english-park.train.en', 'r') as f:
    train_en_data = f.read().splitlines()

In [3]:
print('train_ko_data : ', len(train_ko_data))
print('train_en_data : ', len(train_en_data))

train_ko_data :  94123
train_en_data :  94123


In [4]:
for i in range(0,500,100):
    print('>> ', train_ko_data[i])
    print('>> ', train_en_data[i], '\n')

>>  개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"
>>  Much of personal computing is about "can you top this?" 

>>  제 23차 연례 컴덱스 박람회의 개회사를 한 케이츠는 2년여전 기술 산업의 거품이 붕괴된 이후에 첨단 기술에 대해 부정적인 인식이 있다고 말했다.
>>  Gates, who opened the 23rd annual Comdex trade show, said there was a negative perception of high tech following the collapse of the tech bubble about two years ago. 

>>  국제 원자력 기구는 북한이 핵 무기 개발 계획을 중지하고, 즉각적으로 "모든 관련 시설들"을 공개하여 사찰을 받으라고 요구했다.
>>  The International Atomic Energy Agency called on North Korea to end any nuclear weapon program and open "all relevant facilities" to inspections immediately. 

>>  일본에서 124명의 사망자를 낸 폐암 치료제
>>  Microsoft had "leveraged its PC monopoly in which it is unfairly advantaged." 

>>  그러나 미국은 개개의 시험과 성인의 식자율(識字率)에서 하위에 머물렀다.
>>  The United States, however, finished low in each test and in adult literacy. 



## Step 2. 데이터 정제

In [5]:
# set을 이용한 중복 제거
cleaned_corpus = list(set(zip(train_ko_data, train_en_data)))

In [6]:
len(cleaned_corpus)

78968

In [7]:
for i in range(0,500,100):
    print('>> ', cleaned_corpus[i][0])
    print('>> ', cleaned_corpus[i][1], '\n')

>>  회장 밖에서는, 경찰들이 G8에 반대하는 사람들을 해산시키기 위해 물대포를 사용했다.
>>  Outside, police have used water cannon to try to break up anti-G8 protesters. 

>>  그는 '어글리 베티'에서 어설픈 베티 수아레즈의 역할로 올해 골든글로브와 미국배우조합상을 수상했다.
>>  She won Golden Globe and Screen Actors Guild awards this year for her role as awkward assistant Betty Suarez on "Ugly Betty," and has appeared on a host of magazine covers. 

>>  그는 “이에 대해 생각할 시간 조차 없었다”며 “몸을 굽혀 날렵하게 피했다”고 말했다.
>>  "I didn't have much time to reflect on anything, I was ducking and dodging," Bush said. 

>>  탑승객 165명 중에 인명 피해는 없었는데, 일부 승객은 여객기가 거대한 불덩이가 되어 폭발하기 직전 탈출에 성공했다.
>>  All 165 people on board survived, with some escaping just seconds before the plane exploded into a huge fireball. 

>>  펜실베니아주 환경보호국 대변인은 당국이 페트롤리아 인근을 조사해 유독한 화학물질이 남은 흔적이 없다는 판단을 내렸다고 밝혔다.
>>  Authorities surveyed the neighborhood in Petrolia and determined that no traces of the toxic chemical remained, said Freda Tarbell, spokeswoman for the Pennsylvania Department of Environmental Protection. 



In [8]:
# 한국어 전처리
# 정규식 변경, mecab 사용
def preprocess_sentence_ko(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^가-힣?.!,]+", " ", sentence)

    sentence = sentence.strip()
    
    mecab = Mecab()
    sentence = mecab.morphs(sentence)

    return sentence

In [9]:
# 영어 전처리
def preprocess_sentence_en(sentence, s_token=False, e_token=False):
    sentence = sentence.lower().strip()

    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)

    sentence = sentence.strip()

    if s_token:
        sentence = '<start> ' + sentence

    if e_token:
        sentence += ' <end>'
    
    sentence = sentence.split()
    
    return sentence

In [10]:
# 토큰 길이가 40 이하인 데이터 선별
kor_corpus = []
eng_corpus = []

for x in cleaned_corpus:
    ko = preprocess_sentence_ko(x[0])
    en = preprocess_sentence_en(x[1], s_token=True, e_token=True)
    
    if len(ko) <= 40 and len(en) <= 40:
        kor_corpus.append(ko)
        eng_corpus.append(en)

In [11]:
print(len(kor_corpus), len(eng_corpus))

for i in range(0,500,100):
    print('>> ', kor_corpus[i])
    print('>> ', eng_corpus[i], '\n')

63127 63127
>>  ['회장', '밖', '에서', '는', ',', '경찰', '들', '이', '에', '반대', '하', '는', '사람', '들', '을', '해산', '시키', '기', '위해', '물대포', '를', '사용', '했', '다', '.']
>>  ['<start>', 'outside', ',', 'police', 'have', 'used', 'water', 'cannon', 'to', 'try', 'to', 'break', 'up', 'anti', 'g', 'protesters', '.', '<end>'] 

>>  ['한편', '소치', '에서', '는', '콘서트', '와', '개최지', '발표', '실황', '중계', '를', '보', '기', '위해', '모인', '만', '명', '은', '일제히', '환호', '했', '다', '.']
>>  ['<start>', 'in', 'sochi', ',', 'cheers', 'erupted', 'from', 'the', 'crowd', 'of', 'more', 'than', ',', 'that', 'had', 'gathered', 'for', 'a', 'pop', 'concert', 'and', 'the', 'announcement', 'in', 'a', 'main', 'square', '.', '<end>'] 

>>  ['결혼식', '날', '에', '는', '둘', '사이', '에', '웃음', '도', '없', '어', '지', '고', '관계', '가', '멀', '어', '졌', '다', '.']
>>  ['<start>', 'on', 'the', 'big', 'day', ',', 'smiles', 'were', 'few', 'and', 'far', 'between', '.', '<end>'] 

>>  ['올리', '펜', '트', '회장', '은', '감독', '이', '가족', '딜레마', '를', '안', '고', '있', '다', '며', '부인', '의

## Step 3. 데이터 토큰화

In [12]:
# 단어수 15000으로 설정, 텐서 변환
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=15000, filters='', oov_token="<unk>")
    tokenizer.fit_on_texts(corpus)
    
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, tokenizer

In [13]:
enc_tensor, enc_tokenizer = tokenize(kor_corpus)
dec_tensor, dec_tokenizer = tokenize(eng_corpus)

In [14]:
len(enc_tokenizer.index_word), len(enc_tokenizer.index_word)

(37141, 37141)

In [15]:
enc_tensor[0]

array([ 667,  524,   17,    5,   18,   70,   16,    4,    9,  339,   12,
          5,   77,   16,    6, 2103,  244,   34,   65, 8592,   10,  177,
         11,    3,    2,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [16]:
dec_tensor[0]

array([   4,  466,    6,   64,   27,  259,  411, 9961,    7,  852,    7,
        989,   57,  574, 2704,  624,    3,    5,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

## Step 4. 모델 설계

In [17]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.w_dec = tf.keras.layers.Dense(units)
        self.w_enc = tf.keras.layers.Dense(units)
        self.w_com = tf.keras.layers.Dense(1)
    
    def call(self, h_enc, h_dec):
        # h_enc shape: [batch x length x units]
        # h_dec shape: [batch x units]

        h_enc = self.w_enc(h_enc)
        h_dec = tf.expand_dims(h_dec, 1)
        h_dec = self.w_dec(h_dec)

        score = self.w_com(tf.nn.tanh(h_dec + h_enc))
        
        attn = tf.nn.softmax(score, axis=1)

        context_vec = attn * h_enc
        context_vec = tf.reduce_sum(context_vec, axis=1)

        return context_vec, attn

In [18]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()

        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True)

    def call(self, x):
        out = self.embedding(x)
        out = self.gru(out)

        return out

In [19]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(dec_units,
                                       return_sequences=True,
                                       return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, h_dec, enc_out):
        context_vec, attn = self.attention(enc_out, h_dec)

        out = self.embedding(x)
        out = tf.concat([tf.expand_dims(context_vec, 1), out], axis=-1)

        out, h_dec = self.gru(out)
        out = tf.reshape(out, (-1, out.shape[2]))
        out = self.fc(out)

        return out, h_dec, attn

In [20]:
BATCH_SIZE     = 32
SRC_VOCAB_SIZE = len(enc_tokenizer.index_word) + 1
TGT_VOCAB_SIZE = len(dec_tokenizer.index_word) + 1

units         = 1024
embedding_dim = 512

encoder = Encoder(SRC_VOCAB_SIZE, embedding_dim, units)
decoder = Decoder(TGT_VOCAB_SIZE, embedding_dim, units)

# sample input
sequence_len = 30

sample_enc = tf.random.uniform((BATCH_SIZE, sequence_len))
sample_output = encoder(sample_enc)

print ('Encoder Output:', sample_output.shape)

sample_state = tf.random.uniform((BATCH_SIZE, units))

sample_logits, h_dec, attn = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                     sample_state, sample_output)

print ('Decoder Output:', sample_logits.shape)
print ('Decoder Hidden State:', h_dec.shape)
print ('Attention:', attn.shape)

Encoder Output: (32, 30, 1024)
Decoder Output: (32, 38525)
Decoder Hidden State: (32, 1024)
Attention: (32, 30, 1)


In [21]:
optimizer = tf.keras.optimizers.legacy.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss.dtype)  # 패딩을 위한 토큰
    loss *= mask
    
    return tf.reduce_mean(loss)

In [22]:
@tf.function 
def train_step(src, tgt, encoder, decoder, optimizer, dec_tok):
    bsz = src.shape[0]
    loss = 0

    # 학습하며 발생한 모든 연산을 기록하는 테이프로
    # 모델이 각 스텝의 최종 단계에서 미분값을 구하는 데에 사용됨
    with tf.GradientTape() as tape:
        enc_out = encoder(src)  # encoder에 소스문장을 전달하여 컨텍스트 벡터인 enc_out을 생성
        h_dec = enc_out[:, -1]  # encoder의 마지막을 가져와(encoder의 final state) decoder의 hidden state로 정의 (t=0일때)
        
        dec_src = tf.expand_dims([dec_tok.word_index['<start>']] * bsz, 1) # Decoder에 입력으로 전달할 <start> 토큰 문장 생성

        for t in range(1, tgt.shape[1]):
            # pred : <start> 문장과 enc_out, Hidden State를 기반으로 다음 단어(t=1)를 예측
            pred, h_dec, _ = decoder(dec_src, h_dec, enc_out)

            # 예측된 단어와 정답 간의 Loss를 구한 후, t=1의 정답 단어를 다음 입력으로 사용 (예측 단어 X) -> 그 후 반복!
            loss += loss_function(tgt[:, t], pred)
            dec_src = tf.expand_dims(tgt[:, t], 1)
        
    batch_loss = (loss / int(tgt.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

## Step 5. 훈련하기

In [23]:
from tqdm import tqdm    # tqdm
import random

EPOCHS = 15
BATCH_SIZE = 16

for epoch in range(EPOCHS):
    total_loss = 0

    idx_list = list(range(0, enc_tensor.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss = train_step(enc_tensor[idx:idx+BATCH_SIZE],
                                dec_tensor[idx:idx+BATCH_SIZE],
                                encoder,
                                decoder,
                                optimizer,
                                dec_tokenizer)

        total_loss += batch_loss

        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

  0%|          | 0/3946 [00:00<?, ?it/s]

2023-08-24 11:34:36.783749: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-08-24 11:34:36.784627: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-08-24 11:34:36.785369: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

KeyboardInterrupt: 