# Machine Translation (기계번역)

- 챗봇과 비교하면 기계번역은 embedding layer와 사전을 한글, 영어 따로따로 만드는 차이

In [80]:
!pip install sentencepiece



In [81]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import sentencepiece as spm
import re
import pickle
from nltk.translate.bleu_score import sentence_bleu

In [82]:
%cd '/content/drive/My Drive/Colab Notebooks'

/content/drive/My Drive/Colab Notebooks


In [83]:
df = pd.read_csv('data/machine_trans.csv')
question, answer = list(df['source']), list(df['target'])
df.head()

Unnamed: 0,source,target
0,12시 땡,12 o'clock
1,1지망 학교 떨어졌어,I fell on the one.
2,3박4일 놀러가고 싶다,I want to go to 4 nights and 4 days.
3,3박4일 정도 놀러가고 싶다,I want to go to three nights and four days.
4,PPL 심하네,PPL serious


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23646 entries, 0 to 23645
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  23646 non-null  object
 1   target  23646 non-null  object
dtypes: object(2)
memory usage: 369.6+ KB


In [85]:
# question의 평균 문장길이
df['source'].apply(len).mean()

13.257760297724774

In [86]:
# answer의 평균 문장길이
df['target'].apply(len).mean()

26.232005413177703

In [87]:
filters = "([~.,!?\"':;)(])"
question = [re.sub(filters, "", s) for s in question]
answer = [re.sub(filters, "", s) for s in answer]

In [88]:
print(question[:5])
print(answer[:5])

['12시 땡', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['12 oclock', 'I fell on the one', 'I want to go to 4 nights and 4 days', 'I want to go to three nights and four days', 'PPL serious']


In [89]:
len(question), len(answer)

(23646, 23646)

In [90]:
ko_data_file = "data/machine_trans_ko.txt"
en_data_file = "data/machine_trans_en.txt"
with open(ko_data_file, 'w', encoding='utf-8') as f:
    for sent in question:
        f.write(sent+'\n')

with open(en_data_file, 'w', encoding='utf-8') as f:
    for sent in answer:
        f.write(sent+'\n')

templates = "--input={} \
            --pad_id=0 --pad_piece=<PAD>\
            --unk_id=1 --unk_piece=<UNK>\
            --bos_id=2 --bos_piece=<BOS>\
            --eos_id=3 --eos_piece=<EOS>\
            --model_prefix={}\
            --vocab_size={}"

KO_VOCAB_SIZE = 4000
EN_VOCAB_SIZE = 4000
ko_model_prefix = "data/machine_trans_ko_model"
en_model_prefix = "data/machine_trans_en_model"
ko_params = templates.format(ko_data_file, ko_model_prefix, KO_VOCAB_SIZE)
en_params = templates.format(en_data_file, en_model_prefix, EN_VOCAB_SIZE)

# question sentencepiece (Korean)
spm.SentencePieceTrainer.Train(ko_params)
ko_sp = spm.SentencePieceProcessor()
ko_sp.Load(ko_model_prefix + '.model')

with open(ko_model_prefix + '.vocab', encoding='utf-8') as f:
    ko_vocab = [doc.strip().split('\t') for doc in f]

ko_word2idx = {k:v for v, [k, _] in enumerate(ko_vocab)}
ko_idx2word = {v:k for v, [k, _] in enumerate(ko_vocab)}

# answer sentencepiece (English)
spm.SentencePieceTrainer.Train(en_params)
en_sp = spm.SentencePieceProcessor()
en_sp.Load(en_model_prefix + '.model')

with open(en_model_prefix + '.vocab', encoding='utf-8') as f:
    en_vocab = [doc.strip().split('\t') for doc in f]

en_word2idx = {k:v for v, [k, _] in enumerate(en_vocab)}
en_idx2word = {v:k for v, [k, _] in enumerate(en_vocab)}

In [91]:
print(ko_word2idx)
print(en_word2idx)

{'<PAD>': 0, '<UNK>': 1, '<BOS>': 2, '<EOS>': 3, '▁': 4, '가': 5, '을': 6, '이': 7, '도': 8, '요': 9, '는': 10, '지': 11, '게': 12, '은': 13, '고': 14, '▁거예요': 15, '를': 16, '세요': 17, '▁수': 18, '의': 19, '보세요': 20, '▁너무': 21, '▁나': 22, '죠': 23, '▁거': 24, '한': 25, '서': 26, '에': 27, '▁사람': 28, '▁좋아하는': 29, '▁잘': 30, '▁것': 31, '▁더': 32, '▁안': 33, '면': 34, '만': 35, '▁사랑': 36, '나': 37, '어': 38, '다': 39, '네': 40, '▁많이': 41, '▁좋은': 42, '해': 43, '히': 44, '▁이별': 45, '▁그': 46, '로': 47, '▁싶어': 48, '할': 49, '▁좀': 50, '기': 51, '랑': 52, '▁같아요': 53, '네요': 54, '▁사람이': 55, '▁있을': 56, '▁내': 57, '하고': 58, '▁마음': 59, '▁있어요': 60, '으로': 61, '▁때': 62, '▁썸': 63, '▁생각': 64, '▁말': 65, '하세요': 66, '야': 67, '길': 68, '▁같아': 69, '▁다': 70, '▁게': 71, '하는': 72, '▁좋': 73, '일': 74, '▁있': 75, '해보세요': 76, '거': 77, '▁마세요': 78, '▁건': 79, '▁오늘': 80, '겠네요': 81, '나봐요': 82, '인': 83, '▁가': 84, '▁이제': 85, '에서': 86, '▁있는': 87, '▁마음이': 88, '겠어요': 89, '▁내가': 90, '▁연애': 91, '▁왜': 92, '▁일': 93, '▁뭐': 94, '▁못': 95, '는데': 96, '▁저': 97, '▁다른': 98, '▁

여기서부터 copy&paste

In [92]:
KO_MAX_LEN = 13     # questions 평균 length: 13 
EN_MAX_LEN = 26     # answer 평균 length: 26

enc_input = []
dec_input = []
dec_output = []

for Q, A in zip(question, answer):
    # Encoder 입력
    enc_i = ko_sp.encode_as_ids(Q)
    enc_input.append(enc_i)

    # Decoder 입력, 출력
    dec_i = [en_sp.bos_id()]   # <BOS>에서 시작함
    dec_o = []
    for ans in en_sp.encode_as_ids(A):
        dec_i.append(ans)
        dec_o.append(ans)
    dec_o.append(en_sp.eos_id())   # Decoder 출력은 <EOS>로 끝남.        
    
    # dec_o는 <EOS>가 마지막에 들어있다. 나중에 pad_sequences()에서 <EOS>가
    # 잘려 나가지 않도록 MAX_LEN 위치에 <EOS>를 넣어준다.
    if len(dec_o) > EN_MAX_LEN:
        dec_o[EN_MAX_LEN] = en_sp.eos_id()
        
    dec_input.append(dec_i)
    dec_output.append(dec_o)

In [93]:
# 각 문장의 길이를 맞추고 남는 부분에 padding을 삽입한다.
enc_input = pad_sequences(enc_input, maxlen=KO_MAX_LEN, value = ko_sp.pad_id(), padding='post', truncating='post')
dec_input = pad_sequences(dec_input, maxlen=EN_MAX_LEN, value = en_sp.pad_id(), padding='post', truncating='post')
dec_output = pad_sequences(dec_output, maxlen=EN_MAX_LEN, value = en_sp.pad_id(), padding='post', truncating='post')

In [94]:
enc_input[1], dec_input[1], dec_output[1]

(array([ 278,   11, 1351,  731, 1838,    0,    0,    0,    0,    0,    0,
           0,    0], dtype=int32),
 array([  2,   4, 658,  75,   8, 136,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       dtype=int32),
 array([  4, 658,  75,   8, 136,   3,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       dtype=int32))

In [95]:
# 사전과 학습 데이터를 저장한다.
with open('data/chatbot_voc_ko.pkl', 'wb') as f:
    pickle.dump([ko_word2idx, ko_idx2word], f, pickle.HIGHEST_PROTOCOL)
with open('data/chatbot_voc_en.pkl', 'wb') as f:
    pickle.dump([en_word2idx, en_idx2word], f, pickle.HIGHEST_PROTOCOL)

# BLEU 평가를 위해 que_test와 ans_test를 저장해 둔다.
with open('data/chatbot_train.pkl', 'wb') as f:
    pickle.dump([enc_input, dec_input, dec_output], f, pickle.HIGHEST_PROTOCOL)


## Seq2Seq

In [96]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.layers import Embedding, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import pickle

In [97]:
# %cd '/content/drive/My Drive/Colab Notebooks'
!pwd

/content/drive/MyDrive/Colab Notebooks


In [98]:
# Sub-word 사전 읽어온다.
with open('data/chatbot_voc_ko.pkl', 'rb') as f:
    ko_word2idx,  ko_idx2word = pickle.load(f)
with open('data/chatbot_voc_en.pkl', 'rb') as f:
    en_word2idx,  en_idx2word = pickle.load(f)

# 학습 데이터 : 인코딩, 디코딩 입력, 디코딩 출력을 읽어온다.
with open('data/chatbot_train.pkl', 'rb') as f:
    trainXE, trainXD, trainYD = pickle.load(f)
	
KO_VOCAB_SIZE = len(ko_idx2word)
EN_VOCAB_SIZE = len(en_idx2word)
EMB_SIZE = 128
LSTM_HIDDEN = 128
MODEL_PATH = 'data/machine_trans_trained.h5'
# EN_MODEL_PATH = 'data/machine_trans_trained_en.h5'
LOAD_MODEL = False

In [99]:
trainYD[1]

array([  4, 658,  75,   8, 136,   3,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
      dtype=int32)

In [100]:
print([ko_idx2word[i] for i in trainXE[1]])
print([en_idx2word[i] for i in trainYD[1]])

['▁1', '지', '망', '▁학교', '▁떨어졌어', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['▁I', '▁fell', '▁on', '▁the', '▁one', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [101]:
# 워드 임베딩 레이어.
K.clear_session()
ko_wordEmbedding = Embedding(input_dim=KO_VOCAB_SIZE, output_dim=EMB_SIZE)
en_wordEmbedding = Embedding(input_dim=EN_VOCAB_SIZE, output_dim=EMB_SIZE)

# Encoder
# -------
# many-to-one으로 구성한다. 중간 출력은 필요 없고 decoder로 전달할 h와 c만
# 필요하다. h와 c를 얻기 위해 return_state = True를 설정한다.
encoderX = Input(batch_shape=(None, trainXE.shape[1]))
encEMB = ko_wordEmbedding(encoderX)
encLSTM1 = LSTM(LSTM_HIDDEN, return_sequences=True, return_state = True)        # return_sequences: 중간출력을 2층으로 올려보내기 위해 필요
encLSTM2 = LSTM(LSTM_HIDDEN, return_state = True)
ey1, eh1, ec1 = encLSTM1(encEMB)    # LSTM 1층 
_, eh2, ec2 = encLSTM2(ey1)       # LSTM 2층

# Decoder
# -------
# many-to-many로 구성한다. target을 학습하기 위해서는 중간 출력이 필요하다.
# 그리고 초기 h와 c는 encoder에서 출력한 값을 사용한다 (initial_state)
# 최종 출력은 vocabulary의 인덱스인 one-hot 인코더이다.
decoderX = Input(batch_shape=(None, trainXD.shape[1]))
decEMB = en_wordEmbedding(decoderX)
decLSTM1 = LSTM(LSTM_HIDDEN, return_sequences=True, return_state=True)
decLSTM2 = LSTM(LSTM_HIDDEN, return_sequences=True, return_state=True)
dy1, _, _ = decLSTM1(decEMB, initial_state = [eh1, ec1])
dy2, _, _ = decLSTM2(dy1, initial_state = [eh2, ec2])
decOutput = TimeDistributed(Dense(EN_VOCAB_SIZE, activation='softmax'))
outputY = decOutput(dy2)

# Model
# -----
model = Model([encoderX, decoderX], outputY)
model.compile(optimizer=optimizers.Adam(learning_rate=0.0005), 
              loss='sparse_categorical_crossentropy')

# if LOAD_MODEL:
#     model.load_weights(MODEL_PATH)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 13)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 13, 128)      512000      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 26, 128)      512000      ['input_2[0][0]']                
                                                                                              

In [None]:
# 학습 (teacher forcing)
# ----------------------
hist = model.fit([trainXE, trainXD], trainYD, batch_size = 512, epochs=300, shuffle=True)

# 학습 결과를 저장한다
model.save_weights(MODEL_PATH)
# model.save_weights

# Loss history를 그린다
plt.plot(hist.history['loss'], label='Train loss')
plt.legend()
plt.title("Loss history")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.show()

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

## Machine Translation Model (Seq2Seq를 이용한 기계번역 모델링)

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.layers import Embedding, TimeDistributed
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import sentencepiece as spm
import numpy as np
import pickle
import random

In [None]:
# %cd '/content/drive/My Drive/Colab Notebooks'
!pwd

In [None]:
# Sub-word 사전 읽어온다.
with open('data/chatbot_voc_ko.pkl', 'rb') as f:
    ko_word2idx,  ko_idx2word = pickle.load(f)
with open('data/chatbot_voc_en.pkl', 'rb') as f:
    en_word2idx,  en_idx2word = pickle.load(f)

KO_VOCAB_SIZE = len(ko_idx2word)
EN_VOCAB_SIZE = len(en_idx2word)
EMB_SIZE = 128
LSTM_HIDDEN = 128
KO_MAX_LEN = 15            # 단어 시퀀스 길이
EN_MAX_LEN = 15            # 단어 시퀀스 길이
MODEL_PATH = 'data/machine_trans_trained.h5'

# 데이터 전처리 과정에서 생성한 SentencePiece model을 불러온다.
KO_SPM_MODEL = "data/machine_trans_ko_model.model"
ko_sp = spm.SentencePieceProcessor()
ko_sp.Load(KO_SPM_MODEL)
EN_SPM_MODEL = "data/machine_trans_en_model.model"
en_sp = spm.SentencePieceProcessor()
en_sp.Load(EN_SPM_MODEL)

In [None]:
# 워드 임베딩 레이어. Encoder와 decoder에서 공동으로 사용한다.
K.clear_session()
ko_wordEmbedding = Embedding(input_dim=KO_VOCAB_SIZE, output_dim=EMB_SIZE)
en_wordEmbedding = Embedding(input_dim=EN_VOCAB_SIZE, output_dim=EMB_SIZE)

# Encoder
# -------
# c는 long term, short term(의 비중?)을 컨트롤 하는 cell state
encoderX = Input(batch_shape=(None, KO_MAX_LEN))
encEMB = ko_wordEmbedding(encoderX)
encLSTM1 = LSTM(LSTM_HIDDEN, return_sequences=True, return_state = True)
encLSTM2 = LSTM(LSTM_HIDDEN, return_state = True)
ey1, eh1, ec1 = encLSTM1(encEMB)    # LSTM 1층 
_, eh2, ec2 = encLSTM2(ey1)         # LSTM 2층

# Decoder
# -------
# Decoder는 1개 단어씩을 입력으로 받는다. (앞과 이 부분이 다름)
# chat bot 학습때는 teacher forcing.
decoderX = Input(batch_shape=(None, 1))
decEMB = en_wordEmbedding(decoderX)
decLSTM1 = LSTM(LSTM_HIDDEN, return_sequences=True, return_state=True)
decLSTM2 = LSTM(LSTM_HIDDEN, return_sequences=True, return_state=True)
dy1, _, _ = decLSTM1(decEMB, initial_state = [eh1, ec1])
dy2, _, _ = decLSTM2(dy1, initial_state = [eh2, ec2])
decOutput = TimeDistributed(Dense(EN_VOCAB_SIZE, activation='softmax'))
outputY = decOutput(dy2)

# Model
# -----
model = Model([encoderX, decoderX], outputY)
model.load_weights(MODEL_PATH)
print(model.summary())


# Chatting용 model
model_enc = Model(encoderX, [eh1, ec1, eh2, ec2])

ih1 = Input(batch_shape = (None, LSTM_HIDDEN))
ic1 = Input(batch_shape = (None, LSTM_HIDDEN))
ih2 = Input(batch_shape = (None, LSTM_HIDDEN))
ic2 = Input(batch_shape = (None, LSTM_HIDDEN))

dec_output1, dh1, dc1 = decLSTM1(decEMB, initial_state = [ih1, ic1])
dec_output2, dh2, dc2 = decLSTM2(dec_output1, initial_state = [ih2, ic2])

dec_output = decOutput(dec_output2)
model_dec = Model([decoderX, ih1, ic1, ih2, ic2], [dec_output, dh1, dc1, dh2, dc2])

# -----------
# Question을 입력받아 Answer를 생성한다.
def genAnswer(question):
    question = question[np.newaxis, :]
    init_h1, init_c1, init_h2, init_c2 = model_enc.predict(question)

    # 시작 단어는 <BOS>로 한다.
    word = np.array(ko_sp.bos_id()).reshape(1, 1)

    answer = []
    for i in range(EN_MAX_LEN):
        dY, next_h1, next_c1, next_h2, next_c2 = model_dec.predict([word, init_h1, init_c1, init_h2, init_c2])
        
        # 디코더의 출력은 vocabulary에 대응되는 one-hot이다.
        # argmax로 해당 단어를 채택한다.

        # random_pick = random.randrange(len(dY[0, 0]))
        nextWord = np.argmax(dY[0, 0])
        

        # 예상 단어가 <EOS>이거나 <PAD>이면 더 이상 예상할 게 없다.
        if nextWord == en_sp.eos_id() or nextWord == en_sp.pad_id():
            break
        
        # 다음 예상 단어인 디코더의 출력을 answer에 추가한다.
        answer.append(en_idx2word[nextWord])
        
        # 디코더의 다음 recurrent를 위해 입력 데이터와 hidden 값을
        # 준비한다. 입력은 word이고, hidden은 h와 c이다.
        word = np.array(nextWord).reshape(1,1)
    
        init_h1 = next_h1
        init_c1 = next_c1
        init_h2 = next_h2
        init_c2 = next_c2
        
    return en_sp.decode_pieces(answer)

In [None]:
def make_question(que_string):
    q_idx = []
    for x in ko_sp.encode_as_pieces(que_string):
        if x in ko_word2idx:
            q_idx.append(ko_word2idx[x])
        else:
            q_idx.append(ko_sp.unk_id())   # out-of-vocabulary (OOV)
    
    # <PAD>를 삽입한다.
    if len(q_idx) < KO_MAX_LEN:
        q_idx.extend([ko_sp.pad_id()] * (KO_MAX_LEN - len(q_idx)))
    else:
        q_idx = q_idx[0:KO_MAX_LEN]
    return q_idx

In [None]:
# Chatting
# dummy : 최초 1회는 모델을 로드하는데 약간의 시간이 걸리므로 이것을 가리기 위함.
def chatting(n=100):
    for i in range(n):
        question = input('Q : ')
        
        if  question == 'quit':
            break
        
        q_idx = make_question(question)
        answer = genAnswer(np.array(q_idx))
        print('A :', answer)

In [None]:
####### Chatting 시작 #######
print("\nSeq2Seq ChatBot (ver. 1.0)")
print("Chatting 모듈을 로드하고 있습니다 ...")

# 처음 1회는 시간이 걸리기 때문에 dummy question을 입력한다.
answer = genAnswer(np.zeros(MAX_LEN))
print("ChatBot이 준비 됐습니다.")

# 채팅을 시작한다.
chatting(100)