# RNN을 이용한 인코더-디코더
- 번역기, 텍스트 요약에 활용

## Sequence-to-Sequence
- 챗봇, 기계번역, 내용요약, stt(Speech to Text)

![](https://wikidocs.net/images/page/24996/%EB%8B%A8%EC%96%B4%ED%86%A0%ED%81%B0%EB%93%A4%EC%9D%B4.PNG)
1. 인코더 RNN 마지막 시점의 은닉상태가 **컨텍스트 벡터**
2. 컨텍스트 벡터는 디코더 RNN 첫번째 시점의 은닉상태에 사용
3. 디코더는 RNNLM(다음 시점의 단어를 예측하는 모델)
4. 훈련과정에서는 기존 정답을 RNN의 인풋으로 사용
5. 테스트과정에서는 전시점의 RNN의 예측을 다음 시점의 인풋으로 사용

### 교사 강요(Teacher forcing)
- 이전 디코더셀의 출력을 현재 디코더셀의 입력으로 활용 (예측시)
- 이전 시점의 실제값을 현재 디코더셀의 입력으로 활용 (훈련시, 교사강요)
- 이전 예측이 틀려서 다음 셀의 예측의 영향을 주는 것을 막기 위한 전략

## Character-Level Neural Machine Translation (실습)

### 병렬 코퍼스 데이터에 대한 전처리

In [1]:
import os
import shutil
import zipfile
from pathlib import Path

import pandas as pd
import tensorflow as tf
import urllib3
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
data_dir = Path('C:/Users/011/sinjy1203/data')
lines = pd.read_csv(data_dir / 'fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
lines.shape

(197463, 2)

In [3]:
lines.head()

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !


In [4]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000]
lines.head(10)

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !
5,Hi.,Salut.
6,Run!,Cours !
7,Run!,Courez !
8,Run!,Prenez vos jambes à vos cous !
9,Run!,File !


In [5]:
lines.sample(10)

Unnamed: 0,src,tar
40438,You're off the hook.,Tu es tiré d'affaire.
6900,Everyone's up.,Tout le monde s'est levé.
14048,I feel relieved.,Je me sens soulagé.
48894,He was chosen captain.,Il a été choisi comme capitaine.
46895,Who is he talking to?,Avec qui parle-t-il ?
992,He's lazy.,Il est paresseux.
8312,"OK, I'm ready.","Bon, je suis prêt."
8822,Tom hesitated.,Tom a hésité.
38411,The view is amazing.,La vue est fantastique.
161,Be fair.,Sois juste !


In [6]:
# 시작과 종료를 의미하는 <sos>와 <eos> 추가
# 여기서는 \t와 \n을 각각 시작 종료 심볼로 간주
lines.tar = lines.tar.apply(lambda x: '\t ' + x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
38635,This beef is tender.,\t Le bœuf est tendre. \n
38901,Tom is a con artist.,\t Tom est un escroc. \n
19200,I was sure of it.,\t J'en étais sûre. \n
3012,He's a slob.,\t C'est un flemmard. \n
30072,I need a secretary.,\t J'ai besoin d'une secrétaire. \n
58046,Is Tom related to Mary?,\t Tom et Mary sont-ils parents ? \n
4606,Do you smoke?,\t Est-ce que tu fumes ? \n
20611,The light was on.,\t La lumière était allumée. \n
36293,I only speak French.,\t Je parle seulement français. \n
27309,We're out of ammo.,\t Nous sommes à court de munitions. \n


In [7]:
# 문자 집합 생성
src_vocab = set()
for line in lines.src:
    for char in line:
        src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [8]:
src_vocab_size = len(src_vocab) + 1 # padding때문에 1개 추가
tar_vocab_size = len(tar_vocab) + 1 
src_vocab_size, tar_vocab_size

(79, 105)

In [9]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{'’': 1, 'R': 2, 'Y': 3, 'Z': 4, '-': 5, 'y': 6, 'i': 7, '0': 8, 'b': 9, 'H': 10, 'z': 11, '5': 12, 'C': 13, 'E': 14, '2': 15, 'n': 16, ' ': 17, 'T': 18, 'w': 19, '&': 20, '8': 21, 'F': 22, 'N': 23, 'V': 24, 'a': 25, 'D': 26, 'f': 27, 'U': 28, '!': 29, ':': 30, 'B': 31, 'h': 32, 'e': 33, 'q': 34, 't': 35, '"': 36, 'W': 37, 'é': 38, '4': 39, 'p': 40, 'r': 41, '?': 42, 'K': 43, '1': 44, 's': 45, 'M': 46, 'L': 47, 'm': 48, 'G': 49, 'S': 50, '3': 51, 'v': 52, 'd': 53, 'x': 54, '7': 55, 'u': 56, 'A': 57, '€': 58, 'O': 59, 'l': 60, 'g': 61, '%': 62, 'k': 63, ',': 64, 'X': 65, 'P': 66, "'": 67, 'j': 68, '/': 69, '9': 70, '$': 71, 'J': 72, 'I': 73, 'o': 74, '6': 75, 'Q': 76, 'c': 77, '.': 78}
{'’': 1, 'ï': 2, 'R': 3, 'Y': 4, 'ç': 5, '-': 6, 'Z': 7, 'y': 8, 'œ': 9, 'i': 10, '\u202f': 11, '0': 12, 'À': 13, 'b': 14, 'à': 15, 'H': 16, 'z': 17, '5': 18, 'É': 19, 'C': 20, 'Ê': 21, 'E': 22, '2': 23, 'n': 24, '\t': 25, '‘': 26, '\xa0': 27, ' ': 28, 'T': 29, 'w': 30, '&': 31, 'Ç': 32, '8': 33, 'ô': 34,

In [10]:
# integer encoding (encoder)
encoder_input = []

for line in lines.src:
    encoded_line = []
    for char in line:
        encoded_line.append(src_to_index[char])
    encoder_input.append(encoded_line)
encoder_input[:5]

[[49, 74, 78], [49, 74, 78], [49, 74, 78], [49, 74, 78], [10, 7, 78]]

In [11]:
# decoder encoding (decoder)
decoder_input = []
for line in lines.tar:
    decoded_line = []
    for char in line:
        decoded_line.append(tar_to_index[char])
    decoder_input.append(decoded_line)
decoder_input[:5]

[[25, 28, 37, 39, 28, 43, 28, 87],
 [25, 28, 65, 39, 59, 103, 47, 48, 104, 28, 87],
 [25, 28, 22, 24, 28, 59, 99, 78, 50, 48, 28, 43, 28, 87],
 [25, 28, 46, 99, 78, 82, 48, 28, 43, 28, 87],
 [25, 28, 70, 39, 81, 78, 50, 28, 43, 28, 87]]

In [12]:
# target data
decoder_target = []
for line in lines.tar:
    timestep = 0
    decoded_line = []
    for char in line:
        if timestep > 0:
            decoded_line.append(tar_to_index[char])
        timestep += 1
    decoder_target.append(decoded_line)
decoder_target[:5]

[[28, 37, 39, 28, 43, 28, 87],
 [28, 65, 39, 59, 103, 47, 48, 104, 28, 87],
 [28, 22, 24, 28, 59, 99, 78, 50, 48, 28, 43, 28, 87],
 [28, 46, 99, 78, 82, 48, 28, 43, 28, 87],
 [28, 70, 39, 81, 78, 50, 28, 43, 28, 87]]

In [13]:
# padding
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
max_src_len, max_tar_len

(23, 76)

In [14]:
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [15]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

In [16]:
encoder_input.shape

(60000, 23, 79)

In [17]:
decoder_input.shape

(60000, 76, 105)

In [18]:
decoder_target.shape

(60000, 76, 105)

### training seq2seq model

In [20]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np

In [21]:
encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(units=256, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

encoder_states = [state_h, state_c]

In [23]:
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 79)]   0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None, 105)]  0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 256),        344064      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [25]:
model.fit(x=[encoder_input, decoder_input], y=decoder_target, batch_size=64,
         epochs=40, validation_split=0.2)

Epoch 1/40

KeyboardInterrupt: 

### seq2seq 동작