# RNN을 이용한 인코더-디코더
- 번역기, 텍스트 요약에 활용

## Sequence-to-Sequence
- 챗봇, 기계번역, 내용요약, stt(Speech to Text)

![](https://wikidocs.net/images/page/24996/%EB%8B%A8%EC%96%B4%ED%86%A0%ED%81%B0%EB%93%A4%EC%9D%B4.PNG)
1. 인코더 RNN 마지막 시점의 은닉상태가 **컨텍스트 벡터**
2. 컨텍스트 벡터는 디코더 RNN 첫번째 시점의 은닉상태에 사용
3. 디코더는 RNNLM(다음 시점의 단어를 예측하는 모델)
4. 훈련과정에서는 기존 정답을 RNN의 인풋으로 사용
5. 테스트과정에서는 전시점의 RNN의 예측을 다음 시점의 인풋으로 사용

## Character-Level Neural Machine Translation (실습)

### 병렬 코퍼스 데이터에 대한 전처리

In [3]:
import os
import shutil
import zipfile
from pathlib import Path

import pandas as pd
import tensorflow as tf
import urllib3
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [11]:
data_dir = Path('C:/Users/011/sinjy1203/data')
lines = pd.read_csv(data_dir / 'fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
lines.shape

(197463, 2)

In [12]:
lines.head()

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !


In [13]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000]
lines.head(10)

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !
5,Hi.,Salut.
6,Run!,Cours !
7,Run!,Courez !
8,Run!,Prenez vos jambes à vos cous !
9,Run!,File !


In [14]:
lines.sample(10)

Unnamed: 0,src,tar
6928,Give it to me!,Donnez-la-moi.
13382,Dinner is ready.,Le dîner est prêt.
7168,I bribed them.,Je les ai soudoyées.
41733,He made her his wife.,Il a fait d'elle sa femme.
34918,Everybody likes her.,Tout le monde l'aime.
43124,I thought I knew you.,Je pensais te connaître.
14606,I went for help.,Je suis allé chercher de l'aide.
19420,I'm ignoring you.,Je t'ignore.
23214,Haste makes waste.,La vitesse crée de la perte.
35965,I have but one wish.,Je n'ai qu'un vœu.


In [15]:
# 시작과 종료를 의미하는 <sos>와 <eos> 추가
# 여기서는 \t와 \n을 각각 시작 종료 심볼로 간주
lines.tar = lines.tar.apply(lambda x: '\t ' + x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
13840,How are you now?,\t Comment vas-tu maintenant ? \n
59578,The weather turned bad.,\t Le temps devint mauvais. \n
33011,We can't afford it.,\t Nous ne pouvons nous le permettre. \n
47993,Did you hear all that?,\t Avez-vous tout entendu ? \n
53128,Tom looks embarrassed.,\t Tom a l'air gêné. \n
6031,This is ugly.,\t C'est hideux. \n
19909,Let me handle it.,\t Laisse-moi m'en débrouiller ! \n
19974,Look at your map.,\t Regarde ta carte. \n
12523,We think alike.,\t Je partage votre point de vue. \n
58026,I've only used it once.,\t Je l'ai utilisé seulement une fois. \n


In [16]:
# 문자 집합 생성
src_vocab = set()
for line in lines.src:
    for char in line:
        src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [20]:
src_vocab_size = len(src_vocab) + 1 # padding때문에 1개 추가
tar_vocab_size = len(tar_vocab) + 1 
src_vocab_size, tar_vocab_size

(79, 105)

In [21]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{'P': 1, 'G': 2, '1': 3, ':': 4, '4': 5, 'A': 6, 'i': 7, 't': 8, '’': 9, '/': 10, 'B': 11, 'h': 12, ',': 13, 'F': 14, 'a': 15, 'y': 16, '8': 17, '2': 18, 'J': 19, 'b': 20, 'r': 21, 'X': 22, 'D': 23, ' ': 24, 'j': 25, 'd': 26, 'M': 27, 'L': 28, 'K': 29, 'U': 30, 'u': 31, 'm': 32, '5': 33, 'O': 34, 'o': 35, 'c': 36, 'e': 37, 'W': 38, 'Y': 39, 'N': 40, '7': 41, '€': 42, '3': 43, '!': 44, 'l': 45, '&': 46, 'é': 47, 'q': 48, '?': 49, 'H': 50, '"': 51, 'g': 52, 'V': 53, 'z': 54, '.': 55, 'p': 56, 'f': 57, 'E': 58, 'I': 59, '6': 60, '%': 61, 'S': 62, 'Q': 63, 'T': 64, "'": 65, 's': 66, 'v': 67, '$': 68, 'x': 69, 'k': 70, '0': 71, 'w': 72, '9': 73, '-': 74, 'n': 75, 'R': 76, 'Z': 77, 'C': 78}
{'Ê': 1, 'P': 2, 'G': 3, '1': 4, ':': 5, '4': 6, 'A': 7, 'i': 8, 'Ô': 9, 'û': 10, 'ù': 11, '\u200b': 12, 't': 13, '’': 14, 'À': 15, 'É': 16, 'B': 17, 'h': 18, ',': 19, 'F': 20, 'a': 21, 'y': 22, 'œ': 23, '8': 24, '2': 25, 'J': 26, 'b': 27, 'r': 28, 'â': 29, 'X': 30, 'D': 31, ' ': 32, 'ê': 33, 'î': 34, 'j'

In [22]:
# integer encoding (encoder)
encoder_input = []

for line in lines.src:
    encoded_line = []
    for char in line:
        encoded_line.append(src_to_index[char])
    encoder_input.append(encoded_line)
encoder_input[:5]

[[2, 35, 55], [2, 35, 55], [2, 35, 55], [2, 35, 55], [50, 7, 55]]

In [23]:
# decoder encoding (decoder)
decoder_input = []
for line in lines.tar:
    decoded_line = []
    for char in line:
        decoded_line.append(tar_to_index[char])
    decoder_input.append(decoded_line)
decoder_input[:5]

[[40, 32, 70, 21, 32, 60, 32, 85],
 [40, 32, 39, 21, 28, 51, 18, 52, 72, 32, 85],
 [40, 32, 77, 99, 32, 28, 50, 46, 13, 52, 32, 60, 32, 85],
 [40, 32, 17, 50, 46, 69, 52, 32, 60, 32, 85],
 [40, 32, 81, 21, 61, 46, 13, 32, 60, 32, 85]]

In [24]:
# target data
decoder_target = []
for line in lines.tar:
    timestep = 0
    decoded_line = []
    for char in line:
        if timestep > 0:
            decoded_line.append(tar_to_index[char])
        timestep += 1
    decoder_target.append(decoded_line)
decoder_target[:5]

[[32, 70, 21, 32, 60, 32, 85],
 [32, 39, 21, 28, 51, 18, 52, 72, 32, 85],
 [32, 77, 99, 32, 28, 50, 46, 13, 52, 32, 60, 32, 85],
 [32, 17, 50, 46, 69, 52, 32, 60, 32, 85],
 [32, 81, 21, 61, 46, 13, 32, 60, 32, 85]]

In [25]:
# padding