# 1 Seq2Seq

## Machine Translation
### Seq2Seq 모델을 활용한 기계 번역
(영어를 불어로)

https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

## 모듈 임포트

In [None]:
import os
import random
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from tqdm.auto import tqdm

## 1. 데이터셋 준비

In [None]:
word2idx = {
    '<SOS>' : 0,
    '<EOS>' : 1,
    'token1' : 2,
}

In [None]:
SOS_token = 0
EOS_token = 1
batch_size = 64  # batch
epochs = 20  # epoch
latent_dim = 256  # context vactor 차원수
num_samples = 10000  # 학습데이터 검수
data_path = 'fra.txt' # 학습데이터 파일

In [None]:
# 학습데이터 벡터화
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text, _ = line.split('\t') #데이터는 tab 으로 구분
    target_text = '\t' + target_text + '\n' # \t -> (SOS), \n -> (EOS)
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters)) #입력 시퀀스 문자
target_characters = sorted(list(target_characters)) #출력 시퀀스 문자
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts]) # 입력 시퀀스의 최대 길이
max_decoder_seq_length = max([len(txt) for txt in target_texts]) # 출력 시퀀스의 최대 길이

In [None]:
num_encoder_tokens, num_decoder_tokens

(71, 93)

In [None]:
max_encoder_seq_length, max_decoder_seq_length

(15, 59)

In [None]:
len(input_token_index)

73

In [None]:
# word2idx - encoder
input_token_index = dict([('<SOS>', SOS_token), ('<EOS>', EOS_token)] +
                        [(char, i+2) for i, char in enumerate(input_characters)]) #문자 : 인덱스 dictionary
# word2idx - decoder
target_token_index = dict([('<SOS>', SOS_token), ('<EOS>', EOS_token)] +
                        [(char, i+2) for i, char in enumerate(target_characters)]) #문자 : 인덱스 dictionary

n = len(input_texts)
encoder_input_data = np.zeros((n, max_encoder_seq_length+1), dtype=np.int32)
decoder_input_data = np.zeros((n, max_decoder_seq_length+1), dtype=np.int32)

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    data_encoded = []
    for t, char in enumerate(input_text):
        data_encoded.append(input_token_index[char])
    data_encoded.append(EOS_token) # EOS
    encoder_input_data[i, :len(data_encoded)] = np.array(data_encoded)

    # 타겟 데이터는 멀티클래스 분류 softmax 문제이기 때문에 60, 토큰 차원수로 벡터생성
    data_encoded = []
    for t, char in enumerate(target_text):
        data_encoded.append(target_token_index[char])
    data_encoded.append(EOS_token) # EOS
    decoder_input_data[i, :len(data_encoded)] = np.array(data_encoded)

In [None]:
encoder_input_data.shape

(10000, 16)

In [None]:
encoder_input_data[0]

array([28, 60, 10,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [None]:
decoder_input_data.shape

(10000, 60)

In [None]:
decoder_input_data[0]

array([ 2, 45, 47,  4,  5,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, decoder_input_data,test_size=0.2, shuffle=True)

In [None]:
# 데이터셋 만들기
class TextDataset(Dataset):
    def __init__(self, input_datas, target_datas):
        self.input_ids = torch.tensor(input_datas, dtype=torch.long)
        self.target_ids = torch.tensor(target_datas, dtype=torch.long)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

    def __len__(self):
        return len(self.input_ids)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

In [None]:
X_train[0]

array([30,  2, 68, 46, 59, 65,  2, 61, 63, 60, 60, 51, 10,  1,  0,  0],
      dtype=int32)

In [None]:
y_train[0]

array([ 2, 33, 51,  4, 68, 51, 67, 69,  4, 67, 60, 51,  4, 62, 64, 51, 67,
       68, 51, 14,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [None]:
train_dataset[0]

(tensor([30,  2, 68, 46, 59, 65,  2, 61, 63, 60, 60, 51, 10,  1,  0,  0]),
 tensor([ 2, 33, 51,  4, 68, 51, 67, 69,  4, 67, 60, 51,  4, 62, 64, 51, 67, 68,
         51, 14,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0]))

In [None]:
for input_ids, target_ids in train_loader:
    print(input_ids) # input을 배치사이즈만큼 뽑은 것
    print(target_ids.shape) # target을 ''
    # print(ones) # 1
    break

tensor([[45, 60, 66,  ..., 10,  1,  0],
        [41, 53, 50,  ...,  1,  0,  0],
        [22, 63, 50,  ...,  1,  0,  0],
        ...,
        [41, 53, 46,  ..., 65, 10,  1],
        [41, 53, 54,  ...,  0,  0,  0],
        [30,  7, 58,  ...,  0,  0,  0]])
torch.Size([64, 60])


## 2. 모델 정의

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size=256, dropout_p=0.1):
        super(EncoderRNN, self).__init__()

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden # output은 사용하지 않음. hidden -> context vector

encoder = EncoderRNN(num_encoder_tokens+2) # len(input_token_index)
# sos, eos embedding 하기 위해 +2

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size) # 256 -> 유니크한 토큰 개수로 멀티클래스 분류

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(max_decoder_seq_length+1):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        # decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

decoder = DecoderRNN(256, num_decoder_tokens+2) # len(target_token_index)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
encoder.to(device)
decoder.to(device)

learning_rate = 1e-3

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss() #.cuda()

## 3. 모델 학습

In [None]:
epochs = 10

history = {
    'loss': [],
    'val_loss': [],
    'val_acc': []
}

for epoch in range(1, epochs+1):
    # 학습
    encoder.train()
    decoder.train()
    train_loss = []
    for batch_idx, (input_tensor, target_tensor) in enumerate(tqdm(iter(train_loader))):
        input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        train_loss.append(loss.item())

    _train_loss = np.mean(train_loss)
    history['loss'].append(_train_loss)

    if epoch % 1 == 0:
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}]')


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.92789]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.57135]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.48526]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.43386]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.39738]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.36834]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.34554]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.32504]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.30788]


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.29281]


## 4. 예측

In [None]:
input_idx2word = {v: k for k, v in input_token_index.items()}
target_idx2word = {v: k for k, v in target_token_index.items()}

In [None]:
input_tensor, target_tensor = test_dataset[0]
input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)

In [None]:
# 입력값
print(len(input_tensor))
for idx in input_tensor:
    print(input_idx2word[idx.item()], end='')

16
You look smart.<EOS>

In [None]:
target_tensor

tensor([ 2, 45, 61, 67, 65,  4, 47, 68, 51, 71,  4, 58,  9, 47, 55, 64,  4, 55,
        60, 66, 51, 58, 58, 55, 53, 51, 60, 66, 51, 14,  3,  1,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0])

In [None]:
# 정답값
for idx in target_tensor:
    print(target_idx2word[idx.item()], end='')

	Vous avez l'air intelligente.
<EOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS>

In [None]:
# 예측
encoder.eval()
decoder.eval()
with torch.no_grad():
    encoder_outputs, encoder_hidden = encoder(input_tensor.unsqueeze(0))
    decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

    loss = criterion(
        decoder_outputs.view(-1, decoder_outputs.size(-1)),
        target_tensor.view(-1)
    )
    print(loss)
    # val_loss.append(loss.item())

    _, topi = decoder_outputs.topk(1)
    decoded_ids = topi.squeeze()

    decoded_words = []
    for idx in decoded_ids:
        if idx.item() == EOS_token:
            decoded_words.append('<EOS>')
            break
        decoded_words.append(target_idx2word[idx.item()])

tensor(3.4437)


In [None]:
# \t == <SOS>
# \n == <EOS>

print(''.join(decoded_words))

	Tu as l'air en train de main.
<EOS>


In [49]:
# 함수화
def predict_sentence(input_tensor):
    # 예측
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        encoder_outputs, encoder_hidden = encoder(input_tensor.unsqueeze(0))
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        # print(loss)
        # val_loss.append(loss.item())

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(target_idx2word[idx.item()])
    return decoded_words, loss


In [53]:
for seq_index in range(10, 20):
    input_tensor, target_tensor = test_dataset[seq_index]
    input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)

    print('------------')
    # 입력값
    print('input sentence: ', end=' ')
    for idx in input_tensor:
        if idx == EOS_token:
            break
        print(input_idx2word[idx.item()], end='')

    # 정답값
    print('\ntarget sentence: ', end=' ')
    for idx in target_tensor:
        if idx == EOS_token:
            break
        print(target_idx2word[idx.item()], end='')

    # 예측값
    print('\npredict sentence: ', end=' ')
    decoded_words, loss = predict_sentence(input_tensor)
    print(*decoded_words, sep='')

------------
input sentence:  She choked him.
target sentence:  	Elle l'étrangla.

predict sentence:  	Elle est connuire.
<EOS>
------------
input sentence:  It is my cat.
target sentence:  	C'est mon chat.

predict sentence:  	C'est la mien.
<EOS>
------------
input sentence:  Do you gamble?
target sentence:  	Est-ce que vous jouez ?

predict sentence:  	Est-ce que je vous ai besoin de vous ?
<EOS>
------------
input sentence:  It's done.
target sentence:  	C'est fait.

predict sentence:  	C'est le sien.
<EOS>
------------
input sentence:  Don't kill me.
target sentence:  	Ne me tue pas !

predict sentence:  	Ne sois pas chanter.
<EOS>
------------
input sentence:  I cringed.
target sentence:  	Je suis rentré en moi-même.

predict sentence:  	J'ai besoin de main.
<EOS>
------------
input sentence:  You did it!
target sentence:  	C'est vous qui l'avez fait !

predict sentence:  	Tu peux de la maison !
<EOS>
------------
input sentence:  Do it right.
target sentence:  	Fais-le comme il 