## 개념 짚고가기

- Seq2Seq(Encoder-Decoder LSTM)모델은 RNN을 이용해 input을 feature vector로 인코딩함
- 이렇게 인코딩된 vector를 여기선 'context vector'라 함
- 'context vector' : input문장의 추상적인/압축된 representation
- 이 context vector는 두번째 RNN을 통해 디코딩되어 번역된 문장을 생성함

In [16]:

input_batch, output_batch, target_batch = [], [], []
seq_data = [['man', 'women'], ['black', 'white'], ['king', 'queen'], ['girl', 'boy'], ['up', 'down'], ['high', 'low']]

n_step = 5
    
for seq in seq_data:
    # print(seq) # Output : one row
    for i in range(2):  # Check each one by one
       # print(seq)
       # print(n_step - len(seq[i]))
        seq[i] = seq[i] + 'P' * (n_step - len(seq[i])) # add 'P'
        print(seq[i])


manPP
women
black
white
kingP
queen
girlP
boyPP
upPPP
downP
highP
lowPP


In [15]:
char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
num_dic = {n: i for i, n in enumerate(char_arr)}
num_dic

{'E': 1,
 'P': 2,
 'S': 0,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [18]:
input_batch, output_batch, target_batch = [], [], []
seq_data = [['man', 'women'], ['black', 'white'], ['king', 'queen'], ['girl', 'boy'], ['up', 'down'], ['high', 'low']]

n_step = 5
    
for seq in seq_data:
    # print(seq)  # Output : one row
    for i in range(2):  # Check each one by one
       # print(seq)
       # print(n_step - len(seq[i]))
        seq[i] = seq[i] + 'P' * (n_step - len(seq[i])) # add 'P'
        #print(seq[i])
    input = [num_dic[n] for n in seq[0]] # 인코더 셀의 입력값. 입력단어의 글자들을 한글자씩 떼어 배열로 임베딩한다.
    print(input)

[15, 3, 16, 2, 2]
[4, 14, 3, 5, 13]
[13, 11, 16, 9, 2]
[9, 11, 20, 14, 2]
[23, 18, 2, 2, 2]
[10, 11, 9, 10, 2]


In [20]:
input_batch, output_batch, target_batch = [], [], []
seq_data = [['man', 'women'], ['black', 'white'], ['king', 'queen'], ['girl', 'boy'], ['up', 'down'], ['high', 'low']]

n_step = 5
    
for seq in seq_data:
    # print(seq)  # Output : one row
    for i in range(2):  # Check each one by one
       # print(seq)
       # print(n_step - len(seq[i]))
        seq[i] = seq[i] + 'P' * (n_step - len(seq[i])) # add 'P'
        #print(seq[i])
    input = [num_dic[n] for n in seq[0]] # encoder
    output = [num_dic[n] for n in ('S' + seq[1])] # decoer
    target = [num_dic[n] for n in (seq[1] + 'E')] # training data
    print(target)

[25, 17, 15, 7, 16, 1]
[25, 10, 11, 22, 7, 1]
[19, 23, 7, 7, 16, 1]
[4, 17, 27, 2, 2, 1]
[6, 17, 25, 16, 2, 1]
[14, 17, 25, 2, 2, 1]


---------------------------------------------------------------------------------------------

In [22]:
import torch
import torch.nn as nn

# S: 디코딩 입력의 시작을 나타내는 심볼
# E: 디코딩 출력을 끝을 나타내는 심볼
# P: 현재 배치 데이터의 time step 크기보다 작은 경우 빈 시퀀스를 채우는 심볼
#    예) 현재 배치 데이터의 최대 크기가 4 인 경우
#       word -> ['w', 'o', 'r', 'd']
#       to   -> ['t', 'o', 'P', 'P']


if __name__ == '__main__':
    
    n_step = 5
    n_hidden = 128

    char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
    num_dic = {n: i for i, n in enumerate(char_arr)}
    # 학습데이터
    seq_data = [['man', 'women'], ['black', 'white'], ['king', 'queen'], ['girl', 'boy'], ['up', 'down'], ['high', 'low']]

    n_class = len(num_dic)
    batch_size = len(seq_data)

    model = Seq2Seq()
    

def make_batch(): # batch?
    input_batch, output_batch, target_batch = [], [], []

    for seq in seq_data:
        for i in range(2):
            seq[i] = seq[i] + 'P' * (n_step - len(seq[i])) # 학습데이터를 로드하여 char단위로 확인. n_step보다 짧을경우 P로 채움

        input = [num_dic[n] for n in seq[0]] # 인코더 셀의 입력값. 입력단어의 글자들을 한글자씩 떼어 배열로 만든다.
        output = [num_dic[n] for n in ('S' + seq[1])] # 디코더 셀의 입력값. 시작을 나타내는 S 심볼을 맨 앞에 붙여준다.
        target = [num_dic[n] for n in (seq[1] + 'E')] # 학습을 위해 비교할 디코더 셀의 출력값. 끝나는 것을 알려주기 위해 마지막에 E를 붙인다.

        input_batch.append(np.eye(n_class)[input]) # one-hot?  # return 결과 봐야할듯
        output_batch.append(np.eye(n_class)[output])
        target_batch.append(target) # not one-hot

    # make tensor
    return torch.FloatTensor(input_batch), torch.FloatTensor(output_batch), torch.LongTensor(target_batch)


NameError: name 'Seq2Seq' is not defined

In [None]:
# Model
class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()

        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.fc = nn.Linear(n_hidden, n_class)

    def forward(self, enc_input, enc_hidden, dec_input):
        enc_input = enc_input.transpose(0, 1) # enc_input: [max_len(=n_step, time step), batch_size, n_class]
        dec_input = dec_input.transpose(0, 1) # dec_input: [max_len(=n_step, time step), batch_size, n_class]

        # enc_states : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        _, enc_states = self.enc_cell(enc_input, enc_hidden)
        # outputs : [max_len+1(=6), batch_size, num_directions(=1) * n_hidden(=128)]
        outputs, _ = self.dec_cell(dec_input, enc_states)

        model = self.fc(outputs) # model : [max_len+1(=6), batch_size, n_class]
        return model