* 덧셈 데이터 구성
* seq2seq Encoder / Decoder 정의
* 학습
* 평가 (예측)

## 덧셈 데이터 구성

In [None]:
# 최대 세자리수끼리 합을 구하는 데이터

In [None]:
import numpy as np

In [None]:
# 99 + 1 => '9','9','+','1'

In [None]:
def get_sum_dict():
  items = list(str(x) for x in range(10))
  items.append('_')
  items.append('+')

  id_to_item = {}
  for idx, item in enumerate(items):
    id_to_item[idx]=item
  
  item_to_id = dict([(value, key) for key, value in id_to_item.items()])

  return id_to_item, item_to_id

In [None]:
def convert_item_to_id(items, item_to_id):
  ids = list([item_to_id[item] for item in items])
  return ids

def convert_id_to_item(ids, id_to_item):
  items = list([id_to_item[id] for id in ids])
  return items

In [None]:
def sum_data_gen():
  num1 = np.random.randint(0,1000)
  num2 = np.random.randint(0,1000)
  ans = num1 + num2
  q_str = str(num1) + '+' + str(num2)
  a_str = '_' + str(ans).zfill(4)
  return list(q_str), list(a_str)

In [None]:
def get_dataset(data_num=100):
  id_to_item, item_to_id = get_sum_dict()
  xs = []
  ys = []
  for i in range(data_num):
    x, y = sum_data_gen()
    xs.append(convert_item_to_id(x, item_to_id))
    ys.append(convert_item_to_id(y, item_to_id))
  return xs, ys   

In [None]:
train_x, train_y = get_dataset(data_num=20000)
test_x, test_y = get_dataset(data_num=10000)
id_to_item, item_to_id = get_sum_dict()

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
src_vocab_size = len(id_to_item) 
tar_vocab_size = len(id_to_item)

In [None]:
train_x = pad_sequences(train_x, maxlen=7, padding='pre')
test_x = pad_sequences(test_x, maxlen=7, padding='pre')
train_y = pad_sequences(train_y, maxlen=5, padding='pre')
test_y = pad_sequences(test_y, maxlen=5, padding='pre')

In [None]:
train_y_t = train_y[:,1:]
test_y_t = test_y[:,1:]

train_y_t = pad_sequences(train_y_t, maxlen=5, padding='pre')
test_y_t = pad_sequences(test_y_t, maxlen=5, padding='pre')

In [None]:
train_x = to_categorical(train_x, num_classes=vocab_size)
test_x = to_categorical(test_x, num_classes=vocab_size)
train_y = to_categorical(train_y, num_classes=vocab_size)
test_y = to_categorical(test_y, num_classes=vocab_size)
train_y_t = to_categorical(train_y_t, num_classes=vocab_size)
test_y_t = to_categorical(test_y_t, num_classes=vocab_size)

In [None]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape, train_y_t.shape, test_y_t.shape

((20000, 7, 12),
 (10000, 7, 12),
 (20000, 5, 12),
 (10000, 5, 12),
 (20000, 5, 12),
 (10000, 5, 12))

In [None]:
train_y_t[:2]

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]]], dtype=float32)

In [None]:
# Encoder 
# encoder_input = Embedding(vocab_size, 5,  input_length=7)
encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(8, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

encoder_states = [state_h, state_c]

In [None]:
# Decoder
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(8, return_state=True, return_sequences=True)
# output, h, c 자리인데, decoder의 h,c는 사용되지 않기 때문에 _ 로..
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

# Model(input, output)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, None, 12)]   0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, None, 12)]   0                                            
__________________________________________________________________________________________________
lstm_10 (LSTM)                  [(None, 8), (None, 8 672         input_8[0][0]                    
__________________________________________________________________________________________________
lstm_11 (LSTM)                  [(None, None, 8), (N 672         input_9[0][0]                    
                                                                 lstm_10[0][1]         

In [None]:
# train_x.shape, test_x.shape, 
# train_y.shape, test_y.shape, 
# train_y_t.shape, test_y_t.shape

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy') 

In [None]:
model.fit(x=[train_x, train_y], 
              y=train_y_t, 
              batch_size=512, 
              epochs=50, 
              validation_split=0.2)             

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fc383051ba8>