# Neural Machine Translation by Jointly Learning to Align and Translate

Use: attention mechanism, GRUCell

- The encoder: Bidirectional RNN
- What I should do: make encoder(bidirectional RNN using GRU) and decoder
- 문장 -> 단어(tokenization) -> embedding -> enc-dec -> loss check -> embedding -> output

This is not a complete code for this paper. This might occur error in many ways. But I try to show the process in paper.



In [None]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense
import math
from tensorflow.contrib import rnn
import re

#initialize
tf.reset_default_graph()
sess = tf.InteractiveSession()

# Extra vocabulary symbols
_GO = '_GO'
_EOS = '_EOS'
_UNK = '_UNK'

extra_tokens = [_GO, _EOS, _UNK]

start_token = extra_tokens.index(_GO)   # start = 0
end_token = extra_tokens.index(_EOS)    # end = 1
unk_token = extra_tokens.index(_UNK)

_WORD_SPLIT = re.compile(r'[,.!?"\':;)(]')

hidden_units = 1000
embedding_size = 620
max_decode_step = 500
alignment_hidden_units = 1000
learning_rate = 0.01

inputs = "정부는 공공기관 주차장을 전면 폐쇄하고 관용차 3만3000여대 운행을 중단한다. 15일 첫 시행 당시 여러 허점을 노출한 터라 서울시 미세먼지 대책 실효성 논란은 더욱 커질 전망이다. 나는 개를 좋아한다. 나는 수박을 좋아한다. 나는 너를 사랑한다. 나는 컴퓨터를 사랑한다."
outputs = "The government will shut down parking lots at public institutions and shut down the operation of about 33000 private cars. The Seoul metropolitan government is expected to increase the controversy on the effectiveness of its countermeasures. I like dogs. I like watermelons. I love you. I love computers."
"""
inputs, outputs는 파일에서 읽어오는 방식으로 바꿔도 가능, str 형태면 됨
"""


def basic_tokenizer(sentence):
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_fragment))

    result = [_GO] + [w for w in words if w] + [_EOS]
    return result


def make_data(inputs):
    new_text = re.sub(_WORD_SPLIT, '', inputs)
    new_text.replace('.', ' {0}'.format(_EOS))

    words_list = list(set(new_text.split(" ")))
    num_encoder_symbols = len(words_list)
    word2idx = {c: i + 3 for i, c in enumerate(words_list)}
    word2idx[_EOS] = end_token

    sent_split = inputs.split(".")

    if '' in sent_split:
        sent_split.remove('')

    index_tensor = []

    for sent in sent_split:
        tokens = basic_tokenizer(sent)
        index_tensor.append([word2idx[word] for word in tokens])

    return index_tensor, num_encoder_symbols

encoder_inputs, num_encoder_symbols = make_data(inputs)
decoder_correct, num_decoder_symbols = make_data(outputs)
encoder_inputs_length = tf.placeholder(
            dtype=tf.int32, shape=(None,), name='encoder_inputs_length')
keep_prob_placeholder = tf.placeholder(tf.float32, shape=[], name='keep_prob')
batch_size = tf.shape(encoder_inputs)[0]


enc_fw_cell = rnn.GRUCell(hidden_units)
enc_bw_cell = rnn.GRUCell(hidden_units)

# fw_initial_state = enc_fw_cell.zero_state(batch_size, tf.float32)
# bw_initial_state = enc_bw_cell.zero_state(batch_size, tf.float32)

# Initialize encoder_embeddings to have variance=1.
sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=tf.float32)

encoder_embeddings = tf.get_variable(name='embedding', initializer=initializer, dtype=tf.float32,
                                     shape=[num_encoder_symbols, embedding_size])
encoder_inputs_embedded = tf.nn.embedding_lookup(
    params=encoder_embeddings, ids=encoder_inputs)

input_layer = Dense(hidden_units, dtype=tf.float32, name='input_projection')

encoder_inputs_embedded = input_layer(encoder_inputs_embedded)

encoder_outputs, encoder_output_states = tf.nn.bidirectional_dynamic_rnn(
    cell_fw=enc_fw_cell,
    cell_bw=enc_bw_cell,
    inputs=encoder_inputs_embedded,
    # initial_state_fw=fw_initial_state,
    # initial_state_bw=bw_initial_state,
    dtype=tf.float32,
    sequence_length=encoder_inputs_length)

# Building attention mechanism: Default Bahdanau
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
    num_units=hidden_units, memory=encoder_outputs,
    memory_sequence_length=encoder_output_states, )

decoder_cell = rnn.GRUCell(hidden_units)
initial_state = [state for state in encoder_output_states]
decoder_initial_state = tuple(initial_state)


attn_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_size=hidden_units,
                                                name="attention_init")
wrapped_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
    cell=decoder_cell,
    attention_mechanism=attention_mechanism,
    attention_layer_size=hidden_units,
    initial_cell_state=encoder_output_states[-1],
    name='Attention_Wrapper')

decoder_embeddings = tf.get_variable(name='embedding',
                                          shape=[num_decoder_symbols, embedding_size],
                                          initializer=initializer, dtype=tf.float32)

output_layer = Dense(num_decoder_symbols, name='output_projection')


def decoding(mode):
    if mode == 'train':

        decoder_inputs = tf.placeholder(
            dtype=tf.int32, shape=(None, None), name='decoder_inputs')
        decoder_inputs_length = tf.placeholder(
            dtype=tf.int32, shape=(None,), name='decoder_inputs_length')

        decoder_start_token = tf.ones(
            shape=[batch_size, 1], dtype=tf.int32) * start_token
        decoder_end_token = tf.ones(
            shape=[batch_size, 1], dtype=tf.int32) * end_token

        decoder_inputs_train = tf.concat([decoder_start_token,
                                          decoder_inputs], axis=1)

        decoder_inputs_length_train = decoder_inputs_length + 1

        decoder_targets_train = tf.concat([decoder_inputs,
                                           decoder_end_token], axis=1)

        decoder_inputs_embedded = tf.nn.embedding_lookup(
            params=decoder_embeddings, ids=decoder_inputs_train)

        decoder_inputs_embedded = input_layer(decoder_inputs_embedded)

        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_inputs_embedded,
                                                 sequence_length=decoder_inputs_length_train,
                                                 time_major=False,
                                                 name='training_helper')

        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=wrapped_decoder_cell,
                                                helper=training_helper,
                                                initial_state=decoder_initial_state,
                                                output_layer=output_layer)

        max_decoder_length = tf.reduce_max(decoder_inputs_length_train)

        decoder_outputs_train, decoder_last_state_train,\
        decoder_outputs_length_train = tf.contrib.seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length)

        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)

        masks = tf.sequence_mask(lengths=decoder_inputs_length_train,
                                 maxlen=max_decoder_length, dtype=tf.float32, name='masks')

        loss = tf.contrib.seq2seq.sequence_loss(logits=decoder_logits_train,
                                          targets=decoder_targets_train,
                                          weights=masks,
                                          average_across_timesteps=True,
                                          average_across_batch=True, )

        return loss

    elif mode == 'decode':

        start_tokens = tf.ones([batch_size, ], tf.int32) * start_token
        end_tokens = end_token

        def embed_and_input_proj(inputs):
            return input_layer(tf.nn.embedding_lookup(decoder_embeddings, inputs))

        decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens,
                                                                   end_token=end_tokens,
                                                                   embedding=embed_and_input_proj)
        
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(cell=wrapped_decoder_cell,
                                                            helper=decoding_helper,
                                                            initial_state=decoder_initial_state,
                                                            output_layer=output_layer)
        decoder_outputs_decode, decoder_last_state_decode, \
            decoder_outputs_length_decode = tf.contrib.seq2seq.dynamic_decode(
                                                decoder=inference_decoder,
                                                output_time_major=False,
                                                # impute_finished=True,	# error occurs
                                                maximum_iterations=max_decode_step)

        decoder_pred_decode = tf.expand_dims(decoder_outputs_decode.sample_id, -1)

        return decoder_pred_decode


def check_feeds(encoder_inputs, encoder_inputs_length,
                decoder_inputs, decoder_inputs_length, decode):

    input_batch_size = encoder_inputs.shape[0]
    input_feed = dict()
    input_feed[encoder_inputs.name] = encoder_inputs
    input_feed[encoder_inputs_length.name] = encoder_inputs_length

    if not decode:
        input_feed[decoder_inputs.name] = decoder_inputs
        input_feed[decoder_inputs_length.name] = decoder_inputs_length

    return input_feed


def predict(sess, encoder_inputs, encoder_inputs_length):
    decoder_pred_decode = decoding('decode')
    input_feed = check_feeds(encoder_inputs, encoder_inputs_length,
                             decoder_inputs=None, decoder_inputs_length=None, decode=True)

    input_feed[keep_prob_placeholder.name] = 1.0

    output_feed = [decoder_pred_decode]
    outputs = sess.run(output_feed, input_feed)

    return outputs[0]

sequence_loss = decoding('train')
loss = tf.reduce_mean(sequence_loss)
train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

prediction = tf.argmax(outputs, axis=2)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    loss = 0.0

    for i in range(num_encoder_symbols):
        l, _ = sess.run([loss, train])
        result = predict(sess, encoder_inputs_embedded, encoder_inputs_length)