In [1]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense
import numpy as np
import time

In [2]:
with open('data/letters_source.txt', 'r', encoding='utf-8') as f:
    source_data = f.read()
    
with open('data/letters_target.txt', 'r', encoding='utf-8') as f:
    target_data = f.read()

In [3]:
# 数据预览
source_data.split('\n')[:10]

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 'kmclq']

In [4]:
target_data.split('\n')[:10]

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 'cklmq']

In [6]:
# 数据预处理
def extract_character_vocab(data):
    # 构造映射表
    special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    
    set_words = list(set([character for line in data.split('\n') for character in line]))
    # 这里要把四个特殊字符添加进词典
    int_to_vocab = {idx: word for idx, word in enumerate(special_words + set_words)}
    vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}
    
    return int_to_vocab, vocab_to_int

source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)
len(source_letter_to_int)
len(target_letter_to_int)

30

In [6]:
# 构造映射表
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

# 对字母进行转换
source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>'])
               for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>'])
               for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data.split('\n')]

In [7]:
source_int[:10]

[[17, 22, 8, 24, 24],
 [20, 9, 4],
 [11, 17, 19, 29, 12],
 [17, 24, 16],
 [14, 5, 8, 11],
 [26, 28, 28, 8, 10],
 [6, 28, 23, 9, 12, 9, 21],
 [20, 22, 9, 16],
 [18, 29, 11, 13, 15],
 [14, 10, 7, 11, 24]]

In [8]:
target_int[:10]

[[8, 17, 24, 24, 22, 3],
 [20, 9, 4, 3],
 [17, 12, 11, 29, 19, 3],
 [17, 24, 16, 3],
 [8, 5, 14, 11, 3],
 [8, 28, 28, 10, 26, 3],
 [28, 6, 21, 12, 9, 9, 23, 3],
 [20, 9, 22, 16, 3],
 [18, 11, 13, 29, 15, 3],
 [7, 14, 11, 10, 24, 3]]

# 构建模型

In [9]:
# 输入层

def get_inputs():
    # 模型输入tensor
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    # 定义target序列最大长度（之后target_sequence_length和source_sequence_length会作为feed_dict的参数）
    target_sequence_length = tf.placeholder(tf.int32, (None, ), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32, (None, ), name='source_sequence_length')
    
    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length,source_sequence_length

# Encoder
在encoder端，我们需要进行两步，第一步要对我们的输入进行Embedding,再把Embedding以后的向量传给RNN进行处理。在Embedding中，我们使用tf.contrib.layers.embed_sequence,它会对每个batch执行embedding操作
- tf.contrib.layers.embed_sequence
对序列数据执行embedding操作，输入[batch_size, sequence_length]中的tensor,返回[batch_size,sequence_length,embed_dim]中的tensor
features = [[1,2,3],[4,5,6]]
outputs=tf.contrib.layers.embed_sequence(features,vocab_size,embed_dim)
如果embed_dim=4,输出结果为
[
[[0.1,0.2,0.3,0.1],[0.2,0.5,0.7,0.2],[0.1,0.6,0.1,0.2]],
[[0.6,0.2,0.8,0.2],[0.5,0.6,0.9,0.2],[0.3,0.9,0.2,0.2]]
]
- tf.contrib.rnn.MultiRNNCell
对RNN单元进行序列堆叠。接受参数为一个由RNN cell组成的list.
mn_size代表一个rnn单元中隐层节点数量，layer_nums代表堆叠的rnn cell个数
- tf.nn.dynamic_rnn:
构建RNN,接受动态输入序列。返回RNN的输出以及最终状态的tensor。
dynamic_rnn与rnn的区别在于，dynamic_rnn对于不同的batch,可以接受不同的sequence_length。
例如，第一个batch是[batch_size, 10]，第二个batch是[batch_size,20]。而rnn只能接收定长的sequence_length

In [10]:
def get_encoder_layer(input_data, run_size, num_layers, source_sequence_length, source_vocab_size,
                      encoding_embedding_size):
    '''
    构造Encoder层
    
    参数说明
    - input_data: 输入tensor
    - run_size: rnn隐层节点数量
    - num_layers: 堆叠的rnn cell数量
    - source_sequence_length: 源数据的序列长度
    - source_vocab_size: 源数据的词典大小
    - encoding_embedding_size: embedding的大小
    '''
    # Encoder embedding
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)
    
    # RNN cell
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(run_size) for _ in range(num_layers)])
    
    encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)
    
    return encoder_output, encoder_state

# Decoder
该模型中最复杂的一部分

- 处理解码器输入
- 设置解码器组件
- 嵌入Embedding
- 解码器单元Decoder cell
- 全连接输出层Dense output layer
- 训练解码器
- 推理解码器Inference decoder
- Process Decoder Input
- 目标序列target sequences将被用于两种不同的地方：

计算损失
训练阶段传入解码器中，使模型鲁棒性更好
在传入解码器之前，我们需要做一个小小的转换：

在当前每一步，我们将序列的前一个字母做为输入。考虑到最后一步，解码器输出了最后一个字母。该步骤的解码器输入是倒数第2个字母，所以解码器的最后一个字母是无法作为输入的，我们移除掉。使用 tensorflow’s tf.strided_slice() 方法。传入tensor，截取起始索引号，和结束索引号。
对任何解码器的输入，其第一个序列位置都是GO 符号。将其加入到我们的输入中。

In [11]:
# 处理解码器输入：
def process_decoder_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and
    concat the <GO> to the begining of each batch'''
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    #tf.concat(list,axis) axis代表是沿着第几维度进行拼接。
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

# Set up the decoder components
- 嵌入Embedding
- 解码器单元Decoder cell
- 全连接输出层Dense output layer
- 训练解码器
- 推理解码器Inference decoder

1、Embedding
首先将解码器输入嵌入转换。
我们创建嵌入矩阵（解码器的），然后使用 函数 tf.nn.embedding_lookup将解码器输入转换成嵌入向量。

2、Decoder Cell
声明解码器单元（cell）,和编码器一样，使用函数 tf.contrib.rnn.LSTMCell
我们需要声明2个解码器，一个是用于训练，一个用于推理（即预测）。并且，这两个解码器共享权重。（所以我们才可以通过训练所学习到的权重来进行推理）
首先，我们定义用于解码器RNNS的单元的类型，建议用 LSTM

3、Dense output layer
创建全连接输出层。 可以使用tensorflow.contrib.layers.Dense函数。目的：将解码器的输出转换为logits，该logits告诉我们在每一个时间步骤中概率最大的那个字母，并传入下一步

4、Training decoder
我们需要创建2个共享权重的解码器：一个用于训练，一个用于推理。共同之处在于：都使用 tf.contrib.seq2seq.BasicDecoder 和 tf.contrib.seq2seq.dynamic_decode创建。不同在于：在每一个时间步骤中，我们会将目标序列传入训练解码器进行训练。
训练解码器 不会将每一步的预测值传给下一步。而是将目标序列作为输入传入给训练解码器。

5、Inference decoder
推理解码器，是用于部署模型的解码器
将编码器的hidden state，同时传入：训练解码器和推理解码器，并处理。TensorFlow将处理步骤已经封装好，需要我们调用恰当的方法（tf.contrib.seq2seq ） ，并传入相应的数据。


In [12]:
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                   target_sequence_length, max_target_sequence_length, enc_state,
                   dec_input):
    # 1. Decoder Embedding
    target_vocab_size = len(target_letter_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size,
                                                    decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)

    # 2. Construct the decoder cell
    def make_cell(rnn_size):
        dec_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return dec_cell

    dec_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
     
    # 3. Dense layer to translate the decoder's output at each time 
    # step into a choice from the target vocabulary
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))


    # 4. Set up a training decoder and an inference decoder
    # Training Decoder
    with tf.variable_scope("decode"):

        # Helper for the training process. Used by BasicDecoder to read inputs.
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        
        
        # Basic decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                           training_helper,
                                                           enc_state,
                                                           output_layer) 
        
        # Perform dynamic decoding using the decoder
        training_decoder_output = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                   impute_finished=True,
                                            maximum_iterations=max_target_sequence_length)[0]
    # 5. Inference Decoder
    # Reuses the same parameters trained by the training process
    with tf.variable_scope("decode", reuse=True):
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], 
                                           dtype=tf.int32), 
                                           [batch_size], name='start_tokens')

        # Helper for the inference process.
        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings,
                                                                start_tokens,
                                                                target_letter_to_int['<EOS>'])

        # Basic decoder
        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        enc_state,
                                                        output_layer)
        
        # Perform dynamic decoding using the decoder
        inference_decoder_output = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)[0]
         

    
    return training_decoder_output, inference_decoder_output


In [13]:
def seq2seq_model(input_data, targets, lr, target_sequence_length, 
                  max_target_sequence_length, source_sequence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size, 
                  rnn_size, num_layers):
    
    # 将input data输入编码器，忽略输出
    _, enc_state = get_encoder_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  source_sequence_length,
                                  source_vocab_size, 
                                  encoding_embedding_size)
    
    
    # 预处理解码器输入（目标序列）
    dec_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    # 将encoder state 和 decoder inputs 输入解码器中
    training_decoder_output, inference_decoder_output = decoding_layer(target_letter_to_int, 
                                                                       decoding_embedding_size, 
                                                                       num_layers, 
                                                                       rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       enc_state, 
                                                                       dec_input) 
    
    return training_decoder_output, inference_decoder_output

In [14]:
# 超参数
# Number of Epochs
epochs = 60
# Batch Size 
batch_size = 128
# RNN Size
rnn_size = 50
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 15
decoding_embedding_size = 15
# Learning Rate
learning_rate = 0.001

In [16]:
# 构造graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_inputs()
    
    # Create the training and inference logits
    training_decoder_output, inference_decoder_output = seq2seq_model(input_data, 
                                                                      targets, 
                                                                      lr, 
                                                                      target_sequence_length, 
                                                                      max_target_sequence_length, 
                                                                      source_sequence_length,
                                                                      len(source_letter_to_int),
                                                                      len(target_letter_to_int),
                                                                      encoding_embedding_size, 
                                                                      decoding_embedding_size, 
                                                                      rnn_size, 
                                                                      num_layers)    
    
    # Create tensors for the training logits and inference logits
    #tf.identity(）返回与输入张量或值相同形状和内容的张量
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    inference_logits = tf.identity(inference_decoder_output.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, 
                             dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [17]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))
        
        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))
        
        yield pad_targets_batch, pad_sources_batch, pad_targets_lengths, pad_source_lengths


In [18]:
# Split data to training and validation sets
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]
valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))

display_step = 50 # Check training loss after every 20 batches

checkpoint = "./trained_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
        
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches(train_target, train_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>'])):
            
            # Training step
            _, loss = sess.run(
                [train_op, cost],
                {input_data: sources_batch,
                 targets: targets_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths})

            # Debug message updating us on the status of the training
            if batch_i % display_step == 0 and batch_i > 0:
                
                # Calculate validation cost
                validation_loss = sess.run(
                [cost],
                {input_data: valid_sources_batch,
                 targets: valid_targets_batch,
                 lr: learning_rate,
                 target_sequence_length: valid_targets_lengths,
                 source_sequence_length: valid_sources_lengths})
                
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              loss, 
                              validation_loss[0]))

    
    
    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')

Epoch   1/60 Batch   50/77 - Loss:  2.344  - Validation loss:  2.108
Epoch   2/60 Batch   50/77 - Loss:  1.787  - Validation loss:  1.581
Epoch   3/60 Batch   50/77 - Loss:  1.583  - Validation loss:  1.407
Epoch   4/60 Batch   50/77 - Loss:  1.506  - Validation loss:  1.343
Epoch   5/60 Batch   50/77 - Loss:  1.370  - Validation loss:  1.224
Epoch   6/60 Batch   50/77 - Loss:  1.291  - Validation loss:  1.146
Epoch   7/60 Batch   50/77 - Loss:  1.236  - Validation loss:  1.098
Epoch   8/60 Batch   50/77 - Loss:  1.134  - Validation loss:  1.002
Epoch   9/60 Batch   50/77 - Loss:  1.023  - Validation loss:  0.889
Epoch  10/60 Batch   50/77 - Loss:  0.938  - Validation loss:  0.805
Epoch  11/60 Batch   50/77 - Loss:  0.857  - Validation loss:  0.718
Epoch  12/60 Batch   50/77 - Loss:  0.779  - Validation loss:  0.646
Epoch  13/60 Batch   50/77 - Loss:  0.715  - Validation loss:  0.586
Epoch  14/60 Batch   50/77 - Loss:  0.662  - Validation loss:  0.535
Epoch  15/60 Batch   50/77 - Loss:

In [21]:
def source_to_seq(text):
    '''Prepare the text for the model'''
    sequence_length = 7
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text]+ [source_letter_to_int['<PAD>']]*(sequence_length-len(text))

input_sentence = 'common'
text = source_to_seq(input_sentence)

checkpoint = "./trained_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    
    #Multiply by batch_size to match the model's input parameters
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_sequence_length: [len(text)]*batch_size, 
                                      source_sequence_length: [len(text)]*batch_size})[0] 


pad = source_letter_to_int["<PAD>"] 

print('Original Text:', input_sentence)

print('\nSource')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))

print('\nTarget')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))


INFO:tensorflow:Restoring parameters from ./trained_model.ckpt
Original Text: common

Source
  Word Ids:    [7, 13, 10, 10, 13, 20, 0]
  Input Words: c o m m o n <PAD>

Target
  Word Ids:       [7, 10, 10, 20, 13, 13, 3]
  Response Words: c m m n o o <EOS>
