In [3]:
%reload_ext watermark
%watermark -a 'Scott Ming' -v -m -d -p numpy,sklearn,tensorflow

Scott Ming 2017-05-30 

CPython 3.6.0
IPython 6.0.0

numpy 1.12.1
sklearn 0.18.1
tensorflow 1.0.1

compiler   : GCC 4.9.2
system     : Linux
release    : 3.16.0-4-amd64
machine    : x86_64
processor  : 
CPU cores  : 8
interpreter: 64bit


## 2. 让机器学会做加法：

- 例如:
    + 输入 "1+12", 输出"13"
    + 输入 "324+176", 输出"500"
    + 输入 "154+33", 输出"187"
- 注意输入和输出都是`字符串`类型，我们的目的是让机器在不知道加法法则的情况下学会加法运算。
- 为减小难度，输入字符串在加号左右的数字限定为三位数之内的正整数
- 训练集、验证集和测试集数据由自己生成
- 文档中需要说明
    + 数据的生成过程
    + 模型是如何设计的
    + 超参数是如何选择的
    + 目标函数和优化方法如何选择
    + 模型在测试集上的准确率是多少

In [1]:
from collections import Counter, deque
import itertools
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.contrib.legacy_seq2seq import (basic_rnn_seq2seq, 
                                                embedding_rnn_seq2seq, sequence_loss)
from tensorflow.python.ops import variable_scope

### 2.1 数据预处理

In [2]:
def build_texts(low, high, size):
    x1 = np.random.randint(low=low, high=high, size=size)
    x2 = np.random.randint(low=low, high=high, size=size)
    y = x1 + x2
    # Concat x1, x2
    x_texts = [str(left) + '+' + str(right) for left, right in zip(x1, x2)]
    y_texts = [str(i) for i in y]
    return x_texts, y_texts


def build_dict(words):
    word_counts = Counter(words).most_common()
    count = [['<UNK>', -1]]
    count.extend(word_counts)
    vocab2ix = {key: ix for ix, (key, _) in enumerate(count)}
    vocab2ix['<GO>'] = max(vocab2ix.values()) + 1
    vocab2ix['<EOS>'] = max(vocab2ix.values()) + 1
    ix2vocab = {value: key for key, value in vocab2ix.items()}
    return vocab2ix, ix2vocab


def append_go_eos(nested_list):
    nested_list = [deque(list_) for list_ in nested_list]
    for deque_ in nested_list:
        deque_.appendleft('<GO>')
        deque_.append('<EOS>')
    nested_list = [list(deque_) for deque_ in nested_list]
    return nested_list

In [3]:
def seq2seq_pad(encoder_inputs, encoder_length, decoder_inputs, decoder_length, vocab, pad_symbol='<UNK>'):
    """
    - encoder_input: A nested list of symbol str for encoding, length: batch_size
    - encoder_length: max length of encoder input
    - decoder_input: A nested list of symbol str for decoding, length: batch_size
    - decoder_length: max length of decoder input
    - vocab: vocabulary index, symbol (str) -> index (int)
    
    Example: 
    ["hello", "world"] -> ["hi", "<EOS>"]
    ["cover", "me"] -> ["roger", "<EOS>"]
    
    seq2seq_pad([['hello', 'world'], ['cover', 'me']], 4, [['hi', '<EOS>'], ['roger', '<EOS>']], 4, vocab)
    
    Assume that index of "<PAD>" is 0

    Output:
    [[0, 0, <index of 'hello'>, <index of 'world'>], [0, 0, <index of 'cover'>, <index of 'me'>]],
    [[<index of 'hi'>, <index of 'EOS'>, 0, 0], [<index of 'roger'>, <index of 'EOS'>, 0, 0]]
    """
    pad_index = vocab[pad_symbol]
    def to_index(inputs, length, pad_from_start=True):
        inputs_to_index = []
        for cur_input in inputs:
            cur_input_to_index = [pad_index] * length
            l = len(cur_input)
            if l < length:
                if pad_from_start:
                    cur_input_to_index[(length - l):] = [vocab[i] for i in cur_input]
                else:
                    cur_input_to_index[:l] = [vocab[i] for i in cur_input]
            else:
                cur_input_to_index = [vocab[i] for i in cur_input[:length]]
            inputs_to_index.append(cur_input_to_index)    
        return inputs_to_index
    return to_index(encoder_inputs, encoder_length, True), to_index(decoder_inputs, decoder_length, False)


def left_shift(decoder_inputs, pad_idx):
    # for generating targets
    return [list(input_[1:]) + [pad_idx] for input_ in decoder_inputs]

In [4]:
x_texts, y_texts = build_texts(low=0, high=999, size=500000)
x_words = [list(sequence) for sequence in x_texts]
y_words = [list(sequence) for sequence in y_texts]
data = itertools.chain(x_words, y_words)  # Concat all data
words = itertools.chain.from_iterable(data)  # Flat data
vocab2ix, ix2vocab = build_dict(words)
y_words = append_go_eos(y_words)  # 方便排列，<GO>/<EOS> 在创建字典之后添加

In [5]:
# 分割训练数据和验证数据
dataset = np.c_[x_words, y_words]  # 按列拼接
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
x_train, y_train = train_set[:, 0], train_set[:, 1]
x_test, y_test = test_set[:, 0], test_set[:, 1]

### 2.2 构建模型

In [6]:
tf.reset_default_graph()
sess = tf.Session()

encoder_length = max(len(x) for x in x_train)
decoder_length = max(len(x) for x in y_train)

cell = tf.contrib.rnn.LSTMCell(128)
num_encoder_symbols = len(vocab2ix)
num_decoder_symbols = len(vocab2ix)
train_dataset_size = y_train.shape[0]
batch_size = 50
embedding_size = 128
epochs = 20001
print_loss_every = 1000
learning_rate = 0.0003

encoder_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="encoder_%d" % i) for i in range(encoder_length)]
decoder_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="decoder_%d" % i) for i in range(decoder_length)]
target_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                      name="target_%d" % i) for i in range(decoder_length)]
target_weights_placeholders = [tf.placeholder(tf.float32, shape=[None],
                                              name="decoder_weight_%d" % i) for i in range(decoder_length)]
outputs, states = embedding_rnn_seq2seq(encoder_placeholders, decoder_placeholders, cell,
                                        num_encoder_symbols, num_decoder_symbols,
                                        embedding_size, output_projection=None,
                                        feed_previous=False)

loss = sequence_loss(outputs, target_placeholders, target_weights_placeholders)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [7]:
def get_feed_dict(encoder_inputs, decoder_inputs):
    encoder_inputs = list(zip(*encoder_inputs))
    target_inputs = list(zip(*left_shift(decoder_inputs, vocab2ix['<UNK>'])))
    decoder_inputs = list(zip(*decoder_inputs))
    
    feed_dict = dict()
    # Prepare input data    
    for (i, placeholder) in enumerate(encoder_placeholders):
        # 这里用 placeholder 或者 placeholder.name 都可以
        feed_dict[placeholder.name] = np.asarray(encoder_inputs[i], dtype=int)
    for i in range(len(decoder_placeholders)):
        feed_dict[decoder_placeholders[i].name] = np.asarray(decoder_inputs[i], dtype=int)
        feed_dict[target_placeholders[i].name] = np.asarray(target_inputs[i], dtype=int)        
        # 这里使用 weights 把 <PAD> 的损失屏蔽了
        feed_dict[target_weights_placeholders[i].name] = np.asarray(
            [float(idx != vocab2ix['<UNK>'])
             for idx in target_inputs[i]], dtype=float)
    return feed_dict


def get_output_words(outputs_list):
    test_output_array = np.asarray(outputs_list).T
    # 将获取的最高概率 numbers 转为 字符串
    test_output_list = [[ix2vocab[idx] for idx in sublist]
                                       for sublist in test_output_array]
    # 删除 <EOS> 之后的词
    test_output_list = [itertools.takewhile(lambda x: x != '<EOS>', sublist)
                        for sublist in test_output_list]
    # 把解码后的字符串拼接成句子
    test_output_list = [''.join(i) for i in test_output_list]
    return test_output_list

In [8]:
# Pad data
train_encoders, train_decoders = seq2seq_pad(x_train, encoder_length, 
                                             y_train, decoder_length, vocab2ix)
test_encoders, test_decoders = seq2seq_pad(x_test, encoder_length, 
                                           y_test, decoder_length, vocab2ix)
# Set feed_dict
train_feed_dict = get_feed_dict(train_encoders, train_decoders) 
test_feed_dict = get_feed_dict(test_encoders, test_decoders)

In [9]:
sess.run(tf.global_variables_initializer())

for i in range(epochs):
    start = (i * batch_size) % train_dataset_size
    end = min(start + batch_size, train_dataset_size)
    feed_dict = get_feed_dict(train_encoders[start:end], train_decoders[start:end])
    cost = sess.run(loss, feed_dict)
    sess.run(train_step, feed_dict)
    if i % print_loss_every == 0:
        print(f'Epoch: {i:04d} | Cost: {cost:.4f}')

Epoch: 0000 | Cost: 2.6877
Epoch: 1000 | Cost: 1.3043
Epoch: 2000 | Cost: 1.1225
Epoch: 3000 | Cost: 0.9496
Epoch: 4000 | Cost: 0.6554
Epoch: 5000 | Cost: 0.5188
Epoch: 6000 | Cost: 0.4227
Epoch: 7000 | Cost: 0.4274
Epoch: 8000 | Cost: 0.3680
Epoch: 9000 | Cost: 0.3236
Epoch: 10000 | Cost: 0.2814
Epoch: 11000 | Cost: 0.2653
Epoch: 12000 | Cost: 0.1887
Epoch: 13000 | Cost: 0.1795
Epoch: 14000 | Cost: 0.1269
Epoch: 15000 | Cost: 0.1272
Epoch: 16000 | Cost: 0.0919
Epoch: 17000 | Cost: 0.0695
Epoch: 18000 | Cost: 0.0583
Epoch: 19000 | Cost: 0.0832
Epoch: 20000 | Cost: 0.0574


In [10]:
with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
    outputs, states = embedding_rnn_seq2seq(encoder_placeholders, decoder_placeholders, 
                                            cell, num_encoder_symbols, num_decoder_symbols,
                                            embedding_size, output_projection=None,
                                            feed_previous=True)
    
    train_raw_outputs = [np.argmax(sess.run(o, train_feed_dict), axis=1) for o in outputs]
    test_raw_outputs = [np.argmax(sess.run(o, test_feed_dict), axis=1) for o in outputs]
    train_outputs = get_output_words(train_raw_outputs)
    test_outputs  = get_output_words(test_raw_outputs)
    train_targets = [''.join(i[1:-1]) for i in y_train]
    test_targets  = [''.join(i[1:-1]) for i in y_test]
    
    # Compute acc
    train_acc = accuracy_score(train_targets, train_outputs) 
    test_acc = accuracy_score(test_targets, test_outputs)
    print(f'Train/Test ACC: {train_acc:.3f}/{test_acc:.3f}')

Train/Test ACC: 0.944/0.942


In [11]:
# 随机选择 20 个测试样本看看具体结果
r = np.random.permutation(y_test.shape[0])[:20]
inputs_ = [''.join(i) for i in x_test[r]]
targets_ = np.array(test_targets)[r]
outputs_ = np.array(test_outputs)[r]
for i, o, t in zip(inputs_, outputs_, targets_):
    print('Input:', i)
    print('Target:', t)
    print('Output:', o)
    print('-=-' * 5)

Input: 94+933
Target: 1027
Output: 1027
-=--=--=--=--=-
Input: 504+538
Target: 1042
Output: 1042
-=--=--=--=--=-
Input: 847+926
Target: 1773
Output: 1773
-=--=--=--=--=-
Input: 511+856
Target: 1367
Output: 1367
-=--=--=--=--=-
Input: 212+886
Target: 1098
Output: 1098
-=--=--=--=--=-
Input: 486+1
Target: 487
Output: 507
-=--=--=--=--=-
Input: 299+415
Target: 714
Output: 714
-=--=--=--=--=-
Input: 995+602
Target: 1597
Output: 1597
-=--=--=--=--=-
Input: 224+528
Target: 752
Output: 752
-=--=--=--=--=-
Input: 972+305
Target: 1277
Output: 1277
-=--=--=--=--=-
Input: 149+326
Target: 475
Output: 475
-=--=--=--=--=-
Input: 986+979
Target: 1965
Output: 1965
-=--=--=--=--=-
Input: 357+808
Target: 1165
Output: 1165
-=--=--=--=--=-
Input: 575+940
Target: 1515
Output: 1515
-=--=--=--=--=-
Input: 794+843
Target: 1637
Output: 1637
-=--=--=--=--=-
Input: 189+131
Target: 320
Output: 320
-=--=--=--=--=-
Input: 769+499
Target: 1268
Output: 1268
-=--=--=--=--=-
Input: 720+765
Target: 1485
Output: 1485
-=-

## Change Log:

### 两位数时：

**1**

* Cell: BasicRNNCell
* dataset size: 20000
* Epochs: 5000
* Learning_rate: 0.001
* Train/Test Acc: 0.997/0.986

### 三位数时：

**1**

* Cell: BasicRNNCell
* dataset size: 100000
* Epochs: 50000
* Learning_rate: 0.001
* Train/Test Acc: 0.344/0.322

**2**

* Cell: BasicRNNCell
* dataset size: 200000
* Epochs: 5000
* Learning_rate: 0.01
* Train/Test Acc: 0.005/0.004
* Final cost: 1.3729

**3**

* Cell: BasicRNNCell
* dataset size: 200000
* Epochs: 50000
* Learning_rate: 0.01
* Train/Test Acc: 0.001/0.001
* Final cost: 1.6427
* note: 单纯的加大步数和学习率，感觉没用，反而 cost 有所上升

**4**

* Cell: BasicRNNCell
* dataset size: 2000000
* Epochs: 50000
* Learning_rate: 0.001
* Train/Test Acc: 0.346/0.344
* Final cost: 0.3691
* note: 加大了数据集和降低了学习率，cost 明显下来了，应该是学习率的问题，准确率也有较大提升

**5**

* Cell: BasicRNNCell
* dataset size: 2000000
* Epochs: 50000
* Learning_rate: 0.003
* Train/Test Acc: 0.982/0.982
* Final cost: 0.0941
* note: 调整了学习率之后，cost 下降的比较好，最后的结果也不错

**6**

* Cell: BasicRNNCell
* dataset size: 500000
* Epochs: 20001
* Learning_rate: 0.003
* Train/Test Acc: 0.941/0.940
* Final cost: 0.0394
* note: 试着减少了数据量(现实中数据量很难有第5次那么大)，和迭代次数，发现结果也还不错