# Gated CNN

《Language Modeling with Gated Convolutional Networks》

Yann N. Dauphin  
Angela Fan  
Michael Auli  
David Grangier  
Facebook AI Research

### author qhduan@memect.co

In [1]:
import pickle
import numpy as np
import tensorflow as tf

In [2]:
from data import X_train, X_test, y_train, y_test
from data import fit_vectorizer, fit_onehot
from data import batch_flow, test_batch_flow

训练集样本量：12126，测试集样本量：3032


In [3]:
tf.set_random_seed(0)

In [4]:
embedding_size = 128
PAD = ' ' # 句子不到max_len长度时的占位符
max_len = max(len(x) for x in X_train)
print('单个训练样本最大长度：{}'.format(max_len))

单个训练样本最大长度：14


In [5]:
vectorizer = fit_vectorizer(X_train, embedding_size, max_len, PAD)
onehot = fit_onehot(y_train)

In [6]:
learning_rate = 0.001
n_epoch = 20
hidden_size = 512
batch_size = 256
time_steps = max_len
input_size = embedding_size
target_size = len(onehot.feature_indices_)
print('time_steps', time_steps)
print('input_size', input_size)
print('target_size', target_size)

time_steps 14
input_size 128
target_size 2


In [7]:
test_batch_flow(X_train, y_train, batch_size, vectorizer, onehot, max_len, PAD)

(256, 14, 128) (256, 2)


In [8]:
X = tf.placeholder(tf.float32, [batch_size, time_steps, input_size, 1], name='X')
y = tf.placeholder(tf.float32, [batch_size, target_size], name='X')

In [9]:
pitch_1 = tf.Variable(tf.random_normal([5, 19, 1, 32]), name='pitch_1')
pitch_1_bias = tf.Variable(tf.random_normal([32]), name='pitch_1_bias')

pitch_2 = tf.Variable(tf.random_normal([5, 19, 1, 32]), name='pitch_2')
pitch_2_bias = tf.Variable(tf.random_normal([32]), name='pitch_2_bias')

In [10]:
conv_1 = tf.nn.bias_add(
    tf.nn.conv2d(
        X, pitch_1, strides=[1, 1, 1, 1], padding='VALID'
    ),
    pitch_1_bias,
    name='bias_add_1'
)

In [11]:
conv_2 = tf.nn.bias_add(
    tf.nn.conv2d(
        X, pitch_2, strides=[1, 1, 1, 1], padding='VALID'
    ),
    pitch_2_bias,
    name='bias_add_2'
)

In [12]:
conv_2 = tf.sigmoid(conv_2)

In [13]:
print(conv_1.get_shape())

(256, 10, 110, 32)


In [14]:
print(conv_2.get_shape())

(256, 10, 110, 32)


In [15]:
flatten = conv_1 * conv_2
print(flatten.get_shape())

(256, 10, 110, 32)


In [16]:
flatten = tf.reshape(flatten, [batch_size, -1])

In [17]:
print(flatten.get_shape())

(256, 35200)


In [18]:
weight_1 = tf.Variable(tf.random_normal([int(flatten.get_shape()[1]), target_size]), name='weight_1')
bias_1 = tf.Variable(tf.random_normal([target_size]), name='bias_1')

In [19]:
full_connect = tf.add(
    tf.matmul(flatten, weight_1, name='matmul_2'),
    bias_1,
    name='add_2'
)

In [20]:
full_connect.get_shape()

TensorShape([Dimension(256), Dimension(2)])

In [21]:
pred = full_connect

In [22]:
cost = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(
        pred, y
    )
)

In [23]:
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [24]:
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [25]:
# 初始化所有变量
init = tf.global_variables_initializer()

In [26]:
# 本来是要关，不过CNN不用GPU真的好慢……
# disable GPU，关闭GPU支持
config = tf.ConfigProto(
#     device_count = {'GPU': 0}
)

In [27]:
with tf.Session(config=config) as sess:
    sess.run(init)
    for epoch in range(n_epoch + 1):
        costs = []
        accs = []
        for X_sample, y_sample in batch_flow(X_train, y_train, batch_size, vectorizer, onehot, max_len, PAD):
            feeds = {X: X_sample.reshape([batch_size, time_steps, input_size, 1]), y: y_sample}
            sess.run(train_step, feeds)
            c, acc = sess.run([cost, accuracy], feeds)
            costs.append(c)
            accs.append(acc)
        print('epoch {} cost: {:.4f} acc: {:.4f}'.format(
            epoch, np.mean(costs), np.mean(acc)
        ))
    # train
    costs = []
    accs = []
    for X_sample, y_sample in batch_flow(X_train, y_train, batch_size, vectorizer, onehot, max_len, PAD):
        feeds = {X: X_sample.reshape([batch_size, time_steps, input_size, 1]), y: y_sample}
        c, acc = sess.run([cost, accuracy], feeds)
        costs.append(c)
        accs.append(acc)
    print('train cost: {:.4f} acc: {:.4f}'.format(np.mean(costs), np.mean(acc)))
    # test
    costs = []
    accs = []
    for X_sample, y_sample in batch_flow(X_test, y_test, batch_size, vectorizer, onehot, max_len, PAD):
        feeds = {X: X_sample.reshape([batch_size, time_steps, input_size, 1]), y: y_sample}
        c, acc = sess.run([cost, accuracy], feeds)
        costs.append(c)
        accs.append(acc)
    print('test cost: {:.4f} acc: {:.4f}'.format(np.mean(costs), np.mean(acc)))

epoch 0 cost: 47.4133 acc: 0.5625
epoch 1 cost: 40.7791 acc: 0.5859
epoch 2 cost: 35.5414 acc: 0.5859
epoch 3 cost: 31.1322 acc: 0.6094
epoch 4 cost: 27.4881 acc: 0.6328
epoch 5 cost: 24.4705 acc: 0.6523
epoch 6 cost: 21.9796 acc: 0.6641
epoch 7 cost: 19.9095 acc: 0.6680
epoch 8 cost: 18.1795 acc: 0.6641
epoch 9 cost: 16.6883 acc: 0.6680
epoch 10 cost: 15.4159 acc: 0.6836
epoch 11 cost: 14.2943 acc: 0.6992
epoch 12 cost: 13.3381 acc: 0.7188
epoch 13 cost: 12.4947 acc: 0.7148
epoch 14 cost: 11.7470 acc: 0.7109
epoch 15 cost: 11.1106 acc: 0.7500
epoch 16 cost: 10.5260 acc: 0.7695
epoch 17 cost: 9.9895 acc: 0.7305
epoch 18 cost: 9.5880 acc: 0.7344
epoch 19 cost: 9.2583 acc: 0.7812
epoch 20 cost: 8.8758 acc: 0.7305
train cost: 8.6559 acc: 0.7305
test cost: 15.0292 acc: 0.6328


要么过拟合，要么拟合不了，文本pitch的参数没什么经验～～不知道是不是程序写的有问题……应该不是吧