In [398]:
import tensorflow as tf
import numpy as np
import data
tf.reset_default_graph()

In [399]:
vocab_size = 1681
d, id_to_token = data.get_data(vocab_size)
for k in id_to_token:
    if id_to_token[k] == 'eos':
        id_to_token[k] = '\n'
print(len(d))
x, y = d[10:20], d[11:21]
max_word_length = max([len(id_to_token[np.argmax(j)]) for j in d])
print(max_word_length)

['i', 'would', 'like', 'to', 'talk', 'today', 'about', 'how', 'to', 'develop', 'a', 'new', 'foreign', 'policy', 'direction', 'for', 'our', 'country', 'one', 'that', 'replaces', 'randomness', 'with', 'purpose', 'ideology', 'with', 'strategy', 'and', 'chaos', 'with', 'peace', 'it', 'is', 'time', 'to', 'shake', 'the', 'rust', 'off', 'of']
1681
10229
15


In [400]:
c_to_i, embedding = data.get_char_embedding()

def word_generator(ix, num=10):
    return d[ix : ix + num]

def sequence_char_matrix(ix, num = 10):
    # returns char matrices of all words in a sequence starting from ix and of length num    
    words = [id_to_token[np.argmax(j)] for j in d[ix : ix + num]]
    chars = [embedding[[c_to_i[c] for c in w]] for w in words]
    to_pad = [max_word_length - t for t in map(lambda x: len(x), words)]
    pad = [np.zeros([tp, embedding.shape[0]]) for tp in to_pad]
    reshape = [1, max_word_length, embedding.shape[0], 1]
    padded = np.concatenate([np.reshape(np.r_[ch, pd], reshape) for ch, pd in zip(chars, pad)], axis=0)
    return padded

_ = sequence_char_matrix(3, 13)
_ = word_generator(0, 13)

In [401]:
# CNN hyperparameters
input_height = max_word_length
input_width = embedding.shape[0]
batch_size = 13
print(input_height, input_width)
filter_heights = [2, 3, 4]
feature_maps = [3, 4, 5]

15 87


In [402]:
def inference(inp):
    with tf.variable_scope('conv1') as scope:        
        weight = tf.get_variable('weights', [filter_heights[0], input_width, 1, feature_maps[0]], initializer=tf.random_uniform_initializer(minval=-0.05, maxval=0.05))
        conv1 = tf.nn.tanh(tf.nn.conv2d(inp, weight, strides=[1, 1, 1, 1], padding='VALID'))
    pool1 = tf.squeeze(tf.nn.max_pool(conv1, ksize=[1, conv1.get_shape()[1], 1, 1], strides=[1, 1, 1, 1], padding='VALID'))
    
    with tf.variable_scope('conv2') as scope:        
        weight = tf.get_variable('weights', [filter_heights[1], input_width, 1, feature_maps[1]], initializer=tf.random_uniform_initializer(minval=-0.05, maxval=0.05))
        conv2 = tf.nn.tanh(tf.nn.conv2d(inp, weight, strides=[1, 1, 1, 1], padding='VALID'))
    pool2 = tf.squeeze(tf.nn.max_pool(conv2, ksize=[1, conv2.get_shape()[1], 1, 1], strides=[1, 1, 1, 1], padding='VALID'))
    
    with tf.variable_scope('conv3') as scope:        
        weight = tf.get_variable('weights', [filter_heights[2], input_width, 1, feature_maps[2]], initializer=tf.random_uniform_initializer(minval=-0.05, maxval=0.05))
        conv3 = tf.nn.tanh(tf.nn.conv2d(inp, weight, strides=[1, 1, 1, 1], padding='VALID'))
    pool3 = tf.squeeze(tf.nn.max_pool(conv3, ksize=[1, conv3.get_shape()[1], 1, 1], strides=[1, 1, 1, 1], padding='VALID'))
    
    pool_total = tf.concat(0, [pool1, pool2, pool3], name='total_pool')
    return pool_total

In [403]:
tf.reset_default_graph()
input_ = tf.placeholder(shape=[batch_size, max_word_length, embedding.shape[0], 1], dtype=tf.float32, name = 'cnn_in')
words = tf.split(0, batch_size, input_)
cnn_outputs = []
with tf.variable_scope("CNN") as scope:
    for idx, word in enumerate(words):
#         inp = tf.Variable(word, trainable=False, dtype=tf.float32)
        if idx != 0:
            scope.reuse_variables()
        p = inference(word)
        cnn_outputs.append(p)

In [404]:
# highway network
highway_outputs = []
initializer = tf.random_uniform_initializer(minval=-0.05, maxval=0.05, dtype=tf.float32)
# transformation gate parameters
W_t = tf.get_variable('Wt', shape=[sum(feature_maps)], initializer=initializer)
b_t = tf.get_variable('bt', shape=[sum(feature_maps)], initializer=initializer)
# highway parameters
W_h = tf.get_variable('Wh', shape=[sum(feature_maps)], initializer=initializer)
b_h = tf.get_variable('bh', shape=[sum(feature_maps)], initializer=initializer)
# one layer highway network
for co in cnn_outputs:
    t = tf.nn.sigmoid(W_t * co + b_t)
    z = t * tf.nn.tanh(W_h * co + b_h) + (tf.ones_like(t) -  t) * co
    highway_outputs.append(z)

In [405]:
# RNN hyperparameters

epochs = 7000
hidden_layer = 128
input_size = sum(feature_maps)
output_size = vocab_size
learning_rate = 0.1
num_steps = batch_size
print_step = 500
input_size, output_size

(12, 1681)

In [406]:
initializer = tf.random_uniform_initializer(minval=-0.05, maxval=0.05, dtype=tf.float32)
Wxh = tf.get_variable('Wxh', shape=[input_size, hidden_layer], initializer=initializer)
Whh = tf.get_variable('Whh', shape=[hidden_layer, hidden_layer], initializer=initializer)
Why = tf.get_variable('Why',shape=[hidden_layer, output_size], initializer=initializer)
# weights associated with update gate
Wxz = tf.get_variable('Wxz', shape=[input_size, hidden_layer], initializer=initializer)
Whz = tf.get_variable('Whz', shape=[hidden_layer, hidden_layer], initializer=initializer)
# weights associated with the reset gate
Wxr = tf.get_variable('Wxr', shape=[input_size, hidden_layer], initializer=initializer)
Whr = tf.get_variable('Whr', shape=[hidden_layer, hidden_layer], initializer=initializer)


In [407]:
def GRU(prev, inp):
    i = tf.reshape(inp, shape=[1, -1])
    p = tf.reshape(prev, shape=[1, -1])
    z = tf.nn.sigmoid(tf.matmul(i, Wxz) + tf.matmul(p, Whz))    # update gate
    r = tf.nn.sigmoid(tf.matmul(i, Wxr) + tf.matmul(p, Whr))    # reset gate
    h_ = tf.nn.tanh(tf.matmul(i, Wxh) + tf.matmul(tf.mul(p, r), Whh))
    h = tf.mul(tf.sub(tf.ones_like(z), z), h_) + tf.mul(z, p)
#     h = zoneout(h, p)
    return tf.reshape(h, [hidden_layer])

In [408]:
b = tf.placeholder(shape=[batch_size, vocab_size], dtype=tf.float32, name='targets')
initial = tf.placeholder(shape=[hidden_layer], dtype=tf.float32)
states = tf.scan(GRU, highway_outputs, initializer=initial)
outputs = tf.nn.softmax(tf.matmul(states, Why), name='model_out')
loss = -tf.reduce_sum(b * tf.log(outputs))
# loss = tf.sqrt(tf.reduce_sum(tf.square(tf.sub(outputs, b))))
optimizer = tf.train.AdagradOptimizer(learning_rate)

# clipping gradients between -1 and 1.
grad_var_pairs = optimizer.compute_gradients(loss, tf.trainable_variables())
clipped_grad_var_pairs = [(tf.clip_by_value(gv[0], -1, 1), gv[1]) for gv in grad_var_pairs]
optimize_op = optimizer.apply_gradients(clipped_grad_var_pairs)

In [409]:
def generate(sess, n):
    x = np.array(sequence_char_matrix(ix, 1))
    gen = [id_to_token[np.argmax(x[0])]]
    h = np.zeros(hidden_layer)
    for i in range(n):
        o, h = sess.run([outputs, states], {a:x, initial: h})
        h = h.reshape(hidden_layer)
        o = np.argmax(o[0])
        gen.append(id_to_token[o])
        x = [0] * input_size
        x[o] = 1
#         print np.argmax(x)
        x = [x]
    print(' '.join(gen))

In [410]:
sess = tf.Session()
ix = 0
sess.run(tf.initialize_all_variables())

In [411]:
for i in range(epochs):
    if ix + num_steps >= len(d):
        ix = 0
    x = np.array(sequence_char_matrix(ix, batch_size))
    y = np.array(word_generator(ix, batch_size))
    feed = {input_: x, initial: np.zeros(hidden_layer), b: y}    
    o, l, _ = sess.run([outputs, loss, optimize_op], feed_dict=feed)
    ix += num_steps
    if i % print_step == 0:
        print(l)

96.5508
59.2356
35.8521
8.5517
5.55948
3.48802
8.50309
2.92953
3.24301
2.84464
1.56812
0.737502
0.970936
1.44722
