Notebook written by [Zhedong Zheng](https://github.com/zhedongzheng)

In [1]:
"""
pip3 install tensor2tensor
"""
import pos
import numpy as np
import tensorflow as tf

from tensor2tensor.layers.common_attention import add_timing_signal_1d
from sklearn.metrics import classification_report

In [2]:
params = {
    'seq_len': 20,
    'batch_size': 128,
    'hidden_dim': 128,
    'num_heads': 2,
    'text_iter_step': 1,
    'lr': {'start': 5e-3, 'end': 5e-4},
    'n_epoch': 1,
    'display_step': 50,
}

In [3]:
def to_test_seq(*args):
    return [np.reshape(x[:(len(x)-len(x)%params['seq_len'])],
        [-1,params['seq_len']]) for x in args]

def iter_seq(x):
    return np.array([x[i: i+params['seq_len']] for i in range(
        0, len(x)-params['seq_len'], params['text_iter_step'])])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

In [4]:
def pipeline_train(X, y, sess):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(len(X)).batch(params['batch_size'])
    iterator = dataset.make_initializable_iterator()
    X_ph = tf.placeholder(tf.int32, [None, params['seq_len']])
    y_ph = tf.placeholder(tf.int32, [None, params['seq_len']])
    init_dict = {X_ph: X, y_ph: y}
    sess.run(iterator.initializer, init_dict)
    return iterator, init_dict

def pipeline_test(X, sess):
    dataset = tf.data.Dataset.from_tensor_slices(X)
    dataset = dataset.batch(params['batch_size'])
    iterator = dataset.make_initializable_iterator()
    X_ph = tf.placeholder(tf.int32, [None, params['seq_len']])
    init_dict = {X_ph: X}
    sess.run(iterator.initializer, init_dict)
    return iterator, init_dict


x_train, y_train, x_test, y_test, params['vocab_size'], params['n_class'], word2idx, tag2idx = pos.load_data()
X_train, Y_train = to_train_seq(x_train, y_train)
X_test, Y_test = to_test_seq(x_test, y_test)

sess = tf.Session()
params['lr']['steps'] = len(X_train) // params['batch_size']

iter_train, init_dict_train = pipeline_train(X_train, Y_train, sess)
iter_test, init_dict_test = pipeline_test(X_test, sess)

Vocab Size: 19124 | x_train: 211727 | x_test: 47377


In [5]:
def embed_seq(x, vocab_sz, embed_dim, zero_pad=False, scale=False):
    embedding = tf.get_variable('word2vec', [vocab_sz, embed_dim])
    if zero_pad:
        embedding = tf.concat([tf.zeros([1, embed_dim]), embedding[1:, :]], 0)
    x = tf.nn.embedding_lookup(embedding, x)
    if scale:
        x = x * tf.sqrt(tf.to_float(embed_dim))
    return x


def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) * (tf.rsqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def self_attention(inputs,
                   is_training,
                   num_units = params['hidden_dim'],
                   num_heads = params['num_heads'],
                   reverse=False):
    T_q = T_k = inputs.get_shape()[1].value

    Q_K_V = tf.layers.dense(inputs, 3*num_units)
    Q, K, V = tf.split(Q_K_V, 3, -1)
    
    if num_heads > 1:
        Q = tf.concat(tf.split(Q, num_heads, axis=2), 0)                         
        K = tf.concat(tf.split(K, num_heads, axis=2), 0)                        
        V = tf.concat(tf.split(V, num_heads, axis=2), 0)                         

    align = tf.matmul(Q, K, transpose_b=True)                               
    align *= tf.rsqrt(tf.to_float(K.get_shape()[-1].value))
    
    paddings = tf.fill(tf.shape(align), float('-inf'))         
    lower_tri = tf.ones([T_q, T_k])                                                
    lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense()
    if reverse:
        lower_tri = tf.transpose(lower_tri)
    masks = tf.tile(tf.expand_dims(lower_tri,0), [tf.shape(align)[0],1,1])       
    align = tf.where(tf.equal(masks, 0), paddings, align)          
    
    align = tf.nn.softmax(align)                                                  
    align = tf.layers.dropout(align, 0.1, training=is_training)
    
    x = tf.matmul(align, V)
    
    if num_heads > 1:
        x = tf.concat(tf.split(x, num_heads, axis=0), 2)                                                                                                           
    return x


def ffn(inputs):
    x = tf.layers.conv1d(inputs, params['hidden_dim'], 1, activation=tf.nn.relu)
    x = tf.layers.conv1d(x, params['hidden_dim'], 1)
    return x

In [6]:
def forward(x, reuse, is_training):
    dropout = lambda x: tf.layers.dropout(x, 0.1, training=is_training)
    regul = lambda x: dropout(layer_norm(x))
    
    with tf.variable_scope('model', reuse=reuse):
        x = embed_seq(x,
                      params['vocab_size'],
                      params['hidden_dim'],
                      zero_pad=True,
                      scale=True)
        
        with tf.variable_scope('local'):
            sub_fn = lambda x: tf.layers.conv1d(x,
                                                params['hidden_dim'],
                                                kernel_size=3,
                                                padding='same',
                                                activation=tf.nn.relu)
            x += sub_fn(regul(x))
        
        with tf.variable_scope('forward'):
            x = add_timing_signal_1d(x)
            x += self_attention(regul(x), is_training)
        
        with tf.variable_scope('backward'):
            x = add_timing_signal_1d(x)
            x += self_attention(regul(x), is_training, reverse=True)
            
        with tf.variable_scope('pointwise'):
            x += ffn(regul(x))

        logits = tf.layers.dense(x, params['n_class'])
    return logits

In [7]:
ops = {}

X_train_batch, y_train_batch = iter_train.get_next()
X_test_batch = iter_test.get_next()

logits_tr = forward(X_train_batch, reuse=False, is_training=True)
logits_te = forward(X_test_batch, reuse=True, is_training=False)

log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
    logits_tr, 
    y_train_batch,
    tf.count_nonzero(X_train_batch, 1))

ops['loss'] = tf.reduce_mean(-log_likelihood)

ops['global_step'] = tf.Variable(0, trainable=False)

ops['lr'] = tf.train.exponential_decay(params['lr']['start'],
                                       ops['global_step'],
                                       params['lr']['steps'],
                                       params['lr']['end']/params['lr']['start'])

ops['train'] = tf.train.AdamOptimizer(ops['lr']).minimize(ops['loss'],
                                                          global_step=ops['global_step'])

ops['crf_decode'] = tf.contrib.crf.crf_decode(logits_te,
                                              trans_params,
                                              tf.count_nonzero(X_test_batch, 1))[0]

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [8]:
sess.run(tf.global_variables_initializer())
for epoch in range(1, params['n_epoch']+1):
    while True:
        try:
            _, step, loss, lr = sess.run([ops['train'],
                                          ops['global_step'],
                                          ops['loss'],
                                          ops['lr']])
        except tf.errors.OutOfRangeError:
            break
        else:
            if step % params['display_step'] == 0 or step == 1:
                print("Epoch %d | Step %d | Loss %.3f | LR: %.4f" % (epoch, step, loss, lr))
    
    Y_pred = []
    while True:
        try:
            Y_pred.append(sess.run(ops['crf_decode']))
        except tf.errors.OutOfRangeError:
            break
    Y_pred = np.concatenate(Y_pred)
    
    if epoch != params['n_epoch']:
        sess.run(iter_train.initializer, init_dict_train)
        sess.run(iter_test.initializer, init_dict_test)

Epoch 1 | Step 1 | Loss 109.691 | LR: 0.0050
Epoch 1 | Step 50 | Loss 3.378 | LR: 0.0047
Epoch 1 | Step 100 | Loss 1.681 | LR: 0.0044
Epoch 1 | Step 150 | Loss 0.826 | LR: 0.0041
Epoch 1 | Step 200 | Loss 0.688 | LR: 0.0038
Epoch 1 | Step 250 | Loss 0.431 | LR: 0.0035
Epoch 1 | Step 300 | Loss 0.346 | LR: 0.0033
Epoch 1 | Step 350 | Loss 0.397 | LR: 0.0031
Epoch 1 | Step 400 | Loss 0.248 | LR: 0.0029
Epoch 1 | Step 450 | Loss 0.240 | LR: 0.0027
Epoch 1 | Step 500 | Loss 0.107 | LR: 0.0025
Epoch 1 | Step 550 | Loss 0.083 | LR: 0.0023
Epoch 1 | Step 600 | Loss 0.304 | LR: 0.0022
Epoch 1 | Step 650 | Loss 0.132 | LR: 0.0020
Epoch 1 | Step 700 | Loss 0.230 | LR: 0.0019
Epoch 1 | Step 750 | Loss 0.161 | LR: 0.0018
Epoch 1 | Step 800 | Loss 0.187 | LR: 0.0016
Epoch 1 | Step 850 | Loss 0.210 | LR: 0.0015
Epoch 1 | Step 900 | Loss 0.110 | LR: 0.0014
Epoch 1 | Step 950 | Loss 0.144 | LR: 0.0013
Epoch 1 | Step 1000 | Loss 0.179 | LR: 0.0012
Epoch 1 | Step 1050 | Loss 0.124 | LR: 0.0012
Epoch 1 |

In [9]:
print(classification_report(Y_test.ravel(),
                            Y_pred.ravel(),
                            target_names=tag2idx.keys()))

sample = ['I', 'love', 'you']
x = np.atleast_2d([word2idx[w] for w in sample] + [0]*(params['seq_len']-len(sample)))

ph = tf.placeholder(tf.int32, [None, params['seq_len']])
logits = forward(ph, reuse=True, is_training=False)
infer_op = tf.contrib.crf.crf_decode(logits,
                                     trans_params,
                                     tf.count_nonzero(ph, 1))[0]
idx2tag = {idx : tag for tag, idx in tag2idx.items()}

x = sess.run(infer_op, {ph: x})[0][:len(sample)]
print(' '.join(sample))
print(' '.join([idx2tag[idx] for idx in x if idx != 0]))

  .format(len(labels), len(target_names))


             precision    recall  f1-score   support

      <pad>       0.93      0.93      0.93      6639
         NN       0.99      0.99      0.99      5070
         IN       1.00      1.00      1.00      4020
         DT       0.98      0.92      0.95       912
        VBZ       0.93      0.95      0.94      1354
         RB       0.82      0.91      0.86      1103
        VBN       1.00      1.00      1.00      1177
         TO       0.92      0.95      0.94      1269
         VB       0.93      0.88      0.91      2962
         JJ       0.95      0.90      0.93      3034
        NNS       0.90      0.95      0.92      4803
        NNP       1.00      1.00      1.00      2389
          ,       1.00      1.00      1.00      1214
         CC       0.99      1.00      1.00       433
        POS       1.00      1.00      1.00      1974
          .       0.92      0.91      0.91       539
        VBP       0.98      0.83      0.90       727
        VBG       1.00      1.00      1.00   