In [1]:
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import numpy as np

np.random.seed(42)
tf.__version__

'1.5.0'

In [2]:
class LstmCrfTagger(object):
    START_TAG = '<START>'
    END_TAG   = '<END>'
    @staticmethod
    def log_sum_exp(vec):
        max_score = tf.reduce_max(vec, axis=1)
        return max_score + tf.log(tf.reduce_sum(tf.exp(vec - tf.expand_dims(max_score, axis=1)), axis=1))
    @property
    def tag_size(self): return len(self._tag_to_ix)    
    def __init__(self, vocab_size, tag_to_ix, embedding_dim=5, hidden_dim=4):
        self._vocab_size = vocab_size
        self._tag_to_ix = tag_to_ix
        self._embedding_dim = embedding_dim
        self._hidden_dim = hidden_dim
    def _make_graph(self, graph):
        with graph.as_default():
            sentence = tf.placeholder(tf.int32, shape=[None], name='sentence')
            tags     = tf.placeholder(tf.int32, shape=[None], name='tags')
            with tf.variable_scope('embedding'):
                embedding_var = tf.get_variable('var', shape=(self._vocab_size, self._embedding_dim), dtype=tf.float32, trainable=True)
                embeds        = tf.nn.embedding_lookup(embedding_var, sentence)
            with tf.variable_scope('transitions'):
                transitions_val = np.random.randn(self.tag_size, self.tag_size)
                transitions_val[self._tag_to_ix[self.START_TAG], :] = -10000.
                transitions_val[:, self._tag_to_ix[self.END_TAG]]   = -10000.
                transitions = \
                    tf.get_variable('var', shape=(self.tag_size, self.tag_size), \
                                    initializer=tf.constant_initializer(transitions_val), dtype=tf.float32, trainable=True)
            with tf.variable_scope('lstm'):
                cell_fw = tf.nn.rnn_cell.BasicLSTMCell(self._hidden_dim)
                cell_bw = tf.nn.rnn_cell.BasicLSTMCell(self._hidden_dim)
                (outputs_fw, outputs_bw), _ = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, tf.expand_dims(embeds, axis=0), dtype=tf.float32)
                outputs = tf.concat([outputs_fw, outputs_bw], axis=2)
            with tf.variable_scope('hidden2tag'):
                feats = tf.layers.dense(tf.squeeze(outputs, axis=[0]), self.tag_size, activation=None)
            total_score = self._total_score(feats, transitions)
            path_score  = self._path_score(feats, tags, transitions)
            loss = total_score - path_score
            training_op = tf.train.AdamOptimizer(learning_rate=0.1).minimize(loss)
            best_score, best_path = self._viterbi_decode(feats, transitions)
            return sentence, tags, loss, training_op, best_score, best_path
    def _path_score(self, feats, tags, transitions):
        tags = tf.concat([[self._tag_to_ix[self.START_TAG]], tags], axis=0)
        score = tf.reduce_sum(tf.map_fn(lambda x: transitions[x[0], x[1]] + x[2][x[0]], (tags[1:], tags[:-1], feats), dtype=tf.float32))
        sum_score = score + transitions[self._tag_to_ix[self.END_TAG], tags[-1]]
        return sum_score
    def _total_score(self, feats, transitions):
        def step(forward, feat):
            result = self.log_sum_exp(tf.expand_dims(feat, axis=1) + transitions + forward)
            return tf.expand_dims(result, axis=0)
        alpha = np.ones((1, self.tag_size), dtype=np.float32) * -10000.
        alpha[:, self._tag_to_ix[self.START_TAG]] = 0.
        return tf.squeeze(self.log_sum_exp(tf.foldl(step, feats, alpha) + transitions[self._tag_to_ix[self.END_TAG], :]))
    def _viterbi_decode(self, feats, transitions):
        def step(forward, feat):
            next_var = forward + transitions
            return tf.reduce_max(next_var, axis=1) + tf.expand_dims(feat, axis=0)
        def step_tag(forward):
            return tf.argmax(forward + transitions, axis=1, output_type=tf.int32)
        def step_path(best_tag_id, backpointer):
            return backpointer[best_tag_id]
        init_vvars = np.ones((1, self.tag_size), dtype=np.float32) * -10000.
        init_vvars[:, self._tag_to_ix[self.START_TAG]] = 0.
        forward_vars = tf.scan(step, feats, initializer=init_vvars, back_prop=False)
        backpointers = tf.map_fn(step_tag, tf.concat([init_vvars[np.newaxis, :, :], forward_vars[:-1]], axis=0), back_prop=False, dtype=tf.int32)
        terminal_var = forward_vars[-1] + transitions[self._tag_to_ix[self.END_TAG]]
        best_score   = tf.reduce_max(terminal_var)
        best_tag_id  = tf.squeeze(tf.argmax(terminal_var, axis=1, output_type=tf.int32))
        reverse_path = tf.scan(step_path, tf.reverse(backpointers, axis=[0]), initializer=best_tag_id, back_prop=False)
        assert_op    = tf.Assert(tf.equal(reverse_path[-1], self._tag_to_ix[self.START_TAG]), [reverse_path])
        with tf.control_dependencies([assert_op]):
            best_path = tf.concat([tf.reverse(reverse_path, axis=[0])[1:], [best_tag_id]], axis=0)
        return best_score, best_path
    def fit(self, sentence_seq, tags_seq, num_epochs=10, model_file=None):
        graph = tf.Graph()
        sentence, tags, loss, training_op, _, _ = self._make_graph(graph)
        with graph.as_default():
            saver = tf.train.Saver() if model_file is not None else None
        with tf.Session(graph=graph) as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(num_epochs):
                loss_val = 0
                for sentence_val, tags_val in zip(sentence_seq, tags_seq):
                    _, loss_val_1 = sess.run([training_op, loss], feed_dict={sentence : sentence_val, tags : tags_val})
                    loss_val += loss_val_1
                print('epoch [%d/%d], loss: %.3f' % (epoch + 1, num_epochs, loss_val))
            if model_file is not None:
                model_path = saver.save(sess, model_file)
                print('saved model to %s' % model_path)
    def predict(self, model_file):
        graph = tf.Graph()
        sentence, _, _, _, best_score, best_path = self._make_graph(graph)
        sess = tf.Session(graph=graph)
        with graph.as_default():
            tf.train.Saver().restore(sess, model_file)
        return \
            lambda sents : [sess.run([best_score, best_path], feed_dict={sentence: sent}) for sent in sents], \
            lambda : sess.close()

In [3]:
from six import iteritems

def prepare_sequence(seq, to_ix):
    return np.array([to_ix[w] for w in seq])

# Make up some training data
training_data = [ (
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
) ]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
ix_to_word = dict([(v, k) for k, v in iteritems(word_to_ix)])
            
tag_to_ix = { "B": 0, "I": 1, "O": 2, LstmCrfTagger.START_TAG: 3, LstmCrfTagger.END_TAG: 4 }
ix_to_tag = dict([(v, k) for k, v in iteritems(tag_to_ix)])

model = LstmCrfTagger(vocab_size=len(word_to_ix), tag_to_ix=tag_to_ix)
sent_seq, tags_seq = \
    zip(*[(prepare_sequence(sent, word_to_ix), prepare_sequence(tags, tag_to_ix)) for sent, tags in training_data])

model.fit(sent_seq, tags_seq, model_file='model/lstm-crt-tagger.ckpt', num_epochs=10)
fn, closure = model.predict('model/lstm-crt-tagger.ckpt')
try: 
    scores, paths = zip(*fn(sent_seq))
    for i, (sent, tags, score, path) in enumerate(zip(sent_seq, tags_seq, scores, paths)):
        print(i)
        print('\tsentence: ', ' '.join(prepare_sequence(sent, ix_to_word)))
        print('\ttarget: ', ' '.join(prepare_sequence(tags, ix_to_tag)))
        print('\tprediction: ', ' '.join(prepare_sequence(path, ix_to_tag)))
        print('\tscore: ', score)
finally: closure()

epoch [1/10], loss: 19.832
epoch [2/10], loss: 13.962
epoch [3/10], loss: 10.032
epoch [4/10], loss: 7.943
epoch [5/10], loss: 6.008
epoch [6/10], loss: 3.794
epoch [7/10], loss: 2.217
epoch [8/10], loss: 1.224
epoch [9/10], loss: 0.658
epoch [10/10], loss: 0.344
saved model to model/lstm-crt-tagger.ckpt
INFO:tensorflow:Restoring parameters from model/lstm-crt-tagger.ckpt
0
	sentence:  the wall street journal reported today that apple corporation made money
	target:  B I I I O O O B I O O
	prediction:  B I I I O O O B I O O
	score:  45.0344
1
	sentence:  georgia tech is a university in georgia
	target:  B I O O O O B
	prediction:  B I O O O O B
	score:  32.1796
