In [1]:
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import numpy as np

np.random.seed(42)
tf.__version__

'1.5.0'

In [2]:
class LstmCrfTagger(object):
    START_TAG = '<START>'
    END_TAG   = '<END>'
    @staticmethod
    def log_sum_exp(vec):
        max_score = tf.reduce_max(vec, axis=1)
        return max_score + tf.log(tf.reduce_sum(tf.exp(vec - tf.expand_dims(max_score, axis=1)), axis=1))
    @property
    def tag_size(self): return len(self._tag_to_ix)    
    def __init__(self, vocab_size, tag_to_ix, char_size, embedding_dim=5, hidden_dim=4):
        self._vocab_size = vocab_size
        self._tag_to_ix = tag_to_ix
        self._embedding_dim = embedding_dim
        self._hidden_dim = hidden_dim
        self._char_size = char_size
    def _make_graph(self, graph):
        with graph.as_default():
            sentence = tf.placeholder(tf.int32, shape=[None], name='sentence')
            tags     = tf.placeholder(tf.int32, shape=[None], name='tags')
            words    = tf.placeholder(tf.int32, shape=[None, None], name='words')
            words_ln = tf.placeholder(tf.int32, shape=[None], name='words_length')
            dropout  = tf.placeholder_with_default(1., shape=(), name='dropout_ratio')
            with tf.variable_scope('embedding'):
                embedding_words = tf.get_variable('words', shape=(self._vocab_size, self._embedding_dim), dtype=tf.float32, trainable=True)
                embeds          = tf.nn.embedding_lookup(embedding_words, sentence, name='word_lookup')
            with tf.variable_scope('transitions'):
                transitions_val = np.random.randn(self.tag_size, self.tag_size)
                transitions_val[self._tag_to_ix[self.START_TAG], :] = -10000.
                transitions_val[:, self._tag_to_ix[self.END_TAG]]   = -10000.
                transitions = \
                    tf.get_variable('matrix', shape=(self.tag_size, self.tag_size), \
                                    initializer=tf.constant_initializer(transitions_val), dtype=tf.float32, trainable=True)
            with tf.variable_scope('words_lstm'):
                cell_fw = \
                    tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(self._hidden_dim), input_keep_prob=dropout)
                cell_bw = \
                    tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(self._hidden_dim), input_keep_prob=dropout)
                (outputs_fw, outputs_bw), _ = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, tf.expand_dims(embeds, axis=0), dtype=tf.float32)
                words_outputs = tf.squeeze(tf.concat([outputs_fw, outputs_bw], axis=2), axis=[0])
            with tf.variable_scope('embedding'):
                embedding_chars = tf.get_variable('chars', shape=(self._char_size + 1, self._embedding_dim), dtype=tf.float32, trainable=True)
                embeds          = tf.nn.embedding_lookup(embedding_chars, words, name='char_lookup')
            with tf.variable_scope('chars_lstm'):
                cell_fw = \
                    tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(self._hidden_dim), input_keep_prob=dropout)
                cell_bw = \
                    tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(self._hidden_dim), input_keep_prob=dropout)
                _, ((_, output_state_fw), (_, output_state_bw)) = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, embeds, sequence_length=words_ln, dtype=tf.float32)
                chars_outputs = tf.concat([output_state_fw, output_state_bw], axis=1)
            outputs = tf.concat([words_outputs, chars_outputs], axis=1)
            with tf.variable_scope('hidden2tag'):
                feats = tf.layers.dense(outputs, self.tag_size, activation=None)
            total_score = self._total_score(feats, transitions)
            path_score  = self._path_score(feats, tags, transitions)
            loss = total_score - path_score
            training_op = tf.train.AdamOptimizer(learning_rate=0.1).minimize(loss)
            best_score, best_path = self._viterbi_decode(feats, transitions)
            return dropout, sentence, tags, words, words_ln, loss, training_op, best_score, best_path
    def _path_score(self, feats, tags, transitions):
        tags = tf.concat([[self._tag_to_ix[self.START_TAG]], tags], axis=0)
        score = tf.reduce_sum(tf.map_fn(lambda x: transitions[x[0], x[1]] + x[2][x[0]], (tags[1:], tags[:-1], feats), dtype=tf.float32))
        sum_score = score + transitions[self._tag_to_ix[self.END_TAG], tags[-1]]
        return sum_score
    def _total_score(self, feats, transitions):
        def step(forward, feat):
            result = self.log_sum_exp(tf.expand_dims(feat, axis=1) + transitions + forward)
            return tf.expand_dims(result, axis=0)
        alpha = np.ones((1, self.tag_size), dtype=np.float32) * -10000.
        alpha[:, self._tag_to_ix[self.START_TAG]] = 0.
        return tf.squeeze(self.log_sum_exp(tf.foldl(step, feats, alpha) + transitions[self._tag_to_ix[self.END_TAG], :]))
    def _viterbi_decode(self, feats, transitions):
        def step(forward, feat):
            next_var = forward + transitions
            return tf.reduce_max(next_var, axis=1) + tf.expand_dims(feat, axis=0)
        def step_tag(forward):
            return tf.argmax(forward + transitions, axis=1, output_type=tf.int32)
        def step_path(best_tag_id, backpointer):
            return backpointer[best_tag_id]
        init_vvars = np.ones((1, self.tag_size), dtype=np.float32) * -10000.
        init_vvars[:, self._tag_to_ix[self.START_TAG]] = 0.
        forward_vars = tf.scan(step, feats, initializer=init_vvars, back_prop=False)
        backpointers = tf.map_fn(step_tag, tf.concat([init_vvars[np.newaxis, :, :], forward_vars[:-1]], axis=0), back_prop=False, dtype=tf.int32)
        terminal_var = forward_vars[-1] + transitions[self._tag_to_ix[self.END_TAG]]
        best_score   = tf.reduce_max(terminal_var)
        best_tag_id  = tf.squeeze(tf.argmax(terminal_var, axis=1, output_type=tf.int32))
        reverse_path = tf.scan(step_path, tf.reverse(backpointers, axis=[0]), initializer=best_tag_id, back_prop=False)
        assert_op    = tf.Assert(tf.equal(reverse_path[-1], self._tag_to_ix[self.START_TAG]), [reverse_path])
        with tf.control_dependencies([assert_op]):
            best_path = tf.concat([tf.reverse(reverse_path, axis=[0])[1:], [best_tag_id]], axis=0)
        return best_score, best_path
    def fit(self, sentence_seq, tags_seq, words_seq, words_len_seq, num_epochs=10, model_file=None, drop_out=.5):
        graph = tf.Graph()
        dropout, sentence, tags, words, words_ln, loss, training_op, _, _ = self._make_graph(graph)
        with graph.as_default():
            saver = tf.train.Saver() if model_file is not None else None
        with tf.Session(graph=graph) as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(num_epochs):
                loss_val = 0
                for sentence_val, tags_val, words_val, words_len_val in zip(sentence_seq, tags_seq, words_seq, words_len_seq):
                    feed_dict = { dropout: drop_out, sentence : sentence_val, tags : tags_val, words : words_val + 1, words_ln : words_len_val }
                    _, loss_val_1 = sess.run([training_op, loss], feed_dict=feed_dict)
                    loss_val += loss_val_1
                print('epoch [%d/%d], loss: %.3f' % (epoch + 1, num_epochs, loss_val))
            if model_file is not None:
                model_path = saver.save(sess, model_file)
                print('saved model to %s' % model_path)
    def predict(self, model_file):
        graph = tf.Graph()
        _, sentence, _, words, words_len, _, _, best_score, best_path = self._make_graph(graph)
        sess = tf.Session(graph=graph)
        with graph.as_default():
            tf.train.Saver().restore(sess, model_file)
        return \
            lambda sents, ws, wls : [sess.run([best_score, best_path], feed_dict={sentence: sent, words : w + 1, words_len : wl}) for sent, w, wl in zip(sents, ws, wls)], \
            lambda : sess.close()

In [3]:
from six import iteritems

def prepare_sequence(seq, to_ix):
    return np.array([to_ix[w] for w in seq])

# Make up some training data
training_data = [ (
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
) ]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
ix_to_word = dict([(v, k) for k, v in iteritems(word_to_ix)])

char_to_ix = {}
for word in word_to_ix.keys():
    for char in word:
        if char not in char_to_ix:
            char_to_ix[char] = len(char_to_ix)
ix_to_char = dict([(v, k) for k, v in iteritems(char_to_ix)])

tag_to_ix = { "B": 0, "I": 1, "O": 2, LstmCrfTagger.START_TAG: 3, LstmCrfTagger.END_TAG: 4 }
ix_to_tag = dict([(v, k) for k, v in iteritems(tag_to_ix)])

def prepare_input(sentence, tags):
    sent_seq = prepare_sequence(sentence, word_to_ix)
    tags_seq = prepare_sequence(tags, tag_to_ix)
    word_seq = \
        tf.keras.preprocessing.sequence.pad_sequences(
            [prepare_sequence(word, char_to_ix) for word in sentence], 
            padding='post', 
            value=-1)
    word_len_seq = \
        np.apply_along_axis(
            lambda seq: next(i for i, j in enumerate(seq) if j < 0), 
            axis=1, 
            arr=np.c_[word_seq, np.ones((word_seq.shape[0], 1)) * -1])
    return sent_seq, tags_seq, word_seq, np.squeeze(word_len_seq)

model = LstmCrfTagger(vocab_size=len(word_to_ix), tag_to_ix=tag_to_ix, char_size=len(char_to_ix))
sent_seq, tags_seq, word_seq, word_len_seq = \
    zip(*[prepare_input(sent, tags) for sent, tags in training_data])
model.fit(sent_seq, tags_seq, word_seq, word_len_seq, model_file='model/lstm-crf-tagger.ckpt', num_epochs=10)

fn, closure = model.predict('model/lstm-crf-tagger.ckpt')
try: 
    scores, paths = zip(*fn(sent_seq, word_seq, word_len_seq))
    for i, (sent, tags, score, path) in enumerate(zip(sent_seq, tags_seq, scores, paths)):
        print(i)
        print('\tsentence: ', ' '.join(prepare_sequence(sent, ix_to_word)))
        print('\ttarget: ', ' '.join(prepare_sequence(tags, ix_to_tag)))
        print('\tprediction: ', ' '.join(prepare_sequence(path, ix_to_tag)))
        print('\tscore: ', score)
finally: closure()

epoch [1/10], loss: 21.324
epoch [2/10], loss: 13.394
epoch [3/10], loss: 11.670
epoch [4/10], loss: 8.206
epoch [5/10], loss: 5.775
epoch [6/10], loss: 4.223
epoch [7/10], loss: 2.475
epoch [8/10], loss: 2.080
epoch [9/10], loss: 1.824
epoch [10/10], loss: 3.460
saved model to model/lstm-crf-tagger.ckpt
INFO:tensorflow:Restoring parameters from model/lstm-crf-tagger.ckpt
0
	sentence:  the wall street journal reported today that apple corporation made money
	target:  B I I I O O O B I O O
	prediction:  B I I I I O O B I O O
	score:  36.250656
1
	sentence:  georgia tech is a university in georgia
	target:  B I O O O O B
	prediction:  B I O O O O B
	score:  30.8023
