In [1]:
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import numpy as np

tf.set_random_seed(42)
np.random.seed(42)

tf.__version__

'1.5.0'

In [2]:
class HierarchicalAttentionNetwork(object):
    def __init__(self, embedding_matrix, num_class, hidden_dim = None):
        super(HierarchicalAttentionNetwork, self).__init__()
        self._embedding_matrix = embedding_matrix
        self._num_class = num_class
        self._hidden_dim = \
            hidden_dim if hidden_dim is not None else self._embedding_matrix.shape[1]
    def _make_graph_batch(self, graph):
        with graph.as_default():
            words = tf.placeholder(tf.int32, [None, None, None], name='words')
            words_length = tf.placeholder(tf.int32, [None, None], name='words_length')
            sentences_length = tf.placeholder(tf.int32, [None], name='sentences_length')
            labels = tf.placeholder(tf.int32, [None], name='labels')
            
            with tf.variable_scope('embeddings'):
                embedding = \
                    tf.get_variable('parameter', 
                                    shape=self._embedding_matrix.shape, 
                                    initializer=tf.constant_initializer(embedding_matrix), 
                                    dtype=tf.float32, trainable=True)
                embedded  = tf.nn.embedding_lookup(embedding, words, name='lookup')
            with tf.variable_scope('words_lstm'):
                cell_fw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                cell_bw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                def fn(inp):
                    (outputs_fw, outputs_bw), _ = \
                        tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inp[0], sequence_length=inp[1], dtype=tf.float32)
                    return tf.concat([outputs_fw, outputs_bw], axis=2)
                outputs = tf.map_fn(fn, (embedded, words_length), dtype=tf.float32)
            with tf.variable_scope('words_attention'):
                hidden = tf.layers.dense(outputs, units=self._hidden_dim * 2, activation=tf.nn.tanh)
                attention = tf.layers.dense(outputs, units=1, activation=None)
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention, perm=[0, 1, 3, 2])), perm=[0, 1, 3, 2])
            outputs = tf.reduce_sum(outputs * attention, axis=2)
            with tf.variable_scope('sentence_lstm'):
                cell_fw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                cell_bw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                (outputs_fw, outputs_bw), _ = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, outputs, sequence_length=sentences_length, dtype=tf.float32)
            outputs = tf.concat([outputs_fw, outputs_bw], axis=2)
            with tf.variable_scope('sentence_attention'):
                hidden = tf.layers.dense(outputs, units=self._hidden_dim * 2, activation=tf.nn.tanh)
                attention = tf.layers.dense(hidden, units=1, activation=None)
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention, perm=[0, 2, 1])), perm=[0, 2, 1])
            outputs = tf.reduce_sum(outputs * attention, axis=1)
            logits = tf.layers.dense(outputs, units=self._num_class, activation=None)
            loss = tf.reduce_sum(tf.one_hot(labels, self._num_class) * tf.nn.softmax(logits), name='loss')
#             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
#             loss = tf.reduce_mean(loss)
            training_op = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss)
            return words, words_length, sentences_length, labels, logits, loss, training_op
    def _make_graph(self, graph):
        with graph.as_default():
            words  = tf.placeholder(tf.int32, [None, None], name='words')
            length = tf.placeholder(tf.int32, [None], name='length')
            labels = tf.placeholder(tf.int32, (), name='labels') 
            
            with tf.variable_scope('embeddings'):
                embedding = \
                    tf.get_variable('parameter', 
                                    shape=self._embedding_matrix.shape, 
                                    initializer=tf.constant_initializer(embedding_matrix), 
                                    dtype=tf.float32, trainable=True)
                embedded  = tf.nn.embedding_lookup(embedding, words, name='lookup')
            with tf.variable_scope('words_lstm'):
                cell_fw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                cell_bw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                (outputs_fw, outputs_bw), _ = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, embedded, sequence_length=length, dtype=tf.float32)
            outputs = tf.concat([outputs_fw, outputs_bw], axis=2)
            with tf.variable_scope('words_attention'):
                hidden = \
                    tf.layers.dense(outputs, units=self._hidden_dim * 2, 
                                    activation=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = \
                    tf.layers.dense(outputs, units=1, 
                                    activation=None, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention, perm=[0, 2, 1])), perm=[0, 2, 1])
            sentence_embedding = tf.reduce_sum(outputs * attention, axis=1)
            sentence_embedding = tf.expand_dims(sentence_embedding, axis=0)
            
            with tf.variable_scope('sentence_lstm'):
                cell_fw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                cell_bw = tf.nn.rnn_cell.GRUCell(num_units=self._hidden_dim)
                (outputs_fw, outputs_bw), _ = \
                    tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, sentence_embedding, dtype=tf.float32)
            outputs = tf.squeeze(tf.concat([outputs_fw, outputs_bw], axis=2), axis=[0])
            with tf.variable_scope('sentence_attention'):
                hidden = \
                    tf.layers.dense(outputs, units=self._hidden_dim * 2, 
                                    activation=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = \
                    tf.layers.dense(hidden, units=1, 
                                    activation=None, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
                attention = tf.transpose(tf.nn.softmax(tf.transpose(attention)))                
            outputs = tf.reduce_sum(outputs * attention, axis=0)
            outputs = tf.expand_dims(outputs, axis=0)
            logits = tf.layers.dense(outputs, units=self._num_class, activation=None)
            loss = -tf.log(tf.nn.softmax(logits)[:, labels], name='loss')
#             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tf.expand_dims(labels, axis=0))
#             loss = tf.squeeze(loss, axis=[0])
            training_op = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss)
            return words, length, labels, logits, loss, training_op
    def fit(self, words_seq, length_seq, labels_seq, num_epochs=10, model=None):
        graph = tf.Graph()
        words, length, labels, _, loss, training_op = self._make_graph(graph)
        
        with graph.as_default():
            saver = tf.train.Saver() if model is not None else None
        
        with tf.Session(graph=graph) as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(num_epochs):
                for words_val, length_val, labels_val in zip(words_seq, length_seq, labels_seq):
                    feed_dict = { words : words_val, length : length_val, labels : labels_val }
                    sess.run(training_op, feed_dict=feed_dict)
                loss_total = []
                for words_val, length_val, labels_val in zip(words_seq, length_seq, labels_seq):
                    feed_dict = { words : words_val, length : length_val, labels : labels_val }
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    loss_total.append(loss_val)
                print('Epoch [%d/%d], Loss: %.3f' % (epoch + 1, num_epochs, np.sum(loss_val)))
            if model is not None:
                saver.save(sess, model)
                print('saved model to %s' % model)                
    def evaluate(self, words_seq, length_seq, labels_seq, model):
        graph = tf.Graph()
        words, length, labels, logits, _, _ = self._make_graph(graph)
        with graph.as_default():
            prediction = \
                tf.to_float(tf.nn.in_top_k(logits, tf.expand_dims(labels, axis=0), k=1))
        with tf.Session(graph=graph) as sess:
            tf.train.Saver().restore(sess, model)
            preds = []
            for words_val, length_val, labels_val in zip(words_seq, length_seq, labels_seq):
                feed_dict = { words : words_val, length : length_val, labels : labels_val }
                logits_val, pred_val = sess.run([logits, prediction], feed_dict=feed_dict)
                preds.append(pred_val)
        return np.mean(preds)

In [3]:
from sklearn.datasets import fetch_20newsgroups
import spacy
# from gensim.scripts.glove2word2vec import glove2word2vec
# from gensim.models import KeyedVectors

train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_data  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
nlp = spacy.load('en')

# glove2word2vec('data/glove.6B.100d.txt', 'data/glove.6B.100d.converted.txt')
# embeddings = KeyedVectors.load_word2vec_format('data/glove.6B.100d.converted.txt')

print('train_data shape: ', train_data.target.shape[0])
print('test_data shape: ', test_data.target.shape[0])
# print('embeddings shape: ', embeddings.syn0.shape)

train_data shape:  11314
test_data shape:  7532


In [4]:
tag_to_ix = dict([(n, i) for i, n in enumerate(train_data.target_names)])

train_data_size = 100
test_data_size = 100

batch_size = train_data_size / 10

def prepare_embeddings(data):
    embeddings = dict()
    for i, doc in enumerate(data):
        for token in nlp(doc):
            if token.is_punct: continue
            w = token.lower_.strip()
            if not w: continue
            embeddings[w] = len(embeddings)
#         if (i + 1) % batch_size == 0: print('processed %d documents' % (i + 1))
    return embeddings

def prepare_data(data, labels, embeddings):
    sequences_seq = []
    for i, doc in enumerate(data):
        sequences = []
        for sent in nlp(doc).sents:
            sequence = []
            for token in sent:
                if token.is_punct: continue
                w = token.lower_.strip()
                if not w: continue
                
                if w in embeddings: sequence.append(embeddings[w] + 1)
                else: sequence.append(0)
            if sequence: sequences.append(sequence)
        if sequences: sequences_seq.append(sequences)
#         if (i + 1) % batch_size == 0: print('processed %d documents' % (i + 1))
                
    words_seq = []
    length_seq = []
    label_seq = []
    for i, (sequences, label) in enumerate(zip(sequences_seq, labels)):
        words = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post', value=-1)
        length = np.apply_along_axis(lambda ws : next(i for i, d in enumerate(ws) if d < 0), axis=1, arr=np.c_[words, np.ones((len(words), 1)) * -1])
        words_seq.append(words + np.where(words < 0, 1, 0).astype(np.int32))
        length_seq.append(length)
        label_seq.append(label)
#         if (i + 1) % batch_size == 0: print('padded %d documents' % (i + 1))
    return words_seq, length_seq, label_seq

# print('preparing embeddings')
embeddings = prepare_embeddings(train_data.data[:train_data_size])
# print('converting training documents')
train_words_seq, train_length_seq, train_label_seq = \
    prepare_data(train_data.data[:train_data_size], train_data.target[:train_data_size], embeddings)
# print('converting test documents')
test_words_seq, test_length_seq, test_label_seq = \
    prepare_data(test_data.data[:test_data_size], test_data.target[:test_data_size], embeddings)

In [5]:
embedding_matrix = np.random.randn(len(embeddings) + 2, 10)
model = HierarchicalAttentionNetwork(embedding_matrix, len(tag_to_ix), hidden_dim=10)
model.fit(train_words_seq, train_length_seq, train_label_seq, num_epochs=20, model='models/han.ckpt')

Epoch [1/20], Loss: 2.980
Epoch [2/20], Loss: 2.873
Epoch [3/20], Loss: 2.589
Epoch [4/20], Loss: 2.291
Epoch [5/20], Loss: 2.112
Epoch [6/20], Loss: 1.101
Epoch [7/20], Loss: 0.480
Epoch [8/20], Loss: 0.147
Epoch [9/20], Loss: 0.074
Epoch [10/20], Loss: 0.058
Epoch [11/20], Loss: 0.024
Epoch [12/20], Loss: 0.028
Epoch [13/20], Loss: 0.019
Epoch [14/20], Loss: 0.014
Epoch [15/20], Loss: 0.012
Epoch [16/20], Loss: 0.010
Epoch [17/20], Loss: 0.009
Epoch [18/20], Loss: 0.008
Epoch [19/20], Loss: 0.007
Epoch [20/20], Loss: 0.006
saved model to models/han.ckpt


In [6]:
model = HierarchicalAttentionNetwork(embedding_matrix, len(tag_to_ix), hidden_dim=10)
print('accuracy on training data', 
      model.evaluate(train_words_seq, train_length_seq, train_label_seq, model='models/han.ckpt'))
print('accuracy on test data', 
     model.evaluate(test_words_seq, test_length_seq, test_label_seq, model='models/han.ckpt'))

INFO:tensorflow:Restoring parameters from models/han.ckpt
accuracy on training data 1.0
INFO:tensorflow:Restoring parameters from models/han.ckpt
accuracy on test data 0.0707071
