# Document Similarity - w/ Attention

In [1]:
from __future__ import division, print_function
from keras import backend as K
from keras.layers import Input
from keras.layers.core import Dense, Dropout, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.pooling import GlobalMaxPooling1D
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.merge import concatenate
from keras.models import Model
from keras.optimizers import SGD
from keras.utils import to_categorical
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import custom_attn
import logging
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
%matplotlib inline

Using TensorFlow backend.


In [2]:
DATA_DIR = "../data"

VOCAB_FILE = os.path.join(DATA_DIR, "ng-vocab.tsv")
MIN_OCCURS = 5

GLOVE_FILE = os.path.join(DATA_DIR, "glove.840B.300d.txt")

DOCSIM_IDLABELS = os.path.join(DATA_DIR, "docsim-idlabels.tsv")
DOCSIM_TEXTS = os.path.join(DATA_DIR, "docsim-texts.tsv")

# covers about 95% of input data
MAX_SENTS = 40 # maximum number of sentences per document
MAX_WORDS = 60 # maximum number of words per sentence

WORD_EMBED_SIZE = 300
SENT_EMBED_SIZE = 100
DOC_EMBED_SIZE = 50
NUM_CLASSES = 2

BATCH_SIZE = 64
NUM_EPOCHS = 10

logging.basicConfig()

## Load Vocabulary

In [3]:
word2id = {"PAD": 0, "UNK": 1}
fvocab = open(VOCAB_FILE, "rb")
for i, line in enumerate(fvocab):
    word, count = line.strip().split("\t")
    if int(count) <= MIN_OCCURS:
        break
    word2id[word] = i
fvocab.close()
id2word = {v:k for k, v in word2id.items()}
vocab_size = len(word2id)
print("vocab_size: {:d}".format(vocab_size))

vocab_size: 40730


## Load GloVe Embeddings

In [4]:
E = np.zeros((vocab_size, WORD_EMBED_SIZE))
E[1] = np.random.random(WORD_EMBED_SIZE)
fglove = open(GLOVE_FILE, "rb")
for line in fglove:
    cols = line.strip().split(" ")
    word = cols[0]
    if not word2id.has_key(word):
        continue
    vec = np.array([float(x) for x in cols[1:]])
    idx = word2id[word]
    E[idx] = vec
fglove.close()
print(E.shape)

(40730, 300)


## Compute Document Vectors

In [5]:
def pad_or_truncate(xs, maxlen):
    if len(xs) > maxlen:
        xs = xs[len(xs) - maxlen:]
    elif len(xs) < maxlen:
        xs = ["PAD"] * (maxlen - len(xs)) + xs
    return xs

xs = ["The", "cat", "fought", "like", "a", "mouse"]
print(pad_or_truncate(xs, 3))
print(pad_or_truncate(xs, 7))

['like', 'a', 'mouse']
['PAD', 'The', 'cat', 'fought', 'like', 'a', 'mouse']


In [6]:
docid2mat = {}
ftext = open(DOCSIM_TEXTS, "rb")
for line in ftext:
    rec_id, text = line.strip().split("\t")
    M = np.zeros((MAX_SENTS, MAX_WORDS))
    sents = pad_or_truncate(nltk.sent_tokenize(text), MAX_SENTS)
    for sid, sent in enumerate(sents):
        words = pad_or_truncate(nltk.word_tokenize(sent), MAX_WORDS)
        for wid, word in enumerate(words):
            try:
                word_id = word2id[word]
            except KeyError:
                word_id = word2id["UNK"]
            M[sid, wid] = word_id
    docid2mat[int(rec_id)] = M
ftext.close()
print(len(docid2mat), docid2mat[list(docid2mat.keys())[0]].shape)

1885 (40, 60)


## Extract Label and DocID pairs

In [7]:
xdata, ydata = [], []
fidl = open(DOCSIM_IDLABELS, "rb")
for line in fidl:
    label, docid_left, docid_right = line.strip().split("\t")
    xdata.append((int(docid_left), int(docid_right)))
    ydata.append(int(label))
X = np.array(xdata)
Y = to_categorical(np.array(ydata), num_classes=NUM_CLASSES)
print(X.shape, Y.shape)

(351557, 2) (351557, 2)


## Partition into training, validation and test

In [8]:
Xtv, Xtest, Ytv, Ytest = train_test_split(X, Y, train_size=0.7)
Xtrain, Xval, Ytrain, Yval = train_test_split(Xtv, Ytv, train_size=0.9)
print(Xtrain.shape, Ytrain.shape, Xval.shape, Yval.shape, 
      Xtest.shape, Ytest.shape)

(221480, 2) (221480, 2) (24609, 2) (24609, 2) (105468, 2) (105468, 2)


## Build Data Generator

In [9]:
def datagen(X, Y, docid2mat, batch_size=BATCH_SIZE):
    while True:
        num_recs = X.shape[0]
        indices = np.random.permutation(np.arange(num_recs))
        num_batches = num_recs // batch_size
        for bid in range(num_batches):
            batch_ids = indices[bid * batch_size : (bid + 1) * batch_size]
            Xbatch_l = np.zeros((batch_size, MAX_SENTS, MAX_WORDS))
            Xbatch_r = np.zeros((batch_size, MAX_SENTS, MAX_WORDS))
            for idx, (docid_l, docid_r) in enumerate(X[batch_ids, :]):
                Xbatch_l[idx] = docid2mat[docid_l]
                Xbatch_r[idx] = docid2mat[docid_r]
            Ybatch = Y[batch_ids, :]
            yield [Xbatch_l, Xbatch_r], Ybatch

train_gen = datagen(Xtrain, Ytrain, docid2mat)
[Xbatch_left, Xbatch_right], Ybatch = train_gen.next()
print(Xbatch_left.shape, Xbatch_right.shape, Ybatch.shape)

(64, 40, 60) (64, 40, 60) (64, 2)


## Define Network

### Sentence Networks

In [10]:
sent_in_l = Input(shape=(MAX_WORDS,), dtype="int32")

sent_emb_l = Embedding(input_dim=vocab_size,
                       output_dim=WORD_EMBED_SIZE,
                       weights=[E])(sent_in_l)

sent_enc_l = Bidirectional(GRU(SENT_EMBED_SIZE,
                               return_sequences=False))(sent_emb_l)

sent_model_l = Model(inputs=sent_in_l, outputs=sent_enc_l)
sent_model_l.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 60, 300)           12219000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               240600    
Total params: 12,459,600
Trainable params: 12,459,600
Non-trainable params: 0
_________________________________________________________________


In [11]:
sent_in_r = Input(shape=(MAX_WORDS,), dtype="int32")

sent_emb_r = Embedding(input_dim=vocab_size,
                       output_dim=WORD_EMBED_SIZE,
                       weights=[E])(sent_in_r)

sent_enc_r = Bidirectional(GRU(SENT_EMBED_SIZE,
                               return_sequences=False))(sent_emb_r)

sent_model_r = Model(inputs=sent_in_r, outputs=sent_enc_r)
sent_model_r.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 60, 300)           12219000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               240600    
Total params: 12,459,600
Trainable params: 12,459,600
Non-trainable params: 0
_________________________________________________________________


### Document Networks

In [36]:
def sum_over_axis(X, axis):
    return K.mean(X, axis=axis)

In [13]:
# LHS document
doc_in_l = Input(shape=(MAX_SENTS, MAX_WORDS), dtype="int32")

doc_emb_l = TimeDistributed(sent_model_l)(doc_in_l)

doc_enc_l = Bidirectional(GRU(DOC_EMBED_SIZE,
                              return_sequences=True))(doc_emb_l)

# RHS document
doc_in_r = Input(shape=(MAX_SENTS, MAX_WORDS), dtype="int32")

doc_emb_r = TimeDistributed(sent_model_r)(doc_in_r)

doc_enc_r = Bidirectional(GRU(DOC_EMBED_SIZE,
                              return_sequences=True))(doc_emb_r)

# Attention
doc_att = custom_attn.AttentionMM("concat")([doc_enc_l, doc_enc_r])

# Prediction
fc1_dropout = Dropout(0.2)(doc_att)
fc1 = Dense(50, activation="relu")(fc1_dropout)
fc2_dropout = Dropout(0.2)(fc1)
doc_pred = Dense(NUM_CLASSES, activation="softmax")(fc2_dropout)

model = Model(inputs=[doc_in_l, doc_in_r], outputs=doc_pred)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 40, 60)        0                                            
____________________________________________________________________________________________________
input_6 (InputLayer)             (None, 40, 60)        0                                            
____________________________________________________________________________________________________
time_distributed_3 (TimeDistribu (None, 40, 200)       12459600    input_5[0][0]                    
____________________________________________________________________________________________________
time_distributed_4 (TimeDistribu (None, 40, 200)       12459600    input_6[0][0]                    
___________________________________________________________________________________________

## Train Network

In [14]:
model.compile(loss="categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [15]:
train_gen = datagen(Xtrain, Ytrain, docid2mat, batch_size=BATCH_SIZE)
val_gen = datagen(Xval, Yval, docid2mat, batch_size=BATCH_SIZE)

num_train_steps = len(Xtrain) // BATCH_SIZE
num_val_steps = len(Xval) // BATCH_SIZE

history = model.fit_generator(train_gen, 
                              steps_per_epoch=num_train_steps,
                              epochs=NUM_EPOCHS,
                              validation_data=val_gen,
                              validation_steps=num_val_steps)

kwargs passed to function are ignored with Tensorflow backend


Epoch 1/10


InvalidArgumentError: Incompatible shapes: [64,40,100] vs. [64,40]
	 [[Node: gradients/attention_mm_2/mul_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@attention_mm_2/mul"], _device="/job:localhost/replica:0/task:0/cpu:0"](gradients/attention_mm_2/mul_grad/Shape, gradients/attention_mm_2/mul_grad/Shape_1)]]

Caused by op u'gradients/attention_mm_2/mul_grad/BroadcastGradientArgs', defined at:
  File "/Users/palsujit/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/Users/palsujit/anaconda2/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-6b0877afcbab>", line 11, in <module>
    validation_steps=num_val_steps)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/Keras-2.0.4-py2.7.egg/keras/legacy/interfaces.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/Keras-2.0.4-py2.7.egg/keras/engine/training.py", line 1790, in fit_generator
    self._make_train_function()
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/Keras-2.0.4-py2.7.egg/keras/engine/training.py", line 1013, in _make_train_function
    self.total_loss)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/Keras-2.0.4-py2.7.egg/keras/optimizers.py", line 381, in get_updates
    grads = self.get_gradients(loss, params)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/Keras-2.0.4-py2.7.egg/keras/optimizers.py", line 47, in get_gradients
    grads = K.gradients(loss, params)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/Keras-2.0.4-py2.7.egg/keras/backend/tensorflow_backend.py", line 2266, in gradients
    return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 540, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 346, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 540, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/math_grad.py", line 663, in _MulGrad
    rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 395, in _broadcast_gradient_args
    name=name)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

...which was originally created as op u'attention_mm_2/mul', defined at:
  File "/Users/palsujit/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
[elided 18 identical lines from previous traceback]
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-b45d3888af7d>", line 18, in <module>
    doc_att = custom_attn.AttentionMM("concat")([doc_enc_l, doc_enc_r])
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/Keras-2.0.4-py2.7.egg/keras/engine/topology.py", line 585, in __call__
    output = self.call(inputs, **kwargs)
  File "custom_attn.py", line 238, in call
    ot1 = x1 * at1
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 838, in binary_op_wrapper
    return func(x, y, name=name)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 1061, in _mul_dispatch
    return gen_math_ops._mul(x, y, name=name)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1377, in _mul
    result = _op_def_lib.apply_op("Mul", x=x, y=y, name=name)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/palsujit/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Incompatible shapes: [64,40,100] vs. [64,40]
	 [[Node: gradients/attention_mm_2/mul_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@attention_mm_2/mul"], _device="/job:localhost/replica:0/task:0/cpu:0"](gradients/attention_mm_2/mul_grad/Shape, gradients/attention_mm_2/mul_grad/Shape_1)]]


In [None]:
plt.subplot(211)
plt.title("accuracy")
plt.plot(history.history["acc"], color="r", label="train")
plt.plot(history.history["val_acc"], color="b", label="val")
plt.legend(loc="best")

plt.subplot(212)
plt.title("loss")
plt.plot(history.history["loss"], color="r", label="train")
plt.plot(history.history["val_loss"], color="b", label="val")
plt.legend(loc="best")

plt.tight_layout()
plt.show()

## Evaluate Network

In [None]:
np.set_printoptions(linewidth=120)
test_gen = datagen(Xtest, Ytest, docid2mat, batch_size=BATCH_SIZE)
num_test_steps = len(Xtest) // BATCH_SIZE

Ytest_ = model.predict_generator(test_gen, num_test_steps)

ytest_ = np.argmax(Ytest_, axis=1)
ytest = np.argmax(Ytest, axis=1)

print("accuracy score: {:.3f}".format(accuracy_score(ytest, ytest_)))
print("\nconfusion matrix\n")
print(confusion_matrix(ytest, ytest_))