# Embed, Encode and Predict

In [1]:
from __future__ import division, print_function
from keras.layers import Input
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.pooling import GlobalMaxPooling1D
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.models import Model
from keras.optimizers import SGD
from keras.utils import to_categorical
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import logging
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os

import custom_layers

%matplotlib inline

Using TensorFlow backend.


In [2]:
DATA_DIR = "../data"

VOCAB_FILE = os.path.join(DATA_DIR, "ng-vocab.tsv")
MIN_OCCURS = 5

GLOVE_FILE = os.path.join(DATA_DIR, "glove.840B.300d.txt")

# covers about 95% of input data
MAX_SENTS = 40 # maximum number of sentences per document
MAX_WORDS = 60 # maximum number of words per sentence

WORD_EMBED_SIZE = 300
SENT_EMBED_SIZE = 100
DOC_EMBED_SIZE = 50
NUM_CLASSES = 20

BATCH_SIZE = 64
NUM_EPOCHS = 10

logging.basicConfig()

## Load Vocabulary

In [3]:
word2id = {"PAD": 0, "UNK": 1}
fvocab = open(VOCAB_FILE, "rb")
for i, line in enumerate(fvocab):
    word, count = line.strip().split("\t")
    if int(count) <= MIN_OCCURS:
        break
    word2id[word] = i
fvocab.close()
id2word = {v:k for k, v in word2id.items()}
vocab_size = len(word2id)
print("vocab_size: {:d}".format(vocab_size))

vocab_size: 40730


## Load GloVe Embeddings

In [5]:
E = np.zeros((vocab_size, WORD_EMBED_SIZE))
E[1] = np.random.random(WORD_EMBED_SIZE)
fglove = open(GLOVE_FILE, "rb")
for line in fglove:
    cols = line.strip().split(" ")
    word = cols[0]
    if not word2id.has_key(word):
        continue
    vec = np.array([float(x) for x in cols[1:]])
    idx = word2id[word]
    E[idx] = vec
fglove.close()
print(E.shape)

(40730, 300)


## Compute Document Vectors

In [6]:
ng_data = fetch_20newsgroups(subset='all',
                             data_home=DATA_DIR,
                             shuffle=True, 
                             random_state=42)
num_docs = len(ng_data.data)
print(num_docs)

18846


In [7]:
def pad_or_truncate(xs, maxlen):
    if len(xs) > maxlen:
        xs = xs[len(xs) - maxlen:]
    elif len(xs) < maxlen:
        xs = ["PAD"] * (maxlen - len(xs)) + xs
    return xs

xs = ["The", "cat", "fought", "like", "a", "mouse"]
print(pad_or_truncate(xs, 3))
print(pad_or_truncate(xs, 7))

['like', 'a', 'mouse']
['PAD', 'The', 'cat', 'fought', 'like', 'a', 'mouse']


In [8]:
X = np.zeros((num_docs, MAX_SENTS, MAX_WORDS))
for docid in range(num_docs):
    text = ng_data.data[docid]
    sents = pad_or_truncate(nltk.sent_tokenize(text), MAX_SENTS)
    for sid, sent in enumerate(sents):
        words = nltk.word_tokenize(sent)
        words = pad_or_truncate(words, MAX_WORDS)
        for wid, word in enumerate(words):
            try:
                word_id = word2id[word]
            except KeyError:
                word_id = word2id["UNK"]
            X[docid, sid, wid] = word_id
print(X.shape)

(18846, 40, 60)


In [9]:
y = ng_data.target
Y = to_categorical(y, num_classes=NUM_CLASSES)
print(Y.shape)

(18846, 20)


In [10]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.7)
print(Xtrain.shape, Ytrain.shape, Xtest.shape, Ytest.shape)

(13192, 40, 60) (13192, 20) (5654, 40, 60) (5654, 20)


## Define Network

In [49]:
def sent_encoder():
    sent_inputs = Input(shape=(MAX_WORDS,), dtype="int32")
    # embed
    sent_emb = Embedding(input_dim=vocab_size,
                         output_dim=WORD_EMBED_SIZE,
                         weights=[E])(sent_inputs)
    # encode
    sent_enc = Bidirectional(GRU(SENT_EMBED_SIZE,
                                return_sequences=False))(sent_emb)
    # attend
    m_shape = (BATCH_SIZE, MAX_WORDS, SENT_EMBED_SIZE * 2)
    sent_att = custom_layers.AttentionM(m_shape)(sent_enc)

    sent_encoder = Model(inputs=sent_inputs, outputs=sent_att)
    return sent_encoder

sent_encoder().summary()

AssertionError: 

In [35]:
from keras import backend as K
from keras.engine.topology import Layer

class AttentionMc(Layer):
    
    """
    Keras layer to compute an attention vector on an incoming matrix.
    
    # Input
        enc - 3D Tensor of shape (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)

    # Output
        2D Tensor of shape (BATCH_SIZE, EMBED_SIZE)

    # Usage
        enc = LSTM(EMBED_SIZE, return_sequences=True)(...)
        att = AttentionM((BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE))(enc)

    """    

    def __init__(self, m_shape, **kwargs):
        self.m_shape = m_shape
        super(AttentionMc, self).__init__(**kwargs)

    
    def build(self, input_shape):
        print("input_shape", input_shape)
        assert (self.m_shape[1] == input_shape[1] and
                self.m_shape[2] == input_shape[2])
        # W: (BATCH_SIZE, EMBED_SIZE, 1)
        # b: (BATCH_SIZE, MAX_TIMESTEPS)
        self.W = K.random_normal_variable(
                shape=(self.m_shape[0], self.m_shape[-1], 1), 
                mean=0.0, scale=0.05)
        self.b = K.zeros((self.m_shape[0], self.m_shape[1]))
        print("W=", self.W, "\nb=", self.b)
        super(AttentionMc, self).build(input_shape)


    def call(self, x, mask=None):
        print("x", x)
        # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
        # alpha: (BATCH_SIZE, MAX_TIMESTEPS)
        
        alpha = K.softmax(K.tanh(K.batch_dot(x, self.W) + self.b))
        if mask is not None:
            alpha *= K.cast(mask, K.floatx())
        # output: (BATCH_SIZE, EMBED_SIZE)
        alpha_emb = K.expand_dims(alpha, axis=-1)
        return K.sum(x * alpha_emb, axis=1)

    
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None
    
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2])
        

In [46]:
doc_inputs = Input(shape=(MAX_SENTS, MAX_WORDS), dtype="int32")
print("doc_inputs", doc_inputs)
# embed
doc_emb = TimeDistributed(sent_encoder)(doc_inputs)
print("doc_emb", doc_emb)
# encode
doc_enc = Bidirectional(GRU(DOC_EMBED_SIZE,
                            return_sequences=True))(doc_emb)
# attend
print("doc_enc=", doc_enc)
m_shape = (BATCH_SIZE, MAX_SENTS, DOC_EMBED_SIZE * 2)
print("m_shape=", m_shape)
doc_att = AttentionMc(m_shape)(doc_enc)
# predict
fc1_dropout = Dropout(0.2)(doc_att)
fc1 = Dense(50, activation="relu")(fc1_dropout)
fc2_dropout = Dropout(0.2)(fc1)
outputs = Dense(NUM_CLASSES, activation="softmax")(fc2_dropout)

model = Model(inputs=doc_inputs, outputs=outputs)
model.summary()

doc_inputs Tensor("input_26:0", shape=(?, 40, 60), dtype=int32)
doc_emb Tensor("time_distributed_22/Reshape_1:0", shape=(?, 40, 200), dtype=float32)
doc_enc= Tensor("bidirectional_24/concat_2:0", shape=(?, ?, 100), dtype=float32)
m_shape= (64, 40, 100)
input_shape (None, 40, 100)
W= Tensor("attention_mc_12/Variable/read:0", shape=(64, 100, 1), dtype=float32) 
b= Tensor("attention_mc_12/Variable_1/read:0", shape=(64, 40), dtype=float32)
x Tensor("bidirectional_24/concat_2:0", shape=(?, ?, 100), dtype=float32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_26 (InputLayer)        (None, 40, 60)            0         
_________________________________________________________________
time_distributed_22 (TimeDis (None, 40, 200)           12459600  
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 40, 100)           75300     
________________________

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])

In [None]:
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
                   epochs=NUM_EPOCHS, validation_split=0.1)

In [None]:
plt.subplot(211)
plt.title("accuracy")
plt.plot(history.history["acc"], color="r", label="train")
plt.plot(history.history["val_acc"], color="b", label="val")
plt.legend(loc="best")

plt.subplot(212)
plt.title("loss")
plt.plot(history.history["loss"], color="r", label="train")
plt.plot(history.history["val_loss"], color="b", label="val")
plt.legend(loc="best")

plt.tight_layout()
plt.show()

## Evaluate Network

In [None]:
np.set_printoptions(linewidth=120)
Ytest_ = model.predict(Xtest)
ytest_ = np.argmax(Ytest_, axis=1)
ytest = np.argmax(Ytest, axis=1)
print("accuracy score: {:.3f}".format(accuracy_score(ytest, ytest_)))
print("\nconfusion matrix\n")
print(confusion_matrix(ytest, ytest_))