This notebook explores the use of a bidirectional LSTM with attention for text classification.

In [None]:
import keras
import numpy as np
from sklearn import preprocessing
from keras.layers import Dense, Input, Embedding, Lambda, Layer, Multiply, Dropout, Dot, Bidirectional, LSTM
from keras.models import Model
from keras import backend as K
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import pandas as pd
from scipy.stats import norm
from math import sqrt 

In [None]:
def load_embeddings(filename, max_vocab_size):

    vocab={}
    embeddings=[]
    with open(filename) as file:
        
        cols=file.readline().split(" ")
        num_words=int(cols[0])
        size=int(cols[1])
        embeddings.append(np.zeros(size))  # 0 = 0 padding if needed
        embeddings.append(np.zeros(size))  # 1 = UNK
        vocab["_0_"]=0
        vocab["_UNK_"]=1
        
        for idx,line in enumerate(file):

            if idx+2 >= max_vocab_size:
                break

            cols=line.rstrip().split(" ")
            val=np.array(cols[1:])
            word=cols[0]
            
            embeddings.append(val)
            vocab[word]=idx+2

    return np.array(embeddings), vocab, size

In [None]:
def read_data(filename, vocab):
    X=[]
    Y=[]
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            label=cols[0]
            # assumes text is already tokenized
            text=cols[1].split(" ")
            X.append(text)
            Y.append(label)
    return X, Y

In [None]:
def get_word_ids(docs, vocab, max_length=200):
    
    doc_ids=[]
    
    for doc in docs:
        wids=[]
        for token in doc[:max_length]:
            val = vocab[token.lower()] if token.lower() in vocab else 1
            wids.append(val)
        
        # pad each document to constant width
        for i in range(len(wids),max_length):
            wids.append(0)

        doc_ids.append(wids)

    return np.array(doc_ids)

If you haven't downloaded the glove vectors, do so first -- the top 50K words in the "Common Crawl (42B)"  vectors (300-dimensional) can be found here: [glove.42B.300d.50K.txt](https://drive.google.com/file/d/1n1jt0UIdI3CD26cY1EIeks39XH5S8O8M/view?usp=sharing); download it and place  in your `data` directory.

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file="../data/glove.42B.300d.50K.txt"
glove_in_w2v_format="../data/glove.42B.300d.50K.w2v.txt"
_ = glove2word2vec(glove_file, glove_in_w2v_format)

In [None]:
embeddings, vocab, embedding_size=load_embeddings("../data/glove.42B.300d.50K.w2v.txt", 50000)

In [None]:
# Change this to the directory with your data (from the CheckData_TODO.ipynb exercise).  
# The directory should contain train.tsv, dev.tsv and test.tsv
directory="../data/lmrd"

In [None]:
trainText, trainY=read_data("%s/train.tsv" % directory, vocab)
devText, devY=read_data("%s/dev.tsv" % directory, vocab)
testText, testY=read_data("%s/test.tsv" % directory, vocab)

In [None]:
trainX = get_word_ids(trainText, vocab, max_length=200)
devX = get_word_ids(devText, vocab, max_length=200)
testX = get_word_ids(testText, vocab, max_length=200)

In [None]:
le = preprocessing.LabelEncoder()
le.fit(trainY)
Y_train=np.array(le.transform(trainY))
Y_dev=np.array(le.transform(devY))
Y_test=np.array(le.transform(testY))

In [None]:
class AttentionLayerMasking(Layer):

    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(AttentionLayerMasking, self).__init__(**kwargs)


    def build(self, input_shape):
        input_embedding_dim=input_shape[-1]
        
        self.kernel = self.add_weight(name='kernel', 
                            shape=(input_embedding_dim,1),
                            initializer='uniform',
                            trainable=True)
        super(AttentionLayerMasking, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        
        # dot product 
        x=K.dot(x, self.kernel)
        # exponentiate
        x=K.exp(x)
        
        # zero out elements that are masked
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, axis=-1)
            x = x * mask
        
        # normalize by sum
        x /= K.sum(x, axis=1, keepdims=True)
        x=K.squeeze(x, axis=2)

        return x

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1])

Q1: Implement a BiLSTM with attention. Feel free to base your code on the models in Attention.ipynb and LSTM.ipynb

In [None]:
def get_bilstm_with_attention_masking(embeddings, lstm_size=25, dropout_rate=0.25):

    vocab_size, word_embedding_dim=embeddings.shape
    
    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings], 
                                    mask_zero=True,
                                    trainable=False)

    
    embedded_sequences = word_embedding_layer(word_sequence_input)
    bilstm_output = Bidirectional(LSTM(lstm_size, return_sequences=True, activation='tanh', dropout=dropout_rate), merge_mode='concat')(embedded_sequences)

    # first let's transform each word embedding into a new vector to use for measuring its importance
    attention_key_dim=300
    attention_input=Dense(attention_key_dim, activation='tanh')(bilstm_output)

    # next we'll pass those transformed inputs through an attention layer, getting back a normalized
    # attention value a_i for each token i; \forall i, 0 <= a_i <= 1; for a document with N words, 
    # \sum_{i=0}^N a_i = 1
    
    attention_output = AttentionLayerMasking(word_embedding_dim, name="attention")(attention_input)
    
    # now let's multiply those attention weights by original inputs to get a weighted average over them
    document_representation = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=1), name='dot')([attention_output,bilstm_output])

    x=Dense(1, activation="sigmoid")(document_representation)

    model = Model(inputs=word_sequence_input, outputs=x)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

In [None]:
bilstm_attention_model=get_bilstm_with_attention_masking(embeddings, lstm_size=25, dropout_rate=0.25)
print (bilstm_attention_model.summary())

In [None]:
model=bilstm_attention_model

modelName="bilstm_attention_model.hdf5"
checkpoint = ModelCheckpoint(modelName, monitor='val_loss', verbose=0, save_best_only=True, mode='min')

model.fit(trainX, Y_train, 
            validation_data=(devX, Y_dev),
            epochs=30, batch_size=128,
            callbacks=[checkpoint])

Q2. What is the accuracy of your model on the test data?  Report the accuracy score with 95% confidence intervals.  Feel free to use the dev data for model selection (e.g., to hyperparameter choices like the size of hidden LSTM state, etc.), but be careful not to use the test data for this.  See keras [model.predict](https://keras.io/models/model/#predict) to generate predictions for a trained model.

In [None]:
def binomial_confidence_intervals(predictions, truth, confidence_level=0.95):
    correct=[]
    for pred, gold in zip(predictions, truth):
        correct.append(int(pred==gold))
        
    success_rate=np.mean(correct)

    # two-tailed test
    critical_value=(1-confidence_level)/2
    # ppf finds z such that p(X < z) = critical_value
    z_alpha=-1*norm.ppf(critical_value)
    
    # the standard error is the square root of the variance/sample size
    # the variance for a binomial test is p*(1-p)
    standard_error=sqrt((success_rate*(1-success_rate))/len(correct))

    lower=success_rate-z_alpha*standard_error
    upper=success_rate+z_alpha*standard_error
    print("%.3f, %s%% Confidence interval: [%.3f,%.3f]" % (success_rate, confidence_level*100, lower, upper))

In [None]:
model=bilstm_attention_model

model.load_weights("bilstm_attention_model.hdf5")

In [None]:
predictions = model.predict(testX, batch_size=128)
binarized_predictions=predictions > .5

In [None]:
binomial_confidence_interval(binarized_predictions, Y_test, confidence_level=0.95)

Q3. Take the sentence "I do not like this movie." How is representing this sentence by using attention over the individual word embeddings different from representing it with attention over the output of each time step in an bidirectional LSTM?  What information does the LSTM output encode that individual word embeddings don't have access to?

A3: Word embeddings encode information about the word *type* but not about its specific use in context; the output of an LSTM at time t encodes information about the context a word *token* was used in -- for a single forward LSTM, the context of the sequence from word 1 through word t; for a BiLSTM, the context of the entire sequence.