In [1]:
import pandas as pd
import numpy as np
import keras
from keras import backend as K
import nltk
import sklearn

nltk.download('punkt')

Using TensorFlow backend.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sh160\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
# Maximum words per sentence
MAX_WORDS = 100
# Maximum sentences per doc
MAX_SENT = 15
# Max vocabulary size
MAX_VOCAB = 20000
# Dimension of GloVe
GLOVE_DIM = 100

In [3]:
# --------------------------------------------------
# Load Kaggle IMDB Dataset
# --------------------------------------------------

dataset = pd.read_csv("./Data/labeledTrainData.tsv", sep="\t")
reviews = dataset["review"].values
sentiments = dataset["sentiment"].values
print(dataset.head())

# Tokenize
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(reviews)

# Input matrix for Model, zero-pad as to not effect
# predictions of attention mechanism
x = np.zeros((len(reviews), MAX_SENT, MAX_WORDS), dtype="int32")

for i, review in enumerate(reviews):

    # Seperate each review into individual sentences
    # https://www.nltk.org/api/nltk.tokenize.html
    sentences = nltk.tokenize.sent_tokenize(review)
    tokenized_sents = tokenizer.texts_to_sequences(sentences)

    # Add padding
    tokenized_sents = keras.preprocessing.sequence.pad_sequences(tokenized_sents, maxlen=MAX_WORDS)
    padding = MAX_SENT - tokenized_sents.shape[0]

    # No padding needed
    if padding < 0:
        tokenized_sents = tokenized_sents[0:MAX_SENT]
    else:
        # Add padding
        tokenized_sents = np.pad(tokenized_sents, ((0, padding), (0, 0)), mode='constant', constant_values=0)

    # Add to input matrix
    x[i] = tokenized_sents[None, ...]

# Convert sentiments for Keras
y = keras.utils.to_categorical(sentiments)

# Create test/train sets 20/80 split
# x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.1)
x_train = x[:4000]
y_train = y[:4000]
x_val = x[4000:4500]
y_val = y[4000:4500]
x_test = x[4500:5000]
y_test = y[4500:5000]

       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


In [5]:
# --------------------------------------------------
# Load word embeddings from GloVe
# --------------------------------------------------

# Read in embeddings
file = open("./Data/glove_6B/glove_6B_100d.txt", "r", encoding = "utf-8")
lines = file.readlines()
embeddings = dict()
for line in lines:
    vals = line.split()
    embeddings[vals[0]] = np.asarray(vals[1:], dtype="float32")

# Create weight matrix from embeddings
embed_matrix = np.random.random((len(tokenizer.word_index) + 1, GLOVE_DIM))
embed_matrix[0] = 0
for word, i in tokenizer.word_index.items():
    embed_vec = embeddings.get(word)
    if embed_vec is not None:
        embed_matrix[i] = embed_vec


In [6]:
# --------------------------------------------------
# Create custom layer for HAN
# https://keras.io/layers/writing-your-own-keras-layers/
# --------------------------------------------------

class han_attention_layer(keras.layers.Layer):

    def __init__(self, output_dim=GLOVE_DIM, **kwargs):
        self.output_dim = output_dim
        super(han_attention_layer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create trainable weight variables for this layer
        dim = input_shape[1]
        self.W = self.add_weight(name='W',
                                 shape=(dim, self.output_dim),
                                 initializer=keras.initializers.get("uniform"),
                                 trainable=True)

        # Trainable weight
        self.u = self.add_weight(name='output',
                                 shape=(self.output_dim, 1),
                                 initializer=keras.initializers.get("uniform"),
                                 trainable=True)

        super(han_attention_layer, self).build(input_shape)

    def get_att_weights(self, x):
        u_tw = K.tanh(K.dot(x, self.W))
        tw_stimulus = K.dot(u_tw, self.u)
        tw_stimulus = K.reshape(tw_stimulus, (-1, tw_stimulus.shape[1]))
        return K.softmax(tw_stimulus)

    def call(self, x):
        weights = self.get_att_weights(x)
        weights = K.reshape(weights, (-1, weights.shape[1], 1))
        weights = K.repeat_elements(weights, x.shape[-1], -1)
        weighted_input = keras.layers.Multiply()([x, weights])
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[1]


In [8]:
# --------------------------------------------------
# Create HAN Model
# --------------------------------------------------
class han(keras.models.Model):
    def __init__(self, max_words, max_sents, output_size, embed_matrix,
                 word_encode_dim=200, sent_encode_dim=200,
                 name="Hierarchical_Attention_Network"):
        self.max_words = max_words
        self.max_sents = max_sents
        self.output_size = output_size
        self.embed_matrix = embed_matrix
        self.word_encode_dim = word_encode_dim
        self.sent_encode_dim = sent_encode_dim

        in_tensor, out_tensor = self.build_network()

        super(han, self).__init__(inputs=in_tensor, outputs=out_tensor, name=name)

    def build_word_encoder(self, max_words, embed_matrix, encode_dim=200):
        vocab_size = embed_matrix.shape[0]
        embed_dim = embed_matrix.shape[1]
        embed_layer = keras.layers.Embedding(vocab_size, embed_dim, weights=[embed_matrix],
                                             input_length=max_words, trainable=False)
        sent_input = keras.layers.Input(shape=(max_words,), dtype="int32")
        embed_sents = embed_layer(sent_input)
        encode_sents = keras.layers.Bidirectional(keras.layers.GRU(int(encode_dim / 2)))(
            embed_sents)
        return keras.Model(inputs=[sent_input], outputs=[encode_sents], name="word_encoder")

    def build_sent_encoder(self, max_sents, summary_dim, encode_dim=200):
        text_input = keras.layers.Input(shape=(max_sents, summary_dim))
        encode_sents = keras.layers.Bidirectional(keras.layers.GRU(int(encode_dim / 2)))(
            text_input)
        return keras.Model(inputs=[text_input], outputs=[encode_sents], name="sentence_encoder")

    def build_network(self):
        in_tensor = keras.layers.Input(shape=(self.max_sents, self.max_words))
        word_encoder = self.build_word_encoder(self.max_words, self.embed_matrix, self.word_encode_dim)
        word_rep = keras.layers.TimeDistributed(word_encoder, name="word_encoder")(in_tensor)
        sentence_rep = keras.layers.TimeDistributed(han_attention_layer(), name="word_attention")(word_rep)
        doc_rep = self.build_sent_encoder(self.max_sents, self.word_encode_dim, self.sent_encode_dim)(sentence_rep)
        doc_summary = han_attention_layer(name="sentence_attention")(doc_rep)
        out_tensor = keras.layers.Dense(self.output_size, activation="softmax", name="class_prediction")(doc_summary)
        return in_tensor, out_tensor


In [None]:
# --------------------------------------------------
# Train / Test HAN Model
# --------------------------------------------------

# Build
han_model = han(MAX_WORDS, MAX_SENT, 2, embed_matrix, word_encode_dim=100, sent_encode_dim=100)
han_model.summary()

han_model.compile(optimizer="adagrad", loss="categorical_crossentropy", metrics=["acc"])
han_model.fit(x_train, y_train, batch_size=20, epochs=10, validation_data=(x_val, y_val))

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 15, 100)           0         
_________________________________________________________________
word_encoder (TimeDistribute (None, 15, 100)           8903600   
_________________________________________________________________
word_attention (TimeDistribu (None, 15, 100)           10100     
_________________________________________________________________
sentence_encoder (Model)     (None, 100)               45300     
_________________________________________________________________
sentence_attention (han_atte (None, 100)               10100     
_________________________________________________________________
class_prediction (Dense)     (None, 2)                 202       
Total params: 8,969,302
Trainable params: 111,002
Non-trainable params

In [None]:
# fit network
n_epochs = [10]
learning_rate = [0.001] #[0.1, 0.001, 0.0001]
n_sent_encode_dim = [100, 200, 300]
n_batch_size = [32, 64, 128]
for epoch in n_epochs:
        for l_rate in learning_rate:
            for batch_size in n_batch_size:
                for sent_encode_dim in n_sent_encode_dim:
                    print("Current Model: Epochs = {0}, l_rate = {1}, batch_size = {2}, sent_encode_dim = {3}".format(epoch, l_rate, batch_size, sent_encode_dim))
                    han_model = han(MAX_WORDS, MAX_SENT, 2, embed_matrix, word_encode_dim=100, sent_encode_dim=100)
                    han_model.compile(optimizer="adagrad", loss="categorical_crossentropy", metrics=["acc"])
                    han_model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epochs, validation_data=(x_val, y_val), verbose=1)
                    # Save the model
                    model_name = "{4}_epochs_{0}_lrate_{1}_batch_size_{2}_sent_encode_dim_{3}".format(epoch, l_rate, batch_size, sent_encode_dim, "HAN")
                    model.save("./HAN_models_large/" + model_name + ".h5")

In [7]:
# --------------------------------------------------
# Sources:
# https://www.nltk.org/api/nltk.tokenize.html
# https://keras.io/layers/writing-your-own-keras-layers/
# https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/
# https://github.com/FlorisHoogenboom/keras-han-for-docla
# https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-HATN/
# https://medium.com/jatana/report-on-text-classification-using-cnn-rnn-han-f0e887214d5f
# --------------------------------------------------