## Importing libraries

In [16]:
# Importing the relevant libraries
import numpy as np

from nltk import tokenize

from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.engine.topology import Layer, InputSpec
from keras.layers import Embedding, Input, Dense, Bidirectional, LSTM, Layer, GRU, TimeDistributed
from keras.models import Model
from keras.layers.merge import concatenate
from keras.callbacks import ModelCheckpoint, EarlyStopping
import keras.callbacks


import string
import re
import pandas as pd
import collections
import os
import time

## Data Load

In [2]:
# We define a function to load the data
def load_data(path_to_dir, dataset):
    
    '''
    The loading procedure is different for the two datasets, therefore we add a parameter to the load 
    function that specifies the dataset to load
    '''
    
    # Regular Expression to remove punctuation
    Remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation.replace('.',''))) #We leave the periods in since we want to split it into sentences
    
    if dataset == "imdb":
        
        train_pos = []
        train_neg = []
        test_pos = []
        test_neg = []

        for filename in os.listdir(path_to_dir+"/train/pos/"):
            train_pos.append(open(path_to_dir+"/train/pos/"+filename,'r',encoding='utf-8').read())

        for filename in os.listdir(path_to_dir+"/train/neg/"):
            train_neg.append(open(path_to_dir+"/train/neg/"+filename,'r',encoding='utf-8').read())

        for filename in os.listdir(path_to_dir+"/test/pos/"):
            test_pos.append(open(path_to_dir+"/test/pos/"+filename,'r',encoding='utf-8').read())

        for filename in os.listdir(path_to_dir+"/test/neg/"):
            test_neg.append(open(path_to_dir+"/test/neg/"+filename,'r',encoding='utf-8').read())
        
        train_pos = [Remove_punctuation.sub('',i).lower() for i in train_pos]
        train_neg = [Remove_punctuation.sub('',i).lower() for i in train_neg]
        test_pos = [Remove_punctuation.sub('',i).lower() for i in test_pos]
        test_neg = [Remove_punctuation.sub('',i).lower() for i in test_neg]
        
        #train_pos = [[sent for sent in review if len(sent)!=0] for review in train_pos]
        #train_neg = [[sent for sent in review if len(sent)!=0] for review in train_pos]
        #test_pos = [[sent for sent in review if len(sent)!=0] for review in train_pos]
        #test_neg = [[sent for sent in review if len(sent)!=0] for review in train_pos]
        
        X_train = train_pos + train_neg
        X_test = test_pos + test_neg
        
        Y_train = [1]*len(train_pos) + [0]*len(train_neg)
        Y_test = [1]*len(test_pos) + [0]*len(test_neg)        
        
        return X_train, Y_train, X_test, Y_test
        
    if dataset == "yelp":
        
        Train_data = pd.read_csv(path_to_dir + "Yelp_train_data.csv")
        Train_data = Train_data[Train_data['stars']!=3]
        
        X_train = Train_data['text']
        X_train = list(X_train)
        
        Y_train = Train_data['stars']
        Y_train = list(Y_train)
        Y_train = [0 if i <= 3 else 1 for i in Y_train]
        
        Test_data =  pd.read_csv(path_to_dir + "Yelp_test_data.csv")
        Test_data = Test_data[Test_data['stars']!=3]
        
        X_test = Test_data['text']
        X_test = list(X_test)
        
        Y_test = Test_data['stars']
        Y_test = list(Y_test)
        Y_test = [0 if i <=3 else 1 for i in Y_test]
        
        X_train = [Remove_punctuation.sub('',i).lower() for i in X_train]
        X_test = [Remove_punctuation.sub('',i).lower() for i in X_test]
        
        #X_train = [[sent for sent in review if len(sent)!=0] for review in X_train]
        #X_test = [[sent for sent in review if len(sent)!=0] for review in X_test]
               
        return X_train, Y_train, X_test, Y_test       


In [3]:
X_train, Y_train, X_test, Y_test = load_data('../../../Data/IMDB Stanford/', "imdb")

## Data Pre Processing

In [4]:
# Converting the reviews into sentences
X_train_sentences = [review.split('.') for review in X_train]
X_train_sentences = [[sentence for sentence in review if len(sentence)!=0] for review in X_train_sentences]

X_test_sentences = [review.split('.') for review in X_test]
X_test_sentences = [[sentence for sentence in review if len(sentence)!=0] for review in X_test_sentences]



In [6]:
# Setting up parameters for the word embeddings

Max_Words = 20000 #Max words in the vocabulary
Max_Sequence_length = 100 #Max number of words in a sentence
Embedding_dimension = 100 #Using 100 dimensional Glove data
Max_Sentence_length = 15


In [7]:
# Converting the words into tokens

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

print('Total %s unique tokens.' % len(word_index))

Total 107359 unique tokens.


In [8]:
Train_data = np.zeros((len(X_train), Max_Sentence_length, Max_Sequence_length), dtype='int32')
#(Number of reviews, number of sentences per review = 15, number of words per sentence = 100)

for i, sentences in enumerate(X_train_sentences):#Taking a single review(list of sentences) from a list of reviews
    for j, sent in enumerate(sentences):#Iterating through sentences within a review. sent = a sentence in a single review
        if j < Max_Sentence_length:
            wordTokens = text_to_word_sequence(sent)#Converting sentences into words(similar function to .split())
            #wordTokens is a list
            k = 0
            for _, word in enumerate(wordTokens):#Iterate through each word in the word token
                if k < Max_Sequence_length and tokenizer.word_index[word] < Max_Words:
                    Train_data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1
                    
Test_data = np.zeros((len(X_test), Max_Sentence_length, Max_Sequence_length), dtype='int32')

for i, sentences in enumerate(X_test_sentences):#Taking a single review(list of sentences) from a list of reviews
    for j, sent in enumerate(sentences):#Iterating through sentences within a review. sent = a sentence in a single review
        if j < Max_Sentence_length:
            wordTokens = text_to_word_sequence(sent)#Converting sentences into words(similar function to .split())
            #wordTokens is a list
            k = 0
            for _, word in enumerate(wordTokens):#Iterate through each word in the word token
                if k < Max_Sequence_length and tokenizer.word_index.get(word,0) < Max_Words:
                    Test_data[i, j, k] = tokenizer.word_index.get(word,0)
                    k = k + 1


In [9]:
# Creating the labels
Train_labels = np.array(Y_train)
Test_labels = np.array(Y_test)

In [17]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)



class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        #self.built = True
        super(Attention, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

## Model Evaluation on Test Data

In [18]:
# Loading the trained model and measuring accuracy on test data
HAN_model = load_model('HAN_model.h5', custom_objects = {'Attention': Attention})

# Evaluating accuracy on Test data
Test_score = HAN_model.evaluate(Test_data, Test_labels, verbose=1)
print("%s: %.2f%%" % (HAN_model.metrics_names[1], Test_score[1]*100))


acc: 87.20%
