In [180]:
"""
Created on Fri Oct 13 21:39:24 2017

@author: rmisra
"""

import numpy as np
import pandas as pd
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

os.environ['KERAS_BACKEND']='tensorflow'

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializations
from nltk import tokenize
from nltk.corpus import stopwords
from keras.callbacks import EarlyStopping

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

## Read data

In [356]:
data_train = pd.read_csv('./amazon_review_data.tsv', sep='\t')
print(data_train.shape)

(9996, 2)


In [357]:
data_train.head(5)

Unnamed: 0,rating,review
0,1,Received this item yesterday and eagerly opene...
1,1,I can understand the low reviews. Service was ...
2,0,I can't begin to tell you how awful this stuff...
3,0,"Just another Lame story,w/a poor girl and a ri..."
4,0,Never thought i will be playing a PS1 game on ...


## Clean reviews

In [360]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 25
MAX_NB_WORDS = 20000  ## dictionary size
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)
    string = string.strip().lower()
    stopWords = set(stopwords.words('english'))
    trans_string = ' '.join([word for word in string.strip().split() if word not in stopWords])
    return trans_string

In [361]:
reviews = []
labels = []
texts = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx], "lxml")
    text = clean_str(text.get_text())
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)

    labels.append(data_train.rating[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)           ## top nb words in the reviews


In [362]:
len(tokenizer.word_index)

42037

## Prepare data

In [363]:
data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1                    
                    
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels), nb_classes=5)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Total 42037 unique tokens.
Shape of data tensor: (9996, 25, 100)
Shape of label tensor: (9996, 5)


## Shuffle and split data

In [364]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

## Copy over word embeddings for top NB words

In [365]:
## download glove embeddings from https://nlp.stanford.edu/projects/glove/
GLOVE_DIR = "./"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [366]:
# building Hierachical Attention network
embedding_matrix = np.random.random((MAX_NB_WORDS + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < MAX_NB_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector


## Define Attention layer 

In [367]:
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        self.supports_masking = True
        self.mask_zero = True
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.init((input_shape[-1],))
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)
    
    def call(self, x, mask=None):
        eij = K.tanh(dot_product(x, self.W))

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

## Construct the model

In [368]:
embedding_layer = Embedding(MAX_NB_WORDS + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=False)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32') 
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(100))(l_lstm)
l_att = AttLayer()(l_dense)
sentEncoder = Model(sentence_input, l_att)

review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(50, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(100))(l_lstm_sent)
l_att_sent = AttLayer()(l_dense_sent)
preds = Dense(5, activation='softmax')(l_att_sent)
model = Model(review_input, preds)

## Training

In [369]:
model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['mse'])

callback = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=1,
                              verbose=0, mode='auto')]

print("model fitting - Hierachical attention network")
print(sentEncoder.summary())
print(model.summary())
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=100, callbacks = callback)

model fitting - Hierachical attention network
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_18 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
embedding_10 (Embedding)         (None, 100, 100)      2000100     input_18[0][0]                   
____________________________________________________________________________________________________
bidirectional_18 (Bidirectional) (None, 100, 100)      45300       embedding_10[0][0]               
____________________________________________________________________________________________________
timedistributed_26 (TimeDistribu (None, 100, 100)      10100       bidirectional_18[0][0]           
_____________________________________________

<keras.callbacks.History at 0x1eed2030d68>

## Test word and sentence attention

### Prep data

In [409]:
test_data = np.zeros((1, MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
## sample reviews

## rating = 1
#rev = 'Described as &#34;super soft&#34; and &#34;Exceptionally heavy weight&#34;. But what I received looked more like a set of cheap towels coming from a discount store, and looked neither &#34;super soft&#34; nor &#34;Exceptionally heavy weight&#34;. My order came from Warehouse Deals and was packed in a transparent plastic bag, so maybe a mistake was made?Anyway they are being sent back..'

## rating = 4
#rev = "I guess after looking at other reviews that we got lucky  to have great service. Nicole was our server and she first of all informed that the computers were down so orders were going in slowly. Secondly, my wife ordered a lamb burger to be cooked medium and when it came out it was almost raw. Nicole took it back immediately and five minutes later my wife had a perfect lamb burger. On top of that Nicole gave us a free desert because of the burger and the wait.  My burger was great and with service like this who wouldn't come back!"

## rating = 3
#rev = "We have at least 2 birthday parties to attend here a year. The good part is the fact that the rides are contained in one secluded area so it's easy to keep and eye on the kiddos. The bad part is the cheesy arcade located within. Don't get me wrong I don't mind arcade games, what I detest is the stupid ticket program. You know, the one where the kids collect tickets from the various arcade games then turn them in for a prize? The program here is ridiculous. You have to get and ungodly amount of tickets to get anything, and that \"anything\" is usually something that can be purchased at the dollar store. The other annoying factor is that the ticket redemption booth is always understaffed and there you are...waiting in a line of indecisive kids so that your child can pick one of the many booby prizes available. It's torture I tell you! If they got rid of this ticket BS I would frequent this place 10 times more. My kids love the go carts and the kiddie roller coaster."

## rating = 2
rev = "Excellent service, clean restaurant, absolutely gorgeous wooden Buddah table next to the bar. When I arrived I should have known what to expect when a bus of elderly retirees were leaving. This is not authentic Chinese food. Bad sauces. Too much sugar. No balance of sweet, spicy and salty. I'm really disappointed because I was referred by Check Please Arizona (channel 8). The sauces were terrible in flavor. Proteins were delicious and so were all the veggies. I think I came when chef was having an off night. I'll give them another try."
revs = []
word_atten_sen_number = 2

text = BeautifulSoup(rev, "lxml")
text = clean_str(text.get_text())
revs.append(tokenize.sent_tokenize(text))

for i, sentences in enumerate(revs):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    test_data[i,j,k] = tokenizer.word_index[word]
                    k=k+1

test_words = []
test_data_words = np.zeros((1, MAX_SENT_LENGTH), dtype='int32')
for i, sentences in enumerate(revs):
    for j, sent in enumerate(sentences):
        if(j!=word_atten_sen_number):
            continue
        print(sent)
        wordTokens = text_to_word_sequence(sent)
        k=0
        for _, word in enumerate(wordTokens):
            if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                test_data_words[i,k] = tokenizer.word_index[word]
                k=k+1
                test_words.append(word)
        break

authentic chinese food.


### Check sentence attention

In [403]:
sent_atten_output = K.function([model.layers[0].input, K.learning_phase()], [model.layers[2].output])
output = sent_atten_output([test_data, 0])[0]  # test mode
print(output[0].shape)

(25, 100)


In [404]:
eij = np.tanh(np.dot(model.layers[-2].get_weights(), np.transpose(output[0])))
ai = np.exp(eij)
for i in range(len(np.sum(test_data[0], axis = 1))):
    if np.sum(test_data[0], axis = 1)[i] == 0:
        ai[0][i] = 0.0
weights = ai/np.sum(ai)

print(weights[0][:len(revs[0])])
print(len(revs[0]))

[ 0.05552082  0.07892545  0.06577173  0.09960775  0.11561187  0.08201135
  0.14752722  0.16850677  0.05990895  0.06151862  0.06508944]
11


In [405]:
list(zip(revs[0], weights[0][:len(revs[0])]))

[('excellent service, clean restaurant, absolutely gorgeous wooden buddah table next bar.',
  0.055520818),
 ('arrived known expect bus elderly retirees leaving.', 0.078925453),
 ('authentic chinese food.', 0.065771729),
 ('bad sauces.', 0.099607751),
 ('much sugar.', 0.11561187),
 ('balance sweet, spicy salty.', 0.082011349),
 ('im really disappointed referred check please arizona (channel 8).',
  0.14752722),
 ('sauces terrible flavor.', 0.16850677),
 ('proteins delicious veggies.', 0.059908953),
 ('think came chef night.', 0.061518621),
 ('ill give another try.', 0.065089442)]

### Check word attention

In [406]:
word_atten_output = K.function([sentEncoder.layers[0].input, K.learning_phase()], [sentEncoder.layers[2].output])
out = word_atten_output([test_data_words, 0])[0]  # test mode
print(out[0].shape)

(100, 100)


In [407]:
eij = np.tanh(np.dot(sentEncoder.layers[-1].get_weights(), np.transpose(out[0])))
ai = np.exp(eij)
count=0
for i in range(len(np.sum(test_data_words, axis = 0))):
    if np.sum(test_data_words, axis = 0)[i] == 0:
        ai[0][i] = 0.0
    else:
        count+=1
weights = ai/np.sum(ai)

print(weights[0][:count])
print(count)

[ 0.44965282  0.27460587  0.27574128]
3


In [408]:
list(zip(test_words, weights[0][:count]))

[('authentic', 0.44965282), ('chinese', 0.27460587), ('food', 0.27574128)]