In [65]:
from keras.models import Model
from keras import initializers
from keras.layers import Layer
from keras.layers import TimeDistributed
from keras.layers import Dense, Input, Embedding, GRU, Bidirectional
from keras_preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical
from keras.initializers import Constant
from nltk import tokenize   
from bs4 import BeautifulSoup
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re

embedding_dim = 100
batch_size = 64
gru_dim = 50
max_words = 20000
max_sentences = 15
max_len = 100

In [66]:
class HierarchicalAttentionNetwork(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HierarchicalAttentionNetwork, self).__init__()
    
    def build(self, input_shape):
        assert len(input_shape) == 3
    
        self.W = tf.Variable(self.init((input_shape[-1], self.attention_dim)))
        
        self.b = tf.Variable(self.init((self.attention_dim, )))
        
        self.u = tf.Variable(self.init((self.attention_dim, 1)))
        
        self._trainable_weights = [self.W, self.b, self.u]
        
        super(HierarchicalAttentionNetwork, self).build(input_shape)
        
    def compute_mask(self, inputs, mask = None):
        return mask
    
    def call(self, x, mask=None):
        
        uit = tf.tanh(tf.matmul(x, self.W) + self.b)
        
        ait = tf.exp(tf.squeeze(tf.matmul(uit, self.u), -1))
        
        if mask is not None:
            ait *= tf.cast(mask, tf.float32)
        
        ait /= tf.cast(tf.reduce_sum(ait, axis=1, keepdims=True) + tf.keras.backend.epsilon(), tf.float32)
        
        weighted_input = x * tf.expand_dims(ait, axis=-1)
        
        output = tf.reduce_sum(weighted_input, axis=1)
        return output
    
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
    

In [67]:
yelp_path_text = "./Dataset/yelp_2013_texts.txt"
yelp_path_score = "./Dataset/yelp_2013_score.txt"

In [68]:
texts = []
scores = []
with open(yelp_path_text, 'r', encoding='utf-8', errors="ignore") as file:
    for line in file:
        texts.append(line.strip())
with open(yelp_path_score, 'r') as file:
    for line in file:
        scores.append(line.strip())

In [69]:
paired = list(zip(texts, scores))
np.random.shuffle(paired)
texts, scores = zip(*paired)
texts = list(texts)
scores = list(scores)

In [70]:
texts = texts[:20000]
scores = scores[:20000]

In [71]:
def clean_str(string):
    string = re.sub(r'[^\x00-\x7F]+', r'', string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    string = re.sub(r"<sssss>", "", string)
    string = re.sub(r"-lrb-", "", string)
    string = re.sub(r"-rrb-", "", string)
    string = re.sub(r"\.\.\.", "", string)
    string = string.strip().lower()
    return string

In [72]:
reviews = []
for i in range(len(texts)):
    string = clean_str(texts[i])
    texts[i] = string
    sentences = tokenize.sent_tokenize(texts[i])
    reviews.append(sentences)

In [73]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

In [74]:
word_index = tokenizer.word_index
print("Word index length:", len(word_index))

Word index length: 37970


In [75]:
x = np.zeros((len(reviews), max_sentences, 100), dtype='int32')

In [76]:
for i, sentences in enumerate(reviews):
    for j, sen in enumerate(sentences):
        if j < max_sentences:
            word_tokens = text_to_word_sequence(sen)
            k = 0
            for _, word in enumerate(word_tokens):
                if k < max_len and word_index[word] < max_words:
                    x[i, j, k] = word_index[word]
                    k = k + 1

In [77]:
print(x.shape)

(20000, 15, 100)


In [78]:
y = to_categorical(scores)
x_train = x[:18000]
y_train = y[:18000]
x_val = x[18000:]
y_val = y[18000:]

In [79]:
embeddings = dict()
with open('./glove.6B.100d.txt', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coef   

In [80]:
embedding_matrix = np.random.random((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [85]:
review_input = Input(shape=(max_sentences, max_len), dtype='int32')
embedded_sequences = TimeDistributed(Embedding(embedding_matrix.shape[0], 
                                               embedding_dim,
                                               embeddings_initializer=Constant(embedding_matrix),
                                               mask_zero=True,
                                               trainable=True))(review_input)
word_encoder = TimeDistributed(Bidirectional(GRU(gru_dim, return_sequences=True)))(embedded_sequences)
word_attn = TimeDistributed(HierarchicalAttentionNetwork(100))(word_encoder)

sentence_encoder = Bidirectional(GRU(gru_dim, return_sequences=True))(word_attn)
sentence_attn = HierarchicalAttentionNetwork(100)(sentence_encoder)

output = Dense(6, activation='softmax')(sentence_attn)
model = Model(review_input, output)


In [86]:
model.summary()

In [87]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=64)

Epoch 1/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m790s[0m 3s/step - accuracy: 0.3410 - loss: 1.4626 - val_accuracy: 0.3610 - val_loss: 1.3815
Epoch 2/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m764s[0m 3s/step - accuracy: 0.4210 - loss: 1.2777 - val_accuracy: 0.5625 - val_loss: 1.0012
Epoch 3/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m706s[0m 3s/step - accuracy: 0.5839 - loss: 0.9280 - val_accuracy: 0.5860 - val_loss: 0.9597
Epoch 4/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m548s[0m 2s/step - accuracy: 0.6558 - loss: 0.7836 - val_accuracy: 0.5870 - val_loss: 0.9700
Epoch 5/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m588s[0m 2s/step - accuracy: 0.7205 - loss: 0.6763 - val_accuracy: 0.5840 - val_loss: 1.0429


<keras.src.callbacks.history.History at 0x1fb2f8db3e0>