In [1]:
# importing the required libraries
import xml.etree.ElementTree as ET
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import gensim.downloader as api
from tensorflow.keras.layers import Embedding
import pandas as pd
import numpy as np

tf.random.set_seed(123)

In [None]:
# downloading the dataset
!gdown 1r8SeRlZgWXd5UDuWBUMZ4_4A9bb_6SjY --fuzzy
!gdown 1UmEt8uQuTF6aa2nwTywqFOLRPHp6kkVv --fuzzy
!gdown 1uw1xQ-ryaWtZeDRzvCAIlKJlGFadUdyL --fuzzy
!gdown 1Zg7-x5iHtFelx_sKQJfsgrU_YHtKXXCr --fuzzy
!gdown 1TJDZsa5IOqxuQryhhFQovN72i-1rBLPZ --fuzzy
!gdown 1uXQSKet_KYkQVYyTWmjnYqFebadRtikn --fuzzy

Downloading...
From (original): https://drive.google.com/uc?id=1r8SeRlZgWXd5UDuWBUMZ4_4A9bb_6SjY
From (redirected): https://drive.google.com/uc?id=1r8SeRlZgWXd5UDuWBUMZ4_4A9bb_6SjY&confirm=t&uuid=06ef58af-29b9-48ec-a7e6-9eb77204dfce
To: /content/ABSA16_Restaurants_Train_SB1_v2.xml
100% 723k/723k [00:00<00:00, 16.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UmEt8uQuTF6aa2nwTywqFOLRPHp6kkVv
To: /content/EN_LAPT_SB1_TEST_.xml.gold
100% 272k/272k [00:00<00:00, 62.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1uw1xQ-ryaWtZeDRzvCAIlKJlGFadUdyL
To: /content/EN_REST_SB1_TEST.xml.gold
100% 266k/266k [00:00<00:00, 25.4MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1Zg7-x5iHtFelx_sKQJfsgrU_YHtKXXCr
From (redirected): https://drive.google.com/uc?id=1Zg7-x5iHtFelx_sKQJfsgrU_YHtKXXCr&confirm=t&uuid=4089e18e-c442-458c-ab02-9ad95c6fa1cd
To: /content/Laptop_Train_v2.xml
100% 687k/687k [00:00<00:00, 77.3MB/s]
Downloading...
From: https://drive.google.com

In [None]:
# defining labels
polar_idx={'positive':0,'negative':1,'neutral':2}
idx_polar={v:k for k,v in polar_idx.items()}

In [None]:
# extract xml file into dictionary form.
def parse_xml_2014(fn):

        root = ET.parse(fn).getroot()
        corpus = []
        for review in root.iter("Review"):
            for sent in review.iter("sentence"):
                target2polarity = {}
                forbid = []
                for ix, opin in enumerate(sent.iter('Opinion')):
                    if opin.attrib['polarity'] in polar_idx:
                        if opin.attrib['target'] in target2polarity and target2polarity[opin.attrib['target']] != opin.attrib['polarity']:
                            forbid.append(opin.attrib['target'])
                        target2polarity[opin.attrib['target']] = opin.attrib['polarity']

                for ix, opin in enumerate(sent.iter('Opinion')):
                    if opin.attrib['target'] not in forbid:
                        corpus.append({"id": sent.attrib['id']+"_"+str(ix),
                                        "sentence": sent.find('text').text,
                                        "term": opin.attrib['target'],
                                        "polarity": opin.attrib['polarity']})

        return corpus


# extract xml file into dictionary form.
def parse_xml_2016(fn):
  import json
  f = open(fn)
  data = json.load(f)
  corpus = []
  for i in range(len(data)):
    txt = data[i]['text']
    tmp = data[i]['opinions']['aspect_term']
    for j in  range(len(tmp)):
      labl = tmp[j]['polarity']
      trm = tmp[j]['term']
      corpus.append({'sentence':txt, 'term': trm, 'polarity': labl})

  return corpus

In [None]:
#V1 datset
restaurant_v1 = pd.DataFrame.from_dict(parse_xml_2014('ABSA16_Restaurants_Train_SB1_v2.xml'))

#V2 dataset
restaurant_v2 = pd.DataFrame.from_dict(parse_xml_2014('EN_REST_SB1_TEST.xml.gold'))

# V3 dataset
restaurant_v3 = pd.DataFrame.from_dict(parse_xml_2016('restaurants-train.json'))

# combining the V1,V2 and V3 into a single dataframe
df=pd.concat([restaurant_v1,restaurant_v2,restaurant_v3],axis=0)

# dropping the id column
df=df.drop(columns=['id'])

In [None]:
df

Unnamed: 0,sentence,term,polarity
0,Judging from previous posts this used to be a ...,place,negative
1,"We, there were four of us, arrived at noon - t...",staff,negative
2,"They never brought us complimentary noodles, i...",,negative
3,The food was lousy - too sweet or too salty an...,food,negative
4,The food was lousy - too sweet or too salty an...,portions,negative
...,...,...,...
3688,Each table has a pot of boiling water sunken i...,pot of boiling water,neutral
3689,Each table has a pot of boiling water sunken i...,meats,neutral
3690,Each table has a pot of boiling water sunken i...,vegetables,neutral
3691,Each table has a pot of boiling water sunken i...,rice,neutral


## EDA

In [None]:
# basic check
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6935 entries, 0 to 3692
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  6935 non-null   object
 1   term      6935 non-null   object
 2   polarity  6935 non-null   object
dtypes: object(3)
memory usage: 216.7+ KB


In [None]:
print(f"The shape of the Dataset is {df.shape}")

The shape of the Dataset is (6935, 3)


In [None]:
df.nunique()

sentence    3606
term        1804
polarity       4
dtype: int64

In [None]:
# checking what kind of polarities do we have
df['polarity'].unique()

array(['negative', 'positive', 'neutral', 'conflict'], dtype=object)

In [None]:
# checking the distribution of polarities
df['polarity'].value_counts()

positive    4381
negative    1704
neutral      759
conflict      91
Name: polarity, dtype: int64

In [None]:
# removing the rows where label is "conflict"
df=df[df['polarity']!='conflict']

print("The shape of the Dataframe after removing the rows where the label is conflict ",df.shape)

The shape of the Dataframe after removing the rows where the label is conflict  (6844, 3)


In [None]:
df['polarity'].value_counts()

positive    4381
negative    1704
neutral      759
Name: polarity, dtype: int64

In [None]:
df['polarity'].value_counts(normalize=True)

positive    0.640123
negative    0.248977
neutral     0.110900
Name: polarity, dtype: float64

The Data is imbalanced

In [None]:
df['term'] = df['term'].apply(lambda x: x if len(x.split(' '))<=2 else 'NULL')
print(f"We have {sum(df['term']=='NULL')} rows where term=NULL")

We have 1320 rows where term=NULL


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['term'] = df['term'].apply(lambda x: x if len(x.split(' '))<=2 else 'NULL')


In [None]:
# so removing those rows
mask=df['term']!='NULL'
df=df[mask]
print(f"The shape of the Dataframe after removing the rows is {df.shape}")

The shape of the Dataframe after removing the rows is (5524, 3)


### Pre-Processing

In [11]:
# defining a function which removes the punctuations from a string
import string
def clean_text(s):
  out = s.translate(str.maketrans('', '', string.punctuation))
  return out

In [None]:
# encoding the labels
df=df.replace({'polarity' : { 'positive' : 2, 'negative' : 0, 'neutral' : 1}})

# removing the punctuations in the senteneces
df['sentence']=df['sentence'].apply(lambda x: clean_text(x))

# converting the text into lowercase
df['sentence']=df['sentence'].apply(lambda x: x.lower() )

# converting the terms to the lower case
df['term']=df['term'].apply(lambda x: x.lower() )


In [None]:
# sanity check
import random
idx = random.randint(0, 100)
df.iloc[idx]['sentence']

'i tend to judge a sushi restaurant by its sea urchin which was heavenly at sushi rose'

In [None]:
df['polarity'].value_counts()

2    3555
0    1302
1     667
Name: polarity, dtype: int64

Our data is imbalanced

In [None]:
# doing oversampling since the data is imbalanced
def oversample(train_df,max_size=1356):

  # iterating through each group
  for class_idx,group in train_df.groupby('polarity'):
    # checking the if the no of datapoints in the class is less than max_size
    if len(group)<max_size:
      # sample max_size-len(group) no of points
      sampled=group.sample(max_size-len(group), replace=True)
      train_df = pd.concat([train_df, sampled], ignore_index=True)

  return train_df

df=oversample(df,max_size=1356)

In [None]:
# sanity check whether the data is balanced
df['polarity'].value_counts()

2    3555
0    1356
1    1356
Name: polarity, dtype: int64

Yes our data is balanced

In [None]:
# train test split
from sklearn.model_selection import train_test_split

y=df['polarity']
x=df.drop(columns=['polarity'])

x_train,x_val_test,y_train,y_val_test= train_test_split(x,y,test_size=0.4,random_state=42)
x_test,x_val,y_test,y_val=train_test_split(x_val_test,y_val_test,test_size=0.5,random_state=42)

print(f"The shape of the training data {x_train.shape}")
print(f"The shape of the testing data {x_test.shape}")
print(f"The shape of the validation data {x_val.shape}")

The shape of the training data (3760, 2)
The shape of the testing data (1253, 2)
The shape of the validation data (1254, 2)


In [None]:
# tokenization
from keras.preprocessing.text import Tokenizer

# creating and fitting the tokenizer
tokenizer=Tokenizer(lower=True)
tokenizer.fit_on_texts(x_train['sentence'])


In [None]:
import keras as ke
ke.__version__

'2.15.0'

In [None]:
import pickle
with open('tokenizer_new.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'is': 4,
 'to': 5,
 'was': 6,
 'food': 7,
 'of': 8,
 'for': 9,
 'i': 10,
 'in': 11,
 'service': 12,
 'great': 13,
 'with': 14,
 'we': 15,
 'but': 16,
 'you': 17,
 'it': 18,
 'are': 19,
 'good': 20,
 'not': 21,
 'that': 22,
 'have': 23,
 'this': 24,
 'place': 25,
 'were': 26,
 'at': 27,
 'had': 28,
 'on': 29,
 'very': 30,
 'they': 31,
 'my': 32,
 'so': 33,
 'be': 34,
 'as': 35,
 'our': 36,
 'an': 37,
 'restaurant': 38,
 'if': 39,
 'wine': 40,
 'all': 41,
 'or': 42,
 'like': 43,
 'their': 44,
 'get': 45,
 'menu': 46,
 'there': 47,
 'from': 48,
 'best': 49,
 'staff': 50,
 'its': 51,
 'dinner': 52,
 'sushi': 53,
 'your': 54,
 'been': 55,
 'delicious': 56,
 'one': 57,
 'pizza': 58,
 'out': 59,
 'has': 60,
 'atmosphere': 61,
 'fish': 62,
 'prices': 63,
 'nice': 64,
 'excellent': 65,
 'here': 66,
 'about': 67,
 'more': 68,
 'when': 69,
 'which': 70,
 'drinks': 71,
 'bar': 72,
 'us': 73,
 'what': 74,
 'some': 75,
 'table': 76,
 'would': 77,
 'dishes': 78,
 'only

In [None]:
# #load word embeddings from gensim api
# word2vec = api.load("word2vec-google-news-300")
# embedding_dim = 300

In [None]:
# preparation of embedding matrix (tokens,embedding_dim)
n_tokens=len(tokenizer.word_index)+1
embedding_matrix=np.zeros((n_tokens,embedding_dim))
hits=miss=0

# filling the embedding matrix
for word,i in tokenizer.word_index.items():
  word_embedding=None

  try:
    word_embedding=word2vec[word]
  except Exception as e:
    pass

  if word_embedding is not None:
    embedding_matrix[i]=word_embedding
    hits+=1
  else:
    miss+=1

print(f"Got embedding for {hits} words")
print(f"Embeddings not found for {miss} words")

Got embedding for 3721 words
Embeddings not found for 461 words


Implementing Bahdanau Attention

In [None]:
class Attention(tf.keras.layers.Layer):

  def __init__(self,units):
    super(Attention,self).__init__(name='Attention')

    self.units=units
    self.W1=tf.keras.layers.Dense(self.units)
    self.W2=tf.keras.layers.Dense(self.units)
    self.V=tf.keras.layers.Dense(1)

  def get_config(self):
    config = super().get_config().copy()
    config.update({'units': self.units, })
    return config

  def call(self,query,keys,flag=False):
    """
    query: (batch_size,embedding_dim)
    keys : (batch_size,max_len,embedding_dim)
    """

    # expanding the query along the time axis ==> query: (batch_size,1,embedding_dim)
    if not flag:
      query_time_axis=tf.expand_dims(query,1)
    else:
      query_time_axis=query

    # calculating the attention scores
    i=self.W1(query_time_axis)    # i=(batch_size,units)
    j=self.W2(keys)               # j=(batch_size,max_len,units)


    # adding i and j (i will get broadcasted to match j's dimension and the result will be (batch_size,max_len,units) ) and applying tanh
    k=tf.nn.tanh(i+j)
    # passing the result to the get the attention scores (batch_size,max_len,1)
    scores=self.V(k)
    # applying softmax along axis=1
    attention_weights=tf.nn.softmax(scores,axis=1)

    # getting the context vector (batch_size,max_len,1) * (batch_size,max_len,embedding_dim) = (batch_size,max_len,embedding_dim) attention_weights tensor will get broadcasted automatically
    context_vector=attention_weights*keys
    # summing up along the max_len axis
    context_vector=tf.reduce_sum(context_vector,axis=1)   # context_vector: (batch_size,embedding_dim)

    return context_vector,attention_weights

Model Building

In [None]:
# defining the embedding layer (we have our own word2vec embedding so we need to specify the embeding matrix)
embedding_layer=Embedding(n_tokens,
                          embedding_dim,
                          embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), # initializing with the word2vec embeddings
                          trainable=False,
                          name='embedding_layer')

# defining the review input (the input shape is 20, which means each review should be 20 words long)
review_input=tf.keras.Input(shape=(20,),dtype="float64",name='review_input_text')

# defining the aspect input  (the input shape is 2, which means there should be 2 aspects)
aspect_input=tf.keras.Input(shape=(2,),dtype='float64',name='aspect_input_text')

# passing the review and aspects to the embdding layer
review_embedding=embedding_layer(review_input)

aspect_embedding=embedding_layer(aspect_input)

# passing the review_embeddings to a GRU layer
gru_review,_= tf.keras.layers.GRU(64,return_sequences=True,return_state=True)(review_embedding)

# passing the aspect_embedding to the GRU layer
_,gru_aspect= tf.keras.layers.GRU(64,return_sequences=True,return_state=True)(aspect_embedding)

# passing the aspect and reviews from gru layers to attention layer
context_vector,attention_weights= Attention(128)(gru_aspect,gru_review)

# passing the context vectors to the dense layers
dense64_output=tf.keras.layers.Dense(64,activation='selu')(context_vector)
dense16_output=tf.keras.layers.Dense(16,activation='selu')(dense64_output)
output=tf.keras.layers.Dense(3,activation='softmax')(dense16_output)

# creating the model
model= tf.keras.Model(inputs=[review_input,aspect_input],outputs=output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["acc"],run_eagerly=True)


In [None]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 aspect_input_text (InputLa  [(None, 2)]                  0         []                            
 yer)                                                                                             
                                                                                                  
 review_input_text (InputLa  [(None, 20)]                 0         []                            
 yer)                                                                                             
                                                                                                  
 embedding_layer (Embedding  multiple                     1254900   ['review_input_text[0][0]',   
 )                                                                   'aspect_input_text[0][0

Train and Test Data prep

In [None]:
# converting the training reviews to review sequences
x_train_reviews=tokenizer.texts_to_sequences(x_train['sentence'])
# padding the review sequences
x_train_reviews_padded=tf.keras.preprocessing.sequence.pad_sequences(x_train_reviews,maxlen=20)

# converting the traning aspects to aspect sequences
x_train_aspects=tokenizer.texts_to_sequences(x_train['term'])
x_train_aspects_padded=tf.keras.preprocessing.sequence.pad_sequences(x_train_aspects,maxlen=2)

# train label processing
y_train= tf.keras.utils.to_categorical(y_train,num_classes=3)


In [None]:
# converting the validation reviews to review sequences
x_val_reviews=tokenizer.texts_to_sequences(x_val['sentence'])
# padding the review sequences
x_val_reviews_padded=tf.keras.preprocessing.sequence.pad_sequences(x_val_reviews,maxlen=20)

# converting the validation aspects to aspect sequences
x_val_aspects=tokenizer.texts_to_sequences(x_val['term'])
x_val_aspects_padded=tf.keras.preprocessing.sequence.pad_sequences(x_val_aspects,maxlen=2)

# val label processing
y_val= tf.keras.utils.to_categorical(y_val,num_classes=3)

In [None]:
# converting the test reviews to review sequences
x_test_reviews=tokenizer.texts_to_sequences(x_test['sentence'])
# padding the review sequences
x_test_reviews_padded=tf.keras.preprocessing.sequence.pad_sequences(x_test_reviews,maxlen=20)

# converting the test aspects to aspect sequences
x_test_aspects=tokenizer.texts_to_sequences(x_test['term'])
x_test_aspects_padded=tf.keras.preprocessing.sequence.pad_sequences(x_test_aspects,maxlen=2)

In [None]:
# tensorboard call back
log_dir = "logs/26th_march"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# save model callback
save_model = tf.keras.callbacks.ModelCheckpoint(filepath='aspect_based_sa.h5',
  monitor='val_acc',
  mode='max',
  save_weights_only=True,
  save_best_only=True,
  verbose=1
)

# early stopping callback
es = tf.keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=10)

callbacks = [save_model, es]

In [None]:
# training
history=model.fit(
          [x_train_reviews_padded,x_train_aspects_padded],
          y_train,
          batch_size=64,
          epochs=300,
          validation_data=([x_val_reviews_padded,x_val_aspects_padded],y_val),
          callbacks=callbacks,
          shuffle=True
          )

Epoch 1/300
Epoch 1: val_acc improved from -inf to 0.65391, saving model to aspect_based_sa.h5
Epoch 2/300
Epoch 2: val_acc improved from 0.65391 to 0.69378, saving model to aspect_based_sa.h5
Epoch 3/300
Epoch 3: val_acc improved from 0.69378 to 0.71691, saving model to aspect_based_sa.h5
Epoch 4/300
Epoch 4: val_acc improved from 0.71691 to 0.72648, saving model to aspect_based_sa.h5
Epoch 5/300
Epoch 5: val_acc did not improve from 0.72648
Epoch 6/300
Epoch 6: val_acc improved from 0.72648 to 0.73844, saving model to aspect_based_sa.h5
Epoch 7/300
Epoch 7: val_acc improved from 0.73844 to 0.74960, saving model to aspect_based_sa.h5
Epoch 8/300
Epoch 8: val_acc improved from 0.74960 to 0.75837, saving model to aspect_based_sa.h5
Epoch 9/300
Epoch 9: val_acc did not improve from 0.75837
Epoch 10/300
Epoch 10: val_acc did not improve from 0.75837
Epoch 11/300
Epoch 11: val_acc improved from 0.75837 to 0.76555, saving model to aspect_based_sa.h5
Epoch 12/300
Epoch 12: val_acc did not im

In [None]:
# saving the tokenizer
import pickle

# Save tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# Save the trained model
model.save("absa.h5")


  saving_api.save_model(


In [None]:
import tensorflow as tf

# Save the model in Protobuf format
tf.saved_model.save(model, 'model_pb')


In [None]:
# converting our y_test label into OHE
y_test= tf.keras.utils.to_categorical(y_test,num_classes=3)

# Evaluate the model on test data
evaluation = model.evaluate([x_test_reviews_padded, x_test_aspects_padded], y_test)




In [None]:
import tensorflow as tf

# Load the saved model
model = tf.saved_model.load("pb_model")

Model Inferencing

In [12]:
test_review = None
test_aspect = None
label_map = {
    0:'Negative',
    1: 'Neutral',
    2:'Positive'
}

# getting the test review
while not test_review:
    test_review = input("Enter your review: ")
    if not test_review.strip():
        print("Please enter a valid non-empty review.")

# preprocessing the test review
cleaned_test_review=clean_text(test_review)
cleaned_test_review=cleaned_test_review.lower()

# getting the aspect
while not test_aspect:
    test_aspect = input("Enter the aspect: ")
    if not test_aspect.strip():  # Check if the input is empty or only whitespace
        print("Please enter a non-empty aspect.")
    elif len(test_aspect.split(" "))!=1:
      print("Please enter a single aspect")


# preprocessing the test aspect
cleaned_test_aspect=clean_text(test_review)
cleaned_test_aspect=cleaned_test_aspect.lower()

Enter your review: This food tastes good
Enter the aspect: Food


In [14]:
# loading the tokenzier
with open('pb_model/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# converting the test reviews to review sequences
test_review_sequence=tokenizer.texts_to_sequences([cleaned_test_review])
# padding the review sequences
test_reviews_sequence_padded=tf.keras.preprocessing.sequence.pad_sequences(test_review_sequence,maxlen=20)

# converting the test aspect to aspect sequence
test_aspect_sequence=tokenizer.texts_to_sequences([cleaned_test_aspect])
# padding the review sequences
test_aspect_sequence_padded=tf.keras.preprocessing.sequence.pad_sequences(test_aspect_sequence,maxlen=2)

In [17]:
# Load the saved model
model = tf.saved_model.load("pb_model")

# Convert input data to tensors
test_review_tensor = tf.constant(test_reviews_sequence_padded, dtype=tf.float64)
test_aspect_tensor = tf.constant(test_aspect_sequence_padded, dtype=tf.float64)

# Perform inference using the appropriate signature
infer = model.signatures["serving_default"]
output = infer(aspect_input_text=test_aspect_tensor, review_input_text=test_review_tensor)

# Extract predictions from the output
predictions = output["dense_56"].numpy()

# Get the predicted label using argmax
predicted_label = label_map[np.argmax(predictions)]

# Print the predicted label
print(predicted_label)


Positive


In [18]:
# # prediction
# test_review_input= tf.keras.Input(shape=(20,),dtype="float64")
# test_aspect_input=tf.keras.Input(shape=(2,),dtype="float64")

# pred=model.predict([test_reviews_sequence_padded,test_aspect_sequence_padded])
# print(label_map[np.argmax(pred)])