In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Read data

In [None]:
def read_data(filename):
  data = []

  with open('drive/MyDrive/CS4248 group project/ABSA_data/'+filename, 'r') as xml_file:
    tree = ET.parse(xml_file)
    sentences = tree.getroot()

    for sent in sentences:
      record = dict()
      record["id"] = sent.attrib['id']
      record["text"] = sent.findall(".//text")[0].text
      record["aspectTerms"] = []
      record["aspectCats"] = []

      aspectTerms = sent.findall(".//aspectTerms")
      aspectCats = sent.findall(".//aspectCategories")
    
      if aspectTerms:
        record["aspectTerms"] = [term.attrib for term in sent.findall(".//aspectTerms")[0]]
      if aspectCats:
        record["aspectCats"] = [cat.attrib for cat in sent.findall(".//aspectCategories")[0]]

      data.append(record)
  return pd.DataFrame(data)

In [None]:
data_train = read_data("Restaurants_Train.xml")
data_test = read_data("Restaurants_Test.xml")

In [None]:
data_train = read_data("Laptops_Train.xml")
data_test = read_data("Laptops_Test.xml")

### BIO tagging

In [None]:
def BIO_tagging(text, aspectTerms):
  origin_len = len(text)  # get the original length of the text

  terms_info = []
  for term in aspectTerms:
    terms_info.append((int(term['from']), int(term['to']), len(term['term'].split())))  # (from, to, #words) for each aspect term

  terms_info = sorted(terms_info, key=lambda t: t[0]) # sort the aspect terms by their indices for tagging in order
  indent = 0
  for fr, to, length in terms_info:
    tagging = '_B'+ ' _I'*(length-1)
    text = text[:fr-indent] + tagging + text[to-indent:]
    indent = origin_len - len(text)
  res = []
  for token in word_tokenize(text):
    if token == '_B':
      res.append(1)
    elif token == '_I':
      res.append(2)
    else:
      res.append(0)
  return res

In [None]:
text_train = data_train.apply(lambda x: word_tokenize(x['text']), axis=1)
Y_train = data_train.apply(lambda x: BIO_tagging(x['text'], x['aspectTerms']), axis=1)

In [None]:
text_test = data_test.apply(lambda x: word_tokenize(x['text']), axis=1)
Y_test = data_test.apply(lambda x: BIO_tagging(x['text'], x['aspectTerms']), axis=1)

In [None]:
N_train = len(Y_train)
N_test = len(Y_test)

### Word embeddings

1. Spacy en_core_web_sm

In [None]:
!pip install --upgrade spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.2.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 22.1 MB/s 
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (653 kB)
[K     |████████████████████████████████| 653 kB 45.6 MB/s 
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 58.7 MB/s 
[?25hCollecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 58.7 MB/s 
[?25hCollecting pathy>=0.3.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
embed_size = 96

In [None]:
X_train = []
X_train_len = []
for sent in text_train:
  embeddings = [nlp(word).vector for word in sent]
  X_train.append(embeddings)
  X_train_len.append(len(embeddings))



In [None]:
X_test = []
X_test_len = []
for sent in text_test:
  embeddings = [nlp(word).vector for word in sent]
  X_test.append(embeddings)
  X_test_len.append(len(embeddings))



2. Gensim glove

In [None]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
# glove_vectors = gensim.downloader.load('glove-twitter-100')



In [None]:
import pickle

# with open('drive/MyDrive/CS4248 group project/glove_vectors_100.pkl', 'wb') as outp:
#     pickle.dump(glove_vectors, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
# load Glove word embeddings
with open('drive/MyDrive/CS4248 group project/glove_vectors_100.pkl', 'rb') as inp:
    glove_vectors = pickle.load(inp)

In [None]:
embed_size = 100

In [None]:
X_train = []
X_train_len = []
for sent in text_train:
  embeddings = []
  for word in sent:
    if word in glove_vectors:
      embeddings.append(glove_vectors[word.lower()])
    else:
      embeddings.append(np.zeros(100))
  X_train.append(embeddings)
  X_train_len.append(len(embeddings))


In [None]:
X_test = []
X_test_len = []
for sent in text_test:
  embeddings = []
  for word in sent:
    if word in glove_vectors:
      embeddings.append(glove_vectors[word.lower()])
    else:
      embeddings.append(np.zeros(100))
  X_test.append(embeddings)
  X_test_len.append(len(embeddings))

## Modelling

In [None]:
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, TimeDistributed, Bidirectional, LSTM
from tensorflow.keras.optimizers import Adam
from tqdm.keras import TqdmCallback

In [None]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
# if gpus:
#   # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
#   try:
#     tf.config.experimental.set_virtual_device_configuration(
#         gpus[0],
#         [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
#     logical_gpus = tf.config.experimental.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#   except RuntimeError as e:
#     # Virtual devices must be set before GPUs have been initialized
#     print(e)

### Padding

In [None]:
X_train = sequence.pad_sequences(X_train, padding='post')
X_train.shape

(3044, 79, 96)

In [None]:
padded_len = X_train.shape[1]

In [None]:
Y_train = to_categorical(sequence.pad_sequences(Y_train, padding='post'))
Y_train.shape

(3044, 79, 3)

In [None]:
X_test = sequence.pad_sequences(X_test, maxlen=padded_len, padding='post')
X_test.shape

(800, 79, 96)

In [None]:
Y_test = to_categorical(sequence.pad_sequences(Y_test, maxlen=padded_len, padding='post'))
Y_test.shape

(800, 79, 3)

In [None]:
n_tags = 3

### Vanilla RNN

In [None]:
input = Input(shape=(padded_len, embed_size))
model = SimpleRNN(64, return_sequences=True,
                  recurrent_dropout=0.1) (input)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

model = Model(input, out)
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 83, 100)]         0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 83, 64)            10560     
                                                                 
 time_distributed_3 (TimeDis  (None, 83, 3)            195       
 tributed)                                                       
                                                                 
Total params: 10,755
Trainable params: 10,755
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(
  X_train,
  Y_train,
  batch_size=32,
  epochs=15,
  validation_data=(X_test, Y_test),
  callbacks=[TqdmCallback(verbose=1)],
  verbose=0,
)

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

In [None]:
Y_pred = model.predict(X_test)
tag_pred = np.array([np.argmax(sent, axis=1) for sent in Y_pred])
tag_pred[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
tag_true = np.array([np.argmax(sent, axis=1) for sent in Y_test])
tag_true[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
f1_score(tag_true.flatten(), tag_pred.flatten(), average='macro')

0.5394446811856677

### LSTM

In [None]:
input = Input(shape=(padded_len, embed_size))
model = LSTM(units=64, return_sequences=True, 
             recurrent_dropout=0.1)(input)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

model = Model(input, out)
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 83, 100)]         0         
                                                                 
 lstm_2 (LSTM)               (None, 83, 64)            42240     
                                                                 
 time_distributed_4 (TimeDis  (None, 83, 3)            195       
 tributed)                                                       
                                                                 
Total params: 42,435
Trainable params: 42,435
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(
  X_train,
  Y_train,
  batch_size=32,
  epochs=15,
  validation_data=(X_test, Y_test),
  callbacks=[TqdmCallback(verbose=1)],
  verbose=0,
)

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

In [None]:
Y_pred = model.predict(X_test)
tag_pred = np.array([np.argmax(sent, axis=1) for sent in Y_pred])
tag_true = np.array([np.argmax(sent, axis=1) for sent in Y_test])
f1_score(tag_true.flatten(), tag_pred.flatten(), average='macro')

0.5603888836608214

### Bi-LSTM

In [None]:
input = Input(shape=(padded_len,embed_size))
model = Bidirectional(LSTM(units=64, return_sequences=True, 
                           recurrent_dropout=0.1))(input)  
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

model = Model(input, out)
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 79, 96)]          0         
                                                                 
 bidirectional (Bidirectiona  (None, 79, 128)          82432     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 79, 3)            387       
 ibuted)                                                         
                                                                 
Total params: 82,819
Trainable params: 82,819
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(
  X_train,
  Y_train,
  batch_size=32,
  epochs=15,
  validation_data=(X_test, Y_test),
  callbacks=[TqdmCallback(verbose=1)],
  verbose=0,
)

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

In [None]:
Y_pred = model.predict(X_test)
tag_pred = np.array([np.argmax(sent, axis=1) for sent in Y_pred])
tag_true = np.array([np.argmax(sent, axis=1) for sent in Y_test])
f1_score(tag_true.flatten(), tag_pred.flatten(), average='macro')

0.7947936432053693

In [None]:
tag_pred_vary_len = [tag_pred[i,:X_test_len[i]] for i in range(N_test)]
tag_true_vary_len = [tag_true[i,:X_test_len[i]] for i in range(N_test)]

In [None]:
def decode_BIO_taggings(text, tags):
  extracted_terms = []
  current_term = ""
  for i, tag in enumerate(tags):
    if tag == 1:
      current_term += text[i]
    elif tag == 2:
      current_term += " " + text[i]
    else:
      if current_term != "" :
        extracted_terms.append(current_term)
        current_term = ""
  return extracted_terms

In [None]:
comparison = pd.DataFrame()
comparison['text'] = data_test['text']
comparison['truth'] = [decode_BIO_taggings(text_test[i], tag_true_vary_len[i]) for i in range(N_test)]
comparison['pred'] = [decode_BIO_taggings(text_test[i], tag_pred_vary_len[i]) for i in range(N_test)]
comparison

Unnamed: 0,text,truth,pred
0,The bread is top notch as well.,[bread],[bread]
1,I have to say they have one of the fastest del...,[delivery times],[]
2,Food is always fresh and hot- ready to eat!,[Food],[Food]
3,Did I mention that the coffee is OUTSTANDING?,[coffee],[coffee]
4,"Certainly not the best sushi in New York, howe...","[sushi, place]","[sushi, place]"
...,...,...,...
795,"Anyway, the owner was fake.",[owner],[owner]
796,Owner is pleasant and entertaining.,[Owner],[Owner]
797,"I have never in my life sent back food before,...","[food, waiter]","[food, waiter]"
798,"Although the restaurant itself is nice, I pref...",[food],[food]


In [None]:
comparison.to_csv("drive/MyDrive/CS4248 group project/tables/test_spacy", index=False)