In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import xml.etree.ElementTree as ET
import pandas as pd


In [2]:
def import_file_to_df(file_path):
  data = []

  with open(file_path, 'r') as xml_file:
    tree = ET.parse(xml_file)
    sentences = tree.getroot()

    for sent in sentences:
      record = dict()
      record["id"] = sent.attrib['id']
      record["text"] = sent.findall(".//text")[0].text
      record["aspectTerms"] = []
      # record["aspectCats"] = []

      aspectTerms = sent.findall(".//aspectTerms")
      if aspectTerms:
        record["aspectTerms"] = [term.attrib for term in sent.findall(".//aspectTerms")[0]]

      # aspectCats = sent.findall(".//aspectCategories")
      # if aspectCats:
      #   record["aspectCats"] = [cat.attrib for cat in sent.findall(".//aspectCategories")[0]]

      data.append(record)
  return pd.DataFrame(data)

In [None]:
df = import_file_to_df('drive/MyDrive/ABSA_data/Restaurants_Train.xml')
df

Unnamed: 0,id,text,aspectTerms
0,3121,But the staff was so horrible to us.,"[{'term': 'staff', 'polarity': 'negative', 'fr..."
1,2777,"To be completely fair, the only redeeming fact...","[{'term': 'food', 'polarity': 'positive', 'fro..."
2,1634,"The food is uniformly exceptional, with a very...","[{'term': 'food', 'polarity': 'positive', 'fro..."
3,2534,Where Gabriela personaly greets you and recomm...,[]
4,583,"For those that go once and don't enjoy it, all...",[]
...,...,...,...
3039,1063,But that is highly forgivable.,[]
3040,777,"From the appetizers we ate, the dim sum and ot...","[{'term': 'appetizers', 'polarity': 'positive'..."
3041,875,"When we arrived at 6:00 PM, the restaurant was...",[]
3042,671,Each table has a pot of boiling water sunken i...,"[{'term': 'table', 'polarity': 'neutral', 'fro..."


##POST

In [None]:
!pip install --upgrade spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
tokens_series = []
tags_series = []
noun_chunks_series = []
# gonna take ~30s to run
for index, row in df.iterrows():
  sentence_doc = nlp(row['text'])
  tokens = []
  tags = []
  noun_chunks = []
  for token in sentence_doc:
    tokens.append(token.text)
    tags.append(token.tag_)
  for chunks in sentence_doc.noun_chunks:
    noun_chunks.append(chunks)
  # print('Toekns={}, tags={}'.format(tokens, tags))
  tokens_series.append(tokens)
  tags_series.append(tags)
  noun_chunks_series.append(noun_chunks)
  # print('row={}'.format(row))
df['tokens'] = tokens_series
df['tags'] = tags_series
df['noun_chunks'] = noun_chunks_series

In [None]:
df

Unnamed: 0,id,text,aspectTerms,tokens,tags,noun_chunks
0,3121,But the staff was so horrible to us.,"[{'term': 'staff', 'polarity': 'negative', 'fr...","[But, the, staff, was, so, horrible, to, us, .]","[CC, DT, NN, VBD, RB, JJ, IN, PRP, .]","[(the, staff), (us)]"
1,2777,"To be completely fair, the only redeeming fact...","[{'term': 'food', 'polarity': 'positive', 'fro...","[To, be, completely, fair, ,, the, only, redee...","[TO, VB, RB, JJ, ,, DT, JJ, VBG, NN, VBD, DT, ...","[(the, only, redeeming, factor), (the, food), ..."
2,1634,"The food is uniformly exceptional, with a very...","[{'term': 'food', 'polarity': 'positive', 'fro...","[The, food, is, uniformly, exceptional, ,, wit...","[DT, NN, VBZ, RB, JJ, ,, IN, DT, RB, JJ, NN, W...","[(The, food), (a, very, capable, kitchen), (wh..."
3,2534,Where Gabriela personaly greets you and recomm...,[],"[Where, Gabriela, personaly, greets, you, and,...","[WRB, NNP, NNP, VBZ, PRP, CC, VBZ, PRP, WP, TO...","[(Gabriela, personaly), (you), (you), (what)]"
4,583,"For those that go once and don't enjoy it, all...",[],"[For, those, that, go, once, and, do, n't, enj...","[IN, DT, WDT, VBP, RB, CC, VBP, RB, VB, PRP, ,...","[(those), (that), (it), (all), (I), (they), (it)]"
...,...,...,...,...,...,...
3039,1063,But that is highly forgivable.,[],"[But, that, is, highly, forgivable, .]","[CC, DT, VBZ, RB, JJ, .]",[(that)]
3040,777,"From the appetizers we ate, the dim sum and ot...","[{'term': 'appetizers', 'polarity': 'positive'...","[From, the, appetizers, we, ate, ,, the, dim, ...","[IN, DT, NNS, PRP, VBD, ,, DT, JJ, NN, CC, JJ,...","[(the, appetizers), (we), (foods), (it), (the,..."
3041,875,"When we arrived at 6:00 PM, the restaurant was...",[],"[When, we, arrived, at, 6:00, PM, ,, the, rest...","[WRB, PRP, VBD, IN, CD, NN, ,, DT, NN, VBD, RB...","[(we), (6:00, PM), (the, restaurant)]"
3042,671,Each table has a pot of boiling water sunken i...,"[{'term': 'table', 'polarity': 'neutral', 'fro...","[Each, table, has, a, pot, of, boiling, water,...","[DT, NN, VBZ, DT, NN, IN, NN, NN, VBN, IN, PRP...","[(Each, table), (a, pot), (boiling, water), (i..."


## B-LSTM + CRF Model

neurNER needs Tensorflow 1 (duh!), need to downgrade the Tensorflow 2 that comes with Colab

In [3]:
%tensorflow_version 1.15.2
import tensorflow as tf
print(tf.__version__)

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.15.2`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.
1.15.2


Download neuroNER engine

In [4]:
!pip3 install pyneuroner[cpu]

Collecting pyneuroner[cpu]
  Downloading pyneuroner-1.0.8-py2.py3-none-any.whl (26.9 MB)
[K     |████████████████████████████████| 26.9 MB 3.9 MB/s 
[?25hCollecting pycorenlp>=0.3.0
  Downloading pycorenlp-0.3.0.tar.gz (1.3 kB)
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 39.9 MB/s 
Building wheels for collected packages: pycorenlp
  Building wheel for pycorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for pycorenlp: filename=pycorenlp-0.3.0-py3-none-any.whl size=2145 sha256=1070d7619633e98bd60e6aac16f2aff948e207a71eaf61bd31fce174beda1375
  Stored in directory: /root/.cache/pip/wheels/83/d8/ad/6b2276343ac605ee47e6beddb28331e96377909e5c816539c3
Successfully built pycorenlp
Installing collected packages: tf-estimator-nightly, pycorenlp, pyneuroner
Successfully installed pycorenlp-0.3.0 pyneuroner-1.0.8 tf-estimator-nightly-2.8.0.dev202112

Download the SpaCy English module

In [6]:
!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 7.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


Download word embeddings. Original paper uses fastText embedding. We use Glove first. Can try fastText later.

In [7]:
!wget -P data/word_vectors http://neuroner.com/data/word_vectors/glove.6B.100d.zip
!unzip data/word_vectors/glove.6B.100d.zip -d data/word_vectors/

--2022-04-06 07:10:12--  http://neuroner.com/data/word_vectors/glove.6B.100d.zip
Resolving neuroner.com (neuroner.com)... 142.44.246.184
Connecting to neuroner.com (neuroner.com)|142.44.246.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 122612186 (117M) [application/zip]
Saving to: ‘data/word_vectors/glove.6B.100d.zip’


2022-04-06 07:10:23 (11.3 MB/s) - ‘data/word_vectors/glove.6B.100d.zip’ saved [122612186/122612186]

Archive:  data/word_vectors/glove.6B.100d.zip
  inflating: data/word_vectors/glove.6B.100d.txt  


Some preparation work

In [4]:
import numpy as np
# from keras.preprocessing import sequence
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from neuroner import neuromodel
import spacy
from sklearn.model_selection import train_test_split

In [5]:
nlp = spacy.load("en_core_web_sm")

Generate input file

In [6]:
def get_idx_iob(idx, range_list):
  # Use IOB2 format
  for range in range_list:
    if range[0] <= idx < range[1]:
      if range[0] == idx:
        return 'B-MISC'
      else:
        return 'I-MISC'
  return 'O'

# CoNLL-2003 format files
# Each word to be put on a separate line and there is an empty line after each sentence.
# Each line  has format: <token> + <single space> + <iob tag>, e.g.: "This O"
def generate_input_file(path, filename, data_df):
  f = open(path + filename, "x")
  for index, row in data_df.iterrows():
    # A list of tuples, each tuple is of (<from>, <to>)
    aspectTermRanges = [(int(aspectTerm['from']), int(aspectTerm['to'])) for aspectTerm in row['aspectTerms']]
    sentence_doc = nlp(row['text'])
    for token in sentence_doc:
      if len(token.text.strip()) == 0:
        continue
      f.write(token.text + ' ' + get_idx_iob(token.idx, aspectTermRanges) + '\n')
    f.write('\n')
  f.close()

In [7]:
input_df = import_file_to_df('drive/MyDrive/ABSA_data/Restaurants_Train.xml')
test_df = import_file_to_df('drive/MyDrive/ABSA_data/Restaurants_Test.xml')

In [None]:
input_df = input_df.drop(input_df[input_df.text.str.contains('touchpad')].index)
test_df = test_df.drop(test_df[test_df.text.str.contains('touchpad')].index)
test_df = test_df.drop(test_df[test_df.text.str.contains('programs')].index)

In [12]:
train_df, validate_df = train_test_split(input_df, test_size=0.2)

In [13]:
# Gonna run for ~40s
generate_input_file('./', 'train.txt', train_df)
generate_input_file('./', 'valid.txt', validate_df)
generate_input_file('./', 'test.txt', test_df)

In [None]:
!rm -r /content/valid*
!rm -r /content/train*
!rm -r /content/test*

Let's roll

In [12]:
# Obtain a model, all configurable params: https://github.com/Franck-Dernoncourt/NeuroNER/blob/master/parameters.ini
# The parameters used here follow the original paper expect mini-batch SGD. This is because vanilla NeuroNER dose not support batch training (i.e. it runs with batch size = 1).
# The paper used mini-batch SGD with batch size 64. We need to modify NeuroNER source to support this.
# So as a baseline, use batch size = 1 first.
tf.reset_default_graph()
nn = neuromodel.NeuroNER(use_crf=True, use_character_lstm=False, train_model=True, use_pretrained_model=False, dataset_text_folder='./', optimizer='adam', learning_rate = 0.01)

{'character_embedding_dimension': 25,
 'character_lstm_hidden_state_dimension': 25,
 'check_for_digits_replaced_with_zeros': 1,
 'check_for_lowercase': 1,
 'dataset_text_folder': './',
 'debug': 0,
 'dropout_rate': 0.5,
 'experiment_name': 'experiment',
 'freeze_token_embeddings': 0,
 'gradient_clipping_value': 5.0,
 'learning_rate': 0.01,
 'load_all_pretrained_token_embeddings': 0,
 'load_only_pretrained_token_embeddings': 0,
 'main_evaluation_mode': 'conll',
 'maximum_number_of_epochs': 100,
 'number_of_cpu_threads': 8,
 'number_of_gpus': 0,
 'optimizer': 'adam',
 'output_folder': './output',
 'output_scores': 0,
 'parameters_filepath': './parameters.ini',
 'patience': 10,
 'plot_format': 'pdf',
 'pretrained_model_folder': './trained_models/conll_2003_en',
 'reload_character_embeddings': 1,
 'reload_character_lstm': 1,
 'reload_crf': 1,
 'reload_feedforward': 1,
 'reload_token_embeddings': 1,
 'reload_token_lstm': 1,
 'remap_unknown_tokens_to_unk': 1,
 'spacylanguage': 'en',
 'taggin

In [13]:
nn.fit()


Starting epoch 0
Training completed in 0.00 seconds
Evaluate model on the train set
processed 38203 tokens with 2960 phrases; found: 22177 phrases; correct: 1153.
accuracy:  32.79%; precision:   5.20%; recall:  38.95%; FB1:   9.17
             MISC: precision:   5.20%; recall:  38.95%; FB1:   9.17  22177

Evaluate model on the valid set
processed 9378 tokens with 739 phrases; found: 5475 phrases; correct: 301.
accuracy:  33.01%; precision:   5.50%; recall:  40.73%; FB1:   9.69
             MISC: precision:   5.50%; recall:  40.73%; FB1:   9.69  5475

Evaluate model on the test set
processed 12758 tokens with 1134 phrases; found: 7409 phrases; correct: 445.
accuracy:  33.16%; precision:   6.01%; recall:  39.24%; FB1:  10.42
             MISC: precision:   6.01%; recall:  39.24%; FB1:  10.42  7409

Generating plots for the train set
Generating plots for the valid set
Generating plots for the test set
/usr/local/lib/python3.7/dist-packages/neuroner
shell_command: perl /usr/local/lib/pyth

Glove + single b-LSTM + CRF: 


*   Restaurants: 60s, acc-0.9559, f1-0.7743
*   laptop: 57.23s, acc-0.9513, f1-0.6086

Glove and bi-LSTM char embedding + single b-LSTM + CRF: 


*   Restaurants: 76s, acc-0.9485, f1-0.7417
*   laptop: 77.39s, acc-0.9475, f1-0.6133



To save model, change the param and run the cell below

In [None]:
import glob
import os
import pickle
from pprint import pprint
import shutil
import neuroner.utils as utils

from neuroner.entity_lstm import EntityLSTM
import tensorflow as tf
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file

from neuroner import utils_tf
from neuroner import neuromodel

def trim_dataset_pickle(input_dataset_filepath, output_dataset_filepath=None, delete_token_mappings=False):
    '''
    Remove the dataset and labels from dataset.pickle. 
    If delete_token_mappings = True, then also remove token_to_index and index_to_token except for UNK.
    '''
    print("Trimming dataset.pickle..")
    if output_dataset_filepath == None:
        output_dataset_filepath = os.path.join(os.path.dirname(input_dataset_filepath), 
            'dataset_trimmed.pickle')
    dataset = pickle.load(open(input_dataset_filepath, 'rb'))
    count = 0
    print("Keys removed:")
    keys_to_remove = ['character_indices', 'character_indices_padded', 'characters', 
        'label_indices', 'label_vector_indices', 'labels', 'token_indices', 
        'token_lengths', 'tokens', 'infrequent_token_indices', 'tokens_mapped_to_unk']
    for key in keys_to_remove:
        if key in dataset.__dict__:
            del dataset.__dict__[key]
            print('\t' + key)
            count += 1            
    if delete_token_mappings:
        dataset.__dict__['token_to_index'] = {dataset.__dict__['UNK']:dataset.__dict__['UNK_TOKEN_INDEX']}
        dataset.__dict__['index_to_token'] = {dataset.__dict__['UNK_TOKEN_INDEX']:dataset.__dict__['UNK']}
    print("Number of keys removed: {0}".format(count))
    pprint(dataset.__dict__)
    pickle.dump(dataset, open(output_dataset_filepath, 'wb'))
    print("Done!")


def trim_model_checkpoint(parameters_filepath, dataset_filepath, input_checkpoint_filepath, 
    output_checkpoint_filepath):
    '''
    Remove all token embeddings except UNK.
    '''
    parameters, _ = neuromodel.load_parameters(parameters_filepath=parameters_filepath)
    dataset = pickle.load(open(dataset_filepath, 'rb'))
    model = EntityLSTM(dataset, parameters) 
    with tf.Session() as sess:
        model_saver = tf.train.Saver()  # defaults to saving all variables
        
        # Restore the pretrained model
        model_saver.restore(sess, input_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched.
        
        # Get pretrained embeddings
        token_embedding_weights = sess.run(model.token_embedding_weights) 
    
        # Restore the sizes of token embedding weights
        utils_tf.resize_tensor_variable(sess, model.token_embedding_weights, 
            [1, parameters['token_embedding_dimension']]) 
            
        initial_weights = sess.run(model.token_embedding_weights)
        initial_weights[dataset.UNK_TOKEN_INDEX] = token_embedding_weights[dataset.UNK_TOKEN_INDEX]
        sess.run(tf.assign(model.token_embedding_weights, initial_weights, validate_shape=False))
    
        token_embedding_weights = sess.run(model.token_embedding_weights) 
        print("token_embedding_weights: {0}".format(token_embedding_weights))
        
        model_saver.save(sess, output_checkpoint_filepath)
            
    dataset.__dict__['vocabulary_size'] = 1
    pickle.dump(dataset, open(dataset_filepath, 'wb'))
    pprint(dataset.__dict__)


def prepare_pretrained_model_for_restoring(output_folder_name, epoch_number, 
    model_name, delete_token_mappings=False):
    '''
    Copy the dataset.pickle, parameters.ini, and model checkpoint files after 
    removing the data used for training.
    
    The dataset and labels are deleted from dataset.pickle by default. The only 
    information about the dataset that remain in the pretrained model
    is the list of tokens that appears in the dataset and the corresponding token 
    embeddings learned from the dataset.
    
    If delete_token_mappings is set to True, index_to_token and token_to_index 
    mappings are deleted from dataset.pickle additionally,
    and the corresponding token embeddings are deleted from the model checkpoint 
    files. In this case, the pretrained model would not contain
    any information about the dataset used for training the model. 
    
    If you wish to share a pretrained model with delete_token_mappings = True, 
    it is highly recommended to use some external pre-trained token 
    embeddings and freeze them while training the model to obtain high performance. 
    This can be done by specifying the token_pretrained_embedding_filepath 
    and setting freeze_token_embeddings = True in parameters.ini for training.
    '''
    input_model_folder = os.path.join('.', 'output', output_folder_name, 'model')
    output_model_folder = os.path.join('.', 'trained_models', model_name)
    utils.create_folder_if_not_exists(output_model_folder)

    # trim and copy dataset.pickle
    input_dataset_filepath = os.path.join(input_model_folder, 'dataset.pickle')
    output_dataset_filepath = os.path.join(output_model_folder, 'dataset.pickle')
    trim_dataset_pickle(input_dataset_filepath, output_dataset_filepath, 
        delete_token_mappings=delete_token_mappings)
    
    # copy parameters.ini
    parameters_filepath = os.path.join(input_model_folder, 'parameters.ini')
    shutil.copy(parameters_filepath, output_model_folder)
    
    # (trim and) copy checkpoint files
    epoch_number_string = str(epoch_number).zfill(5)
    if delete_token_mappings:
        input_checkpoint_filepath = os.path.join(input_model_folder, 
            'model_{0}.ckpt'.format(epoch_number_string))
        output_checkpoint_filepath = os.path.join(output_model_folder, 'model.ckpt')
        trim_model_checkpoint(parameters_filepath, output_dataset_filepath, 
            input_checkpoint_filepath, output_checkpoint_filepath)
    else:
        for filepath in glob.glob(os.path.join(input_model_folder, 
            'model_{0}.ckpt*'.format(epoch_number_string))):
            shutil.copyfile(filepath, os.path.join(output_model_folder, 
                os.path.basename(filepath).replace('_' + epoch_number_string, '')))

 
def check_contents_of_dataset_and_model_checkpoint(model_folder):
    '''
    Check the contents of dataset.pickle and model_xxx.ckpt.
    model_folder: folder containing dataset.pickle and model_xxx.ckpt to be checked. 
    '''
    dataset_filepath = os.path.join(model_folder, 'dataset.pickle')
    dataset = pickle.load(open(dataset_filepath, 'rb'))
    pprint(dataset.__dict__)
    pprint(list(dataset.__dict__.keys()))

    checkpoint_filepath = os.path.join(model_folder, 'model.ckpt')
    with tf.Session() as sess:
        print_tensors_in_checkpoint_file(checkpoint_filepath, 
            tensor_name='token_embedding/token_embedding_weights', all_tensors=True)
        print_tensors_in_checkpoint_file(checkpoint_filepath, 
            tensor_name='token_embedding/token_embedding_weights', all_tensors=False)


if __name__ == '__main__':
    output_folder_name = '_2022-03-12_16-42-57-891711'
    epoch_number = 7
    model_name = 'default_glove'
    delete_token_mappings = False
    prepare_pretrained_model_for_restoring(output_folder_name, epoch_number, 
        model_name, delete_token_mappings)

In [None]:
!zip -r ./trained_model.zip /content/trained_models/
from google.colab import files
files.download("./trained_model.zip")

  adding: content/trained_models/ (stored 0%)
  adding: content/trained_models/default_glove/ (stored 0%)
  adding: content/trained_models/default_glove/model.ckpt.index (deflated 53%)
  adding: content/trained_models/default_glove/parameters.ini (stored 0%)
  adding: content/trained_models/default_glove/model.ckpt.data-00000-of-00001 (deflated 32%)
  adding: content/trained_models/default_glove/dataset.pickle (deflated 52%)
  adding: content/trained_models/default_glove/model.ckpt.meta (deflated 40%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# Generate result table
import re

test_predict_filename = 'test.txt'
train_predict_filename = 'train.txt'
model_num = '001_'
run_num = '_2022-04-06_08-57-15-219265'

c_tokens = []
c_truth = []
c_prediction = []
result_df = pd.DataFrame(data={'case':[], 'truth':[], 'prediction':[]})

with open('/content/output/' + run_num + '/' + model_num + test_predict_filename, 'r') as output_file:
  lines = output_file.readlines()
  for line in lines:
    if len(line.strip()) == 0:
      sentence = ' '.join(c_tokens)
      
      true_aspect = ' '.join(c_truth)
      true_aspect = re.sub(r"(, ?)+",',',true_aspect)
      true_aspect = re.sub(r"^, ?| ?, ?$","", true_aspect)
      predict_aspect = ' '.join(c_prediction)
      predict_aspect = re.sub(r"(, ?)+",',',predict_aspect)
      predict_aspect = re.sub(r"^, ?| ?, ?$","", predict_aspect)
      result_df = result_df.append({'case':sentence, 'truth':re.sub(r',+',',',true_aspect), 'prediction':re.sub(r',+',',',predict_aspect)}, ignore_index=True)
      c_tokens=[]
      c_truth = []
      c_prediction = []
      continue
    values = line.split()
    c_tokens.append(values[0])
    if values[-2][0] != 'O':
      c_truth.append(values[0])
    else:
      c_truth.append(',')
    if values[-1][0] != 'O':
      c_prediction.append(values[0])
    else:
      c_prediction.append(',')



In [15]:
result_df

Unnamed: 0,case,truth,prediction
0,The bread is top notch as well .,bread,bread
1,I have to say they have one of the fastest del...,delivery times,delivery
2,Food is always fresh and hot- ready to eat !,Food,Food
3,Did I mention that the coffee is OUTSTANDING ?,coffee,coffee
4,"Certainly not the best sushi in New York , how...","sushi ,place",sushi
...,...,...,...
795,"Anyway , the owner was fake .",owner,owner
796,Owner is pleasant and entertaining .,Owner,
797,I have never in my life sent back food before ...,"food ,waiter","food ,waiter"
798,"Although the restaurant itself is nice , I pre...",food,food


In [16]:
result_df.to_csv('test_result_labels.csv')

# No embedding, b-LSTM + CRF

In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-gmd9n0n_
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-gmd9n0n_


In [None]:
!pip install -U tensorflow-addons

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.16.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 19.4 MB/s eta 0:00:01[K     |▋                               | 20 kB 23.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 12.7 MB/s eta 0:00:01[K     |█▏                              | 40 kB 14.0 MB/s eta 0:00:01[K     |█▌                              | 51 kB 9.7 MB/s eta 0:00:01[K     |█▊                              | 61 kB 11.2 MB/s eta 0:00:01[K     |██                              | 71 kB 11.6 MB/s eta 0:00:01[K     |██▍                             | 81 kB 10.5 MB/s eta 0:00:01[K     |██▋                             | 92 kB 11.5 MB/s eta 0:00:01[K     |███                             | 102 kB 11.9 MB/s eta 0:00:01[K     |███▏                            | 112 kB 11.9 MB/s eta 0:00:01[K     |███▌                            | 122 kB 11.9 MB/s eta 0:00:01[K 

In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

Using TensorFlow backend.


In [None]:
# train_file = '/content/drive/MyDrive/ABSA_data/Restaurants_Train.xml'
# test_file = '/content/drive/MyDrive/ABSA_data/Restaurants_Test.xml'

train_file = '/content/drive/MyDrive/ABSA_data/Laptops_Train.xml'
test_file = '/content/drive/MyDrive/ABSA_data/Laptops_Test.xml'

In [None]:
def xml_to_dataframe(filepath):
  data = []

  with open(filepath, 'r') as xml_file:
    tree = ET.parse(xml_file)
    sentences = tree.getroot()

    for sent in sentences:
      record = dict()
      record["id"] = sent.attrib['id']
      record["text"] = sent.findall(".//text")[0].text
      record["aspectTerms"] = []
      # record["aspectCats"] = []

      aspectTerms = sent.findall(".//aspectTerms")
      if aspectTerms:
        record["aspectTerms"] = [term.attrib for term in sent.findall(".//aspectTerms")[0]]

      # aspectCats = sent.findall(".//aspectCategories")
      # if aspectCats:
      #   record["aspectCats"] = [cat.attrib for cat in sent.findall(".//aspectCategories")[0]]

      data.append(record)
  return pd.DataFrame(data)

In [None]:
train_df = xml_to_dataframe(train_file)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
tokens_list = []
for index, row in train_df.iterrows():
  sentence_doc = nlp(row['text'])
  tokens = list(sentence_doc)
  tokens_list.append(tokens)

In [None]:
flat_list = [item.text for sublist in tokens_list for item in sublist]
vocab = set(flat_list)
# idx 0 for unknown word
word2idx = {w: i + 1 for i, w in enumerate(vocab)}
tag2idx = {"PAD":0, "O":1, "I":2, "B":3}

In [None]:
def get_idx_iob(idx, range_list):
  # Use IOB2 format
  for range in range_list:
    if range[0] <= idx < range[1]:
      if range[0] == idx:
        return 'B'
      else:
        return 'I'
  return 'O'

def get_train_inputs(xml_df):
  train_x = []
  train_y = []

  for index, row in xml_df.iterrows():
    # A list of tuples, each tuple is of (<from>, <to>)
    aspectTermRanges = [(int(aspectTerm['from']), int(aspectTerm['to'])) for aspectTerm in row['aspectTerms']]
    sentence_doc = nlp(row['text'])
    train_x.append([word2idx.get(w.text,0) for w in sentence_doc])
    train_y.append([tag2idx[get_idx_iob(w.idx, aspectTermRanges)] for w in sentence_doc])
  return train_x, train_y

In [None]:
train_x, train_y = get_train_inputs(train_df)

In [None]:
max_len = max(len(idxs) for idxs in train_x )

train_x_pad = pad_sequences(maxlen=max_len, sequences=train_x, padding="post", value=0)
train_y_pad = pad_sequences(maxlen=max_len, sequences=train_y, padding="post", value=0)
# Get one-hot labels
train_y_onehot = [to_categorical(i, num_classes=4) for i in train_y_pad]

train_x_pad = np.array(train_x_pad)
train_y_onehot = np.array(train_y_onehot).astype(int)

In [None]:
from keras import backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
# Model
input = Input(shape=(max_len,))
model = Embedding(input_dim=len(word2idx) + 1, output_dim=20,
                  input_length=max_len, mask_zero=False)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="tanh"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(4)  # CRF layer
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 84)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 84, 20)            104500    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 84, 100)           28400     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 84, 50)            5050      
_________________________________________________________________
crf_7 (CRF)                  (None, 84, 4)             228       
Total params: 138,178
Trainable params: 138,178
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(train_x_pad, train_y_onehot, batch_size=32, epochs=8,
                    validation_split=0.1, verbose=1)

Train on 2743 samples, validate on 305 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
test_df = xml_to_dataframe(test_file)
test_x, text_y = get_train_inputs(test_df)

test_x_pad = pad_sequences(maxlen=max_len, sequences=test_x, padding="post", value=0)
test_y_pad = pad_sequences(maxlen=max_len, sequences=text_y, padding="post", value=0)

In [None]:
test_predict = model.predict(test_x_pad)
p_all_label= np.argmax(test_predict, axis=-1) 

In [None]:
idx2word = {value: key for key, value in word2idx.items()}
idx2tag = {value: key for key, value in tag2idx.items()}

true_all_tags = [[idx2tag[idx] for idx in s if idx!=0] for s in test_y_pad]
p_all_tags = [[idx2tag[idx] for idx in s] for s in p_all_label]

for i, true in enumerate(true_all_tags):
    length = len(true)
    p_all_tags[i] = p_all_tags[i][:length]

p_all_tags = [[x.replace('PAD', 'O') for x in s] for s in p_all_tags]

In [None]:
# Evaluation
from seqeval.metrics import f1_score, classification_report
print(f1_score(true_all_tags, p_all_tags))
print(classification_report(true_all_tags, p_all_tags))

0.38716356107660455
              precision    recall  f1-score   support

           _       0.60      0.29      0.39       653

   micro avg       0.60      0.29      0.39       653
   macro avg       0.60      0.29      0.39       653
weighted avg       0.60      0.29      0.39       653



In [None]:
!pip install seqeval