# NLP - Homework 2 - task a

## Setup

In [None]:
# install lightning
!pip install pytorch_lightning &> /dev/null
!pip install transformers



In [None]:
# here go all the imports
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl


import nltk

from pprint import pprint
from typing import *
from string import punctuation
import re

import transformers
from transformers import BertTokenizer, BertConfig

nltk.download('punkt')

torch.__version__, torch.cuda.get_device_name(0), transformers.__version__

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


('1.8.1+cu101', 'Tesla T4', '4.6.1')

## Allow reproducibility

In [None]:
SEED = 1234

# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
pl.seed_everything(SEED)

torch.backends.cudnn.deterministic = True  # will use only deterministic algorithms

Global seed set to 1234


## Importing data

In [None]:
# Let's create a new folder and download the dataset.
! mkdir data
! wget -O data/laptops_train.json https://raw.githubusercontent.com/SapienzaNLP/nlp2021-hw2/master/data/laptops_train.json
! wget -O data/laptops_dev.json https://raw.githubusercontent.com/SapienzaNLP/nlp2021-hw2/master/data/laptops_dev.json
! wget -O data/restaurants_train.json https://raw.githubusercontent.com/SapienzaNLP/nlp2021-hw2/master/data/restaurants_train.json
! wget -O data/restaurants_dev.json https://raw.githubusercontent.com/SapienzaNLP/nlp2021-hw2/master/data/restaurants_dev.json

mkdir: cannot create directory ‘data’: File exists
--2021-06-11 11:16:36--  https://raw.githubusercontent.com/SapienzaNLP/nlp2021-hw2/master/data/laptops_train.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 668974 (653K) [text/plain]
Saving to: ‘data/laptops_train.json’


2021-06-11 11:16:36 (21.9 MB/s) - ‘data/laptops_train.json’ saved [668974/668974]

--2021-06-11 11:16:36--  https://raw.githubusercontent.com/SapienzaNLP/nlp2021-hw2/master/data/laptops_dev.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 148196 (145K) 

In [None]:
import pandas as pd
import os
import json

root_folder: str = r"./data" # to save dataset

laptops_train_filename = "laptops_train.json"
laptops_valid_filename = "laptops_dev.json"
restaurants_train_filename = "restaurants_train.json"
restaurants_valid_filename = "restaurants_dev.json"

laptops_train_dataframe = pd.read_json(os.sep.join([root_folder, laptops_train_filename]), orient='records')
laptops_valid_dataframe = pd.read_json(os.sep.join([root_folder, laptops_valid_filename]), orient='records')
restaurants_train_dataframe = pd.read_json(os.sep.join([root_folder, restaurants_train_filename]), orient='records')
restaurants_valid_dataframe = pd.read_json(os.sep.join([root_folder, restaurants_valid_filename]), orient='records')

restaurants_train_dataframe.drop('categories', inplace=True, axis=1)
restaurants_valid_dataframe.drop('categories', inplace=True, axis=1)

train_dataframe = pd.concat([laptops_train_dataframe, restaurants_train_dataframe], ignore_index=True)
valid_dataframe = pd.concat([laptops_valid_dataframe, restaurants_valid_dataframe], ignore_index=True)

laptops_train_dataframe = None
laptops_valid_dataframe = None
restaurants_train_dataframe = None
restaurants_valid_dataframe = None

VALID_SIZE = len(valid_dataframe)
TEST_SIZE = VALID_SIZE

train_dataframe

Unnamed: 0,targets,text
0,"[[[22, 31], hard disk, neutral]]",I always use a backup hard disk to store impor...
1,"[[[34, 38], size, positive]]","I also love the small, convenient size of my l..."
2,"[[[76, 84], mousepad, neutral]]",I thought the white Mac computers looked dirty...
3,"[[[40, 48], responds, positive]]","It is always reliable, never bugged and respon..."
4,"[[[55, 63], keyboard, positive], [[73, 78], sp...",The real stand out on this computer is the fee...
...,...,...
4995,[],I actually gave Patroon another chance before ...
4996,"[[[77, 84], service, positive], [[123, 128], p...",Although they do the typical what kind of wate...
4997,[],"That said, I thought Scalini Fedeli was one of..."
4998,"[[[27, 34], waiters, positive], [[125, 131], d...",Have always found that the waiters will go out...


In [None]:
valid_dataframe

Unnamed: 0,targets,text
0,[],It was over rated!
1,"[[[55, 75], adding the bluetooth, negative]]",But Sony said we could send it back and be cha...
2,"[[[4, 21], Windows 7 Starter, positive]]","The Windows 7 Starter is, in my opinion, a gre..."
3,"[[[4, 14], powerpoint, positive]]",The powerpoint opened seamlessly in the apple ...
4,"[[[74, 80], screen, positive]]","I chose the iBookG4, a laptop that is an attra..."
...,...,...
1081,[],"(or sister, in my case!)."
1082,"[[[0, 5], Staff, positive]]",Staff is very accomodating.
1083,"[[[26, 40], yellowfun tuna, positive], [[51, 6...",I particularly love their yellowfun tuna and t...
1084,[],Enjoy!


In [None]:
def spans_for_test(txt: str) -> List[Tuple[int, int]]:
    # This function returns the spans of the several tokens of the input text
    tokens = nltk.word_tokenize(txt)
    tokens = filter(lambda token: preprocess_term_for_test(token) != '', tokens)
    offset = 0
    spans_list = list()
    
    for token in tokens:
        
        if token == '``' or token == "''":
          token = '"'
        elif token[-1] in punctuation and len(token) > 2:
          token = token[:-1]
        if token[0] in punctuation and len(token) > 2:
          token = token[1:]

        filtered_punctuation = punctuation.replace('"', '').replace('.', '').replace("'", '')
        tokens_list = re.split("[" + filtered_punctuation + "]+", token)
        for token in tokens_list:
          if token == '':
            continue
          offset = txt.find(token, offset)
          spans_list.append((offset, offset+len(token)))
          offset += len(token)
    
    return spans_list



def preprocess_term_for_test(term: str) -> str: # I remove punctuation
    # This function is used in order to preprocess a signle tokenized term
    cleaned_term = ''
    filtered_punctuation = punctuation.replace('`', '')
    for char in term:
      if (char not in filtered_punctuation) and (char not in '“”'):
        cleaned_term = cleaned_term + char

    return cleaned_term

def preprocess_text_for_test(text: str, start_indexes: List[int], end_indexes: List[int]) -> List[Dict[str, Union[str, Tuple[int, int]]]]:
  # This function returns a list of dicts, where each dict is associated to a tokenized preprocessed term and contains information about the token itslef, its label and its span indexes
  start_indexes.sort()
  end_indexes.sort()
  AT_indexes = list(zip(start_indexes, end_indexes))

  return list(map(lambda indexes: {'token': text[indexes[0] : indexes[1]], 'ne_label': 'AT' if any((indexes[0] >= true_indexes[0] and indexes[1] <= true_indexes[1]) for true_indexes in AT_indexes) else 'O', 'indexes': (indexes[0] , indexes[1])}, spans_for_test(text)))

In [None]:
# These functions are similar to those above, but here a split of the text is made considering that you already know the information of the position of the aspect terms within the input text

def spans_for_train(txt: str) -> List[Tuple[int, int]]:
    tokens = nltk.word_tokenize(txt)
    tokens = filter(lambda token: preprocess_term_for_train(token) != '', tokens)
    offset = 0
    spans_list = list()
    for token in tokens:
        offset = txt.find(token, offset)
        spans_list.append((offset, offset+len(token)))
        offset += len(token)
    return spans_list

def preprocess_term_for_train(term: str) -> str: # I remove punctuation
    cleaned_term = ''
    for char in term:
      if (char not in punctuation) and (char not in '“”'):
        cleaned_term = cleaned_term + char

    return cleaned_term

def preprocess_text_for_train(text: str, start_indexes: List[int], end_indexes: List[int]) -> List[Dict[str, Union[str, Tuple[int, int]]]]:
  preprocessed_text = []
  end_indexes.insert(0, 0)
  start_indexes.sort()
  end_indexes.sort()
  
  for i in range(len(start_indexes)):
    start_index = start_indexes[i]
    end_index = end_indexes[i + 1]
    preprocessed_text += list(map(lambda indexes: {'token': text[end_indexes[i] + indexes[0] : end_indexes[i] + indexes[1]], 'ne_label': 'O', 'indexes': (end_indexes[i] + indexes[0] , end_indexes[i] + indexes[1])}, spans_for_train(text[end_indexes[i] : start_index])))
    aspect_term = text[start_index:end_index].split(' ')
    aspect_term_list = []
    for i in range(len(aspect_term)):
      term = aspect_term[i]
      preprocessed_term = preprocess_term_for_train(term)
      if preprocessed_term != '':
        d = {'token': preprocessed_term, 'ne_label': 'AT', 'indexes': (start_index, start_index + len(term))}
        aspect_term_list.append(d)
      start_index += len(term) + 1
    preprocessed_text += aspect_term_list
  preprocessed_text += list(map(lambda indexes: {'token': text[end_indexes[-1] + indexes[0] : end_indexes[-1] + indexes[1]], 'ne_label': 'O', 'indexes': (end_indexes[-1] + indexes[0] , end_indexes[-1] + indexes[1])}, spans_for_train(text[end_indexes[-1] : ])))

  return preprocessed_text

In [None]:
BERT_MODEL = 'bert-large-cased' # Since we have to do NE recognition, I chose the cased model ---> do_lower_case=False
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False, do_basic_tokenize=False) # I have already done the tokenization with the preprocessing function ---> do_basic_tokenize=False

class ABSA_A_Dataset(Dataset):

    # static variables
    tag_values = ['O', 'AT', 'PAD', 'X']
    tag2idx = {t: i for i, t in enumerate(tag_values)}
    padding_tag_index = tag2idx['PAD']
    x_tag_index = tag2idx['X'] # X tags will not be considered at evaluation 
    o_tag_index = tag2idx['O']


    def __init__(self, 
                 targets: pd.Series,
                 text: pd.Series,
                 preprocess_text: Callable[[str, List[int], List[int]], List[Dict[str, Union[str, Tuple[int, int]]]]]=preprocess_text_for_train):
        """
        We assume that the dataset can fit in memory.
        Args:
            targets (pd.Series): Pandas Dataframe column containing the targets.
            text (pd.Series): Pandas Dataframe column containing the text.
            preprocess_text (Callable): function that is used in order to preprocess the input texts
        """

        self.data = []
        for row_targets, row_text in zip(targets, text):
          start_indexes, end_indexes, true_aspect_terms = [], [], []
          for target in row_targets:
            indexes = target[0]
            start_indexes.append(indexes[0])
            end_indexes.append(indexes[1])
            true_aspect_terms.append(target[1])
          preprocessed_text = preprocess_text(row_text, start_indexes, end_indexes)
          self.data.append({"text": row_text,
                            "preprocessed_text": preprocessed_text,
                            "true_aspect_terms": true_aspect_terms})
        self.encoded_data = None
    
    def index_dataset(self):
        self.encoded_data = list()
        for i in range(len(self.data)):
            # for each sentence
            data_i = self.data[i]
            elem = data_i["preprocessed_text"]
            encoded_elem, encoded_labels, word_span_indexes = self.encode_text_and_labels(elem)
            indexes_list = [w["indexes"] for w in elem]

            self.encoded_data.append({"inputs": encoded_elem, 
                                      "outputs": encoded_labels,
                                      "text": data_i["text"],
                                      "indexes": indexes_list,
                                      "true_aspect_terms": data_i["true_aspect_terms"],
                                      "word_span_indexes": word_span_indexes})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.encoded_data is None:
            raise RuntimeError("""Trying to retrieve elements but index_dataset
            has not been invoked yet! Be sure to invoce index_dataset on this object
            before trying to retrieve elements. In case you want to retrieve raw
            elements, use the method get_raw_element(idx)""")
        return self.encoded_data[idx]
    
    def get_raw_element(self, idx):
        return self.data[idx]

    @staticmethod
    def encode_text_and_labels(sentence:list):
        """
        Args:
            sentences (list): list of Dicts, each carrying the information about
            one token.
        Return:
            The method returns three lists of indexes corresponding to input tokens, input labels and span of words that has been splitted into word pieces
        """
        words_pieces_list, label_indexes, word_span_indexes = ['[CLS]'], [ABSA_A_Dataset.o_tag_index], [(0, 1)]
        words_list, labels_list = [w["token"] for w in sentence], [ABSA_A_Dataset.tag2idx.get(w['ne_label']) for w in sentence]
        i = 1
        n_x_tags_to_add = 0
        
        for word, label in zip(words_list, labels_list):
          
          tokens = tokenizer.tokenize(word)
          n_word_pieces = len(tokens)
          
          words_pieces_list.extend(tokens)
          
          label_indexes.append(label)
          n_x_tags_to_add += (n_word_pieces - 1)

          word_span_indexes.append((i, i + n_word_pieces))

          i += n_word_pieces
        
        words_pieces_list.append('[SEP]')
        label_indexes.append(ABSA_A_Dataset.o_tag_index)
        word_span_indexes.append((i, i + 1))
        label_indexes.extend(n_x_tags_to_add * [ABSA_A_Dataset.x_tag_index])

        return torch.LongTensor(tokenizer.convert_tokens_to_ids(words_pieces_list)), torch.LongTensor(label_indexes), torch.LongTensor(word_span_indexes)
    
    @staticmethod
    def decode_output(max_indices:List[List[int]]):
        """
        Args:
            max_indices: a List where the i-th entry is a List containing the
            indexes preds for the i-th sample
        Output:
            The method returns a list of batch_size length where each element is a list
            of labels, one for each input token.
        """
        predictions = list()
        for indices in max_indices:
            predictions.append([ABSA_A_Dataset.tag_values[i] for i in indices])
        return predictions

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [None]:
def test_dataset_class():

    dataset = ABSA_A_Dataset(train_dataframe['targets'], train_dataframe['text'])
    
    print('Dataset test:')
    for i in range(10):
        print('  sample {}: {}'.format(i, [t["token"] + ":" +  t["ne_label"] for t in dataset.get_raw_element(i)['preprocessed_text']]))
    dataset = None

test_dataset_class()

Dataset test:
  sample 0: ['I:O', 'always:O', 'use:O', 'a:O', 'backup:O', 'hard:AT', 'disk:AT', 'to:O', 'store:O', 'important:O', 'files:O', 'at:O', 'all:O', 'times:O']
  sample 1: ['I:O', 'also:O', 'love:O', 'the:O', 'small:O', 'convenient:O', 'size:AT', 'of:O', 'my:O', 'laptop:O', 'making:O', 'it:O', 'a:O', 'perfect:O', 'tool:O', 'for:O', 'my:O', 'academic:O', 'studies:O']
  sample 2: ['I:O', 'thought:O', 'the:O', 'white:O', 'Mac:O', 'computers:O', 'looked:O', 'dirty:O', 'too:O', 'quicly:O', 'where:O', 'you:O', 'use:O', 'the:O', 'mousepad:AT', 'and:O', 'where:O', 'you:O', 'place:O', 'your:O', 'hands:O', 'when:O', 'typing:O']
  sample 3: ['It:O', 'is:O', 'always:O', 'reliable:O', 'never:O', 'bugged:O', 'and:O', 'responds:AT', 'well:O']
  sample 4: ['The:O', 'real:O', 'stand:O', 'out:O', 'on:O', 'this:O', 'computer:O', 'is:O', 'the:O', 'feel:O', 'of:O', 'the:O', 'keyboard:AT', 'and:O', 'it:O', "'s:O", 'speed:AT']
  sample 5: ['-Called:O', 'headquarters:O', 'again:O', 'they:O', 'report:

In [None]:
dataset = ABSA_A_Dataset(train_dataframe['targets'], train_dataframe['text'])
dataset.index_dataset()

In [None]:
def rnn_collate_fn(
    data_elements: List[Dict[str, Union[torch.Tensor, List]]]) -> List[Dict[str, Union[torch.Tensor, List]]]:

    X, Y = [de['inputs'] for de in data_elements], [de['outputs'] for de in data_elements] # lists of index tensors
    
    batch = {}
    batch['inputs'] = pad_sequence(X, batch_first=True, padding_value=0.0)
    batch['outputs'] = pad_sequence(Y, batch_first=True, padding_value=ABSA_A_Dataset.padding_tag_index)
    batch['text'] = [de['text'] for de in data_elements]
    batch['indexes'] = [de['indexes'] for de in data_elements]
    batch['true_aspect_terms'] = [de['true_aspect_terms'] for de in data_elements]
    batch['word_span_indexes'] = [de['word_span_indexes'] for de in data_elements]
    batch['attention_masks'] = torch.tensor([[float(i < len(words_span)) for i in range(len(input_ids))] for input_ids, words_span in zip(batch['inputs'], batch['word_span_indexes'])])

    return batch

In [None]:
class DataModuleABSA_A(pl.LightningDataModule):
    def __init__(self, training_targets, training_text, dev_targets, dev_text, batch_size=32, number_of_valid_samples=VALID_SIZE, number_of_test_samples=TEST_SIZE):
        super().__init__()
        self.training_targets = training_targets
        self.training_text = training_text
        self.dev_targets = dev_targets
        self.dev_text = dev_text
        self.batch_size = batch_size
        self.number_of_valid_samples = number_of_valid_samples
        self.number_of_test_samples = number_of_test_samples

    def setup(self, stage=None):
      self.trainingset = ABSA_A_Dataset(self.training_targets, self.training_text)
      self.devset = ABSA_A_Dataset(self.dev_targets, self.dev_text, preprocess_text=preprocess_text_for_test)
      self.testset = ABSA_A_Dataset(self.dev_targets, self.dev_text, preprocess_text=preprocess_text_for_test)

      self.trainingset.index_dataset()
      self.devset.index_dataset()
      self.testset.index_dataset()
          
    def train_dataloader(self):
      return DataLoader(self.trainingset,
                        batch_size=self.batch_size,
                        shuffle=True,
                        collate_fn=rnn_collate_fn,
                        num_workers=0)
    
    def val_dataloader(self):
        return DataLoader(self.devset,
                          batch_size=self.number_of_valid_samples//2, # I would test considering the whole validation/test set, in order to get a more representative loss (that is not just an average),
                          shuffle=False,                              # but in order to fit in CUDA memory, I splitted the dev/test set in two batches.
                          collate_fn=rnn_collate_fn,
                          num_workers=0)
    
    def test_dataloader(self):
        return DataLoader(self.testset,
                          batch_size=self.number_of_test_samples//2,
                          shuffle=False,
                          collate_fn=rnn_collate_fn,
                          num_workers=0)

In [None]:
train_dataloader = DataLoader(dataset, batch_size=32, collate_fn=rnn_collate_fn)
for batch in train_dataloader:
    print(batch['inputs'])
    print(batch['outputs'])
    print(batch['text'])
    print(batch['indexes'])
    print(batch['true_aspect_terms'])
    print(batch['attention_masks'])
    print(batch['word_span_indexes'])
    print(batch['inputs'].shape)
    print(batch['outputs'].shape)
    print(len(batch['text']))
    print(len(batch['indexes']))
    print(len(batch['true_aspect_terms']))
    print(len(batch['word_span_indexes']))
    print(batch['attention_masks'].shape)
    for (indexes, mask_values, labels) in zip(batch['inputs'], batch['attention_masks'], batch['outputs']):
      print(list(zip(tokenizer.convert_ids_to_tokens(indexes), mask_values, labels)))
    break

dataset = None # To free up RAM
train_dataloader = None

tensor([[ 101,  146, 1579,  ...,    0,    0,    0],
        [ 101,  146, 1145,  ...,    0,    0,    0],
        [ 101,  146, 1354,  ...,    0,    0,    0],
        ...,
        [ 101, 6844, 1536,  ...,    0,    0,    0],
        [ 101, 1262,  146,  ...,    0,    0,    0],
        [ 101, 1109, 3934,  ...,    0,    0,    0]])
tensor([[0, 0, 0,  ..., 2, 2, 2],
        [0, 0, 0,  ..., 2, 2, 2],
        [0, 0, 0,  ..., 2, 2, 2],
        ...,
        [0, 0, 0,  ..., 2, 2, 2],
        [0, 0, 0,  ..., 2, 2, 2],
        [0, 0, 1,  ..., 2, 2, 2]])
['I always use a backup hard disk to store important files at all times.', 'I also love the small, convenient size of my laptop, making it a perfect tool for my academic studies.', 'I thought the white Mac computers looked dirty too quicly where you use the mousepad and where you place your hands when typing.', 'It is always reliable, never bugged and responds well.', "The real stand out on this computer is the feel of the keyboard and it's speed.", '-

## Model Building

In [None]:
import transformers.models.bert.modeling_bert
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import TokenClassifierOutput
import torch.nn.functional as F

N_LAST_HIDDEN_LAYERS = 4

class BertForTokenClassificationConcat(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.linear1 = nn.Linear(N_LAST_HIDDEN_LAYERS * config.hidden_size, config.hidden_size)
        self.linear2 = nn.Linear(config.hidden_size, config.hidden_size//2)
        self.swish = nn.SiLU()
        self.relu = nn.ReLU()
        self.classifier = nn.Linear(config.hidden_size//2, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        word_span_indexes=None
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_hidden_states = True

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )
        

        hidden_states = outputs.hidden_states[1:] if return_dict else outputs[2][1:]
        last_layers_concatenated = torch.cat(hidden_states[-N_LAST_HIDDEN_LAYERS:], dim=-1)
        last_layers_concatenated = self.linear1(last_layers_concatenated)
        last_layers_concatenated = self.swish(last_layers_concatenated)
        
        new_last_layers_concatenated = None
        seq_len = last_layers_concatenated.shape[1]
        for i in range(len(word_span_indexes)):
          words_span = word_span_indexes[i]
          sentence_hidden_states = last_layers_concatenated[i]
          new_sentence_hidden_states = None
          for index1, index2 in words_span:
            word_pieces_mean_hidden_states = torch.mean(sentence_hidden_states[int(index1):int(index2)], dim=0).unsqueeze(0)
            if new_sentence_hidden_states is None:
              new_sentence_hidden_states = word_pieces_mean_hidden_states
            else:
              new_sentence_hidden_states = torch.cat((new_sentence_hidden_states, word_pieces_mean_hidden_states), dim=0)
          new_sentence_hidden_states = F.pad(new_sentence_hidden_states, pad=(0, 0, 0, seq_len - new_sentence_hidden_states.shape[0]))
          new_sentence_hidden_states = new_sentence_hidden_states.unsqueeze(0)
          if new_last_layers_concatenated is None:
            new_last_layers_concatenated = new_sentence_hidden_states
          else:
            new_last_layers_concatenated = torch.cat((new_last_layers_concatenated, new_sentence_hidden_states), dim=0)


        new_last_layers_concatenated = self.dropout(new_last_layers_concatenated)
        new_last_layers_concatenated = self.linear2(new_last_layers_concatenated)
        new_last_layers_concatenated = self.relu(new_last_layers_concatenated)
        new_last_layers_concatenated = self.dropout(new_last_layers_concatenated)
        logits = self.classifier(new_last_layers_concatenated)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
class AspectTermIdentificationModel(nn.Module):
    # we provide the hyperparameters as input
    def __init__(self, hparams):
        super(AspectTermIdentificationModel, self).__init__()

        pprint(hparams)
        self.bert = BertForTokenClassificationConcat.from_pretrained(
            hparams.bert_model,
            num_labels=hparams.num_labels,
            output_attentions = hparams.output_attentions,
            output_hidden_states = hparams.output_hidden_states)

    
    def forward(self, x, word_span_indexes):
      b_input_ids, b_input_mask, b_labels = x
      outputs = self.bert(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask, labels=b_labels,
                          word_span_indexes=word_span_indexes)
      loss, logits = outputs[0], outputs[1]
      return loss, logits

In [None]:
def evaluate_extraction(samples: List[List[str]],
                        predictions_b: List[List[str]]) -> Tuple[Dict[str, int], float, float, float]:
    scores = {"tp": 0, "fp": 0, "fn": 0}
    tps, fps, fns = [], [], []
    for label, pred in zip (samples, predictions_b):
        pred_terms = {term_pred for term_pred in pred}
        gt_terms = {term_gt for term_gt in label}

        scores["tp"] += len(pred_terms & gt_terms)
        scores["fp"] += len(pred_terms - gt_terms)
        scores["fn"] += len(gt_terms - pred_terms)
        
    precision = 100 * scores["tp"] / (scores["tp"] + scores["fp"])
    recall = 100 * scores["tp"] / (scores["tp"] + scores["fn"])
    f1 = 2 * precision * recall / (precision + recall)
    
    return scores, precision, recall, f1

def get_aspect_terms(labels: List[str],
                     text: str,
                     indexes: List[Tuple[int, int]]) -> List[str]:

  assert len(labels) == len(indexes)
  aspect_terms = []
  start_index = None
  last_seen_aspect_term_appended = False
  filtered_punctuation = punctuation.replace('-', '').replace('"', '').replace('/', '').replace('.', '').replace("'", '').replace('+', '')

  for i in range(len(labels)):

    if labels[i] == 'AT':
      if start_index is None:
        start_index = indexes[i][0]
      end_index = indexes[i][1]
      last_seen_aspect_term_appended = False

    elif not last_seen_aspect_term_appended and start_index is not None:
      aspect_terms_list = re.split("[" + filtered_punctuation + "]+", text[start_index : end_index])
      for aspect_term in aspect_terms_list:
        if aspect_term != '':
          aspect_terms.append(aspect_term.strip())
      start_index = None
      last_seen_aspect_term_appended = True

  if not last_seen_aspect_term_appended and start_index is not None:
      aspect_terms_list = re.split("[" + filtered_punctuation + "]+", text[start_index : end_index])
      for aspect_term in aspect_terms_list:
        if aspect_term != '':
          aspect_terms.append(aspect_term.strip())
  
  return aspect_terms

def evaluate_aspect_terms(inputs_indexes: torch.Tensor,
                          decoded_labels: List[List[str]],
                          texts: List[str],
                          batch_indexes: List[List[Tuple[int, int]]],
                          true_aspect_terms: List[List[str]]) -> Tuple[Dict[str, int], float, float, float]:

  batch_input_tokens = []
  CLS_token_id = tokenizer.convert_tokens_to_ids('[CLS]')
  SEP_token_id = tokenizer.convert_tokens_to_ids('[SEP]')
  for indexes in inputs_indexes:
    tokens = []
    for index in indexes:
      if index != 0.0:
        if index != CLS_token_id and index != SEP_token_id:
          tokens.append(index)
      else:
        break
    batch_input_tokens.append(tokenizer.convert_ids_to_tokens(tokens))

  
     
  assert len(batch_input_tokens) == len(decoded_labels)
  new_batch_labels = list()
  for tokens, labels in zip(batch_input_tokens, decoded_labels):
    labels = labels[1:len(labels) - 1]
    new_labels = list()
    label_list_index = 0
    for token in tokens:
      if not token.startswith("##"):
          new_labels.append(labels[label_list_index])
          label_list_index += 1
    new_batch_labels.append(new_labels)
    assert label_list_index == len(labels)
  
  gt = []
  preds = []
  assert len(new_batch_labels) == len(texts)
  for i in range(len(new_batch_labels)):
    text = texts[i]
    indexes_list = batch_indexes[i]
    predicted_aspect_terms = get_aspect_terms(new_batch_labels[i], text, indexes_list)
    preds.append(predicted_aspect_terms)
    gt.append(true_aspect_terms[i])

  return evaluate_extraction(gt, preds)

In [None]:
from torchmetrics import F1
from transformers import get_linear_schedule_with_warmup, AdamW


class AspectTermModule(pl.LightningModule):
    def __init__(self, hparams, *args, **kwargs):
        super(AspectTermModule, self).__init__(*args, **kwargs)

        self.save_hyperparameters(hparams)
        self.F1 = F1(num_classes=self.hparams.num_labels, average='macro')
        self.model = AspectTermIdentificationModel(self.hparams)
        self.test_tp = 0
        self.test_fp = 0
        self.test_fn = 0

    # This performs a forward pass of the model, as well as returning the predicted index.
    def forward(self, x, word_span_indexes):
        outputs = self.model(x, word_span_indexes)
        return outputs
    
    # This runs the model in training mode mode, ie. activates dropout and gradient computation. It defines a single training step.
    def training_step(self, batch, batch_nb):
        labels = batch['outputs']
        inputs = (batch['inputs'], batch['attention_masks'], labels)
        loss, logits = self.forward(inputs, batch['word_span_indexes'])
        predictions = torch.argmax(logits, -1)
        predictions = torch.tensor([int(p_i) for p, l in zip(predictions, labels)
                                             for p_i, l_i in zip(p, l) if (int(l_i) != ABSA_A_Dataset.padding_tag_index and int(l_i) != ABSA_A_Dataset.x_tag_index)]).to(self.hparams.device)
        
        labels = torch.tensor([int(l_i) for l in labels
                                        for l_i in l if (int(l_i) != ABSA_A_Dataset.padding_tag_index and int(l_i) != ABSA_A_Dataset.x_tag_index)]).to(self.hparams.device)

        F1 = self.F1(predictions, labels)
        # Log it:
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_f1', F1, prog_bar=True)
        # Very important for PL to return the loss that will be used to update the weights:
        return loss


    # This runs the model in eval mode, ie. sets dropout to 0 and deactivates grad. Needed when we are in inference mode.
    def validation_step(self, batch, batch_nb):
        labels = batch['outputs']
        inputs = (batch['inputs'], batch['attention_masks'], labels)
        sample_loss, logits = self.forward(inputs, batch['word_span_indexes'])
        predictions = torch.argmax(logits, -1)
        predictions = torch.tensor([int(p_i) for p, l in zip(predictions, labels)
                                             for p_i, l_i in zip(p, l) if (int(l_i) != ABSA_A_Dataset.padding_tag_index and int(l_i) != ABSA_A_Dataset.x_tag_index)]).to(self.hparams.device)
        labels = torch.tensor([int(l_i) for l in labels
                                        for l_i in l if (int(l_i) != ABSA_A_Dataset.padding_tag_index and int(l_i) != ABSA_A_Dataset.x_tag_index)]).to(self.hparams.device)

        sample_F1 = self.F1(predictions, labels)

        self.log('valid_loss', sample_loss, prog_bar=True)
        self.log('valid_f1', sample_F1, prog_bar=True)

    # This runs the model in eval mode, ie. sets dropout to 0 and deactivates grad. Needed when we are in inference mode.
    def test_step(self, batch, batch_nb):
        labels = batch['outputs']
        inputs_indexes = batch['inputs']
        inputs = (inputs_indexes, batch['attention_masks'], labels)
        sample_loss, logits = self.forward(inputs, batch['word_span_indexes'])
        predictions = torch.argmax(logits, -1)
        predictions = [[int(p_i) for p_i, l_i in zip(p, l) if (int(l_i) != ABSA_A_Dataset.padding_tag_index and int(l_i) != ABSA_A_Dataset.x_tag_index)]
                                 for p, l in zip(predictions, labels)]
        labels = torch.tensor([int(l_i) for l in labels
                                        for l_i in l if (int(l_i) != ABSA_A_Dataset.padding_tag_index and int(l_i) != ABSA_A_Dataset.x_tag_index)]).to(self.hparams.device)
        decoded_labels = ABSA_A_Dataset.decode_output(predictions)
        scores, precision, recall, f1 = evaluate_aspect_terms(inputs_indexes, decoded_labels, batch['text'], batch['indexes'], batch['true_aspect_terms'])

        predictions = torch.tensor([prediction for sample_predictions in predictions for prediction in sample_predictions]).to(self.hparams.device)
        sample_F1 = self.F1(predictions, labels)
        self.test_tp += scores['tp']
        self.test_fp += scores['fp']
        self.test_fn += scores['fn']
        
        
        self.log('test_loss', sample_loss, prog_bar=True)
        self.log('test_f1_on_NE_labels', sample_F1, prog_bar=True)
        self.log('test_tp', scores['tp'], prog_bar=True)
        self.log('test_fp', scores['fp'], prog_bar=True)
        self.log('test_fn', scores['fn'], prog_bar=True)
        self.log('test_precision', precision, prog_bar=True)
        self.log('test_recall', recall, prog_bar=True)
        self.log('test_f1', f1, prog_bar=True)

    def configure_optimizers(self):
      if self.hparams.full_finetuning:
          parameters_optimizer = list(self.model.bert.named_parameters())
          without_decay = ['bias', 'gamma', 'beta']
          optimizer_grouped_parameters = [
              {'params': [p for n, p in parameters_optimizer if not any(wd in n for wd in without_decay)],
              'weight_decay_rate': 0.01},
              {'params': [p for n, p in parameters_optimizer if any(wd in n for wd in without_decay)],
              'weight_decay_rate': 0.0}
          ]
      else:
          parameters_optimizer = list(self.model.bert.classifier.named_parameters())
          optimizer_grouped_parameters = [{"params": [p for n, p in parameters_optimizer]}]

      optimizer = AdamW(
          optimizer_grouped_parameters,
          lr=3e-5,
          eps=1e-8
      )

      total_train_steps = self.hparams.n_batches * self.hparams.n_epochs

      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=0,
          num_training_steps=total_train_steps
      )

      return [optimizer], [scheduler]

    def get_test_results(self):
      precision = 100 * self.test_tp / (self.test_tp + self.test_fp)
      recall = 100 * self.test_tp / (self.test_tp + self.test_fn)
      f1 = 2 * precision * recall / (precision + recall)
    
      return self.test_tp, self.test_fp, self.test_fn, precision, recall, f1


## Model Training

In [None]:
MAX_GRAD_NORM = 1.0

check_point_callback = pl.callbacks.ModelCheckpoint(
    monitor='valid_loss',  # the value that we want to use for model selection.
    verbose=True,  # whether to log or not information in the console.
    save_top_k=1,  # the number of checkpoints we want to store.
    mode='min',  # wheter we want to maximize (max) or minimize the "monitor" value.
    dirpath='experiments/Aspect_Term_Classifier',  # output directory path
    filename='{epoch}-{valid_loss:.4f}'  # the prefix on the checkpoint values. Metrics store by the trainer can be used to dynamically change the name.
)

hparams = {'num_labels': len(ABSA_A_Dataset.tag_values) - 2, # number of different NE labels in our case
           'bert_model': BERT_MODEL,
           'full_finetuning': True,
           'output_attentions': False,
           'output_hidden_states': True,
           'n_epochs': 4,
           'device': 'cuda' if torch.cuda.is_available() else 'cpu'}

data_module = DataModuleABSA_A(train_dataframe['targets'], train_dataframe['text'], valid_dataframe['targets'], valid_dataframe['text'])
data_module.setup()
n_batches = len(data_module.train_dataloader())
hparams['n_batches'] = n_batches
data_module = None
data_module = DataModuleABSA_A(train_dataframe['targets'], train_dataframe['text'], valid_dataframe['targets'], valid_dataframe['text'])
train_dataframe = None
valid_dataframe = None

trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0,
                     val_check_interval=1.0,
                     deterministic=True,
                     max_epochs=hparams['n_epochs'] - 2,
                     gradient_clip_val=MAX_GRAD_NORM,
                     callbacks=[check_point_callback] # the callback we want our trainer to use.
)

model = AspectTermModule(hparams)
trainer.fit(model, datamodule=data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


{'bert_model': 'bert-large-cased',
 'device': 'cuda',
 'full_finetuning': True,
 'n_batches': 157,
 'n_epochs': 4,
 'num_labels': 2,
 'output_attentions': False,
 'output_hidden_states': True}


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1338740706.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForTokenClassificationConcat: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassificationConcat from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassificationConcat from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassificationConcat were not initialized from the model ch

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 1234




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0, global step 156: valid_loss reached 0.07295 (best 0.07295), saving model to "/content/experiments/Aspect_Term_Classifier/epoch=0-valid_loss=0.0729.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1, global step 313: valid_loss reached 0.06361 (best 0.06361), saving model to "/content/experiments/Aspect_Term_Classifier/epoch=1-valid_loss=0.0636.ckpt" as top 1





In [None]:
test_set_results = trainer.test(model, test_dataloaders=data_module.test_dataloader())
print("test set results: {}".format(test_set_results))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1': 78.39517211914062,
 'test_f1_on_NE_labels': 0.9244254231452942,
 'test_fn': 101.5,
 'test_fp': 127.0,
 'test_loss': 0.06361022591590881,
 'test_precision': 76.53558349609375,
 'test_recall': 80.35930633544922,
 'test_tp': 439.5}
--------------------------------------------------------------------------------
test set results: [{'test_loss': 0.06361022591590881, 'test_f1_on_NE_labels': 0.9244254231452942, 'test_tp': 439.5, 'test_fp': 127.0, 'test_fn': 101.5, 'test_precision': 76.53558349609375, 'test_recall': 80.35930633544922, 'test_f1': 78.39517211914062}]


In [None]:
test_tp, test_fp, test_fn, test_precision, test_recall, test_f1 = model.get_test_results()
print("Test TP: {test_tp}, Test FP: {test_fp}, Test FN: {test_fn}, Test Precision: {test_precision}, Test Recall: {test_recall}, Test F1: {test_f1},".format(test_tp = test_tp, test_fp = test_fp, test_fn = test_fn, test_precision = test_precision, test_recall = test_recall, test_f1 = test_f1))

Test TP: 879, Test FP: 254, Test FN: 203, Test Precision: 77.58164165931156, Test Recall: 81.23844731977819, Test F1: 79.36794582392776,
