<a href="https://colab.research.google.com/github/simpleParadox/Private-RE/blob/main/CMPUT_622_Project_semeval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers --quiet

[K     |████████████████████████████████| 5.5 MB 12.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 12.8 MB/s 
[K     |████████████████████████████████| 182 kB 14.3 MB/s 
[?25h

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
#from torchvision import datasets
#from torchvision.transforms import ToTensor
import torch.nn.functional as F
import torch.optim as optim

# Import the transformers library for the retrieving the BERT embeddings.
import transformers
from transformers import BertModel, BertTokenizer


# Import pyvacy for privacy preserving optimizers.
#from pyvacy import optim as private_optim, analysis

# Import scikit-learn packages.
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.utils import gen_batches


# Import scientific computing python packages.
import pandas as pd
import numpy as np      
import matplotlib.pyplot as plt

# Additional packages.
from google.colab import drive
from tqdm import tqdm
import csv
from typing import List


# Using gpu if available.
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
device

'cpu'

## Read in Erin's tabular data and preprocess it.

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


## **Read Sententence-level Data**

In [6]:
train_directory_path = '/content/drive/MyDrive/CMPUT 622 project/data/semeval/train.txt'
test_directory_path = '/content/drive/MyDrive/CMPUT 622 project/data/semeval/test.txt'

In [7]:
relation_to_id = [
    "other", 
    "Entity-Destination(e1,e2)",
    "Cause-Effect(e2,e1)",        
    "Member-Collection(e2,e1)",      
    "Entity-Origin(e1,e2)",        
    "Message-Topic(e1,e2)",        
    "Component-Whole(e2,e1)",       
    "Component-Whole(e1,e2)",       
    "Instrument-Agency(e2,e1)",     
    "Product-Producer(e2,e1)",     
    "Content-Container(e1,e2)",     
    "Cause-Effect(e1,e2)",          
    "Product-Producer(e1,e2)",       
    "Content-Container(e2,e1)",    
    "Entity-Origin(e2,e1)",          
    "Message-Topic(e2,e1)",        
    "Instrument-Agency(e1,e2)",       
    "Member-Collection(e1,e2)",      
    "Entity-Destination(e2,e1)"]    

In [8]:
def convertText_csv(path):
  output: List[List[str]] = []

  with open(path) as file:
    lines = file.read()
    lines =  lines.splitlines()

  for line in lines:
    line = line.strip()
    input = line.split(sep="\t")
    entity1 = input[0]
    entity2 = input[1]
    relation = input[2]
    sentence = input[3]

    #sentence = sentence.replace('<e1>', '')
    #sentence = sentence.replace('<e2>', '')
    #sentence = sentence.replace('</e1>', '')
    #sentence = sentence.replace('</e2>', '')
    
    output.append([sentence, entity1, entity2, relation])
  sentence[:2]
  return output

In [9]:
def writeOutput(output, path):
  with open(path, 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(["sentence", "entity1", "entity2", "relation"])
    for i in output:
      writer.writerow(i)

In [26]:
def loadSemEvalDateset(inputFilename, outputFilename):
  writeOutput(convertText_csv(inputFilename), outputFilename)
  data = pd.read_csv(outputFilename, encoding='utf-8', sep = '\t')
  # print(data.head())
  # print(data[:2])

  data = shuffle(data, random_state = 1) 
  

  labels = data.iloc[:,-1].values
  features = data.iloc[:,:1].values.tolist()
  sentences = [' '.join(i).strip() for i in features]

  label = preprocessing.LabelEncoder()
  y = label.fit_transform(data['relation'])
  label_mappings = integer_mapping = {i: l for i, l in enumerate(label.classes_)}
  return sentences, y, label_mappings

In [27]:
x_train, y_train_classes, train_map = loadSemEvalDateset(train_directory_path, "train.tsv")

In [28]:
x_test, y_test, test_map = loadSemEvalDateset(test_directory_path, "test.tsv")

In [29]:
all_data = pd.read_csv('/content/train.tsv', encoding='utf-8', sep = '\t')
all_data.head()

Unnamed: 0,sentence,entity1,entity2,relation
0,The system as described above has its greatest...,configuration,elements,"Component-Whole(e2,e1)"
1,The <e1> child </e1> was carefully wrapped and...,child,cradle,other
2,The <e1> author </e1> of a keygen uses a <e2> ...,author,disassembler,"Instrument-Agency(e2,e1)"
3,A misty <e1> ridge </e1> uprises from the <e2>...,ridge,surge,other
4,The <e1> student </e1> <e2> association </e2> ...,student,association,"Member-Collection(e1,e2)"


In [30]:
len(y_train_classes)

8000

In [31]:
x_train[0]

'An <e1> invoice </e1> is a commercial document indicating the <e2> products </e2> , quantities , and agreed prices for products or services the seller has provided the buyer .'

In [32]:
x_test[0] # probably test and train files have same data

'Police found in front of one of the bombed establishments an improvised <e1> bomb </e1> inside a papaya <e2> fruit </e2> left under a table at Delecta Bakeshop Friday by two teenage suspects .'

In [18]:
y_test.shape

(2717,)

In [None]:
# y_train = torch.tensor(y_train)
# y_train

tensor([14, 13, 15,  ...,  0, 16, 13])

###IMPORT tokenization.py

In [22]:
#@title Default title text
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# coding=utf-8
"""Tokenization classes implementation.

The file is forked from:
https://github.com/google-research/bert/blob/master/tokenization.py.
"""

import collections
import re
import unicodedata

import six
import tensorflow as tf

#import sentencepiece as spm

SPIECE_UNDERLINE = "▁"


def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
  """Checks whether the casing config is consistent with the checkpoint name."""

  # The casing has to be passed in by the user and there is no explicit check
  # as to whether it matches the checkpoint. The casing information probably
  # should have been stored in the bert_config.json file, but it's not, so
  # we have to heuristically detect it to validate.

  if not init_checkpoint:
    return

  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
  if m is None:
    return

  model_name = m.group(1)

  lower_models = [
      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
  ]

  cased_models = [
      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
      "multi_cased_L-12_H-768_A-12"
  ]

  is_bad_config = False
  if model_name in lower_models and not do_lower_case:
    is_bad_config = True
    actual_flag = "False"
    case_name = "lowercased"
    opposite_flag = "True"

  if model_name in cased_models and do_lower_case:
    is_bad_config = True
    actual_flag = "True"
    case_name = "cased"
    opposite_flag = "False"

  if is_bad_config:
    raise ValueError(
        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
        "However, `%s` seems to be a %s model, so you "
        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
        "how the model was pre-training. If this error is wrong, please "
        "just comment out this check." %
        (actual_flag, init_checkpoint, model_name, case_name, opposite_flag))


def convert_to_unicode(text):
  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
  if six.PY3:
    if isinstance(text, str):
      return text
    elif isinstance(text, bytes):
      return text.decode("utf-8", "ignore")
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  elif six.PY2:
    if isinstance(text, str):
      return text.decode("utf-8", "ignore")
    elif isinstance(text, unicode):
      return text
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  else:
    raise ValueError("Not running on Python2 or Python 3?")


def printable_text(text):
  """Returns text encoded in a way suitable for print or `tf.logging`."""

  # These functions want `str` for both Python2 and Python3, but in one case
  # it's a Unicode string and in the other it's a byte string.
  if six.PY3:
    if isinstance(text, str):
      return text
    elif isinstance(text, bytes):
      return text.decode("utf-8", "ignore")
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  elif six.PY2:
    if isinstance(text, str):
      return text
    elif isinstance(text, unicode):
      return text.encode("utf-8")
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  else:
    raise ValueError("Not running on Python2 or Python 3?")


def load_vocab(vocab_file):
  """Loads a vocabulary file into a dictionary."""
  vocab = collections.OrderedDict()
  index = 0
  with tf.io.gfile.GFile(vocab_file, "r") as reader:
    while True:
      token = convert_to_unicode(reader.readline())
      if not token:
        break
      token = token.strip()
      vocab[token] = index
      index += 1
  return vocab


def convert_by_vocab(vocab, items):
  """Converts a sequence of [tokens|ids] using the vocab."""
  output = []
  for item in items:
    output.append(vocab[item])
  return output


def convert_tokens_to_ids(vocab, tokens):
  return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
  return convert_by_vocab(inv_vocab, ids)


def whitespace_tokenize(text):
  """Runs basic whitespace cleaning and splitting on a piece of text."""
  text = text.strip()
  if not text:
    return []
  tokens = text.split()
  return tokens


class FullTokenizer(object):
  """Runs end-to-end tokenziation."""

  def __init__(self, vocab_file, do_lower_case=True, split_on_punc=True):
    self.vocab = load_vocab(vocab_file)
    self.inv_vocab = {v: k for k, v in self.vocab.items()}
    self.basic_tokenizer = BasicTokenizer(
        do_lower_case=do_lower_case, split_on_punc=split_on_punc)
    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

  def tokenize(self, text):
    split_tokens = []
    for token in self.basic_tokenizer.tokenize(text):
      for sub_token in self.wordpiece_tokenizer.tokenize(token):
        split_tokens.append(sub_token)

    return split_tokens

  def convert_tokens_to_ids(self, tokens):
    return convert_by_vocab(self.vocab, tokens)

  def convert_ids_to_tokens(self, ids):
    return convert_by_vocab(self.inv_vocab, ids)


class BasicTokenizer(object):
  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

  def __init__(self, do_lower_case=True, split_on_punc=True):
    """Constructs a BasicTokenizer.

    Args:
      do_lower_case: Whether to lower case the input.
      split_on_punc: Whether to apply split on punctuations. By default BERT
        starts a new token for punctuations. This makes detokenization difficult
        for tasks like seq2seq decoding.
    """
    self.do_lower_case = do_lower_case
    self.split_on_punc = split_on_punc

  def tokenize(self, text):
    """Tokenizes a piece of text."""
    text = convert_to_unicode(text)
    text = self._clean_text(text)

    # This was added on November 1st, 2018 for the multilingual and Chinese
    # models. This is also applied to the English models now, but it doesn't
    # matter since the English models were not trained on any Chinese data
    # and generally don't have any Chinese data in them (there are Chinese
    # characters in the vocabulary because Wikipedia does have some Chinese
    # words in the English Wikipedia.).
    text = self._tokenize_chinese_chars(text)

    orig_tokens = whitespace_tokenize(text)
    split_tokens = []
    for token in orig_tokens:
      if self.do_lower_case:
        token = token.lower()
        token = self._run_strip_accents(token)
      if self.split_on_punc:
        split_tokens.extend(self._run_split_on_punc(token))
      else:
        split_tokens.append(token)

    output_tokens = whitespace_tokenize(" ".join(split_tokens))
    return output_tokens

  def _run_strip_accents(self, text):
    """Strips accents from a piece of text."""
    text = unicodedata.normalize("NFD", text)
    output = []
    for char in text:
      cat = unicodedata.category(char)
      if cat == "Mn":
        continue
      output.append(char)
    return "".join(output)

  def _run_split_on_punc(self, text):
    """Splits punctuation on a piece of text."""
    chars = list(text)
    i = 0
    start_new_word = True
    output = []
    while i < len(chars):
      char = chars[i]
      if _is_punctuation(char):
        output.append([char])
        start_new_word = True
      else:
        if start_new_word:
          output.append([])
        start_new_word = False
        output[-1].append(char)
      i += 1

    return ["".join(x) for x in output]

  def _tokenize_chinese_chars(self, text):
    """Adds whitespace around any CJK character."""
    output = []
    for char in text:
      cp = ord(char)
      if self._is_chinese_char(cp):
        output.append(" ")
        output.append(char)
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)

  def _is_chinese_char(self, cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
        (cp >= 0x3400 and cp <= 0x4DBF) or  #
        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
        (cp >= 0x2B820 and cp <= 0x2CEAF) or
        (cp >= 0xF900 and cp <= 0xFAFF) or  #
        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
      return True

    return False

  def _clean_text(self, text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
      cp = ord(char)
      if cp == 0 or cp == 0xfffd or _is_control(char):
        continue
      if _is_whitespace(char):
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)


class WordpieceTokenizer(object):
  """Runs WordPiece tokenziation."""

  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=400):
    self.vocab = vocab
    self.unk_token = unk_token
    self.max_input_chars_per_word = max_input_chars_per_word

  def tokenize(self, text):
    """Tokenizes a piece of text into its word pieces.

    This uses a greedy longest-match-first algorithm to perform tokenization
    using the given vocabulary.

    For example:
      input = "unaffable"
      output = ["un", "##aff", "##able"]

    Args:
      text: A single token or whitespace separated tokens. This should have
        already been passed through `BasicTokenizer.

    Returns:
      A list of wordpiece tokens.
    """

    text = convert_to_unicode(text)

    output_tokens = []
    for token in whitespace_tokenize(text):
      chars = list(token)
      if len(chars) > self.max_input_chars_per_word:
        output_tokens.append(self.unk_token)
        continue

      is_bad = False
      start = 0
      sub_tokens = []
      while start < len(chars):
        end = len(chars)
        cur_substr = None
        while start < end:
          substr = "".join(chars[start:end])
          if start > 0:
            substr = "##" + substr
          if substr in self.vocab:
            cur_substr = substr
            break
          end -= 1
        if cur_substr is None:
          is_bad = True
          break
        sub_tokens.append(cur_substr)
        start = end

      if is_bad:
        output_tokens.append(self.unk_token)
      else:
        output_tokens.extend(sub_tokens)
    return output_tokens


def _is_whitespace(char):
  """Checks whether `chars` is a whitespace character."""
  # \t, \n, and \r are technically control characters but we treat them
  # as whitespace since they are generally considered as such.
  if char == " " or char == "\t" or char == "\n" or char == "\r":
    return True
  cat = unicodedata.category(char)
  if cat == "Zs":
    return True
  return False


def _is_control(char):
  """Checks whether `chars` is a control character."""
  # These are technically control characters but we count them as whitespace
  # characters.
  if char == "\t" or char == "\n" or char == "\r":
    return False
  cat = unicodedata.category(char)
  if cat in ("Cc", "Cf"):
    return True
  return False


def _is_punctuation(char):
  """Checks whether `chars` is a punctuation character."""
  cp = ord(char)
  # We treat all non-letter/number ASCII as punctuation.
  # Characters such as "^", "$", and "`" are not in the Unicode
  # Punctuation class but we treat them as punctuation anyways, for
  # consistency.
  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
    return True
  cat = unicodedata.category(char)
  if cat.startswith("P"):
    return True
  return False


def preprocess_text(inputs, remove_space=True, lower=False):
  """Preprocesses data by removing extra space and normalize data.

  This method is used together with sentence piece tokenizer and is forked from:
  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py

  Args:
    inputs: The input text.
    remove_space: Whether to remove the extra space.
    lower: Whether to lowercase the text.

  Returns:
    The preprocessed text.

  """
  outputs = inputs
  if remove_space:
    outputs = " ".join(inputs.strip().split())

  if six.PY2 and isinstance(outputs, str):
    try:
      outputs = six.ensure_text(outputs, "utf-8")
    except UnicodeDecodeError:
      outputs = six.ensure_text(outputs, "latin-1")

  outputs = unicodedata.normalize("NFKD", outputs)
  outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
  if lower:
    outputs = outputs.lower()

  return outputs


def encode_pieces(sp_model, text, sample=False):
  """Segements text into pieces.

  This method is used together with sentence piece tokenizer and is forked from:
  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py


  Args:
    sp_model: A spm.SentencePieceProcessor object.
    text: The input text to be segemented.
    sample: Whether to randomly sample a segmentation output or return a
      deterministic one.

  Returns:
    A list of token pieces.
  """
  if six.PY2 and isinstance(text, six.text_type):
    text = six.ensure_binary(text, "utf-8")

  if not sample:
    pieces = sp_model.EncodeAsPieces(text)
  else:
    pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
  new_pieces = []
  for piece in pieces:
    piece = printable_text(piece)
    if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
      cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace(
          SPIECE_UNDERLINE, ""))
      if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
        if len(cur_pieces[0]) == 1:
          cur_pieces = cur_pieces[1:]
        else:
          cur_pieces[0] = cur_pieces[0][1:]
      cur_pieces.append(piece[-1])
      new_pieces.extend(cur_pieces)
    else:
      new_pieces.append(piece)

  return new_pieces


def encode_ids(sp_model, text, sample=False):
  """Segments text and return token ids.

  This method is used together with sentence piece tokenizer and is forked from:
  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py

  Args:
    sp_model: A spm.SentencePieceProcessor object.
    text: The input text to be segemented.
    sample: Whether to randomly sample a segmentation output or return a
      deterministic one.

  Returns:
    A list of token ids.
  """
  pieces = encode_pieces(sp_model, text, sample=sample)
  ids = [sp_model.PieceToId(piece) for piece in pieces]
  return ids

## Initialize the pretrained BERT model (uncased) and the respective tokenizer.

### NOTE: We might need to tokenize and encode everything before running the model.

### Get BERT embeddings

In [23]:
#import tokenization
import tensorflow_hub as hub

def tf_tokenizer():
	m_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
	bert_layer = hub.KerasLayer(m_url, trainable=False)

	vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
	do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
	tokenizer = FullTokenizer(vocab_file, do_lower_case)
	return tokenizer

In [24]:
from transformers import BatchEncoding

def tf_bert_tokenize(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    all_tokenized_data = []
    
    for text in texts:
      current_tokenized_data = {}
      special_tokens=True
      
      text = tokenizer.tokenize(text)
        
      text = text[:max_len-2]
      input_sequence = "[CLS] " + text + " [SEP]"
      pad_len = max_len-len(input_sequence)
      
      tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
      pad_masks = [1] * len(input_sequence) + [0] * pad_len
      segment_ids = [0] * max_len
      
      current_tokenized_data['input_ids'] = torch.Tensor([tokens]).long()
      current_tokenized_data['attention_mask'] = torch.Tensor([pad_masks]).long()
      current_tokenized_data['token_type_ids'] = torch.Tensor([segment_ids]).long()
      
      all_tokenized_data.append(BatchEncoding(current_tokenized_data))
      
      # all_tokens.append(tokens)
      # all_masks.append(pad_masks)
      # all_segments.append(segment_ids)
        
        
    # return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
    return all_tokenized_data

### Define the BertTokenizer and the BertModel from the transformers library.

In [25]:
# --- Subject & object markup ---
SUB_START_CHAR = "<e1>"
SUB_END_CHAR = "</e1>"
OBJ_START_CHAR = "<e2>"
OBJ_END_CHAR = "</e2>"

added_special_token = [SUB_START_CHAR, SUB_END_CHAR, OBJ_START_CHAR, OBJ_END_CHAR]

In [33]:
# Define the BertModel and the BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', model_max_length=50, padding_side='right', additional_special_tokens = added_special_token)
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
bert_model.resize_token_embeddings(len(bert_tokenizer))
#bert_model = bert_model.to(device)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Encode the inputs and store them so that we don't have re-encode everytime we run the model.

In [34]:
X_train_subset = x_train[:500]
y_train_subset = y_train_classes[:500]

In [60]:
print(X_train_subset[0])

An <e1> invoice </e1> is a commercial document indicating the <e2> products </e2> , quantities , and agreed prices for products or services the seller has provided the buyer .


In [41]:
tf_bert_tokenizer = tf_tokenizer()

In [39]:
# Define model parameters.
seeds = [0]   # Change the actual seed value here.
batch_size = 16
epochs = 10
optimizer_name = "Adam" # DP-SGD, DP-Adam, Adam, SGD
learning_rate = 0.001
load_epochs = epochs - 5
make_private = False
EPSILON = 4
DELTA = (1/500)
#DELTA = 1e-5
MAX_GRAD_NORM = 1.0
NOISE_MULTIPLIER = 1.5
sequence_max_length = 50

In [38]:
class SemevalDataset(Dataset):
    def __init__(self, tokenized_data, labels):
        self.tokenized_data = tokenized_data
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        class_label = self.labels[idx]
        tokens_info = self.tokenized_data[idx]
        return [tokens_info, class_label]

In [42]:
print("Encoding training data.")
all_train_tokens = []
for batch in tqdm(range(0, len(X_train_subset), batch_size)):
  sentence_batch = X_train_subset[batch:batch+batch_size]
  
  # Tokenize the data.
  # train_tokens = bert_tokenize(sentence_batch, bert_tokenizer)
  train_tokens = tf_bert_tokenize(sentence_batch, tf_bert_tokenizer, max_len=sequence_max_length)
  #print("Train tokens: ", train_tokens)
  #print("Type train tokens: ", type(train_tokens))
  all_train_tokens.extend(train_tokens)
  # Get bert embeddings for the data.
print("Training data encoding complete.", flush=True)

Encoding training data.


 12%|█▎        | 4/32 [00:00<00:00, 37.14it/s]

Train tokens:  [{'input_ids': tensor([[  101,  2019,  1026,  1041,  2487,  1028,  1999,  6767,  6610,  1026,
          1013,  1041,  2487,  1028,  2003,  1037,  3293,  6254,  8131,  1996,
          1026,  1041,  2475,  1028,  3688,  1026,  1013,  1041,  2475,  1028,
          1010, 12450,  1010,  1998,  3530,  7597,  2005,  3688,  2030,  2578,
          1996, 14939,  2038,  3024,  1996, 17634,  1012,   102,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}, {'input_ids': tensor([[  101,  2023,  2738, 13576,  3861,  2003,  1997,  1037,  1026,  1041,
          2487,  1028,  1059, 11961,  2140,  1026,  1013,  1041,  2487,  1028,
   

 38%|███▊      | 12/32 [00:00<00:00, 33.84it/s]

[{'input_ids': tensor([[  101,  2009,  2001,  1996,  1026,  1041,  2487,  1028,  6215,  1026,
          1013,  1041,  2487,  1028,  1997,  1996,  2751,  3115,  2011,  1996,
          2231,  2008,  3303,  1996, 15741,  1026,  1041,  2475,  1028,  7859,
          1026,  1013,  1041,  2475,  1028,  1999,  1996,  2088,  4610,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}, {'input_ids': tensor([[  101, 10390,  2024,  9720,  2000,  4638,  2037, 18097,  2015,  1998,
         13184,  2869,  2005, 12486,  1026,  1041,  2487,  1028,  3688,  1026,
          1013,  1

 62%|██████▎   | 20/32 [00:00<00:00, 32.52it/s]

[{'input_ids': tensor([[  101,  6623,  1026,  1041,  2487,  1028,  2162,  3215,  1026,  1013,
          1041,  2487,  1028,  2006,  1996,  2398,  1998,  2519,  2024,  3303,
          2011,  1026,  1041,  2475,  1028, 18191,  1026,  1013,  1041,  2475,
          1028,  1010,  2029,  2069,  7461,  4286,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}, {'input_ids': tensor([[  101,  1996,  3145,  2000,  7093,  1005,  1055,  2709,  2000,  2433,
          2003,  2383,  6266,  4153,  1006, 18856,  7828,  3240,  1007,  1998,
          2010, 12

 75%|███████▌  | 24/32 [00:00<00:00, 31.92it/s]

Train tokens:  [{'input_ids': tensor([[ 101, 1996, 1026, 1041, 2487, 1028, 7267, 1026, 1013, 1041, 2487, 1028,
         2038, 2042, 2207, 2046, 2014, 2047, 1026, 1041, 2475, 1028, 2155, 1026,
         1013, 1041, 2475, 1028, 1012,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}, {'input_ids': tensor([[  101,  1996,  1026,  1041,  2487,  1028,  4306,  1026,  1013,  1041,
          2487,  1028, 18280,  2015,  2013,  1996,  1026,  1041,  2475,  1028,
          3007,  1026,  1013,  1041,  2475,  1028,  2

100%|██████████| 32/32 [00:00<00:00, 33.73it/s]

[{'input_ids': tensor([[ 101, 1996, 1023, 1013, 2340, 1026, 1041, 2487, 1028, 9738, 1026, 1013,
         1041, 2487, 1028, 3495, 5520, 2046, 1996, 3274, 1026, 1041, 2475, 1028,
         4734, 1026, 1013, 1041, 2475, 1028, 1012,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}, {'input_ids': tensor([[  101,  1996,  1026,  1041,  2487,  1028,  8312,  1026,  1013,  1041,
          2487,  1028,  1997,  1996,  2951,  2003, 15685,  1999,  2019,  1026,
          1041,  2475,  1028,  4767,  1026,  1013,  1041,  2475,  10




In [61]:
print(all_train_tokens[0])

{'input_ids': tensor([[  101,  2019,  1026,  1041,  2487,  1028,  1999,  6767,  6610,  1026,
          1013,  1041,  2487,  1028,  2003,  1037,  3293,  6254,  8131,  1996,
          1026,  1041,  2475,  1028,  3688,  1026,  1013,  1041,  2475,  1028,
          1010, 12450,  1010,  1998,  3530,  7597,  2005,  3688,  2030,  2578,
          1996, 14939,  2038,  3024,  1996, 17634,  1012,   102,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}


In [43]:
print("Creating custom dataset", flush=True)
train_dataset = SemevalDataset(all_train_tokens, y_train_subset)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

Creating custom dataset


## Model definition and training




### Implement the model

In [44]:
!pip install opacus

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opacus
  Using cached opacus-1.3.0-py3-none-any.whl (216 kB)
Collecting functorch
  Using cached functorch-1.13.0-py2.py3-none-any.whl (2.1 kB)
Collecting torch>=1.8
  Downloading torch-1.13.0-cp37-cp37m-manylinux1_x86_64.whl (890.2 MB)
[K     |██████████████████████████████  | 834.1 MB 1.3 MB/s eta 0:00:44tcmalloc: large alloc 1147494400 bytes == 0x3abf6000 @  0x7f6aa0f8a615 0x58ead6 0x4f355e 0x4d222f 0x51041f 0x5b4ee6 0x58ff2e 0x510325 0x5b4ee6 0x58ff2e 0x50d482 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4bac0a 0x538a76 0x590ae5 0x510280 0x5b4ee6 0x58ff2e 0x50d482 0x5b4ee6 0x58ff2e 0x50c4fc 0x58fd37 0x50ca37 0x5b4ee6 0x58ff2e
[K     |████████████████████████████████| 890.2 MB 5.1 kB/s 
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[K     |███████████████████████████████

In [45]:
from opacus import PrivacyEngine
from opacus.layers import dp_rnn

In [46]:
class erin_model(nn.Module):
    def __init__(self, in_size=768, hidden_size: int = 1, num_relations: int = 19, sequence_length:int = 50, private=False):
        super(erin_model,self).__init__()
        
        # Just add one LSTM unit as the model followed by a fully connected layer and then a softmax.
        if private:
            self.lstm = dp_rnn.DPLSTM(input_size=in_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        else:
            self.lstm = nn.LSTM(input_size=in_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.fc = nn.Linear(sequence_length*hidden_size, num_relations)
        print("Private or non-private....: ", private)

    def forward(self, x):
        # First get the bert embeddings.
        # Then do the forward pass.
        x, (h_n, c_n) = self.lstm(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        output = F.softmax(x, 1)
        return output

In [47]:
def reformat(data, batch_size):
    reformated_data = []
    for i in range(batch_size):
        temp_formated_data_dict = {}
        temp_formated_data_dict['input_ids'] = torch.Tensor(data['input_ids'].numpy()[i]).long()
        temp_formated_data_dict['attention_mask'] = torch.Tensor(data['attention_mask'].numpy()[i]).long()
        temp_formated_data_dict['token_type_ids'] = torch.Tensor(data['token_type_ids'].numpy()[i]).long()
        reformated_data.append(BatchEncoding(temp_formated_data_dict))
    return reformated_data

def get_bert_embeds_from_tokens(bert_model, encoded_inputs):
    all_bert_embeds = []
    bert_model = bert_model.to(device)  # Put the bert_model on the GPU.
    for i in tqdm(range(len(encoded_inputs))):
        encoded_input = encoded_inputs[i]
        encoded_input = encoded_input.to(device)  # Put the encoded input on the GPU.
        # print("encoded input: ", type(encoded_input))
        with torch.no_grad():
            outputs = bert_model(**encoded_input)

            # Getting embeddings from the final BERT layer
            #print(outputs.keys())
            token_embeddings = outputs[0]
            token_embeddings = torch.squeeze(token_embeddings, dim=0).cpu().detach()
        all_bert_embeds.append(token_embeddings) 
        encoded_input.to('cpu')
    all_bert_embeds = [t.numpy() for t in all_bert_embeds]
    return all_bert_embeds

**testing semeval**

In [48]:
device

'cpu'

### Convert numpy to PyTorch TensorDataset and then into DataLoader

In [49]:
model = erin_model(sequence_length=sequence_max_length, private=True) # Using default model dimensions.

model = model.to(device)  # Make sure you have this before loading an existing model.

if optimizer_name == 'RMSProp':
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
elif optimizer_name == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer_name == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss()

Private or non-private....:  True


In [50]:
print("Starting model training.", flush=True)
epoch_losses = []
for epoch in range(load_epochs, epochs):
    print("Epoch: ", epoch)
    running_loss = 0.0
    for batch_index, data in enumerate(train_dataloader):
        inputs, batch_y_train_classes = data
        # print("Inputs from private dataloader: ", inputs)
        inputs_size = inputs['input_ids'].size(0)
        # print("Inputs batch size", inputs_size)
        inputs = reformat(inputs, inputs_size)  # Reformat data for the custom dataset.
        last_hidden_states_train = get_bert_embeds_from_tokens(bert_model, inputs)

        inputs_tensor = torch.Tensor(last_hidden_states_train)
        batch_labels_tensor = torch.Tensor(batch_y_train_classes)
        
        # Put the batched data on the gpu.
        inputs_tensor = inputs_tensor.to(device)
        batch_labels_tensor = batch_labels_tensor.type(torch.LongTensor)
        batch_labels_tensor = batch_labels_tensor.to(device)
        
        optimizer.zero_grad()

        # # Forward pass.
        outputs = model(inputs_tensor)            
        # print("outputs size: ", outputs.size())
        
        # Calculate loss.
        loss = criterion(outputs, batch_labels_tensor)
        
        # Calculate gradients.
        loss.backward()

        # Update weights.
        optimizer.step()

        # Calculate loss for debugging.
        training_loss = loss.item()
        running_loss += training_loss
        # if i % 1000 == 999:
        # print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / len(inputs) :.3f}')
        if batch_index % 100 == 0:
            batch_loss = training_loss / inputs_size
            print(f"Batch loss at batch {batch_index}: ", training_loss / inputs_size, flush=True)
            # log_metric("Batch losses", batch_loss)
    epoch_loss = running_loss / len(all_train_tokens)
    epoch_losses.append(epoch_loss)
    print("Epoch loss", epoch_loss)
    if make_private:
        # Log the epsilon value.
        #log_metric("Epsilon budget per epoch", privacy_engine.get_epsilon(DELTA))
        print(f"Epoch {epoch + 1} loss : ", epoch_loss, flush=True)
print("All epoch losses: ", epoch_losses)
print("Finished model training.")

Starting model training.
Epoch:  5


100%|██████████| 16/16 [00:05<00:00,  3.11it/s]

Batch loss at batch 0:  0.18393051624298096



  
100%|██████████| 16/16 [00:03<00:00,  4.48it/s]
100%|██████████| 16/16 [00:07<00:00,  2.27it/s]
100%|██████████| 16/16 [00:05<00:00,  2.93it/s]
100%|██████████| 16/16 [00:03<00:00,  4.44it/s]
100%|██████████| 16/16 [00:03<00:00,  4.36it/s]
100%|██████████| 16/16 [00:05<00:00,  3.05it/s]
100%|██████████| 16/16 [00:04<00:00,  3.33it/s]
100%|██████████| 16/16 [00:03<00:00,  4.41it/s]
100%|██████████| 16/16 [00:03<00:00,  4.36it/s]
100%|██████████| 16/16 [00:03<00:00,  4.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.43it/s]
100%|██████████| 16/16 [00:03<00:00,  4.51it/s]
100%|██████████| 16/16 [00:04<00:00,  3.66it/s]
100%|██████████| 16/16 [00:03<00:00,  4.47it/s]
100%|██████████| 16/16 [00:03<00:00,  4.54it/s]
100%|██████████| 16/16 [00:03<00:00,  4.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.42it/s]
100%|██████████| 16/16 [00:04<00:00,  3.97it/s]
100%|██████████| 16/16 [00:05<00:00,  2.81it/s]
100%|██████████| 16/16 [00:03<00:00,

Epoch loss 0.18834571933746339
Epoch:  6


100%|██████████| 16/16 [00:03<00:00,  4.49it/s]

Batch loss at batch 0:  0.1837039291858673



100%|██████████| 16/16 [00:03<00:00,  4.54it/s]
100%|██████████| 16/16 [00:04<00:00,  3.44it/s]
100%|██████████| 16/16 [00:05<00:00,  3.14it/s]
100%|██████████| 16/16 [00:04<00:00,  3.64it/s]
100%|██████████| 16/16 [00:07<00:00,  2.17it/s]
100%|██████████| 16/16 [00:07<00:00,  2.28it/s]
100%|██████████| 16/16 [00:07<00:00,  2.16it/s]
100%|██████████| 16/16 [00:03<00:00,  4.23it/s]
100%|██████████| 16/16 [00:03<00:00,  4.55it/s]
100%|██████████| 16/16 [00:03<00:00,  4.55it/s]
100%|██████████| 16/16 [00:03<00:00,  4.56it/s]
100%|██████████| 16/16 [00:03<00:00,  4.10it/s]
100%|██████████| 16/16 [00:05<00:00,  2.86it/s]
100%|██████████| 16/16 [00:04<00:00,  3.45it/s]
100%|██████████| 16/16 [00:05<00:00,  3.03it/s]
100%|██████████| 16/16 [00:04<00:00,  3.81it/s]
100%|██████████| 16/16 [00:03<00:00,  4.47it/s]
100%|██████████| 16/16 [00:03<00:00,  4.49it/s]
100%|██████████| 16/16 [00:03<00:00,  4.55it/s]
100%|██████████| 16/16 [00:03<00:00,  4.50it/s]
100%|██████████| 16/16 [00:03<00:00,  4

Epoch loss 0.18809181308746337
Epoch:  7


100%|██████████| 16/16 [00:03<00:00,  4.59it/s]

Batch loss at batch 0:  0.1835424154996872



100%|██████████| 16/16 [00:03<00:00,  4.59it/s]
100%|██████████| 16/16 [00:03<00:00,  4.58it/s]
100%|██████████| 16/16 [00:03<00:00,  4.58it/s]
100%|██████████| 16/16 [00:03<00:00,  4.50it/s]
100%|██████████| 16/16 [00:03<00:00,  4.61it/s]
100%|██████████| 16/16 [00:03<00:00,  4.53it/s]
100%|██████████| 16/16 [00:03<00:00,  4.55it/s]
100%|██████████| 16/16 [00:04<00:00,  3.47it/s]
100%|██████████| 16/16 [00:05<00:00,  3.10it/s]
100%|██████████| 16/16 [00:03<00:00,  4.55it/s]
100%|██████████| 16/16 [00:03<00:00,  4.46it/s]
100%|██████████| 16/16 [00:03<00:00,  4.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.58it/s]
100%|██████████| 16/16 [00:03<00:00,  4.55it/s]
100%|██████████| 16/16 [00:03<00:00,  4.47it/s]
100%|██████████| 16/16 [00:03<00:00,  4.49it/s]
100%|██████████| 16/16 [00:03<00:00,  4.51it/s]
100%|██████████| 16/16 [00:03<00:00,  4.53it/s]
100%|██████████| 16/16 [00:03<00:00,  4.50it/s]
100%|██████████| 16/16 [00:03<00:00,  4.54it/s]
100%|██████████| 16/16 [00:03<00:00,  4

Epoch loss 0.18747095012664794
Epoch:  8


100%|██████████| 16/16 [00:03<00:00,  4.46it/s]

Batch loss at batch 0:  0.18316638469696045



100%|██████████| 16/16 [00:03<00:00,  4.46it/s]
100%|██████████| 16/16 [00:03<00:00,  4.50it/s]
100%|██████████| 16/16 [00:03<00:00,  4.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.46it/s]
100%|██████████| 16/16 [00:05<00:00,  3.08it/s]
100%|██████████| 16/16 [00:04<00:00,  3.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.46it/s]
100%|██████████| 16/16 [00:04<00:00,  3.70it/s]
100%|██████████| 16/16 [00:05<00:00,  2.88it/s]
100%|██████████| 16/16 [00:03<00:00,  4.48it/s]
100%|██████████| 16/16 [00:03<00:00,  4.48it/s]
100%|██████████| 16/16 [00:03<00:00,  4.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.45it/s]
100%|██████████| 16/16 [00:03<00:00,  4.56it/s]
100%|██████████| 16/16 [00:03<00:00,  4.51it/s]
100%|██████████| 16/16 [00:03<00:00,  4.57it/s]
100%|██████████| 16/16 [00:03<00:00,  4.47it/s]
100%|██████████| 16/16 [00:04<00:00,  3.63it/s]
100%|██████████| 16/16 [00:05<00:00,  3.00it/s]
100%|██████████| 16/16 [00:03<00:00,  4.52it/s]
100%|██████████| 16/16 [00:03<00:00,  4

Epoch loss 0.1859609694480896
Epoch:  9


100%|██████████| 16/16 [00:03<00:00,  4.39it/s]

Batch loss at batch 0:  0.18444733321666718



100%|██████████| 16/16 [00:04<00:00,  3.52it/s]
100%|██████████| 16/16 [00:05<00:00,  3.12it/s]
100%|██████████| 16/16 [00:03<00:00,  4.57it/s]
100%|██████████| 16/16 [00:03<00:00,  4.49it/s]
100%|██████████| 16/16 [00:03<00:00,  4.52it/s]
100%|██████████| 16/16 [00:03<00:00,  4.56it/s]
100%|██████████| 16/16 [00:03<00:00,  4.48it/s]
100%|██████████| 16/16 [00:03<00:00,  4.56it/s]
100%|██████████| 16/16 [00:03<00:00,  4.55it/s]
100%|██████████| 16/16 [00:03<00:00,  4.39it/s]
100%|██████████| 16/16 [00:03<00:00,  4.48it/s]
100%|██████████| 16/16 [00:03<00:00,  4.40it/s]
100%|██████████| 16/16 [00:03<00:00,  4.44it/s]
100%|██████████| 16/16 [00:03<00:00,  4.60it/s]
100%|██████████| 16/16 [00:04<00:00,  3.52it/s]
100%|██████████| 16/16 [00:05<00:00,  3.16it/s]
100%|██████████| 16/16 [00:03<00:00,  4.52it/s]
100%|██████████| 16/16 [00:03<00:00,  4.49it/s]
100%|██████████| 16/16 [00:03<00:00,  4.60it/s]
100%|██████████| 16/16 [00:03<00:00,  4.57it/s]
100%|██████████| 16/16 [00:03<00:00,  4

Epoch loss 0.1838971586227417
All epoch losses:  [0.18834571933746339, 0.18809181308746337, 0.18747095012664794, 0.1859609694480896, 0.1838971586227417]
Finished model training.





### Evaluating model performance on test data.

In [51]:
print("Encoding test data.")
all_test_tokens = []
for batch in tqdm(range(0, len(x_test), batch_size)):
  sentence_batch = x_test[batch:batch+batch_size]
  # Tokenize the data.
  # test_tokens = bert_tokenize(sentence_batch, bert_tokenizer)
  test_tokens = tf_bert_tokenize(sentence_batch, tf_bert_tokenizer, max_len=sequence_max_length)
  all_test_tokens.extend(test_tokens)
  # Get bert embeddings for the data.
print("Test data encoding complete.")

Encoding test data.


100%|██████████| 170/170 [00:01<00:00, 102.84it/s]

Test data encoding complete.





In [52]:
test_dataset = SemevalDataset(all_test_tokens, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [53]:
print("Testing on test data.")
all_predictions = []
all_test_labels = []
with torch.no_grad():
    total = 0.0
    correct = 0.0
    for batch_index in enumerate(test_dataloader):
        test_inputs, batch_y_test_classes = data
        test_inputs_size = test_inputs['input_ids'].size(0)
        test_inputs = reformat(test_inputs, test_inputs_size)  # Reformat data for the custom dataset.
        last_hidden_states_test = get_bert_embeds_from_tokens(bert_model, test_inputs)
        
        inputs_tensor_test = torch.Tensor(last_hidden_states_test)
        batch_labels_tensor_test = torch.Tensor(batch_y_test_classes)
        
        # Put the batched data on the gpu.
        inputs_tensor_test = inputs_tensor_test.to(device)
        batch_labels_tensor_test = batch_labels_tensor_test.type(torch.LongTensor)
        batch_labels_tensor_test = batch_labels_tensor_test.to(device)
        
        test_outputs = model(inputs_tensor_test)
        # The class with the highest energy is what we choose as prediction
        _, predicted = torch.max(test_outputs.data, 1)
        total += batch_labels_tensor_test.size(0)
        correct += (predicted == batch_labels_tensor_test).sum().item()
    
        all_predictions.extend(predicted.cpu().int().numpy())
        all_test_labels.extend(batch_labels_tensor_test.cpu().int().numpy())
    # print("All predictions: ", all_predictions)
    # print("All test labels: ", all_test_labels)
    # Calculate test accuracy and F1 here.
    f1 = f1_score(all_predictions, all_test_labels, average='macro')
    test_accuracy = 100 * correct / total
   

Testing on test data.


100%|██████████| 4/4 [00:01<00:00,  3.28it/s]
100%|██████████| 4/4 [00:01<00:00,  2.30it/s]
100%|██████████| 4/4 [00:01<00:00,  2.73it/s]
100%|██████████| 4/4 [00:01<00:00,  2.78it/s]
100%|██████████| 4/4 [00:01<00:00,  2.75it/s]
100%|██████████| 4/4 [00:01<00:00,  2.80it/s]
100%|██████████| 4/4 [00:01<00:00,  2.78it/s]
100%|██████████| 4/4 [00:01<00:00,  3.04it/s]
100%|██████████| 4/4 [00:00<00:00,  4.62it/s]
100%|██████████| 4/4 [00:00<00:00,  4.52it/s]
100%|██████████| 4/4 [00:00<00:00,  4.57it/s]
100%|██████████| 4/4 [00:00<00:00,  4.51it/s]
100%|██████████| 4/4 [00:00<00:00,  4.51it/s]
100%|██████████| 4/4 [00:00<00:00,  4.60it/s]
100%|██████████| 4/4 [00:00<00:00,  4.42it/s]
100%|██████████| 4/4 [00:00<00:00,  4.50it/s]
100%|██████████| 4/4 [00:00<00:00,  4.51it/s]
100%|██████████| 4/4 [00:00<00:00,  4.54it/s]
100%|██████████| 4/4 [00:00<00:00,  4.57it/s]
100%|██████████| 4/4 [00:00<00:00,  4.55it/s]
100%|██████████| 4/4 [00:00<00:00,  4.55it/s]
100%|██████████| 4/4 [00:00<00:00,

NameError: ignored

In [56]:
print(all_predictions)

[18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18, 18, 18, 6, 18

In [57]:
print(all_test_labels)

[0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17, 8, 18, 0, 17

In [58]:
correct

170.0

In [59]:
total

680.0

In [54]:
print(f'Test accuracy for seed {0}: {100 * correct / total} %')
print(f"Test f1 is: {f1}")

Test accuracy for seed 0: 25.0 %
Test f1 is: 0.1




Accuracy of the network on the 10000 test images: 22 %
