In [3]:
from collections import defaultdict
from urllib import request
import json
import pandas as pd
from math import floor, ceil, log10
import os
from glob import glob
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import random

In [4]:
#Set seeds
seed = 28

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [5]:
def parse_conllu_using_pandas(block):
    records = []
    for line in block.splitlines():
        if not line.startswith('#'):
            records.append(line.strip().split('\t'))
    return pd.DataFrame.from_records(
        records,
        columns=['ID', 'FORM', 'TAG', 'Misc1', 'Misc2'])

In [6]:
def tokens_to_labels(df):
    return (
        df.FORM.tolist(),
        df.TAG.tolist()
    )

In [7]:
PREFIX = "https://raw.githubusercontent.com/UniversalNER/"
DATA_URLS = {
    "en_ewt": {
        "train": "UNER_English-EWT/master/en_ewt-ud-train.iob2",
        "dev": "UNER_English-EWT/master/en_ewt-ud-dev.iob2",
        "test": "UNER_English-EWT/master/en_ewt-ud-test.iob2"
    },
    "en_pud": {
        "test": "UNER_English-PUD/master/en_pud-ud-test.iob2"
    }
}

In [8]:
# en_ewt is the main train-dev-test split
# en_pud is the OOD test set
data_dict = defaultdict(dict)
for corpus, split_dict in DATA_URLS.items():
    for split, url_suffix in split_dict.items():
        url = PREFIX + url_suffix
        with request.urlopen(url) as response:
            txt = response.read().decode('utf-8')
            data_frames = map(parse_conllu_using_pandas,
                              txt.strip().split('\n\n'))
            token_label_alignments = list(map(tokens_to_labels,
                                              data_frames))
            data_dict[corpus][split] = token_label_alignments


In [9]:
# Saving the data so that you don't have to redownload it each time.
with open('ner_data_dict.json', 'w', encoding='utf-8') as out:
    json.dump(data_dict, out, indent=2, ensure_ascii=False)

In [None]:
# Each subset of each corpus is a list of tuples where each tuple
# is a list of tokens with a corresponding list of labels.

# Train on data_dict['en_ewt']['train']; validate on data_dict['en_ewt']['dev']
# and test on data_dict['en_ewt']['test'] and data_dict['en_pud']['test']
data_dict['en_ewt']['train'][0], data_dict['en_pud']['test'][1]

In [None]:
data_dict['en_ewt']['train']

In [12]:
#Extracting sentences and labels from the data

#split training into sentences and labels
train_sentences = [x[0] for x in data_dict['en_ewt']['train']]
train_labels = [x[1] for x in data_dict['en_ewt']['train']]

#split dev set into sentences and labels
dev_sentences = [x[0] for x in data_dict['en_ewt']['dev']]
dev_labels = [x[1] for x in data_dict['en_ewt']['dev']]

#get the test data from both domains
test_sentences1 = [x[0] for x in data_dict['en_ewt']['test']]
test_labels1 = [x[1] for x in data_dict['en_ewt']['test']]

test_sentences2 = [x[0] for x in data_dict['en_pud']['test']]
test_labels2 = [x[1] for x in data_dict['en_pud']['test']]

#combine the test data
test_sentences = test_sentences1 + test_sentences2
test_labels = test_labels1 + test_labels2

In [13]:
#the label dictionariies for tag conversion

#mappijg bio to numneric ids for training
label_to_number = {'B-LOC': 1,
                   'I-LOC': 2,
                   'B-PER': 3,
                   'I-PER': 4,
                   'B-ORG': 5,
                   'I-ORG': 6,
                   'O': 0}

#reversing the mapping.
number_to_label = {1: 'B-LOC',
                   2: 'I-LOC',
                   3: 'B-PER',
                   4: 'I-PER',
                   5: 'B-ORG',
                   6: 'I-ORG',
                   0: '0'}


In [38]:
#Converting tag strings to numeric labels

#converting training tag sequences from strings to numeric ids
train_label_to_number = []
for set_of_labels in train_labels:
  new_label = []
  for label in set_of_labels:
    new_label.append(label_to_number[label])
  train_label_to_number.append(new_label)

#converting dev tag sequences from strings to numeric ids
dev_label_to_number = []
for set_of_labels in dev_labels:
  new_label = []
  for label in set_of_labels:
    new_label.append(label_to_number[label])
  dev_label_to_number.append(new_label)

#converting test tag sequences from strings to numeric ids
test_label_to_number = []
for set_of_labels in test_labels:
  new_label = []
  for label in set_of_labels:
    new_label.append(label_to_number[label])
  test_label_to_number.append(new_label)

In [15]:
#Build vocabulary and covert sentences token ids

#define pad token
pad_token = 0

#start index from 1 as 0 is for pad tokens
idx = 1

#create a word indexed vocabulary from all datasets
word_vocabulary = {'<PAD>': pad_token}
for set_of_words in train_sentences + test_sentences + dev_sentences:
  for word in set_of_words:
    if word not in word_vocabulary:
      word_vocabulary[word] = idx
      idx += 1

#converting training sentences to sequences of word indices
train_word_to_number = []
for set_of_words in train_sentences:
  vocab = []
  for word in set_of_words:
    vocab.append(word_vocabulary[word])
  train_word_to_number.append(vocab)

#converting dev sentences to word indices and pad token is word is missing
dev_word_to_number = []
for set_of_words in dev_sentences:
  vocab = []
  for word in set_of_words:
    vocab.append(word_vocabulary.get(word, pad_token))
  dev_word_to_number.append(vocab)

#same for the test sentences
test_word_to_number = []
for set_of_words in test_sentences:
  vocab = []
  for word in set_of_words:
    vocab.append(word_vocabulary.get(word, pad_token))
  test_word_to_number.append(vocab)

In [None]:
#Names of the different lists for clarity

#labels to numbers
train_label_to_number
dev_label_to_number
test_label_to_number

#words to numbers
train_word_to_number
dev_word_to_number
test_word_to_number

In [39]:
#now pad the sentences or cut them short. pad the labels with -100 and the sentences with 0 bc the code needs them all to be the same size and readable.

#Setting the max length of the sequences of tokens
max_length = 128


#PADDING THE LABELS

padded_train_labels = []
for set_of_labels in train_label_to_number:
  if len(set_of_labels) < max_length:
    padded_sentence = set_of_labels + [-100] *(max_length - len(set_of_labels))
  else:
    padded_sentence = set_of_labels[:max_length]
  padded_train_labels.append(padded_sentence)


padded_dev_labels = []
for set_of_labels in dev_label_to_number:
  if len(set_of_labels) < max_length:
    padded_sentence = set_of_labels + [-100] *(max_length - len(set_of_labels))
  else:
    padded_sentence = set_of_labels[:max_length]
  padded_dev_labels.append(padded_sentence)


padded_test_labels = []
for set_of_labels in test_label_to_number:
  if len(set_of_labels) < max_length:
    padded_sentence = set_of_labels + [-100] *(max_length - len(set_of_labels))
  else:
    padded_sentence = set_of_labels[:max_length]
  padded_test_labels.append(padded_sentence)


#PADDING THE SENTENCES

padded_train_sentences = []
for set_of_words in train_word_to_number:
  if len(set_of_words) < max_length:
    padded_sentence = set_of_words + [0] *(max_length-len(set_of_words))
  else:
    padded_sentence = set_of_words[:max_length]
  padded_train_sentences.append(padded_sentence)

padded_dev_sentences = []
for set_of_words in dev_word_to_number:
  if len(set_of_words) < max_length:
    padded_sentence = set_of_words + [0] *(max_length-len(set_of_words))
  else:
    padded_sentence = set_of_words[:max_length]
  padded_dev_sentences.append(padded_sentence)

padded_test_sentences = []
for set_of_words in test_word_to_number:
  if len(set_of_words) < max_length:
    padded_sentence = set_of_words + [0] *(max_length-len(set_of_words))
  else:
    padded_sentence = set_of_words[:max_length]
  padded_test_sentences.append(padded_sentence)

In [None]:
#All the padded lists

#padded labels

padded_train_labels
padded_dev_labels
padded_test_labels

#padded words

padded_train_sentences
padded_dev_sentences
padded_test_sentences

In [17]:
# reconstruct the raw text from token ids before the training loop


label_pad_id = -100 #label for padding
token_pad_id = word_vocabulary['<PAD>'] #word padding token id

#creating a reversed vocab to map token ids back to the string
id2word = {v:k for k,v in word_vocabulary.items()}

#getting the raw training sentences excluding padding tokens
raw_train_sentences = [
    [ id2word[token_id] for token_id in seq
      if token_id != token_pad_id ]
    for seq in padded_train_sentences
]

#same for dev sentences
raw_dev_sentences = [
    [ id2word[token_id] for token_id in seq
      if token_id != token_pad_id ]
    for seq in padded_dev_sentences
]


#and padded sentences
raw_test_sentences = [
    [ id2word[token_id] for token_id in seq
      if token_id != token_pad_id ]
    for seq in padded_test_sentences
]


In [18]:
#ENCODER

hidden_dim = 256

class SineEncoder(nn.Module):
    def __init__(self,
                 vocab_size, #how many tokens is can see
                 num_labels, #how many labels to predict 3 or 7
                 sequence_length=15, #fixed sentnece lengyh
                 d_model=hidden_dim, #how big the embeddings are
                 nhead=2, # numvber of attention heads
                 n_layers=2, #encoder layers
                 dropout=0.1): #transformer layers
        super().__init__() #important for pytorch. initialises nn.Module
        self.d_model = d_model #saves the dimensions as a property of the model

        #load the pretrained BERT model
        self.bertmodel = BertModel.from_pretrained('bert-base-uncased')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        #freeze bert weights
        for param in self.bertmodel.parameters():
          param.requires_grad=False

        #creates transformer encoder layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.bertmodel.config.hidden_size, nhead=2, dim_feedforward=hidden_dim, dropout = dropout, batch_first=True)

        #stack multiple encoder layers
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        #dropout
        self.dropout = nn.Dropout(dropout)

        #linear layer to map embeddings to label logits
        self.classifier = nn.Linear(self.bertmodel.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):

        #input through BERT
        outputs = self.bertmodel(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state

        #pass through customer transformer encoder layers
        contextual_embeddings = self.transformer_encoder(embeddings)

        #pass through drop out
        contextual_embeddings= self.dropout(contextual_embeddings)

        #projct to label space
        logits = self.classifier(contextual_embeddings)

        return logits



In [19]:
#TRAINING LOOP FOR TRANSFORMER


def train_transformer_epoch(raw_texts, train_labels, encoder, optimiser, loss_fn, device, batch_size=32):

  encoder.train() #model to training mode

  n_steps = ceil(len(raw_texts)/batch_size) #batches in one epoch

  train_losses = torch.zeros(n_steps) #stores loss

  for step_n in tqdm(range(n_steps), leave=False):

    #make the batches
    low = step_n * batch_size
    high = low + batch_size

    texts = raw_texts[low:high] #raw text for batch

    #tokenise input using BERT tokeniser
    encoding = encoder.tokenizer(texts, is_split_into_words=True, max_length=128, padding='max_length', truncation=True, return_tensors='pt')

    #move input to cuda and get corresponding labels
    input_ids = encoding.input_ids.to(device)
    attention_mask = encoding.attention_mask.to(device)
    batch_labels = train_labels[low:high]
    batch_labels = torch.tensor(batch_labels).to(device)

    #optimser zero grad
    optimiser.zero_grad()

    #forward pass
    predictions = encoder(input_ids, attention_mask)

    #reshaping stuff so it fits with the loss
    predictions = predictions.reshape(-1, predictions.shape[-1])
    batch_labels = batch_labels.reshape(-1)

    #cross entropy loss
    loss = loss_fn(predictions, batch_labels)

    #backward pass
    loss.backward()
    optimiser.step()

    #update loss
    train_losses[step_n] = loss.item()

    #return loss
  return train_losses




In [20]:
#VaALIDATION FOR TRAINING LOOP


def validate_transformer(raw_texts, dev_labels, encoder, loss_fn, device, batch_size=32):

  encoder.eval() #model to evaluation mode

  n_steps_dev = ceil(len(raw_texts) / batch_size)
  dev_losses = torch.zeros(n_steps_dev) #stores loss

  #lists for final outputs
  all_predictions = []
  all_gold = []

  with torch.no_grad(): #disable gradient tracking
    for step_n in tqdm(range(n_steps_dev), leave=False, desc='Validation'):

      #create batches
      low = step_n * batch_size
      high = low + batch_size

      texts = raw_texts[low:high]

      encoding = encoder.tokenizer(texts, is_split_into_words=True, max_length=128, padding='max_length', truncation=True, return_tensors='pt')

      input_ids = encoding.input_ids.to(device)
      attention_mask = encoding.attention_mask.to(device)

      batch_labels = dev_labels[low:high]
      batch_labels = torch.tensor(batch_labels).to(device)

      #forward pass
      with torch.no_grad():
        predictions = encoder(input_ids, attention_mask)

      #reshape the data for loss
      predictions_flat = predictions.view(-1, predictions.shape[-1])
      labels_flat = batch_labels.view(-1)

      #loss
      loss = loss_fn(predictions_flat, labels_flat)

      dev_losses[step_n] = loss.item()

      #save the data
      preds_indices = predictions.argmax(dim=-1)

      for pred_seq, gold_seq in zip(preds_indices, batch_labels):
        #convert tensors to numpy arrays
        pred_seq = pred_seq.cpu().numpy()
        gold_seq = gold_seq.cpu().numpy()

        sentence_preds = []
        sentence_golds = []

        #filter out padded positions
        for p, g in zip(pred_seq, gold_seq):
          if g!= -100:
            sentence_preds.append(p)
            sentence_golds.append(g)

        all_predictions.append(sentence_preds)
        all_gold.append(sentence_golds)

        #print examples with named entities to track learning
        sp = [int(x) for x in sentence_preds]
        sg = [int(x) for x in sentence_golds]
        if 1 in sg or 2 in sg or 3 in sg or 4 in sg or 5 in sg or 6 in sg:
          print(sp)
          print(sg)
          print('')

    return all_gold, all_predictions, dev_losses




In [34]:
#Span Level Evaluation Functions

#Extract entity spans from a sequence of BIO tags
#returns the start, end and label for each span
def get_spans(tags, simplified = False):
  spans = []
  start = None
  label = None

  for i, tag in enumerate(tags):
    if tag == '0':
      #close span if there is one
      if start is not None:
        spans.append((start, i, label))
        start = None
        label = None

    elif tag.startswith('B'):
       #if prev span is still open then close it
      if start is not None:
        spans.append((start, i, label))

      start = i
      label = '' if simplified else tag[2:] #remove label prefix for simple tagset

    elif tag.startswith('I'):
      continue

  if start is not None: #close spans that reach the end of sequence
      spans.append((start, len(tags), label))

  return spans



#Function to change BIO tags by removing the label types (LOC, PER, ORG)
def simplify_bio_sequences(seqs):
    return [[('0' if t=='O' else t[0]) for t in seq] for seq in seqs]



#calculate the span matching accuracy
#span is correct if its start and end and optionally the label match
def span_match_score(gold_seqs, pred_seqs):
  total = 0 #gold spans
  correct = 0 #pred spans

  for gseq, pseq in zip(gold_seqs, pred_seqs):
        goldspans = set(get_spans(gseq)) #get gold spans
        predspans = set(get_spans(pseq)) #get pred spans
        total   += len(goldspans)
        correct += len(goldspans & predspans) #count overlap

  return correct / total if total else 0.0

In [None]:
#TRAINING SETUP AND LOOP

#use GPU
device = 'cuda'

#constantvalues for the model
vocab_size = len(word_vocabulary)
num_labels = 7 #number of bio labels
sequence_length = 128

#define the encoder model
encoder = SineEncoder(
    vocab_size=vocab_size,
    num_labels=num_labels,
    sequence_length=sequence_length,
    d_model=256, #hidden dimensions
    nhead=2, #number of attention heads
    n_layers=2, #transformer layers
    dropout=0.1).to(device) # drop out rate

#define theoptimiser and weight decay
optimiser = optim.Adam(encoder.parameters(), lr=1e-4, weight_decay=1e-5)

#define custim weights to look at rare tags more
class_weights = torch.tensor([1.0,4.0,4.5,1.0,1.5,1.0,1.0]).to('cuda')

#define loss
loss_fn = nn.CrossEntropyLoss(ignore_index=-100,weight=class_weights)

#number of training epochs
n_epochs = 30

#TRAINING LOOP
for epoch in range(n_epochs):

  #trainging the data
  train_losses = train_transformer_epoch(raw_train_sentences, padded_train_labels, encoder, optimiser, loss_fn, device, batch_size=32)


  #validating the dev set
  all_gold, all_predictions, val_losses = validate_transformer(raw_dev_sentences, padded_dev_labels, encoder, loss_fn, device, batch_size=32)

  print("Train loss", train_losses.mean())
  print("Val loss", val_losses.mean())



In [23]:
#Convert numeric labelIDS back to string bio tags using the reverse labels dictionary

gold_labels = [[number_to_label[i] for i in seq] for seq in all_gold]
pred_labels = [[number_to_label[i] for i in seq] for seq in all_predictions]

In [None]:
#Compute Evaluation Metrics

#flatten gold and predicted tags into single lists for classification report
#each element should be a single tokens label

flat_gold = []

for sentence in gold_labels:
  for tag in sentence:
    flat_gold.append(tag)


flat_pred = []
for sentence in pred_labels:
  for tag in sentence:
    flat_pred.append(tag)

#defining label set for classification report
label_names = ['0','B-LOC','I-LOC','B-ORG','I-ORG','B-PER','I-PER']

print(classification_report(flat_gold, flat_pred,
      labels=label_names, zero_division=0))

In [30]:
#Function for converting full BIO to simplified format

def simplify_bio_sequences(seqs):

    simple = []
    for seq in seqs:
        simple_seq = []
        for tag in seq:
            if tag == '0':
                simple_seq.append('O')
            else:
                simple_seq.append(tag[0]) #only take B or I
        simple.append(simple_seq) #append the list of simple tags
    return simple


In [None]:
#SPAN MATCH ACCURACY

#Labelled span match
labelled_score = span_match_score(gold_labels, pred_labels)

#Unlabelled span match
gold_simple = simplify_bio_sequences(gold_labels)
pred_simple = simplify_bio_sequences(pred_labels)
unlabelled_score = span_match_score(gold_simple, pred_simple)

print(f"Labelled span match", labelled_score)
print(f"Unlabelled span match", unlabelled_score)


In [None]:
#Evaluate Model on Test Set

# Run the model on test data exactly like validation
all_gold_test, all_pred_test, test_losses = validate_transformer(
    raw_test_sentences,
    padded_test_labels,
    encoder,
    loss_fn,
    device,
    batch_size=32
)

print(f"Test loss", test_losses.mean()) #print average loss

# Convert numerical preds to BIO tag strings
gold_labels_test = [[ number_to_label[i] for i in seq ]
                    for seq in all_gold_test]
pred_labels_test = [[ number_to_label[i] for i in seq ]
                    for seq in all_pred_test]






# Classification Report

#flatten the gold and pred lists for the classification report
flat_gold = [t for seq in gold_labels_test for t in seq]
flat_pred = [t for seq in pred_labels_test for t in seq]

label_names = ['0','B-LOC','I-LOC','B-ORG','I-ORG','B-PER','I-PER']

print(classification_report(
    flat_gold,
    flat_pred,
    labels=label_names,
    zero_division=0))




# Span Evaluation

#compute labelled span match score
labelled_score = span_match_score(gold_labels_test, pred_labels_test)

#simplify sequences to simple tagset for unlabelled span match
gold_simple    = simplify_bio_sequences(gold_labels_test)
pred_simple    = simplify_bio_sequences(pred_labels_test)

#compute unlabelled span match score
unlabelled_score = span_match_score(gold_simple, pred_simple)

print(f"Labelled span match", labelled_score)
print(f"Unlabelled span match", unlabelled_score)


In [None]:
#Simplified Label Set Evaluation

#convert full bio sequences to simplified sets for gold and pred lists
gold_simple = simplify_bio_sequences(gold_labels_test)
pred_simple = simplify_bio_sequences(pred_labels_test)

#flatten the lists
flat_gold_simple = [t for seq in gold_simple for t in seq]
flat_pred_simple = [t for seq in pred_simple for t in seq]

print(classification_report(
    flat_gold_simple,
    flat_pred_simple,
    labels=['0', 'B', 'I'],
    zero_division=0))



#compute labelled span match
labelled_simple_score = span_match_score(gold_simple, pred_simple)

#compute unlabelled span match (same as labelled)
unlabelled_simple_score = labelled_simple_score

print(f"Simplified labelled span match", labelled_simple_score)
print(f"Simplified unlabelled span match", unlabelled_simple_score)

