In [1]:
import conllu
import tqdm
import torch
import json
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional
from functools import partial
from transformers import BertModel
from torch import nn
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


Quite a lot of inspo from https://github.com/ltgoslo/NorBERT/tree/main/benchmarking/experiments

In [2]:
""" Could tidy up this one to just use device instead of calling gpu=true etc."""
if torch.cuda.is_available():
    gpu = True
    print("Using GPU")
else:
    gpu = False

device = torch.device("cuda")
print(device)

Using GPU
cuda


In [3]:
## Conllu stuff
def filter_tags(x):
    return x        

def convert_to_list_dict(path, file):
    path = path.format(file)
    with open(path, encoding="UTF-8") as infile:
        lst = []
        tokens = list(conllu.parse_incr(infile))
        for sent in tokens:
                dic = {
                "idx": sent.metadata["sent_id"],
                "text": sent.metadata["text"].lower(),
                "tokens": [token["form"].lower() for token in sent],
                "lemmas": [token["lemma"] for token in sent],
                "pos_tags": [token["upos"] for token in sent],
                "ner_tags": [filter_tags(token["misc"].get("name", "O")) for token in sent],
            }
                lst.append(dic) 
        print("Converting {} to list of dictionaries\n     {} elements converted..".format(file, len(lst)))
    return lst

In [4]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/dataset.py



class CoNLLDataset(Dataset):
    def __init__(self, x_tokens, y_labels, ner_vocab=None):
        self.tokens = [[x for x in entry] for entry in x_tokens]
        self.ner_labels = [[y for y in entry] for entry in y_labels]

        # hard coded ner_vocab to avoid random shuffled instanciation of ordering of ner_vocab
        self.ner_vocab = ner_vocab
        self.ner_indexer = {i: n for n, i in enumerate(self.ner_vocab)}
    
    def __getitem__(self, index):
        tokens = self.tokens[index]
        ner_labels = self.ner_labels[index]

        x = tokens
        y = torch.LongTensor([self.ner_indexer[i] if i in self.ner_vocab
                              else self.ner_indexer['@UNK'] for i in ner_labels])
        return x, y

    def __len__(self):
        return len(self.tokens)



In [5]:
""" Dynamic padding. Takes the longest sentence in batch and pads other sentences to its length (if im not mistaken)"""
# Function borrowed from https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py

def collate_fn(batch, gpu=False):
    longest_y = max([y.size(0) for X, y in batch])
    x = [X for X, y in batch]
    y = torch.stack(
        [functional.pad(y, (0, longest_y - y.size(0)), value=-1) for X, y in batch]) #https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    if gpu:
        y = y.to("cuda")
    return x, y

In [6]:
class NorBert(nn.Module):
    def __init__(self, ner_vocab, model_path=None, freeze=False):
        super().__init__()
        self._bert = BertModel.from_pretrained(
            model_path
        )
        hidden_size = self._bert.config.hidden_size
        self._linear = nn.Linear(hidden_size, len(ner_vocab))

        if freeze:
            for param in self._bert.parameters():
                param.requires_grad = False #Freezing bert layer

    def forward(self, batch, mask):
        b = self._bert(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        pooler = b.last_hidden_state[:, mask].diagonal().permute(2, 0, 1) #https://pytorch.org/docs/stable/generated/torch.permute.html
        return self._linear(pooler)                                     #https://pytorch.org/docs/stable/generated/torch.diagonal.html 

In [7]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py 

def build_mask(tokenizer, ids):
    tok_sents = [tokenizer.convert_ids_to_tokens(i) for i in ids]
    mask = []
    for sentence in tok_sents:
        current = []
        for n, token in enumerate(sentence):
            if token in tokenizer.all_special_tokens[1:] or token.startswith("##"): # ## masked
                continue
            else:
                current.append(n)
        mask.append(current)

    mask = tokenizer.pad({"input_ids": mask}, return_tensors="pt")["input_ids"]
    return mask

In [8]:
def predict(input_data, tokenizer, model, gpu=False):
    input_data = tokenizer(
        input_data, is_split_into_words=True, return_tensors="pt", padding=True
    )
    if gpu:
        input_data = input_data.to("cuda")
    batch_mask = build_mask(tokenizer, input_data["input_ids"])
    y_pred = model(input_data, batch_mask).permute(0, 2, 1).argmax(dim=1)
    return y_pred

In [9]:
def train(device, train_loader, dev_loader, tokenizer, model, optimiser, scheduler, criterion, no_ne_index, MAX_EPOCHS, OUTPUT_PATH, PATIENCE):
    
    accuracy_lst = []
    loss_history_train, loss_history_dev, val_accuracy = {}, {}, {}
    inpatience = 0
    
    for epoch in range(MAX_EPOCHS):
        # TRAIN
        loss_accumulation = 0
        model.train()
        train_iter = tqdm.tqdm(train_loader)
        for x, y in train_iter:
            x = tokenizer(
                x, is_split_into_words=True, return_tensors="pt", padding=True
            )
            if gpu:
                x = x.to(device)
            batch_mask = build_mask(tokenizer, x["input_ids"])
            optimiser.zero_grad()
            y_pred = model(x, batch_mask).permute(0, 2, 1)
            loss = criterion(y_pred, y)
            loss_accumulation+=loss.item()
            loss.backward()
            optimiser.step()
            train_iter.set_postfix_str(f"loss: {loss.item()}")
                
        # EVAL       
        model.eval()
        dev_iter = tqdm.tqdm(dev_loader)
        correct, total = 0, 0
        for x, y in dev_iter:
            y_pred = predict(x, tokenizer, model, gpu=gpu)
            correct += (((y_pred == y).logical_and(y != no_ne_index)).nonzero(as_tuple=False).size(0))
            total += ((y != no_ne_index).logical_and(y != -1).nonzero(as_tuple=False).size(0))
            x = tokenizer(
                x, is_split_into_words=True, return_tensors="pt", padding=True
            )
            if gpu:
                x = x.to(device)
            batch_mask = build_mask(tokenizer, x["input_ids"])
            y_pred = model(x, batch_mask).permute(0, 2, 1)
            dev_loss = criterion(y_pred, y)
            scheduler.step(dev_loss)

        print("Epoch number: {}".format(epoch))
        accuracy = correct / total
        accuracy_lst.append(accuracy)
        current_acc_best = max(accuracy_lst)
        print(accuracy)
        print(f"List of accuracies: {accuracy_lst}")
        print(f"Best accuracy so far: {current_acc_best}")
        print(f"Validation accuracy = {correct} / {total} = {accuracy}")
        if ((accuracy < current_acc_best) and (inpatience <= PATIENCE)): 
            print(f"No improvement. Inpatience counter increased..")
            print(f"Patience set at {PATIENCE}")
            inpatience +=1
            print(f"Inpatience counter reached {inpatience}")
            if inpatience == PATIENCE:
                print(f"Patience tolerance reached. Early stopping at epoch {epoch}!")
                break
        else:
            current_acc_best = accuracy
            print("New best!")
            print("Setting inpatience counter to 0")
            inpatience = 0
            torch.save(model.state_dict(), OUTPUT_PATH)

        loss_history_train["epoch: {}".format(epoch)] = loss_accumulation
        loss_history_dev["epoch: {}".format(epoch)] = loss_accumulation
        val_accuracy["epoch: {}".format(epoch)] = accuracy
        

    return loss_history_train, loss_history_dev, val_accuracy, epoch

# """ TODO: legg til tekst fil med loss og andre stats"""
# INTERGRATE TODO PATIENCE


In [10]:
def predict_test(test_set, ner_vocab, tokenizer, model):
    model.eval()
    predicted_labels = []
    test_set = tqdm.tqdm(test_set)
    for x, y in test_set:
        y_pred = predict(x, tokenizer, model, gpu=gpu)
        predicted = [ner_vocab[element] for element in y_pred[0]]
        predicted_labels += predicted
    return predicted_labels

In [11]:
def is_freeze(freeze, model):
    if freeze:
        lr=0.001
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
        print("Learn rate {}".format(lr))
    else:
        lr = 2e-5
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
        print("Learn rate {}".format(lr))
    return optimiser, lr

In [12]:
def model_info_json(model_name, loss_history_train, loss_history_dev, val_accuracy, MAX_EPOCHS, stop_epoch, lr, seed, PATIENCE):
    info = {
        "Seed": seed,
        "Max epochs set": MAX_EPOCHS,
        "Train stop at epoch (begins at 0)": stop_epoch,
        "Learning rate": lr,
        "Validation accuracy history": val_accuracy,
        "Train loss history": loss_history_train,
        "Val loss history": loss_history_dev,
        "Patience set": PATIENCE
    }

    file_name = "{}_seed_{}.json".format(model_name, seed)
    with open(file_name, "w") as write:
        json.dump(info, write, indent=2)

In [13]:
path = "all_conllu/{0}.conllu"
file_list = ["no_bokmaal-ud-dev", "no_bokmaal-ud-test", "no_bokmaal-ud-train", "no_nynorsk-ud-dev", "no_nynorsk-ud-test", "no_nynorsk-ud-train"]

dev_split_no = convert_to_list_dict(path, file_list[0])
test_split_no = convert_to_list_dict(path, file_list[1])
train_split_no = convert_to_list_dict(path, file_list[2])

dev_split_ny = convert_to_list_dict(path, file_list[3])
test_split_ny = convert_to_list_dict(path, file_list[4])
train_split_ny = convert_to_list_dict(path, file_list[5])

print("Combining train, dev and test sets..")
dev_split = dev_split_no + dev_split_ny
test_split = test_split_no + test_split_ny
train_split = train_split_no + train_split_ny
print("Success!")

Converting no_bokmaal-ud-dev to list of dictionaries
     2410 elements converted..
Converting no_bokmaal-ud-test to list of dictionaries
     1939 elements converted..
Converting no_bokmaal-ud-train to list of dictionaries
     15696 elements converted..
Converting no_nynorsk-ud-dev to list of dictionaries
     1890 elements converted..
Converting no_nynorsk-ud-test to list of dictionaries
     1511 elements converted..
Converting no_nynorsk-ud-train to list of dictionaries
     14174 elements converted..
Combining train, dev and test sets..
Success!


In [14]:
ner_vocab =     ['B-GPE_LOC', 'I-DRV', 'I-LOC', 'B-PER', 'I-PER', 'B-PROD', 
                'I-GPE_ORG', 'B-GPE_ORG', 'B-EVT', 'B-DRV', 'I-PROD', 'B-ORG', 'B-MISC',
                'I-MISC', 'I-GPE_LOC', 'B-LOC', 'I-ORG', 'I-EVT', 'O', '@UNK']

x_train_tokens = [x["tokens"] for x in train_split]
y_train_labels = [y["ner_tags"] for y in train_split]
train_dataset = CoNLLDataset(x_train_tokens, y_train_labels, ner_vocab)

x_dev_tokens = [x["tokens"] for x in dev_split]
y_dev_labels = [y["ner_tags"] for y in dev_split]
dev_dataset = CoNLLDataset(x_dev_tokens, y_dev_labels, ner_vocab)

x_test_tokens = [x["tokens"] for x in test_split]
y_test_labels = [y["ner_tags"] for y in test_split]
test_dataset = CoNLLDataset(x_test_tokens, y_test_labels, ner_vocab)

In [15]:
# Prepping data for train and predict
train_loader = DataLoader(
        train_dataset, batch_size=32, shuffle=True, collate_fn=partial(collate_fn, gpu=gpu))
dev_loader = DataLoader(
        dev_dataset, batch_size=32, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))
test_loader = DataLoader(
        test_dataset, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

In [16]:
train_dataset.ner_vocab

['B-GPE_LOC',
 'I-DRV',
 'I-LOC',
 'B-PER',
 'I-PER',
 'B-PROD',
 'I-GPE_ORG',
 'B-GPE_ORG',
 'B-EVT',
 'B-DRV',
 'I-PROD',
 'B-ORG',
 'B-MISC',
 'I-MISC',
 'I-GPE_LOC',
 'B-LOC',
 'I-ORG',
 'I-EVT',
 'O',
 '@UNK']

In [None]:
""" Training block """


torch.backends.cudnn.deterministic = True


no_ne_index = train_dataset.ner_indexer["O"] # For train parameter
model_path = "ltgoslo/norbert2"
tokenizer = BertTokenizer.from_pretrained(model_path, do_basic_tokenize=False)
MAX_EPOCHS = 100
PATIENCE = 10
seeds = [8, 37, 42, 101, 1024]
freeze = False

for seed in seeds:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Shuffling dataset for each seed. When taking mean average from each seed, hopefully the
    # results will be somewhat the same even if the train data is shuffled.

    # Instaniating model
    norbert2 = NorBert(ner_vocab, model_path, freeze).to(device)
    model_name = "norbert2"

    # Some paramters for train
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    optimiser, lr = is_freeze(freeze, norbert2)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, mode='min',
            factor=0.1, patience=2, threshold=1e-7, threshold_mode='abs')
            
    OUTPUT_PATH = "norbert2_model_adam_lr_{}__maxEpochs_{}__seed_{}".format(lr, MAX_EPOCHS, seed)
    loss_history_train, loss_history_dev, val_accuracy, stop_epoch = train(
                                        device, train_loader, dev_loader, 
                                        tokenizer, norbert2, optimiser, scheduler, criterion,
                                        no_ne_index, MAX_EPOCHS, OUTPUT_PATH, PATIENCE
                                        )

    info = model_info_json(model_name, loss_history_train, loss_history_dev, val_accuracy,
                         MAX_EPOCHS, stop_epoch, lr, seed, PATIENCE)

In [18]:
""" Gold labels for NorNE test_dataset"""

gold_labels = []
for sentence_labels in test_dataset.ner_labels:
    for label in sentence_labels:
        gold_labels.append(label)

In [20]:
model_path = "ltgoslo/norbert2"
tokenizer = BertTokenizer.from_pretrained(model_path, do_basic_tokenize=False)

MODEL_LOAD_PATH = ""

bertbert = NorBert(ner_vocab, model_path, freeze).to(device)
bertbert.load_state_dict(torch.load(MODEL_LOAD_PATH))

Some weights of the model checkpoint at ltgoslo/norbert2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [21]:
preds = predict_test(test_loader, ner_vocab, tokenizer, bertbert) #train data for ner_vocab 

100%|██████████| 3450/3450 [00:33<00:00, 104.22it/s]


In [22]:
cr = confusion_matrix(gold_labels, preds)
print(classification_report(gold_labels, preds, labels = ner_vocab[:-2]))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   B-GPE_LOC       0.84      0.93      0.88       428
       I-DRV       0.78      0.54      0.64        13
       I-LOC       0.78      0.62      0.69        85
       B-PER       0.92      0.93      0.92       961
       I-PER       0.92      0.98      0.95       510
      B-PROD       0.69      0.31      0.43       131
   I-GPE_ORG       1.00      0.57      0.73         7
   B-GPE_ORG       0.82      0.54      0.65        61
       B-EVT       0.36      0.36      0.36        14
       B-DRV       0.77      0.76      0.76        78
      I-PROD       0.70      0.36      0.47       126
       B-ORG       0.77      0.83      0.80       521
      B-MISC       0.00      0.00      0.00        14
      I-MISC       0.00      0.00      0.00         3
   I-GPE_LOC       0.97      0.91      0.94        79
       B-LOC       0.74      0.66      0.70       185
       I-ORG       0.69      0.74      0.71       248
       I-EVT       0.07    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# from seqeval.scheme import IOB2
# from seqeval.metrics import classification_report as cr

# classification_rep = cr(gold_labels, preds)
# print(classification_rep)

In [None]:
# import matplotlib.pyplot as plt
# import itertools
# import numpy as np

# def plot_confusion_matrix(cm, classes,
#                           normalize=False,
#                           title='Confusion matrix',
#                           cmap=plt.cm.Blues):

#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=45)
#     plt.yticks(tick_marks, classes)

#     if normalize:
#         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
#     else:
#         print('Confusion matrix, without normalization')

#     thresh = cm.max() / 2.
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#         plt.text(j, i, cm[i, j],
#                  horizontalalignment="center",
#                  color="white" if cm[i, j] > thresh else "black")

#     plt.tight_layout()
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')

# from sklearn import metrics
# import itertools


# score = metrics.accuracy_score(gold_labels, predicted_tags)
# print("accuracy:   %0.3f" % score)

# cm = metrics.confusion_matrix(gold_labels, predicted_tags)
# plot_confusion_matrix(cm, classes=set(gold_labels))

In [23]:
from utils.get_parl_corpus_to_dict import ParlamentaryCorpus
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score, precision_score, recall_score, classification_report
from nltk.tokenize import word_tokenize

In [24]:
def tokenize_parl(corpus):
    tokenized_corpus = []
    for item in corpus:
        for value in item.values():
            tokenized_corpus.append((word_tokenize(value)))
    return tokenized_corpus

In [61]:
def binary_parl_from_iob2(array):
    temp_lst = []
    for tag in array:
        if tag == "O":
            temp_lst.append(0)
        else:
            temp_lst.append(1)
    return temp_lst

In [26]:
path1 = r"parl_corpus\20170207_sentence_data.json"
path2 = r"parl_corpus\20170110_sentence_data.json"

c1 = ParlamentaryCorpus(path1)
c2 = ParlamentaryCorpus(path2)
c1_gold = c1.load_data()
c2_gold = c2.load_data()
lowered_corpus1 = c1.load_data(lower=True)
lowered_corpus2 = c2.load_data(lower=True)

lowered_corpus = lowered_corpus1 + lowered_corpus2
gold_corpus = c1_gold + c2_gold

tokens_lower_corpus = tokenize_parl(lowered_corpus)
token_gold_corpus = tokenize_parl(gold_corpus)


In [32]:
""" Provides dummy y-data so CoNLLDataset class can be used """
""" Not elegant but works perfectly fine :)                 """

dummy_y_parl = []
for sentence in tokens_lower_corpus:
    temp = []
    temp.extend(len(sentence)*"O")
    dummy_y_parl.append(temp)

len(dummy_y_parl)

2589

In [35]:
parl_lower_corpus = CoNLLDataset(tokens_lower_corpus, dummy_y_parl, ner_vocab)

In [36]:
parl_loader = DataLoader(
        parl_lower_corpus, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))


In [52]:
prediction_parl = predict_test(parl_loader, ner_vocab, tokenizer, bertbert)

100%|██████████| 2589/2589 [00:25<00:00, 100.24it/s]


In [62]:
prl = binary_parl_from_iob2(prediction_parl)

In [64]:
"""Annotating binary gold labels for parl corpus"""

length = 0
gold_annotations = []
for sentences in token_gold_corpus:
    annotation = []
    length += len(sentences)
    for token in sentences:
        if token[0].isupper() == True: # Checks if token has captial letter
            annotation.append(1)
            
        else:
            annotation.append(0) # Adds 0 if token has lower 
    gold_annotations.append(annotation)

In [65]:
y_true_parl = [item for sublist  in gold_annotations for item in sublist]

In [66]:
print(confusion_matrix(y_true_parl, prl))
print(classification_report(y_true_parl, prl))

[[44912   582]
 [   98  1995]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     45494
           1       0.77      0.95      0.85      2093

    accuracy                           0.99     47587
   macro avg       0.89      0.97      0.92     47587
weighted avg       0.99      0.99      0.99     47587



In [82]:
# Example predictions
some_tokens = []
for t in parl_lower_corpus.tokens[:2]:
    for x in t:
        some_tokens.append(x)
print(some_tokens[:15])
print(prediction_parl[:15])

cap_sent = []
for token, pred in zip(some_tokens, prediction_parl):
    if pred != "O":
        cap_sent.append(token.capitalize())
    else:
        cap_sent.append(token)

print("\nSentence with capitalization prediction:")
print(" ".join(cap_sent))

['stortingets', 'møte', 'er', 'lovlig', 'satt', 'representantene', 'fredric', 'helen', 'fredric', 'holen', 'bjørdal', 'og', 'trond', 'giske', 'som']
['B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O']

Sentence with capitalization prediction:
Stortingets møte er lovlig satt representantene Fredric Helen Fredric Holen Bjørdal og Trond Giske som har vært permitterte har igjen tatt sete
