In [1]:
import conllu
import tqdm
import torch
import json
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional
from functools import partial
from transformers import BertModel
from torch import nn
from transformers import BertTokenizer
import os
import pandas as pd
from get_parl_corpus_token_data import ParlamentaryCorpus
from seqeval.scheme import IOB2
from seqeval.metrics import classification_report as cr
from seqeval.metrics import performance_measure
from seqeval.scheme import IOB2
from seqeval.metrics import performance_measure


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
""" Could tidy up this one to just use device instead of calling gpu=true etc."""
if torch.cuda.is_available():
    gpu = True
    print("Using GPU")
else:
    gpu = False

device = torch.device("cuda")
print(device)

Using GPU
cuda


In [3]:
## Conllu stuff
def filter_tags(x):
    return x        

def convert_to_list_dict(path, file):
    path = path.format(file)
    with open(path, encoding="UTF-8") as infile:
        lst = []
        tokens = list(conllu.parse_incr(infile))
        for sent in tokens:
                dic = {
                "idx": sent.metadata["sent_id"],
                "text": sent.metadata["text"].lower(),
                "tokens": [token["form"].lower() for token in sent],
                "lemmas": [token["lemma"] for token in sent],
                "pos_tags": [token["upos"] for token in sent],
                "ner_tags": [filter_tags(token["misc"].get("name", "O")) for token in sent],
            }
                lst.append(dic) 
        print("Converting {} to list of dictionaries\n     {} elements converted..".format(file, len(lst)))
    return lst

In [4]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/dataset.py



class CoNLLDataset(Dataset):
    def __init__(self, x_tokens, y_labels, ner_vocab=None):
        self.tokens = [[x for x in entry] for entry in x_tokens]
        self.ner_labels = [[y for y in entry] for entry in y_labels]

        # hard coded ner_vocab to avoid random shuffled instanciation of ordering of ner_vocab
        self.ner_vocab = ner_vocab
        self.ner_indexer = {i: n for n, i in enumerate(self.ner_vocab)}
    
    def __getitem__(self, index):
        tokens = self.tokens[index]
        ner_labels = self.ner_labels[index]

        x = tokens
        y = torch.LongTensor([self.ner_indexer[i] if i in self.ner_vocab
                              else self.ner_indexer['@UNK'] for i in ner_labels])
        return x, y

    def __len__(self):
        return len(self.tokens)



In [5]:
""" Dynamic padding. Takes the longest sentence in batch and pads other sentences to its length (if im not mistaken)"""
# Function borrowed from https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py

def collate_fn(batch, gpu=False):
    longest_y = max([y.size(0) for X, y in batch])
    x = [X for X, y in batch]
    y = torch.stack(
        [functional.pad(y, (0, longest_y - y.size(0)), value=-1) for X, y in batch]) #https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    if gpu:
        y = y.to("cuda")
    return x, y

In [6]:
class Bert(nn.Module):
    def __init__(self, ner_vocab, model_path=None, freeze=False):
        super().__init__()
        self._bert = BertModel.from_pretrained(
            model_path
        )
        hidden_size = self._bert.config.hidden_size
        self._linear = nn.Linear(hidden_size, len(ner_vocab))

        if freeze:
            for param in self._bert.parameters():
                param.requires_grad = False #Freezing bert layer

    def forward(self, batch, mask):
        b = self._bert(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        pooler = b.last_hidden_state[:, mask].diagonal().permute(2, 0, 1) #https://pytorch.org/docs/stable/generated/torch.permute.html
        return self._linear(pooler)                                     #https://pytorch.org/docs/stable/generated/torch.diagonal.html 

In [7]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py 

def build_mask(tokenizer, ids):
    tok_sents = [tokenizer.convert_ids_to_tokens(i) for i in ids]
    mask = []
    for sentence in tok_sents:
        current = []
        for n, token in enumerate(sentence):
            if token in tokenizer.all_special_tokens[1:] or token.startswith("##"): # ## masked
                continue
            else:
                current.append(n)
        mask.append(current)

    mask = tokenizer.pad({"input_ids": mask}, return_tensors="pt")["input_ids"]
    return mask

In [8]:
def predict(input_data, tokenizer, model, gpu=False):
    input_data = tokenizer(
        input_data, is_split_into_words=True, return_tensors="pt", padding=True
    )
    if gpu:
        input_data = input_data.to("cuda")
    batch_mask = build_mask(tokenizer, input_data["input_ids"])
    y_pred = model(input_data, batch_mask).permute(0, 2, 1).argmax(dim=1)
    return y_pred

In [9]:
def predict_test(test_set, ner_vocab, tokenizer, model):
    model.eval()
    predicted_labels = []
    test_set = tqdm.tqdm(test_set)
    for x, y in test_set:
        y_pred = predict(x, tokenizer, model, gpu=gpu)
        predicted = [ner_vocab[element] for element in y_pred[0]]
        predicted_labels += predicted
    return predicted_labels

In [10]:
def is_freeze(freeze, model):
    if freeze:
        lr=0.001
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        lr = 2e-5
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    return optimiser, lr

In [11]:
def load_parl_corpus(rootdir_parl_corpus, lower=False):
    corpora_normal_cap, corpora_lower, paths = [], [], []
    
    for subdir, dirs, files in os.walk(rootdir_parl_corpus):
        for file in files:
            if "normalized_token_data.json" in file:
                path = (os.path.join(subdir, file))
                paths.append(path)

    for corpus, path in enumerate(paths):
        corpus = ParlamentaryCorpus(path)
        corpus = corpus.load_data()
        corpora_normal_cap.append(corpus)
        for k, v in corpus.items():
            if v == []:
                print(k)
                print(path)

    if lower==True:
        for corpus, path in enumerate(paths):
            corpus = ParlamentaryCorpus(path)
            corpus = corpus.load_data(lower=True)
            corpora_lower.append(corpus)


    return corpora_normal_cap, corpora_lower

In [12]:
def parl_sentences_only(dictionary_corpus):
    corp = []
    for diction in dictionary_corpus:
        corp += list(diction.values())
    return corp

In [13]:
""" Provides dummy y-data so CoNLLDataset class can be used """
""" Not elegant but works perfectly fine :)                 """

def make_dummy_y(test_corpus):
    y_dummy_data = []
    for sentence in test_corpus:
        dummy = []
        dummy.extend(len(sentence)*"O")
        y_dummy_data.append(dummy)
    return y_dummy_data

In [14]:
def binary_parl_from_iob2(array):
    temp_lst = []
    for tag in array:
        if tag == "O":
            temp_lst.append(0)
        else:
            temp_lst.append(1)
    return temp_lst

In [15]:
def gold_labels_unlabled_data(unlabeled):
    length = 0
    gold_labels = []
    for sentences in unlabeled:
        label = []
        length += len(sentences)
        for token in sentences:
            if token[0].isupper() == True: # Checks if token has captial letter
                label.append(1)
                
            else:
                label.append(0) # Adds 0 if token has lower 
        gold_labels.append(label)
    
    return gold_labels

In [16]:
def transform_to_binary(set):
    binary_labels = []
    for label in set:
        if label == "O":
            binary_labels.append(0)
        else:
            binary_labels.append(1)
    return binary_labels

In [17]:
"""Making list of lists preds, same len as NorNE gold labels"""
""" test_dataset_ner_labels should be test_dataset.ner_labels """

def split_list_preds(preds, test_dataset_ner_labels):
    split_list_preds = []
    start = 0

    for sublist in test_dataset_ner_labels:
        end = start + len(sublist)
        split_list_preds.append(preds[start:end])
        start = end
    return split_list_preds

In [18]:
# Assumes that it is the same vocab in dev and test
from itertools import chain
def get_labels(train_split, add_UNK=True):
    label_vocab = [set(y["ner_tags"]) for y in train_split]
    label_vocab = list(chain(*[d for d in label_vocab]))
    label_vocab = list(set(sorted(label_vocab)))
    
    if add_UNK==True:
        label_vocab.append("@UNK")

    tmp1, tmp2 = [], []
    for label in label_vocab:
        if "-" in label:
            tmp1.append(label)
        else:
            tmp2.append(label)
    tmp1.sort(key=lambda x: (x.split("-")[1], x.split("-")[0]))
    tmp1 = [x for x in tmp1]
    tmp1.extend(tmp2)
    label_vocab = tmp1
    return label_vocab

In [19]:
""" Norne Data loading stuff"""

path = "all_conllu/{0}.conllu"
file_list = ["no_bokmaal-ud-dev", "no_bokmaal-ud-test", "no_bokmaal-ud-train", "no_nynorsk-ud-dev", "no_nynorsk-ud-test", "no_nynorsk-ud-train"]


test_split_no = convert_to_list_dict(path, file_list[1])
train_split_no = convert_to_list_dict(path, file_list[2])


test_split_ny = convert_to_list_dict(path, file_list[4])
train_split_ny = convert_to_list_dict(path, file_list[5])



train_split = train_split_no + train_split_ny
ner_vocab = get_labels(train_split)


print("Combining test set..")
test_split = test_split_no + test_split_ny
print("Success!")

x_test_tokens = [x["tokens"] for x in test_split]
y_test_labels = [y["ner_tags"] for y in test_split]
test_dataset = CoNLLDataset(x_test_tokens, y_test_labels, ner_vocab)

test_loader = DataLoader(
        test_dataset, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

""" Gold labels for NorNE test_dataset"""
gold_labels = []
for sentence_labels in test_dataset.ner_labels:
    for label in sentence_labels:
        gold_labels.append(label)

Converting no_bokmaal-ud-test to list of dictionaries
     1939 elements converted..
Converting no_bokmaal-ud-train to list of dictionaries
     15696 elements converted..
Converting no_nynorsk-ud-test to list of dictionaries
     1511 elements converted..
Converting no_nynorsk-ud-train to list of dictionaries
     14174 elements converted..
Combining test set..
Success!


In [20]:
""" Parliamentary corpus data loading stuff"""

rootdir=r"C:\Users\Aarne\Desktop\Ferdig_code_folder\parl_corpus_full"
corpora_normal_cap, corpora_lower = load_parl_corpus(rootdir, lower=True)
gold_corp = parl_sentences_only(corpora_normal_cap)
test_parl = parl_sentences_only(corpora_lower)

_gold_labels_parl = gold_labels_unlabled_data(gold_corp)

dummy_y_parl = make_dummy_y(test_parl)
parl_lower_corpus = CoNLLDataset(test_parl, dummy_y_parl, ner_vocab)

parl_loader = DataLoader(
        parl_lower_corpus, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

gold_labels_parl = [item for sublist  in _gold_labels_parl for item in sublist]

In [196]:
""" sentence_numbers generated using random
random_sentence_numbers = []
for sen in range(1, 31):
    r = randint(0, len(gold_corp))
    if r not in random_sentence_numbers:
        random_sentence_numbers.append(r)
        """

sentence_numbers = [17238, 31177, 36326, 56785, 41297, 6916, 18332, 22270, 18040, 15062, 41225, 23693, 35717, 47842, 18370, 53265, 49502, 21644, 53000, 15387, 47433, 30984, 4540, 43388, 42241, 37460, 51399, 3828, 52673, 15665]


sentences = []
for i in sentence_numbers:
    sentences.append(gold_corp[i])
sample_sentences = sentences

sentences_lower = []
for i in sentence_numbers:
    sentences_lower.append(test_parl[i])
sample_sentences_lower = sentences_lower

In [197]:
bert_model = Bert(ner_vocab, "NbAiLab/nb-bert-base", freeze=False).to(device)
tokenizer = BertTokenizer.from_pretrained("NbAiLab/nb-bert-base", do_basic_tokenize=False)
bert_model.load_state_dict(torch.load(r"C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\MSTR-PY\TRAIN ALL BERTS\trained\NB-BERT\NB-BERT_adam_lr_2e-05__maxEpochs_50__seed_42"))


Some weights of the model checkpoint at NbAiLab/nb-bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [198]:
_gold_sample_sentences = gold_labels_unlabled_data(sample_sentences)
gold_sentences_samples = [item for sublist  in _gold_sample_sentences for item in sublist]
gold_sentences_samples_splits = split_list_preds(gold_sentences_samples, sample_sentences)


sample_sentence_lower_dummy = make_dummy_y(sample_sentences_lower)
sample_sentence_lower_corpus = CoNLLDataset(sample_sentences_lower, sample_sentence_lower_dummy, ner_vocab)


sample_loader = DataLoader(sample_sentence_lower_corpus, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))
sample_sentence_preds = predict_test(sample_loader, ner_vocab, tokenizer, bert_model)
sample_sentence_preds_splits = split_list_preds(sample_sentence_preds, sample_sentence_lower_dummy)
sample_sentence_preds_BINARY = binary_parl_from_iob2(sample_sentence_preds)
sample_sentence_preds_BINARY_splits = split_list_preds(sample_sentence_preds_BINARY, sample_sentence_lower_dummy)


100%|██████████| 30/30 [00:00<00:00, 67.89it/s]


In [199]:
false_negatives = { 

}

false_positives = {

}

In [204]:
gold_sentences_samples_splits

[[1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
 [0, 0],
 [0, 0, 0],
 [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [200]:


for i, (gold, pred) in enumerate(zip(gold_sentences_samples_splits, sample_sentence_preds_BINARY_splits)):
    if gold != pred:
        for word_index, (word_g, word_p) in enumerate(zip(gold,pred)):
            if word_g != word_p:
                if word_p == 0:
                        if sample_sentences_lower[i][word_index] not in false_negatives.keys():
                            sentence_list = [sample_sentences[i]]
                            false_negatives[sample_sentences_lower[i][word_index]] = {
                                "Count": 1,
                                "Sentence": sentence_list
                                }
                        
                        elif sample_sentences_lower[i][word_index] in false_negatives.keys():
                            count = false_negatives[sample_sentences_lower[i][word_index]]["Count"]
                            count += 1
                            false_negatives[sample_sentences_lower[i][word_index]]["Count"] = count
                            false_negatives[sample_sentences_lower[i][word_index]]["Sentence"].append(sample_sentences[i])



                elif word_p == 1:
                        if sample_sentences_lower[i][word_index] not in false_positives.keys():
                            sentence_list = [sample_sentences[i]]
                            false_positives[sample_sentences_lower[i][word_index]] = {
                                "Count": 1,
                                "Sentence": sentence_list
                                }
                        
                        elif sample_sentences_lower[i][word_index] in false_positives.keys():
                            count = false_positives[sample_sentences_lower[i][word_index]]["Count"]
                            count += 1
                            false_positives[sample_sentences_lower[i][word_index]]["Count"] = count
                            false_positives[sample_sentences_lower[i][word_index]]["Sentence"].append(sample_sentences[i])




In [205]:
gold_sentences_samples_splits

[[1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
 [0, 0],
 [0, 0, 0],
 [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [202]:
"sykehuspartners" in false_negatives.keys()


True

In [126]:
fn = {
    "ord": {
        "count": 1,
        "sen": [ ]
    },
}

In [130]:
cn = fn["ord"]["count"] 
cn +=1
fn["ord"]["count"] = cn

In [131]:
fn

{'ord': {'count': 2, 'sen': []}}

In [25]:
# relative paths only works half of the time on my windows machine :(
model_types = {
    "bert-base-multilingual-cased": r"C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\MSTR-PY\TRAIN ALL BERTS\trained\mBERT",
    "ltgoslo/norbert": r"C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\MSTR-PY\TRAIN ALL BERTS\trained\norBERT",
    "ltgoslo/norbert2": r"C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\MSTR-PY\TRAIN ALL BERTS\trained\NB-BERT",
    "NbAiLab/nb-bert-base": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\NB-BERT\trained models nb-bert",
    "saattrupdan/nbailab-base-ner-scandi": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\scandi-bert\trained models scandi_bert"
}

for name, type in model_types.items():
    if "/" in name:
        model_type_name = name.rsplit('/',1)[1]
        print(model_type_name)
    else:
        model_type_name = name
        print(model_type_name)


bert-base-multilingual-cased
norbert
norbert2
nb-bert-base
nbailab-base-ner-scandi
