In [1]:
import conllu
import tqdm
import torch
import json
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional
from functools import partial
from transformers import BertModel
from torch import nn
from transformers import BertTokenizer
import os
import pandas as pd
from get_parl_corpus_token_data import ParlamentaryCorpus


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
""" Could tidy up this one to just use device instead of calling gpu=true etc."""
if torch.cuda.is_available():
    gpu = True
    print("Using GPU")
else:
    gpu = False

device = torch.device("cuda")
print(device)

Using GPU
cuda


In [3]:
## Conllu stuff
def filter_tags(x):
    return x        

def convert_to_list_dict(path, file):
    path = path.format(file)
    with open(path, encoding="UTF-8") as infile:
        lst = []
        tokens = list(conllu.parse_incr(infile))
        for sent in tokens:
                dic = {
                "idx": sent.metadata["sent_id"],
                "text": sent.metadata["text"].lower(),
                "tokens": [token["form"].lower() for token in sent],
                "lemmas": [token["lemma"] for token in sent],
                "pos_tags": [token["upos"] for token in sent],
                "ner_tags": [filter_tags(token["misc"].get("name", "O")) for token in sent],
            }
                lst.append(dic) 
        print("Converting {} to list of dictionaries\n     {} elements converted..".format(file, len(lst)))
    return lst

In [4]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/dataset.py



class CoNLLDataset(Dataset):
    def __init__(self, x_tokens, y_labels, ner_vocab=None):
        self.tokens = [[x for x in entry] for entry in x_tokens]
        self.ner_labels = [[y for y in entry] for entry in y_labels]

        # hard coded ner_vocab to avoid random shuffled instanciation of ordering of ner_vocab
        self.ner_vocab = ner_vocab
        self.ner_indexer = {i: n for n, i in enumerate(self.ner_vocab)}
    
    def __getitem__(self, index):
        tokens = self.tokens[index]
        ner_labels = self.ner_labels[index]

        x = tokens
        y = torch.LongTensor([self.ner_indexer[i] if i in self.ner_vocab
                              else self.ner_indexer['@UNK'] for i in ner_labels])
        return x, y

    def __len__(self):
        return len(self.tokens)



In [5]:
""" Dynamic padding. Takes the longest sentence in batch and pads other sentences to its length (if im not mistaken)"""
# Function borrowed from https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py

def collate_fn(batch, gpu=False):
    longest_y = max([y.size(0) for X, y in batch])
    x = [X for X, y in batch]
    y = torch.stack(
        [functional.pad(y, (0, longest_y - y.size(0)), value=-1) for X, y in batch]) #https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    if gpu:
        y = y.to("cuda")
    return x, y

In [6]:
class Bert(nn.Module):
    def __init__(self, ner_vocab, model_path=None, freeze=False):
        super().__init__()
        self._bert = BertModel.from_pretrained(
            model_path
        )
        hidden_size = self._bert.config.hidden_size
        self._linear = nn.Linear(hidden_size, len(ner_vocab))

        if freeze:
            for param in self._bert.parameters():
                param.requires_grad = False #Freezing bert layer

    def forward(self, batch, mask):
        b = self._bert(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        pooler = b.last_hidden_state[:, mask].diagonal().permute(2, 0, 1) #https://pytorch.org/docs/stable/generated/torch.permute.html
        return self._linear(pooler)                                     #https://pytorch.org/docs/stable/generated/torch.diagonal.html 

In [7]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py 

def build_mask(tokenizer, ids):
    tok_sents = [tokenizer.convert_ids_to_tokens(i) for i in ids]
    mask = []
    for sentence in tok_sents:
        current = []
        for n, token in enumerate(sentence):
            if token in tokenizer.all_special_tokens[1:] or token.startswith("##"): # ## masked
                continue
            else:
                current.append(n)
        mask.append(current)

    mask = tokenizer.pad({"input_ids": mask}, return_tensors="pt")["input_ids"]
    return mask

In [8]:
def predict(input_data, tokenizer, model, gpu=False):
    input_data = tokenizer(
        input_data, is_split_into_words=True, return_tensors="pt", padding=True
    )
    if gpu:
        input_data = input_data.to("cuda")
    batch_mask = build_mask(tokenizer, input_data["input_ids"])
    y_pred = model(input_data, batch_mask).permute(0, 2, 1).argmax(dim=1)
    return y_pred

In [9]:
def predict_test(test_set, ner_vocab, tokenizer, model):
    model.eval()
    predicted_labels = []
    test_set = tqdm.tqdm(test_set)
    for x, y in test_set:
        y_pred = predict(x, tokenizer, model, gpu=gpu)
        predicted = [ner_vocab[element] for element in y_pred[0]]
        predicted_labels += predicted
    return predicted_labels

In [10]:
def is_freeze(freeze, model):
    if freeze:
        lr=0.001
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        lr = 2e-5
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    return optimiser, lr

In [11]:
def load_parl_corpus(rootdir_parl_corpus, lower=False):
    corpora_normal_cap, corpora_lower, paths = [], [], []
    
    for subdir, dirs, files in os.walk(rootdir_parl_corpus):
        for file in files:
            if "normalized_token_data.json" in file:
                path = (os.path.join(subdir, file))
                paths.append(path)

    for corpus, path in enumerate(paths):
        corpus = ParlamentaryCorpus(path)
        corpus = corpus.load_data()
        corpora_normal_cap.append(corpus)
        for k, v in corpus.items():
            if v == []:
                print(k)
                print(path)

    if lower==True:
        for corpus, path in enumerate(paths):
            corpus = ParlamentaryCorpus(path)
            corpus = corpus.load_data(lower=True)
            corpora_lower.append(corpus)


    return corpora_normal_cap, corpora_lower

In [12]:
def parl_sentences_only(dictionary_corpus):
    corp = []
    for diction in dictionary_corpus:
        corp += list(diction.values())
    return corp

In [13]:
""" Provides dummy y-data so CoNLLDataset class can be used """
""" Not elegant but works perfectly fine :)                 """

def make_dummy_y(test_corpus):
    y_dummy_data = []
    for sentence in test_corpus:
        dummy = []
        dummy.extend(len(sentence)*"O")
        y_dummy_data.append(dummy)
    return y_dummy_data

In [14]:
def binary_parl_from_iob2(array):
    temp_lst = []
    for tag in array:
        if tag == "O":
            temp_lst.append(0)
        else:
            temp_lst.append(1)
    return temp_lst

In [15]:
def gold_labels_unlabled_data(unlabeled):
    length = 0
    gold_labels = []
    for sentences in unlabeled:
        label = []
        length += len(sentences)
        for token in sentences:
            if token[0].isupper() == True: # Checks if token has captial letter
                label.append(1)
                
            else:
                label.append(0) # Adds 0 if token has lower 
        gold_labels.append(label)
    
    return gold_labels

In [16]:
ner_vocab =     ['B-GPE_LOC', 'I-DRV', 'I-LOC', 'B-PER', 'I-PER', 'B-PROD', 
                'I-GPE_ORG', 'B-GPE_ORG', 'B-EVT', 'B-DRV', 'I-PROD', 'B-ORG', 'B-MISC',
                'I-MISC', 'I-GPE_LOC', 'B-LOC', 'I-ORG', 'I-EVT', 'O', '@UNK']

In [17]:
""" Norne Data loading stuff"""

path = "all_conllu/{0}.conllu"
file_list = ["no_bokmaal-ud-test", "no_nynorsk-ud-test"]

test_split_no = convert_to_list_dict(path, file_list[0])
test_split_ny = convert_to_list_dict(path, file_list[1])

print("Combining test set..")
test_split = test_split_no + test_split_ny
print("Success!")

x_test_tokens = [x["tokens"] for x in test_split]
y_test_labels = [y["ner_tags"] for y in test_split]
test_dataset = CoNLLDataset(x_test_tokens, y_test_labels, ner_vocab)

test_loader = DataLoader(
        test_dataset, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

""" Gold labels for NorNE test_dataset"""
gold_labels = []
for sentence_labels in test_dataset.ner_labels:
    for label in sentence_labels:
        gold_labels.append(label)

Converting no_bokmaal-ud-test to list of dictionaries
     1939 elements converted..
Converting no_nynorsk-ud-test to list of dictionaries
     1511 elements converted..
Combining test set..
Success!


In [18]:
""" Parliamentary corpus data loading stuff"""

rootdir=r"C:\Users\Aarne\Desktop\Ferdig_code_folder\parl_corpus_full"
corpora_normal_cap, corpora_lower = load_parl_corpus(rootdir, lower=True)
gold_corp = parl_sentences_only(corpora_normal_cap)
test_parl = parl_sentences_only(corpora_lower)

_gold_labels_parl = gold_labels_unlabled_data(gold_corp)

dummy_y_parl = make_dummy_y(test_parl)
parl_lower_corpus = CoNLLDataset(test_parl, dummy_y_parl, ner_vocab)

parl_loader = DataLoader(
        parl_lower_corpus, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

gold_labels_parl = [item for sublist  in _gold_labels_parl for item in sublist]

In [19]:


# relative paths only works half of the time on my windows machine :(
model_types = {
    "ltgoslo/norbert2": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\NORBERT2\trained models",
    "NbAiLab/nb-bert-base": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\NB-BERT\trained models nb-bert",
    "saattrupdan/nbailab-base-ner-scandi": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\scandi-bert\trained models scandi_bert"
}

for model_type, trained_models_path in model_types.items():
    tokenizer = BertTokenizer.from_pretrained(model_type, do_basic_tokenize=False)
    for subdir, dirs, files in os.walk(trained_models_path):
        for file in files:
            if ".json" not in file:
                bert_model = Bert(ner_vocab, model_type, freeze=False).to(device)
                bert_train_path = os.path.join(subdir, file)
                bert_model.load_state_dict(torch.load(bert_train_path))
                
                # Writing stuff for norne data

                y_pred_norne = predict_test(test_loader, ner_vocab, tokenizer, bert_model)
                norne_cr = classification_report(gold_labels, y_pred_norne, labels = ner_vocab[:-2], digits=5, output_dict=True)
                norne_cf_matrix = confusion_matrix(gold_labels, y_pred_norne, labels = ner_vocab[:-2])

                norne_cr_df = pd.DataFrame(norne_cr).transpose()
                norne_cf_df = pd.DataFrame(norne_cf_matrix)

                norne_cf_df.columns = ner_vocab[:-2]
                norne_cf_df.index = ner_vocab[:-2]

                model_type_name = model_type.rsplit('/',1)[1]
                file_name_cr = f"norne_CR_{model_type_name}.csv" 
                file_name_cf = f"norne_CF_MATRIX_{model_type_name}.csv"

                current_directory = os.getcwd()
                final_directory = os.path.join(current_directory, rf"{model_type_name}")
                if not os.path.exists(final_directory):
                    os.makedirs(final_directory)


                with open(f"{final_directory}\{file_name_cr}", "a") as f:
                    f.write("\n")
                    f.write(f"NorNE_{file_name_cr[3:-4]}_seed_{bert_train_path.rsplit('_')[-1]}")
                    f.write("\n")
                with open(f"{final_directory}\{file_name_cf}", "a") as f:
                    f.write("\n")
                    f.write(f"NorNE_{file_name_cf[3:-4]}_seed_{bert_train_path.rsplit('_')[-1]}")
                    f.write("\n")

                norne_cr_df.to_csv(f"{final_directory}\{file_name_cr}", mode="a")
                norne_cf_df.to_csv(f"{final_directory}\{file_name_cf}", mode="a")


                # Parl test 
                y_pred_parl = predict_test(parl_loader, ner_vocab, tokenizer, bert_model)
                y_pred_parl_binary = binary_parl_from_iob2(y_pred_parl)

                parl_cr = classification_report(gold_labels_parl, y_pred_parl_binary, digits=5, output_dict=True)
                parl_cf_matrix = confusion_matrix(gold_labels_parl, y_pred_parl_binary)

                parl_cr_df = pd.DataFrame(parl_cr).transpose()
                parl_cf_df = pd.DataFrame(parl_cf_matrix)

                parl_cf_df.columns = [0, 1]
                parl_cf_df.index = [0, 1]

                model_type_name = model_type.rsplit('/',1)[1]
                file_name_cr = f"parl_CR_{model_type_name}.csv" 
                file_name_cf = f"parl_CF_MATRIX_{model_type_name}.csv"

                current_directory = os.getcwd()
                final_directory = os.path.join(current_directory, rf"{model_type_name}")
                if not os.path.exists(final_directory):
                    os.makedirs(final_directory)


                with open(f"{final_directory}\{file_name_cr}", "a") as f:
                    f.write("\n")
                    f.write(f"parl_{file_name_cr[3:-4]}_seed_{bert_train_path.rsplit('_')[-1]}")
                    f.write("\n")
                with open(f"{final_directory}\{file_name_cf}", "a") as f:
                    f.write("\n")
                    f.write(f"parl_{file_name_cf[3:-4]}_seed_{bert_train_path.rsplit('_')[-1]}")
                    f.write("\n")

                parl_cr_df.to_csv(f"{final_directory}\{file_name_cr}", mode="a")
                parl_cf_df.to_csv(f"{final_directory}\{file_name_cf}", mode="a")

                


Some weights of the model checkpoint at ltgoslo/norbert2 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 3450/3450 [00:37<00:00, 92.05it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(avera

In [None]:




# rootdir=r"C:\Users\Aarne\Desktop\Ferdig_code_folder\parl_corpus_full\20170215"

# path_list_parl = []

# for subdir, dirs, files in os.walk(rootdir):
#     for file in files:
#         if "normalized_token_data.json" in file:
#             path = (os.path.join(subdir, file))
#             path_list_parl.append(path)


# # relative paths only works half of the time on my windows machine :(
# model_types = {
#     "ltgoslo/norbert2": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\NORBERT2\trained models",
#     "NbAiLab/nb-bert-base": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\NB-BERT\trained models nb-bert",
#     "saattrupdan/nbailab-base-ner-scandi": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\scandi-bert\trained models scandi_bert"
# }


# for model_type, trained_models_path in model_types.items():
#     TP = 0
#     FP = 0
#     FN = 0
#     TN = 0
#     tokenizer = BertTokenizer.from_pretrained(model_type, do_basic_tokenize=False)
#     for subdir, dirs, files in os.walk(trained_models_path):
#         for file in files:
#             if ".json" not in file:
#                 for path in path_list_parl:
#                     bert_model = Bert(ner_vocab, model_type, freeze=False).to(device)
#                     bert_train_path = os.path.join(subdir, file)
#                     bert_model.load_state_dict(torch.load(bert_train_path))
                
#                     #LOADING CORPUS

#                     cn = ParlamentaryCorpus(path)
#                     cl = ParlamentaryCorpus(path)

#                     _gold_corp = cn.load_data(lower=False)
#                     _test_parl = cl.load_data(lower=True)

#                     gold_corp = list(_gold_corp.values())
#                     test_parl = list(_test_parl.values())

#                     gold_corp = gold_corp[1170:1171]
#                     test_parl = test_parl[1170:1171]


                    
#                     gold_corp = gold_corp[1170:]
#                     test_parl = test_parl[1170:]
                    
#                     length = 0
#                     gold_labels = []
#                     for sentences in gold_corp:
#                         annotation = []
#                         length += len(sentences)
#                         for token in sentences:
#                             if token[0].isupper() == True: # Checks if token has captial letter
#                                 annotation.append(1)
                                
#                             else:
#                                 annotation.append(0) # Adds 0 if token has lower 
#                         gold_labels.append(annotation)


#                     dummy_y_parl = make_dummy_y(test_parl)
#                     parl_lower_corpus = CoNLLDataset(test_parl, dummy_y_parl, ner_vocab)

#                     parl_loader = DataLoader(
#                             parl_lower_corpus, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

#                     gold_labels_parl = [item for sublist  in gold_labels for item in sublist]


#                     # Parl test 
#                     y_pred_parl = predict_test(parl_loader, ner_vocab, tokenizer, bert_model)
#                     y_pred_parl_binary = binary_parl_from_iob2(y_pred_parl)

#                     parl_cr = classification_report(gold_labels_parl, y_pred_parl_binary, digits=5, output_dict=True)
#                     parl_cf_matrix = confusion_matrix(gold_labels_parl, y_pred_parl_binary)
                    
#                     TP += parl_cf_matrix[0][0]
#                     FP += parl_cf_matrix[0][1]
#                     FN += parl_cf_matrix[1][0]
#                     TN += parl_cf_matrix[1][1]

#         confusion_matrix_dictionary = {
#             "TP": TP,
#             "FP": FP,
#             "FN": FN,
#             "TN": TN
#         }

#                     # parl_cr_df = pd.DataFrame(parl_cr).transpose()
#                     # parl_cf_df = pd.DataFrame(parl_cf_matrix)

#                     # parl_cf_df.columns = [0, 1]
#                     # parl_cf_df.index = [0, 1]

#                     # model_type_name = model_type.rsplit('/',1)[1]
#                     # file_name_cr = f"parl_CR_{model_type_name}.csv" 
#                     # file_name_cf = f"parl_CF_MATRIX_{model_type_name}.csv"

#                     # current_directory = os.getcwd()
#                     # final_directory = os.path.join(current_directory, rf"{model_type_name}")
#                     # if not os.path.exists(final_directory):
#                     #     os.makedirs(final_directory)


#                     # with open(f"{final_directory}\{file_name_cr}", "a") as f:
#                     #     f.write("\n")
#                     #     f.write(f"parl_{file_name_cr[3:-4]}_seed_{bert_train_path.rsplit('_')[-1]}")
#                     #     f.write("\n")
#                     # with open(f"{final_directory}\{file_name_cf}", "a") as f:
#                     #     f.write("\n")
#                     #     f.write(f"parl_{file_name_cf[3:-4]}_seed_{bert_train_path.rsplit('_')[-1]}")
#                     #     f.write("\n")

#                     # parl_cr_df.to_csv(f"{final_directory}\{file_name_cr}", mode="a")
#                     # parl_cf_df.to_csv(f"{final_directory}\{file_name_cf}", mode="a")

                    
