In [89]:
import conllu
import tqdm
import torch
import json
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional
from functools import partial
from transformers import BertModel
from torch import nn
from transformers import BertTokenizer
import os
import pandas as pd
from get_parl_corpus_token_data import ParlamentaryCorpus
from seqeval.scheme import IOB2
from seqeval.metrics import classification_report as cr
from seqeval.metrics import performance_measure
from seqeval.scheme import IOB2
from seqeval.metrics import performance_measure


In [90]:
""" Could tidy up this one to just use device instead of calling gpu=true etc."""
if torch.cuda.is_available():
    gpu = True
    print("Using GPU")
else:
    gpu = False

device = torch.device("cuda")
print(device)

cuda


In [91]:
## Conllu stuff
def filter_tags(x):
    return x        

def convert_to_list_dict(path, file):
    path = path.format(file)
    with open(path, encoding="UTF-8") as infile:
        lst = []
        tokens = list(conllu.parse_incr(infile))
        for sent in tokens:
                dic = {
                "idx": sent.metadata["sent_id"],
                "text": sent.metadata["text"].lower(),
                "tokens": [token["form"].lower() for token in sent],
                "lemmas": [token["lemma"] for token in sent],
                "pos_tags": [token["upos"] for token in sent],
                "ner_tags": [filter_tags(token["misc"].get("name", "O")) for token in sent],
            }
                lst.append(dic) 
        print("Converting {} to list of dictionaries\n     {} elements converted..".format(file, len(lst)))
    return lst

In [92]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/dataset.py



class CoNLLDataset(Dataset):
    def __init__(self, x_tokens, y_labels, ner_vocab=None):
        self.tokens = [[x for x in entry] for entry in x_tokens]
        self.ner_labels = [[y for y in entry] for entry in y_labels]

        # hard coded ner_vocab to avoid random shuffled instanciation of ordering of ner_vocab
        self.ner_vocab = ner_vocab
        self.ner_indexer = {i: n for n, i in enumerate(self.ner_vocab)}
    
    def __getitem__(self, index):
        tokens = self.tokens[index]
        ner_labels = self.ner_labels[index]

        x = tokens
        y = torch.LongTensor([self.ner_indexer[i] if i in self.ner_vocab
                              else self.ner_indexer['@UNK'] for i in ner_labels])
        return x, y

    def __len__(self):
        return len(self.tokens)



In [93]:
""" Dynamic padding. Takes the longest sentence in batch and pads other sentences to its length (if im not mistaken)"""
# Function borrowed from https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py

def collate_fn(batch, gpu=False):
    longest_y = max([y.size(0) for X, y in batch])
    x = [X for X, y in batch]
    y = torch.stack(
        [functional.pad(y, (0, longest_y - y.size(0)), value=-1) for X, y in batch]) #https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    if gpu:
        y = y.to("cuda")
    return x, y

In [94]:
class Bert(nn.Module):
    def __init__(self, ner_vocab, model_path=None, freeze=False):
        super().__init__()
        self._bert = BertModel.from_pretrained(
            model_path
        )
        hidden_size = self._bert.config.hidden_size
        self._linear = nn.Linear(hidden_size, len(ner_vocab))

        if freeze:
            for param in self._bert.parameters():
                param.requires_grad = False #Freezing bert layer

    def forward(self, batch, mask):
        b = self._bert(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        pooler = b.last_hidden_state[:, mask].diagonal().permute(2, 0, 1) #https://pytorch.org/docs/stable/generated/torch.permute.html
        return self._linear(pooler)                                     #https://pytorch.org/docs/stable/generated/torch.diagonal.html 

In [95]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py 

def build_mask(tokenizer, ids):
    tok_sents = [tokenizer.convert_ids_to_tokens(i) for i in ids]
    mask = []
    for sentence in tok_sents:
        current = []
        for n, token in enumerate(sentence):
            if token in tokenizer.all_special_tokens[1:] or token.startswith("##"): # ## masked
                continue
            else:
                current.append(n)
        mask.append(current)

    mask = tokenizer.pad({"input_ids": mask}, return_tensors="pt")["input_ids"]
    return mask

In [96]:
def predict(input_data, tokenizer, model, gpu=False):
    input_data = tokenizer(
        input_data, is_split_into_words=True, return_tensors="pt", padding=True
    )
    if gpu:
        input_data = input_data.to("cuda")
    batch_mask = build_mask(tokenizer, input_data["input_ids"])
    y_pred = model(input_data, batch_mask).permute(0, 2, 1).argmax(dim=1)
    return y_pred

In [97]:
def predict_test(test_set, ner_vocab, tokenizer, model):
    model.eval()
    predicted_labels = []
    test_set = tqdm.tqdm(test_set)
    for x, y in test_set:
        y_pred = predict(x, tokenizer, model, gpu=gpu)
        predicted = [ner_vocab[element] for element in y_pred[0]]
        predicted_labels += predicted
    return predicted_labels

In [98]:
def is_freeze(freeze, model):
    if freeze:
        lr=0.001
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        lr = 2e-5
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    return optimiser, lr

In [99]:
def load_parl_corpus(rootdir_parl_corpus, lower=False):
    corpora_normal_cap, corpora_lower, paths = [], [], []
    
    for subdir, dirs, files in os.walk(rootdir_parl_corpus):
        for file in files:
            if "normalized_token_data.json" in file:
                path = (os.path.join(subdir, file))
                paths.append(path)

    for corpus, path in enumerate(paths):
        corpus = ParlamentaryCorpus(path)
        corpus = corpus.load_data()
        corpora_normal_cap.append(corpus)
        for k, v in corpus.items():
            if v == []:
                print(k)
                print(path)

    if lower==True:
        for corpus, path in enumerate(paths):
            corpus = ParlamentaryCorpus(path)
            corpus = corpus.load_data(lower=True)
            corpora_lower.append(corpus)


    return corpora_normal_cap, corpora_lower

In [100]:
def parl_sentences_only(dictionary_corpus):
    corp = []
    for diction in dictionary_corpus:
        corp += list(diction.values())
    return corp

In [101]:
""" Provides dummy y-data so CoNLLDataset class can be used """
""" Not elegant but works perfectly fine :)                 """

def make_dummy_y(test_corpus):
    y_dummy_data = []
    for sentence in test_corpus:
        dummy = []
        dummy.extend(len(sentence)*"O")
        y_dummy_data.append(dummy)
    return y_dummy_data

In [102]:
def binary_parl_from_iob2(array):
    temp_lst = []
    for tag in array:
        if tag == "O":
            temp_lst.append(0)
        else:
            temp_lst.append(1)
    return temp_lst

In [103]:
def gold_labels_unlabled_data(unlabeled):
    length = 0
    gold_labels = []
    for sentences in unlabeled:
        label = []
        length += len(sentences)
        for token in sentences:
            if token[0].isupper() == True: # Checks if token has captial letter
                label.append(1)
                
            else:
                label.append(0) # Adds 0 if token has lower 
        gold_labels.append(label)
    
    return gold_labels

In [104]:
def transform_to_binary(set):
    binary_labels = []
    for label in set:
        if label == "O":
            binary_labels.append(0)
        else:
            binary_labels.append(1)
    return binary_labels

In [105]:
"""Making list of lists preds, same len as NorNE gold labels"""
""" test_dataset_ner_labels should be test_dataset.ner_labels """

def split_list_preds(preds, test_dataset_ner_labels):
    split_list_preds = []
    start = 0

    for sublist in test_dataset_ner_labels:
        end = start + len(sublist)
        split_list_preds.append(preds[start:end])
        start = end
    return split_list_preds

In [106]:
# Assumes that it is the same vocab in dev and test
from itertools import chain
def get_labels(train_split, add_UNK=True):
    label_vocab = [set(y["ner_tags"]) for y in train_split]
    label_vocab = list(chain(*[d for d in label_vocab]))
    label_vocab = list(set(sorted(label_vocab)))
    
    if add_UNK==True:
        label_vocab.append("@UNK")

    tmp1, tmp2 = [], []
    for label in label_vocab:
        if "-" in label:
            tmp1.append(label)
        else:
            tmp2.append(label)
    tmp1.sort(key=lambda x: (x.split("-")[1], x.split("-")[0]))
    tmp1 = [x for x in tmp1]
    tmp1.extend(tmp2)
    label_vocab = tmp1
    return label_vocab

In [107]:
""" Norne Data loading stuff"""

path = "/Users/aarnes/Library/CloudStorage/OneDrive-UniversityofBergen/Dokumenter/MSTR-PY/Random tests/annotate_sentences_files_parl/all_conllu/{0}.conllu"
file_list = ["no_bokmaal-ud-dev", "no_bokmaal-ud-test", "no_bokmaal-ud-train", "no_nynorsk-ud-dev", "no_nynorsk-ud-test", "no_nynorsk-ud-train"]


test_split_no = convert_to_list_dict(path, file_list[1])
train_split_no = convert_to_list_dict(path, file_list[2])


test_split_ny = convert_to_list_dict(path, file_list[4])
train_split_ny = convert_to_list_dict(path, file_list[5])



train_split = train_split_no + train_split_ny
ner_vocab = get_labels(train_split)


print("Combining test set..")
test_split = test_split_no + test_split_ny
print("Success!")

x_test_tokens = [x["tokens"] for x in test_split]
y_test_labels = [y["ner_tags"] for y in test_split]
test_dataset = CoNLLDataset(x_test_tokens, y_test_labels, ner_vocab)

test_loader = DataLoader(
        test_dataset, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

""" Gold labels for NorNE test_dataset"""
gold_labels = []
for sentence_labels in test_dataset.ner_labels:
    for label in sentence_labels:
        gold_labels.append(label)

Converting no_bokmaal-ud-test to list of dictionaries
     1939 elements converted..
Converting no_bokmaal-ud-train to list of dictionaries
     15696 elements converted..
Converting no_nynorsk-ud-test to list of dictionaries
     1511 elements converted..
Converting no_nynorsk-ud-train to list of dictionaries
     14174 elements converted..
Combining test set..
Success!


In [108]:
""" Parliamentary corpus data loading stuff"""

rootdir=r"/Users/aarnes/Documents/GitHub/MA_Peter-R-ysland-Aarnes/parl_corpus_full"
corpora_normal_cap, corpora_lower = load_parl_corpus(rootdir, lower=True)
gold_corp = parl_sentences_only(corpora_normal_cap)
test_parl = parl_sentences_only(corpora_lower)

_gold_labels_parl = gold_labels_unlabled_data(gold_corp)

dummy_y_parl = make_dummy_y(test_parl)
parl_lower_corpus = CoNLLDataset(test_parl, dummy_y_parl, ner_vocab)

parl_loader = DataLoader(
        parl_lower_corpus, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

gold_labels_parl = [item for sublist  in _gold_labels_parl for item in sublist]

In [109]:
random_sentences_number_first_list = [15516,
 35867,
 4098,
 35691,
 30349,
 51855,
 41168,
 55913,
 20602,
 17634,
 3459,
 28645,
 47616,
 44675,
 41509,
 23383,
 29885,
 3768,
 3407,
 42391,
 49739,
 13990,
 27924,
 46773,
 4588,
 52669,
 14987,
 50668,
 15770,
 55929,
 45534,
 52134,
 21369,
 30458,
 18766,
 15745,
 21375,
 35099,
 29015,
 1647,
 26161,
 9417,
 29623,
 35631,
 17871,
 6668,
 56582,
 18865,
 19871,
 19663,
 48089,
 16483,
 27978,
 43277,
 37444,
 16326,
 57881,
 38439,
 19526,
 37972,
 36700,
 12681,
 20807,
 10321,
 759,
 21558,
 41279,
 15268,
 10943,
 17395,
 49074,
 12391,
 50319,
 45247,
 7540,
 21236,
 20084,
 37092,
 21904,
 26111,
 24916,
 14647,
 47516,
 3944,
 52878,
 35787,
 39016,
 32064,
 48844,
 12676,
 7245,
 35081,
 56289,
 50570,
 40816,
 29578,
 58584,
 56622,
 23381,
 12325]

In [178]:
from random import randint

random_sentence_numbers = []
for sen in range(0, 1):
    r = randint(0, len(gold_corp))
    if r not in random_sentence_numbers:
        random_sentence_numbers.append(r)

In [179]:
random_sentence_numbers

[35523]

In [180]:
tohundre_mer_nummer = [45193,
 23709,
 11240,
 14310,
 6316,
 31020,
 558,
 22315,
 17907,
 10647,
 58011,
 7891,
 6236,
 41963,
 13035,
 994,
 42979,
 31883,
 6426,
 12543,
 10588,
 5958,
 26005,
 5879,
 23795,
 37913,
 10632,
 22800,
 8548,
 57678,
 29583,
 4783,
 13724,
 14204,
 33813,
 48356,
 39557,
 44661,
 50100,
 21860,
 11191,
 58060,
 35762,
 37512,
 19690,
 10025,
 6951,
 32806,
 52842,
 40553,
 4375,
 30527,
 4907,
 52189,
 53941,
 12811,
 27278,
 39350,
 46410,
 24437,
 36304,
 9358,
 22249,
 3337,
 15554,
 11055,
 9316,
 22913,
 44816,
 41010,
 45319,
 54350,
 17896,
 1111,
 54466,
 25743,
 29489,
 54364,
 22364,
 47872,
 20154,
 36720,
 57119,
 46320,
 24869,
 54826,
 38514,
 56646,
 19574,
 38830,
 978,
 15996,
 26085,
 11102,
 12138,
 22504,
 15823,
 46094,
 40680,
 21033,
 15641,
 23328,
 43515,
 52708,
 45150,
 12172,
 19295,
 29460,
 21487,
 24253,
 53761,
 42385,
 24243,
 35875,
 20669,
 41326,
 48274,
 52113,
 631,
 18880,
 24099,
 32887,
 5264,
 52173,
 53831,
 33094,
 15154,
 49301,
 57154,
 22299,
 26574,
 39028,
 35937,
 40325,
 24197,
 2505,
 27495,
 50492,
 38859,
 41926,
 34258,
 57944,
 5598,
 54285,
 39280,
 57968,
 3736,
 19217,
 21190,
 14819,
 40257,
 32626,
 38656,
 40331,
 40176,
 8611,
 43059,
 5895,
 2917,
 37386,
 38378,
 11201,
 54972,
 18575,
 6418,
 42845,
 3931,
 24303,
 14368,
 28922,
 21862,
 30376,
 9452,
 46629,
 19001,
 14463,
 10118,
 45420,
 11483,
 8390,
 32336,
 2589,
 33430,
 7317,
 40265,
 46189,
 25362,
 51401,
 26740,
 1299,
 40565,
 9284,
 26639,
 34838,
 39220,
 4925,
 42948,
 53682,
 18611,
 38180]

In [181]:
random_sentence_numbers

[35523]

In [182]:
sentences = []
for i in random_sentence_numbers:
    sentences.append(gold_corp[i])
sample_sentences = sentences

sentences_lower = []
for i in random_sentence_numbers:
    sentences_lower.append(test_parl[i])
sample_sentences_lower = sentences_lower

In [183]:
random_sentence_numbers in tohundre_mer_nummer

False

In [170]:
import csv
afwawfafw crash

with open('annotate_this.csv', 'w', newline='', encoding="UTF-8") as f:
    wr = csv.writer(f, delimiter=",")
    for sentence in sentences:
        wr.writerow("")
        for word in sentence:
            if not word[0].isupper():
                wr.writerow([word, "O"])
            else:
                wr.writerow([word])

In [263]:
import csv

with open('200_annoterte_setninger.csv', newline='') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    left_col = []
    right_col = []
    for row in csvreader:
        left_col.append(row[0])
        right_col.append(row[1])


In [264]:
print(len(right_col))
print(len(left_col))

4067
4067


In [265]:

def create_sublists(lst):
    sublists = []
    sublist = []
    for element in lst:
        if element == '':
            if sublist:
                sublists.append(sublist)
                sublist = []
        else:
            sublist.append(element)

    if sublist:
        sublists.append(sublist)

    return sublists


In [266]:
y200 = create_sublists(right_col)
x200 = create_sublists(left_col)

0


200

In [260]:
import csv

with open("annotated sents.csv", "r") as f_input:
    reader = csv.reader(f_input)
    with open("parl_annotation_comma_fixed.csv", "w") as f_output:
        writer = csv.writer(f_output)
        for row in reader:
            row = [item for item in row if item or item == ","]
            if row:
                writer.writerow(row)

In [261]:
x, y = [], []

with open("parl_annotation_comma_fixed.csv", "r") as f_input:
    reader = csv.reader(f_input, delimiter=",")
    for i, row in enumerate(reader):
        if i%2:
            y.append(row)
        else:
            x.append(row)

In [277]:
bert_model = Bert(ner_vocab, "ltgoslo/norbert2", freeze=False).to("cpu")
tokenizer = BertTokenizer.from_pretrained("ltgoslo/norbert2", do_basic_tokenize=False)
bert_model.load_state_dict(torch.load(r"/Users/aarnes/Downloads/norbert2_model_adam_lr_2e-05__maxEpochs_50__seed_42", map_location="cpu"))

OSError: Can't load config for 'ltgoslo/norbert2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'ltgoslo/norbert2' is the correct path to a directory containing a config.json file

In [271]:
x.extend(x200)
y.extend(y200)

In [None]:

x = [word for sublist in sentences for word in sublist]
print(len(x))

In [270]:
len(x)

300

In [273]:
def lower_list_of_lists(lists):
    return [[word.lower() for word in sublist] for sublist in lists]


In [274]:
lower_x=lower_list_of_lists(x)

In [276]:
xt = CoNLLDataset(lower_x, y, ner_vocab)
fullset_loader = DataLoader(xt, batch_size=1, shuffle=False, collate_fn=partial(collate_fn))
preds = predict_test(fullset_loader, ner_vocab, tokenizer, bert_model)



NameError: name 'tokenizer' is not defined

In [None]:
asd = split_list_preds(preds, lower_x)
for true, pred, in zip(y, asd):
    print(true)
    print(pred)
    print()

['O', 'B-GPE_LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE_LOC']
['O', 'B-DRV', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DRV']

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PROD', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-GPE_LOC', 'O']

['O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O']

['O', 'O', 'O', 'O', 'B-ORG', 'og', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'I-MISC', 'O', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 

In [None]:
print(asd[0])
y[0]

12
['O', 'B-GPE_LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE_LOC']


In [None]:
asd = [word for sublist in lower_x for word in sublist]
dsa = [word for sublist in y for word in sublist]
print(len(asd))
print(len(dsa))

1616
1616


In [None]:
for sen1, sen2 in zip(lower_x, y):
    if len(sen1)!=len(sen2):
        print(sen1)
        print(sen2)
    if sen1==[] or sen1==[]:
        print("fuck") 

In [None]:
with open("sentences_to_annotate.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for row in sentences:
        writer.writerow(row)
        f.write("\n\n")

In [None]:
_gold_sample_sentences = gold_labels_unlabled_data(sample_sentences)
gold_sentences_samples = [item for sublist  in _gold_sample_sentences for item in sublist]
gold_sentences_samples_splits = split_list_preds(gold_sentences_samples, sample_sentences)


sample_sentence_lower_dummy = make_dummy_y(sample_sentences_lower)
sample_sentence_lower_corpus = CoNLLDataset(sample_sentences_lower, sample_sentence_lower_dummy, ner_vocab)


sample_loader = DataLoader(sample_sentence_lower_corpus, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))
sample_sentence_preds = predict_test(sample_loader, ner_vocab, tokenizer, bert_model)
sample_sentence_preds_splits = split_list_preds(sample_sentence_preds, sample_sentence_lower_dummy)
sample_sentence_preds_BINARY = binary_parl_from_iob2(sample_sentence_preds)
sample_sentence_preds_BINARY_splits = split_list_preds(sample_sentence_preds_BINARY, sample_sentence_lower_dummy)


NameError: name 'sample_sentences' is not defined

In [None]:
false_negatives = { 

}

false_positives = {

}

In [None]:


for i, (gold, pred) in enumerate(zip(gold_sentences_samples_splits, sample_sentence_preds_BINARY_splits)):
    if gold != pred:
        for word_index, (word_g, word_p) in enumerate(zip(gold,pred)):
            if word_g != word_p:
                if word_p == 0:
                        if sample_sentences_lower[i][word_index] not in false_negatives.keys():
                            sentence_list = [sample_sentences[i]]
                            false_negatives[sample_sentences_lower[i][word_index]] = {
                                "Count": 1,
                                "Sentence": sentence_list
                                }
                        
                        elif sample_sentences_lower[i][word_index] in false_negatives.keys():
                            count = false_negatives[sample_sentences_lower[i][word_index]]["Count"]
                            count += 1
                            false_negatives[sample_sentences_lower[i][word_index]]["Count"] = count
                            false_negatives[sample_sentences_lower[i][word_index]]["Sentence"].append(sample_sentences[i])



                elif word_p == 1:
                        if sample_sentences_lower[i][word_index] not in false_positives.keys():
                            sentence_list = [sample_sentences[i]]
                            false_positives[sample_sentences_lower[i][word_index]] = {
                                "Count": 1,
                                "Sentence": sentence_list
                                }
                        
                        elif sample_sentences_lower[i][word_index] in false_positives.keys():
                            count = false_positives[sample_sentences_lower[i][word_index]]["Count"]
                            count += 1
                            false_positives[sample_sentences_lower[i][word_index]]["Count"] = count
                            false_positives[sample_sentences_lower[i][word_index]]["Sentence"].append(sample_sentences[i])




In [None]:
"sykehuspartners" in false_negatives.keys()


In [None]:
fn = {
    "ord": {
        "count": 1,
        "sen": [ ]
    },
}

In [None]:
cn = fn["ord"]["count"] 
cn +=1
fn["ord"]["count"] = cn

In [None]:
fn

In [None]:
# relative paths only works half of the time on my windows machine :(
model_types = {
    "bert-base-multilingual-cased": r"C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\MSTR-PY\TRAIN ALL BERTS\trained\mBERT",
    "ltgoslo/norbert": r"C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\MSTR-PY\TRAIN ALL BERTS\trained\norBERT",
    "ltgoslo/norbert2": r"C:\Users\Aarne\OneDrive - University of Bergen\Dokumenter\MSTR-PY\TRAIN ALL BERTS\trained\NB-BERT",
    "NbAiLab/nb-bert-base": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\NB-BERT\trained models nb-bert",
    "saattrupdan/nbailab-base-ner-scandi": r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\scandi-bert\trained models scandi_bert"
}

for name, type in model_types.items():
    if "/" in name:
        model_type_name = name.rsplit('/',1)[1]
        print(model_type_name)
    else:
        model_type_name = name
        print(model_type_name)
