In [None]:
import conllu
import tqdm
import torch
import json
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import DataLoader
import torch
from torch.nn import functional
from functools import partial
from transformers import BertModel
from torch import nn
from transformers import BertTokenizer
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score, precision_score, recall_score, classification_report
from utils.get_conllu_dataset import CoNLLDataset
from utils.get_parl_corpus_token_data import ParlamentaryCorpus

Quite a lot of inspo from https://github.com/ltgoslo/NorBERT/tree/main/benchmarking/experiments

In [None]:
""" Could tidy up this one to just use device instead of calling gpu=true etc."""
if torch.cuda.is_available():
    gpu = True
    print("Using GPU")
else:
    gpu = False

device = torch.device("cuda")
print(device)

In [None]:
## Conllu stuff
def filter_tags(x):
    return x        

def convert_to_list_dict(path, file):
    path = path.format(file)
    with open(path, encoding="UTF-8") as infile:
        lst = []
        tokens = list(conllu.parse_incr(infile))
        for sent in tokens:
                dic = {
                "idx": sent.metadata["sent_id"],
                "text": sent.metadata["text"].lower(),
                "tokens": [token["form"].lower() for token in sent],
                "lemmas": [token["lemma"] for token in sent],
                "pos_tags": [token["upos"] for token in sent],
                "ner_tags": [filter_tags(token["misc"].get("name", "O")) for token in sent],
            }
                lst.append(dic) 
        print("Converting {} to list of dictionaries\n     {} elements converted..".format(file, len(lst)))
    return lst

In [None]:
class NorBert(nn.Module):
    def __init__(self, ner_vocab, model_path=None, freeze=False):
        super().__init__()
        self._bert = BertModel.from_pretrained(
            model_path
        )
        hidden_size = self._bert.config.hidden_size
        self._linear = nn.Linear(hidden_size, len(ner_vocab))

        if freeze:
            for param in self._bert.parameters():
                param.requires_grad = False #Freezing bert layer

    def forward(self, batch, mask):
        b = self._bert(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )                                                                   # Notes to self
        pooler = b.last_hidden_state[:, mask].diagonal().permute(2, 0, 1) #https://pytorch.org/docs/stable/generated/torch.permute.html
        return self._linear(pooler)                                     #https://pytorch.org/docs/stable/generated/torch.diagonal.html 

In [None]:
""" Dynamic padding. Takes the longest sentence in batch and pads other sentences to its length (if im not mistaken)"""
# Function borrowed from https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py

def collate_fn(batch, gpu=False):
    longest_y = max([y.size(0) for X, y in batch])
    x = [X for X, y in batch]
    y = torch.stack(
        [functional.pad(y, (0, longest_y - y.size(0)), value=-1) for X, y in batch]) #https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    if gpu:
        y = y.to("cuda")
    return x, y

In [None]:
# https://github.com/ltgoslo/NorBERT/blob/main/benchmarking/experiments/bert_ner.py 

def build_mask(tokenizer, ids):
    tok_sents = [tokenizer.convert_ids_to_tokens(i) for i in ids]
    mask = []
    for sentence in tok_sents:
        current = []
        for n, token in enumerate(sentence):
            if token in tokenizer.all_special_tokens[1:] or token.startswith("##"): # ## masked
                continue
            else:
                current.append(n)
        mask.append(current)

    mask = tokenizer.pad({"input_ids": mask}, return_tensors="pt")["input_ids"]
    return mask

In [None]:
def predict(input_data, tokenizer, model, gpu=False):
    input_data = tokenizer(
        input_data, is_split_into_words=True, return_tensors="pt", padding=True
    )
    if gpu:
        input_data = input_data.to("cuda")
    batch_mask = build_mask(tokenizer, input_data["input_ids"])
    y_pred = model(input_data, batch_mask).permute(0, 2, 1).argmax(dim=1)
    return y_pred

In [None]:
def train(device, train_loader, dev_loader, tokenizer, model, optimiser, scheduler, criterion, no_ne_index, MAX_EPOCHS, OUTPUT_PATH, PATIENCE):
    
    accuracy_lst = []
    loss_history_train, loss_history_dev, val_accuracy, learn_rate_history = {}, {}, {}, {}
    inpatience = 0
    
    for epoch in range(MAX_EPOCHS):
        print("Learn rate: ", end="")
        print(optimiser.param_groups[0]['lr'])
        # TRAIN
        loss_accumulation = 0
        model.train()
        train_iter = tqdm.tqdm(train_loader)
        for x, y in train_iter:
            x = tokenizer(
                x, is_split_into_words=True, return_tensors="pt", padding=True
            )
            if gpu:
                x = x.to(device)
            batch_mask = build_mask(tokenizer, x["input_ids"])
            optimiser.zero_grad()
            y_pred = model(x, batch_mask).permute(0, 2, 1)
            loss = criterion(y_pred, y)
            loss_accumulation+=loss.item()
            loss.backward()
            optimiser.step()
            train_iter.set_postfix_str(f"loss: {loss.item()}")
                
        # EVAL       
        model.eval()
        dev_iter = tqdm.tqdm(dev_loader)
        correct, total = 0, 0
        for x, y in dev_iter:
            y_pred = predict(x, tokenizer, model, gpu=gpu)
            correct += (((y_pred == y).logical_and(y != no_ne_index)).nonzero(as_tuple=False).size(0))
            total += ((y != no_ne_index).logical_and(y != -1).nonzero(as_tuple=False).size(0))
            x = tokenizer(
                x, is_split_into_words=True, return_tensors="pt", padding=True
            )
            if gpu:
                x = x.to(device)
            batch_mask = build_mask(tokenizer, x["input_ids"])
            y_pred = model(x, batch_mask).permute(0, 2, 1)
            dev_loss = criterion(y_pred, y)
    
        scheduler.step(dev_loss)
    
        print("Epoch number: {}".format(epoch))
        accuracy = correct / total
        accuracy_lst.append(accuracy)
        current_acc_best = max(accuracy_lst)
        print(accuracy)
        print(f"List of accuracies: {accuracy_lst}")
        print(f"Best accuracy so far: {current_acc_best}")
        print(f"Validation accuracy = {correct} / {total} = {accuracy}")
        if ((accuracy < current_acc_best) and (inpatience <= PATIENCE)): 
            print(f"No improvement. Inpatience counter increased..")
            print(f"Patience set at {PATIENCE}")
            inpatience +=1
            print(f"Inpatience counter reached {inpatience}")
            if inpatience == PATIENCE:
                print(f"Patience tolerance reached. Early stopping at epoch {epoch}!")
                break
        else:
            current_acc_best = accuracy
            print("New best!")
            print("Setting inpatience counter to 0")
            inpatience = 0
            torch.save(model.state_dict(), OUTPUT_PATH)

        loss_history_train["epoch: {}".format(epoch)] = loss_accumulation
        loss_history_dev["epoch: {}".format(epoch)] = loss_accumulation
        val_accuracy["epoch: {}".format(epoch)] = accuracy
        learn_rate_history["epoch: {}".format(epoch)] = optimiser.param_groups[0]['lr']
        

    return loss_history_train, loss_history_dev, val_accuracy, epoch, learn_rate_history

# """ TODO: legg til tekst fil med loss og andre stats"""
# INTERGRATE TODO PATIENCE


In [None]:
def predict_test(test_set, ner_vocab, tokenizer, model):
    model.eval()
    predicted_labels = []
    test_set = tqdm.tqdm(test_set)
    for x, y in test_set:
        y_pred = predict(x, tokenizer, model, gpu=gpu)
        predicted = [ner_vocab[element] for element in y_pred[0]]
        predicted_labels += predicted
    return predicted_labels

In [None]:
def is_freeze(freeze, model):
    if freeze:
        lr=0.001
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        lr = 2e-5
        optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    return optimiser, lr

In [None]:
def model_info_json(model_name, loss_history_train, loss_history_dev, val_accuracy, learn_rate_history, MAX_EPOCHS, stop_epoch, lr, seed, PATIENCE):
    info = {
        "Seed": seed,
        "Max epochs set": MAX_EPOCHS,
        "Train stop at epoch (begins at 0)": stop_epoch,
        "Learning rate": lr,
        "Validation accuracy history": val_accuracy,
        "Train loss history": loss_history_train,
        "Val loss history": loss_history_dev,
        "Learn rate history": learn_rate_history,
        "Patience set": PATIENCE
    }

    file_name = "{}_seed_{}.json".format(model_name, seed)
    with open(file_name, "w") as write:
        json.dump(info, write, indent=2)

In [None]:
path = "all_conllu/{0}.conllu"
file_list = ["no_bokmaal-ud-dev", "no_bokmaal-ud-test", "no_bokmaal-ud-train", "no_nynorsk-ud-dev", "no_nynorsk-ud-test", "no_nynorsk-ud-train"]

dev_split_no = convert_to_list_dict(path, file_list[0])
test_split_no = convert_to_list_dict(path, file_list[1])
train_split_no = convert_to_list_dict(path, file_list[2])

dev_split_ny = convert_to_list_dict(path, file_list[3])
test_split_ny = convert_to_list_dict(path, file_list[4])
train_split_ny = convert_to_list_dict(path, file_list[5])

print("Combining train, dev and test sets..")
dev_split = dev_split_no + dev_split_ny
test_split = test_split_no + test_split_ny
train_split = train_split_no + train_split_ny
print("Success!")

In [None]:
# Assumes that it is the same vocab in dev and test
from itertools import chain
def get_labels(train_split, add_UNK=True):
    label_vocab = [set(y["ner_tags"]) for y in train_split]
    label_vocab = list(chain(*[d for d in label_vocab]))
    label_vocab = list(set(sorted(label_vocab)))
    
    if add_UNK==True:
        label_vocab.append("@UNK")

    tmp1, tmp2 = [], []
    for label in label_vocab:
        if "-" in label:
            tmp1.append(label)
        else:
            tmp2.append(label)
    tmp1.sort(key=lambda x: (x.split("-")[1], x.split("-")[0]))
    tmp1 = [x for x in tmp1]
    tmp1.extend(tmp2)
    label_vocab = tmp1
    return label_vocab

In [None]:
# Labels for when models where trained in this label order/index 
#
# ner_vocab =     ['B-GPE_LOC', 'I-DRV', 'I-LOC', 'B-PER', 'I-PER', 'B-PROD', 
#                 'I-GPE_ORG', 'B-GPE_ORG', 'B-EVT', 'B-DRV', 'I-PROD', 'B-ORG', 'B-MISC',
#                 'I-MISC', 'I-GPE_LOC', 'B-LOC', 'I-ORG', 'I-EVT', 'O', '@UNK']


ner_vocab = get_labels(train_split)

x_train_tokens = [x["tokens"] for x in train_split]
y_train_labels = [y["ner_tags"] for y in train_split]
train_dataset = CoNLLDataset(x_train_tokens, y_train_labels, ner_vocab)

x_dev_tokens = [x["tokens"] for x in dev_split]
y_dev_labels = [y["ner_tags"] for y in dev_split]
dev_dataset = CoNLLDataset(x_dev_tokens, y_dev_labels, ner_vocab)

x_test_tokens = [x["tokens"] for x in test_split]
y_test_labels = [y["ner_tags"] for y in test_split]
test_dataset = CoNLLDataset(x_test_tokens, y_test_labels, ner_vocab)

In [None]:
# Prepping data for train and predict
train_loader = DataLoader(
        train_dataset, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, gpu=gpu))
dev_loader = DataLoader(
        dev_dataset, batch_size=64, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))
test_loader = DataLoader(
        test_dataset, batch_size=1, shuffle=False, collate_fn=partial(collate_fn, gpu=gpu))

In [None]:
""" Training block """


""" SET BERT MODEL TYPE """
model_path = "ltgoslo/norbert2"

""" SET TOKENIZER """
tokenizer = BertTokenizer.from_pretrained(model_path, do_basic_tokenize=False)

""" SET NUMBER OF EPOCHS """
MAX_EPOCHS = 1

""" SET PATIENCE IF  """
PATIENCE = 10

""" IF SETTING DETERMINSTIC SEEDS"""
torch.backends.cudnn.deterministic = True
seeds = [8, 37, 42, 101, 1024]

""" SET FREEZE CONDITION """
freeze = False

""" SET IF MODEL SHOULD TRAIN """
do_training = False

""" LABEL THAT IS NOT NE """
no_ne_index = train_dataset.ner_indexer["O"] # For train parameter
        
""" NAME FOR MODEL (FOR JSON FILE) """
model_name = "norbert2"

""" OUTPUT PATH NAMES """
output_path = r"C:\Users\Aarne\Desktop\Ferdig_code_folder\BERT models\norbert_dyno_labels\best_model"



In [None]:
if do_training == True:
    for seed in seeds:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # Shuffling dataset for each seed. When taking mean average from each seed, hopefully the
        # results will be somewhat the same even if the train data is shuffled.

        # Instaniating model
        norbert2 = NorBert(ner_vocab, model_path, freeze).to(device)

        # Some paramters for train
        criterion = nn.CrossEntropyLoss(ignore_index=-1)
        optimiser, lr = is_freeze(freeze, norbert2)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, mode='min',
                factor=0.93, patience=1, threshold=1e-7, threshold_mode='abs')

        OUTPUT_PATH = r"{}_adam_lr_{}__maxEpochs_{}__seed_{}".format(output_path, lr, MAX_EPOCHS, seed)

        loss_history_train, loss_history_dev, val_accuracy, stop_epoch, learn_rate_history = train(
                                            device, train_loader, dev_loader, 
                                            tokenizer, norbert2, optimiser, scheduler, criterion,
                                            no_ne_index, MAX_EPOCHS, OUTPUT_PATH, PATIENCE
                                            )

        info = model_info_json(model_name, loss_history_train, loss_history_dev, val_accuracy, learn_rate_history,
                            MAX_EPOCHS, stop_epoch, lr, seed, PATIENCE)

In [None]:
""" Gold labels for NorNE test_dataset"""

gold_labels = []
for sentence_labels in test_dataset.ner_labels:
    for label in sentence_labels:
        gold_labels.append(label)

In [None]:
""" Test model output path """
MODEL_LOAD_PATH = r""

bertbert = NorBert(ner_vocab, model_path, freeze).to(device)
bertbert.load_state_dict(torch.load(MODEL_LOAD_PATH))

In [None]:
preds = predict_test(test_loader, ner_vocab, tokenizer, bertbert) #train data for ner_vocab 

In [None]:
""" Uses index [:-2] so it will not use "O" or "@UNK" """

print(classification_report(gold_labels, preds, labels = ner_vocab[:-2]))
