In [None]:
import pandas as pd
import importlib
import utilities as utils

activeMode= "prod"

# Reload library if changed
importlib.reload(utils)

configuration_csv = pd.read_csv(f"./config/{activeMode}.csv", dtype=str, sep=";")
config = utils.configureParameters(configuration_csv)
print(f"Task: {config.task}")
print(f"Model path: {config.saved_model_path}")
print(f"Data path: {config.data_path}")
print(f"Tokenizer: {config.tokenizer}")
print(f"Batch size: {config.batch_size}")
print(f"Epochs: {config.epochs}")
print(f"Learning rate: {config.learning_rate}")
print(f"Sequence length: {config.sequence_length}")
print(f"Training: {config.train_model}")
print(f"Num Threads: {config.num_threads}")
print(f"Num Sentences: {config.num_sentences}")

In [None]:
import os

# Export env vars to limit number of threads to use
num_threads = str(config.num_threads)
os.environ["OMP_NUM_THREADS"] = num_threads 
os.environ["OPENBLAS_NUM_THREADS"] = num_threads
os.environ["MKL_NUM_THREADS"] = num_threads 
os.environ["VECLIB_MAXIMUM_THREADS"] = num_threads
os.environ["NUMEXPR_NUM_THREADS"] = num_threads

# Only use CPU, hide GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, BertForTokenClassification, BertConfig, BertModel
import torch
import torch.nn as nn
import torch.nn.functional as F
#Import SummaryWriter for Tensorboard logging
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import (DataLoader, TensorDataset)
# Load Pytorch Geometric
from torch_geometric.data import Data
import torch_geometric.data as tg_data
import torch_geometric.utils as tg_utils
import torch_geometric.nn as tg_nn
import evaluate
# Evaluation metrics for NER task
from seqeval.metrics import classification_report
# Support for IOBES style NER labels
from seqeval.scheme import IOBES
import numpy as np
# Progress bar
from tqdm import tqdm
# Easy file reading
import glob
import random
import pickle
import matplotlib.pyplot as plt
import math

In [None]:
PID = os.getpid()
PGID = os.getpgid(PID)
print(f"PID: {PID}, PGID: {PGID}")

In [None]:
# Limit no. of threads used by Pytorch
torch.set_num_threads = int(num_threads)
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
ner_tags_list = ['X','O','<unk>', 'B-CARDINAL', 'E-CARDINAL', 'S-PERSON', 'S-CARDINAL', 'S-PRODUCT', 'B-PRODUCT', 'I-PRODUCT', 'E-PRODUCT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'E-WORK_OF_ART', 'B-PERSON', 'E-PERSON', 'S-GPE', 'B-DATE', 'I-DATE', 'E-DATE', 'S-ORDINAL', 'S-LANGUAGE', 'I-PERSON', 'S-EVENT', 'S-DATE', 'B-QUANTITY', 'E-QUANTITY', 'S-TIME', 'B-TIME', 'I-TIME', 'E-TIME', 'B-GPE', 'E-GPE', 'S-ORG', 'I-GPE', 'S-NORP', 'B-FAC', 'I-FAC', 'E-FAC', 'B-NORP', 'E-NORP', 'S-PERCENT', 'B-ORG', 'E-ORG', 'B-LANGUAGE', 'E-LANGUAGE', 'I-CARDINAL', 'I-ORG', 'S-WORK_OF_ART', 'I-QUANTITY', 'B-MONEY', 'I-MONEY', 'E-MONEY', 'B-LOC', 'E-LOC', 'I-LOC', 'B-PERCENT', 'I-PERCENT', 'E-PERCENT', 'S-LOC', 'S-FAC', 'B-EVENT', 'E-EVENT', 'I-EVENT', 'S-MONEY', 'B-LAW', 'I-LAW', 'E-LAW', 'I-NORP', 'I-LANGUAGE', 'S-LAW', 'S-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'E-ORDINAL', '<START>', '<STOP>', "[CLS]", "[SEP]"]
num_labels = len(ner_tags_list)

In [None]:
filepath_train_data = config.data_path + '**/*-train.txt'
filepath_validation_data = config.data_path + '**/*-dev.txt'
filepath_test_data = config.data_path + '**/*-test.txt'

filepath_train_syntrees = config.data_path + '**/*-train.syntree'
filepath_validation_syntrees = config.data_path + '**/*-dev.syntree'
filepath_test_syntrees = config.data_path + '**/*-test.syntree'

filepath_train_ner_labels = config.data_path + '**/*-train-orig.ner'
filepath_validation_ner_labels = config.data_path + '**/*-dev-orig.ner'
filepath_test_ner_labels = config.data_path + '**/*-test-orig.ner'

In [None]:
class BertForNer(BertForTokenClassification):
    """
    Adapted from Huggingface BertForTokenClassification
    """

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,valid_ids=None,attention_mask_label=None):
        
        # Calculate new embeddings
        sequence_output = self.bert(input_ids, token_type_ids, attention_mask,head_mask=None)[0]
        batch_size,max_len,feat_dim = sequence_output.shape

        # Initialize valid output
        valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32)
        # Calculate new sequence output: ignore non-valid tokens, e.g. subtokens of words
        for batch_idx in range(batch_size):
            valid_idx = -1
            for token_idx in range(max_len):
                    if valid_ids[batch_idx][token_idx].item() == 1:
                        valid_idx += 1
                        valid_output[batch_idx][valid_idx] = sequence_output[batch_idx][token_idx]
        sequence_output = self.dropout(valid_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=0)
            # Only keep active parts of the loss
            if attention_mask_label is not None:
                active_loss = attention_mask_label.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss, logits
        else:
            return logits

In [None]:
class SynGNN(nn.Module):
    """
    SynGNN Pytorch module
    based on Pytorch TransformerEncoderLayer implementing the architecture in paper “Attention Is All You Need”. 
    
    """
    def __init__(self, dim_in, dim_hdn, dim_out, num_heads, dim_feedforward=2048, dropout=0.1, activation="relu"):
        """
        :param dim_in: input dimension
        :param dim_hdn: hidden nodes dimension
        :param dim_out: output dimension
        """
        super(nn.TransformerEncoderLayer, self).__init__()
        # Graph attention sublayer
        self.graph_attn = tg_nn.GATv2Conv(dim_in, dim_hdn , heads=num_heads)
        self.linear1 = nn.Linear(dim_hdn, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, dim_out)
        self.norm0 = tg_nn.LayerNorm(dim_in)
        self.norm1 = nn.LayerNorm(dim_hdn)
        self.norm2 = nn.LayerNorm(dim_hdn)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)
        
        def __setstate__(self, state):
            if 'activation' not in state:
                state['activation'] = F.relu
            super(nn.TransformerEncoderLayer, self).__setstate__(state)

        def forward(self, x, edge_index, batch):
            r"""Pass the input through the encoder layer.
            Args:
                x: node features
                edge_index: graph edges
                batch: current batch
            """
            # Graph attention sublayer
            x_norm = self.norm0(x, batch)
            src2, att_weight = self.graph_attn(x_norm, edge_index)
            src = src + self.dropout1(src2)
            src = self.norm1(src)

            # Feed-Forward-Network sublayer
            src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
            src = src + self.dropout2(src2)
            src = self.norm2(src)
            return src, att_weight
        
        def _get_activation_fn(activation):
            if activation == "relu":
                return F.relu
            elif activation == "gelu":
                return F.gelu

            raise RuntimeError("activation should be relu/gelu, not {}".format(activation))

In [None]:
class SynBertForNer(BertForTokenClassification):
      def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        # self.syngnn = SynGNN()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

      def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,valid_ids=None,attention_mask_label=None):
         sequence_output = self.bert(input_ids, token_type_ids, attention_mask,head_mask=None)[0]
         batch_size,max_len,feat_dim = sequence_output.shape
         valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32)
         # Calculate sequence output: ignore non-valid tokens, e.g. subtokens of words
         for batch_idx in range(batch_size):
            valid_idx = -1
            for token_idx in range(max_len):
                     if valid_ids[batch_idx][token_idx].item() == 1:
                        valid_idx += 1
                        valid_output[batch_idx][valid_idx] = sequence_output[batch_idx][token_idx]

         # Pipe Bert embeddings into syntactic GAN

         sequence_output = self.dropout(valid_output)
         logits = self.classifier(sequence_output)

         if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=0)
            # Only keep active parts of the loss
            if attention_mask_label is not None:
                  active_loss = attention_mask_label.view(-1) == 1
                  active_logits = logits.view(-1, self.num_labels)[active_loss]
                  active_labels = labels.view(-1)[active_loss]
                  loss = loss_fct(active_logits, active_labels)
            else:
                  loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
         else:
            return logits

In [None]:
tokenizer = BertTokenizer.from_pretrained(config.tokenizer)
if(config.task == 'mlm'):
    model = BertForMaskedLM.from_pretrained(config.saved_model_path)
if(config.task == 'ner'):
    BERTconfig = BertConfig.from_pretrained(config.saved_model_path, num_labels=num_labels, tokenizer = tokenizer)
    model = BertForNer.from_pretrained(config.saved_model_path, from_tf = False, config = BERTconfig)

In [None]:
device =  torch.device('cpu')
# Move model to device
model.to(device)

In [None]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids, valid_ids=None, label_mask=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids
        self.valid_ids = valid_ids
        self.label_mask = label_mask

In [None]:
def createMaskedInputs(inputs):
    """
    creates masked input embeddings and labels from tokenized text

    :param inputs: tokenized text
    :return: masked input embeddings and new column labels 
    """ 
    # Clone input ids (tokens) to create labels
    inputs['labels'] = inputs.input_ids.detach().clone()
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array with 15% masked tokens
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
        (inputs.input_ids != 102) * (inputs.input_ids != 0)
    # Select indices of each nonzero (= selected) value as token to be masked
    selection = []

    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    # Mask selected tokens: replace with [MASK] code 103 in tensor
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
    
    return inputs

In [None]:
def createNERInputFeatures(sentence_labels_list, label_list, max_seq_length, tokenizer):
    """Loads a list of sentences into a list of input features for the transformer
    
        :return: list of inpt features objects
    """

    # Map NER labels to indices
    # start with 1: 0 reserved for invalid labels e.g. subtoken labels
    label_map = {label : i for i, label in enumerate(label_list,0)}
    #label_map['X'] = 0

    features = []
    for (sentence_idx,sentence_label_pair) in enumerate(sentence_labels_list):
    #     sentence = sentence.split(" ")
    #     labellist = labels[sentence_idx]
        if sentence_idx < 2:
            sentence_label_pair
        # Tokenized text of sentence
        tokens = []
        # Token labels for sentence
        labels = []
        # Lists valid labels as 1 and labels to be ignored as 0 (e.g. for the labels for subword tokens which are not counting as separate labels for each token)
        valid = []
        # Mask for transformer indicating which tokens to ignore
        label_mask = []
        for word_label_pair in sentence_label_pair:
            token = tokenizer.tokenize(word_label_pair[0])
            tokens.extend(token)

            label_word = word_label_pair[1]
            for token_idx in range(len(token)):
                # Append label for first token in word, mark as valid
                if token_idx == 0:
                    labels.append(label_word)
                    valid.append(1)
                    label_mask.append(1)
                # Subword tokens: Mark as not valid
                else:
                    labels.append('X')
                    valid.append(0)
                    label_mask.append(1)
        # Sentence exceeds max sequence length: cut to sequence length
        if len(tokens) >= max_seq_length - 1:
            tokens = tokens[0:(max_seq_length - 2)]
            labels = labels[0:(max_seq_length - 2)]
            valid = valid[0:(max_seq_length - 2)]
            label_mask = label_mask[0:(max_seq_length - 2)]
        # Tokens with BERT [CLS] and [SEP] tokens
        ntokens = []
        # Segment ids for BERT
        segment_ids = []
        # Label embedding ids for BERT
        label_ids = []
        # Start segment
        ntokens.append("[CLS]")
        segment_ids.append(0)
        label_ids.append(label_map["[CLS]"])
        # Mark as valid label
        valid.insert(0,1)
        label_mask.insert(0,1)

        # add sentence tokens and label ids
        for i, token in enumerate(tokens):
            ntokens.append(token)
            segment_ids.append(0)
            if len(labels) > i:
                label_ids.append(label_map[labels[i]])
        # End segment
        ntokens.append("[SEP]")
        segment_ids.append(0)
        valid.append(1)
        label_mask.append(1)
        label_ids.append(label_map["[SEP]"])

        # Convert tokens to ids
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        input_mask = [1] * len(input_ids)
        
        # Pad sentence to sequence length
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            label_ids.append(0)
            valid.append(1)
            label_mask.append(0)

        # Pad labels to sequence length
        while len(label_ids) < max_seq_length:
            label_ids.append(0)
            label_mask.append(0)
        
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length
        assert len(valid) == max_seq_length
        assert len(label_mask) == max_seq_length
        

        features.append(
        InputFeatures(input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_ids=label_ids,
                        valid_ids=valid,
                        label_mask=label_mask))
    return features


In [None]:
class SyntransDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
def train_ud_tokenizer(tokenizer, tokenizer_name):
    tokenizer_path = "./tokenizers/" + tokenizer_name 
    special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
    ]
    # 30,522 vocab is BERT's default vocab size, feel free to tweak
    vocab_size = 30_522
    # Load data
    text = []
    for ud_file in glob.iglob(config.data_path + '**/UD_English-Pronouns/en_*.txt', recursive=True):

        ud_file = os.path.abspath(ud_file)
        filename = os.path.basename(ud_file)
        print(filename, flush = True)
        tokenizer.train(files=ud_file, vocab_size=vocab_size, special_tokens=special_tokens)
    # make the directory if not already there
    if not os.path.isdir(tokenizer_path):
        os.mkdir(tokenizer_path)
    # save the tokenizer  
    tokenizer.save_model(tokenizer_path)

In [None]:
def loadSentencesFromFiles(filepath):
    """
    Load sentences from files.

    :param filepath: path to files (supports glob regex)
    :return: list of sentences
    """ 
    sentences = []
    for ud_file in sorted(glob.iglob(filepath, recursive=True)):

        ud_file = os.path.abspath(ud_file)
        filename = os.path.basename(ud_file)
        print(filename, flush = True)
        with open(ud_file, 'r') as fp:
            sentences.extend(fp.read().split('\n'))
    return sentences
        


In [None]:
def loadNERLabelsFromFiles(filepath):
    """
    Load sentences from files.

    :param filepath: path to files (supports glob regex)
    :return: list of NER labels per sentence
    """ 
    all_token_label_pairs = []
    for ud_file in sorted(glob.iglob(filepath, recursive=True)):

        ud_file = os.path.abspath(ud_file)
        filename = os.path.basename(ud_file)
        print(filename, flush = True)
        with open(ud_file, 'r') as fp:
            # Split labels file by sentences
            sentences = (fp.read().split('\n'))
        # Split sentences by tokens
        token_labels = [x.split("\t") for x in sentences]
        # Remove empty line at end of sentence
        [x.remove('') for x in token_labels] 
        # Split token and NER tags
        token_labels = [list(map(lambda x:x.split(" ") ,tag_token)) for tag_token in token_labels]
        all_token_label_pairs.extend(token_labels)

    return all_token_label_pairs

In [None]:
def loadSyntaxTreesFromFiles(filepath):
    """
    Load binary syntax tree files (*.syntree).

    :param filepath: path to files (supports glob regex)
    :return: list of sentence syntax trees
    """ 
    all_syntrees = []
    for syntree_file in sorted(glob.iglob(filepath, recursive=True)):

        syntree_file = os.path.abspath(syntree_file)
        filename = os.path.basename(syntree_file)
        print(filename, flush = True)
        with open(syntree_file, 'rb') as fp:
            all_syntrees.append(pd.read_pickle(fp))
    return all_syntrees

In [None]:
#all_syntrees = loadSyntaxTreesFromFiles(filepath_test_syntrees)
#print(all_syntrees[0][5])

In [None]:
# all_syntrees = loadSyntaxTreesFromFiles(filepath_test_syntrees)
# print(all_syntrees[0][0:2])

# syntree_train_loader = tg_data.DataLoader(all_syntrees[0], batch_size=config.batch_size, shuffle=True)


In [None]:
"""
# Print example of tokenized text
sentences = []
for ud_file in glob.iglob(config.data_path + '**/UD_English-Atiien_*.txt', recursive=True):

    ud_file = os.path.abspath(ud_file)
    filename = os.path.basename(ud_file)
    print(filename, flush = True)
    with open(ud_file, 'r') as fp:
        sentences.extend(fp.read().split('\n'))
count = 0
for sentence in sentences:
    # Tokenize data
    inputs = tokenizer(sentence, return_tensors='pt', max_length=config.sequence_length, truncation=True, padding='max_length')
    inputs = createMaskedInputs(inputs)

    # Create dataset from tokenized data
    dataset = SyntransDataset(inputs)
    loader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
    if(count==1):
        print(inputs['input_ids'])
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        print(tokens)
        print(inputs['labels'])
        break
    count=count+1"""

In [None]:
def getMaxSequenceLength(sentences, cutoff_limit_percent=0.9999):
    """
    Calculate maximum sequence length for given data.
    param sentences: list of sentences
    param cutoff_limit_percent: percentage of all samples to accommodate with the max sequence length.
    returns: max sequence length which encompasses cutoff_limit_percent of all data samples
    """
    # Get number of tokens per sentence        
    max_sentence_tokens = 0
    sentence_tokens = {}
    print(f"Amount of samples: {len(sentences)}")
    # Tokenize data
    for sentence in sentences:

        inputs = tokenizer(sentence, return_tensors='pt')
        
        token_count = inputs.input_ids.size(dim=1)
        sentence_tokens[inputs.input_ids.size(dim=1)] = sentence_tokens.get(token_count,0) + 1
        if(token_count > max_sentence_tokens): 
            max_sentence_tokens = token_count
            
    no_tokens = 0
    # Calulate number of samples which should have a sequence length smaller than max_sequence_length
    cutoff = cutoff_limit_percent * len(sentences)
    max_sequence_length = 0
    print(max_sentence_tokens)
    for i in sorted(sentence_tokens):
        # print((i, sentence_tokens[i]), end=" ")
        if(no_tokens <= cutoff):
            no_tokens = no_tokens + sentence_tokens[i]
            max_sequence_length = i

    print(f"Max sequence length: {max_sequence_length} with {cutoff_limit_percent}% of samples smaller")
    return max_sequence_length

In [None]:
#sentences = loadSentencesFromFiles(config.data_path + '**/en_*.txt')

In [None]:
#print(getMaxSequenceLength(sentences))

In [None]:
def createDataloader(filepath, shuffle_data=False):

        # Load NER labels
        if(config.task == 'ner'):
            print(filepath)

            sentence_labels_list = loadNERLabelsFromFiles(filepath)
            num_sentences = len(sentence_labels_list)
            num_batches = math.ceil(num_sentences / config.batch_size)
            print(f"{num_sentences} sentences, {num_batches} batches of size {config.batch_size}")
            print(sentence_labels_list[0:2])

            sentence_labels_list = sentence_labels_list[0:config.num_sentences]
            features = createNERInputFeatures(sentence_labels_list, ner_tags_list, config.sequence_length, tokenizer)
            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
            all_valid_ids = torch.tensor([f.valid_ids for f in features], dtype=torch.long)
            all_lmask_ids = torch.tensor([f.label_mask for f in features], dtype=torch.long)
            data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
            print(data.__sizeof__())
            # Print control example of InputFeatures
            print("Control example of InputFeatures")
            print(str(features[1].input_ids))
            print(str(features[1].input_mask))
            print(str(features[1].label_ids))
            print(str(features[1].valid_ids))
            print(str(features[1].label_mask))
            print(str(features[1].segment_ids))


        if(config.task == 'mlm'):
            # Load data
            sentences = loadSentencesFromFiles(filepath)
            # Tokenize data
            inputs = tokenizer(sentences, return_tensors='pt', max_length=config.sequence_length, truncation=True, padding='max_length')
            inputs = createMaskedInputs(inputs)

            # Create dataset from tokenized data
            data = SyntransDataset(inputs)
        
        loader = DataLoader(data, batch_size=config.batch_size, shuffle=shuffle_data)
        return loader

In [None]:
def loadTrainData():
    print("Loading Training Data")
    if(config.task == 'ner'):
        return createDataloader(filepath_train_ner_labels, shuffle_data=True)
    if(config.task == 'mlm'):
        return createDataloader(filepath_train_data, shuffle_data=True)

In [None]:
def loadValidationData():
    print("Loading Validation Data")
    if(config.task == 'ner'):
        return createDataloader(filepath_validation_ner_labels)
    if(config.task == 'mlm'):
        return createDataloader(filepath_validation_data)

In [None]:
def loadTestData():
    print("Test Data")
    if(config.task == 'ner'):
        return createDataloader(filepath_test_ner_labels)
    if(config.task == 'mlm'):
        return createDataloader(filepath_test_data)

In [None]:
def trainModel(epoch, trainLoader, writer):
    # activate training mode
    model.train()

    from torch.optim import AdamW
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=config.learning_rate)
    epoch_loss = 0
    # setup loop with TQDM and dataloader
    loop = tqdm(trainLoader, leave=True, mininterval=40,maxinterval=120)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        if (config.task == 'ner'):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
            batch_loss, logits = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
            batch_loss = loss.item()
        if (config.task == 'mlm'):
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Train Epoch {epoch}')
        loop.set_postfix(loss=batch_loss)
        epoch_loss = epoch_loss + batch_loss
    # Calculate epoch loss
    epoch_loss = epoch_loss / len(trainLoader)
    # Print info to Tensorboard
    writer.add_scalar("Loss", epoch_loss, epoch)
    return epoch_loss, logits

In [None]:
def validateModel(epoch, validationLoader, writer):
    # activate eval mode
    model.eval()

    epoch_loss = 0
    # setup loop with TQDM and dataloader
    with torch.no_grad():
        loop = tqdm(validationLoader, leave=True, mininterval=40,maxinterval=120)
        for batch in loop:
            # pull all tensor batches required for training
            if (config.task == 'ner'):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
                batch_loss, logits = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
                batch_loss = batch_loss.item()
            if (config.task == 'mlm'):
                # pull all tensor batches required for training
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                # process
                batch_loss, logits = model(input_ids, attention_mask=attention_mask,
                                labels=labels)
                batch_loss = batch_loss.item()

            # print relevant info to progress bar
            loop.set_description(f'Validation Epoch {epoch}')

            loop.set_postfix(loss=batch_loss)
            epoch_loss = epoch_loss + batch_loss

            # Calculate epoch loss
            epoch_loss = epoch_loss / len(validationLoader)
            #print(epoch_loss)
            # Print info to Tensorboard
            writer.add_scalar("Loss", epoch_loss, epoch)
            return epoch_loss

In [None]:
from sys import stderr


def evaluateModel(data_loader, mode=None, writer = None, results_dir = None,  epoch = None):
    if(mode == 'Train'):
        model.train()
        from torch.optim import AdamW
        # initialize optimizer
        optim = AdamW(model.parameters(), lr=config.learning_rate)
    elif(mode == 'Test' or mode == 'Validation'):
        model.eval()
    else:
        stderr("Mode must be Train, Validation or Test")
        exit()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    y_true = []
    y_pred = []
    label_map = {i : label for i, label in enumerate(ner_tags_list,0)}

    references_all = []
    predictions_all = []
    references_roc_all = []
    predictions_roc_all = []

    if(config.task == 'mlm'):
        # Setup loop with TQDM and dataloader
        loop = tqdm(data_loader, leave=True, mininterval=20,maxinterval=120)
        for batch in loop:
    
            # Pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).tolist()

            softmax = nn.Softmax(dim = -1)
            if (mode == 'Test' or mode == 'Validation'):
                with torch.no_grad():
                    predictions = model(input_ids)
            if(mode == 'Train'):
                # initialize calculated gradients (from prev step)
                optim.zero_grad()
                predictions = model(input_ids)
            predictions = predictions['logits']
            predictions_sm = softmax(predictions)

            # Change type to double to prevent floating point rounding errors
            predictions = predictions.type(torch.float64)
            predictions_sm = softmax(predictions)

            # Get index of argmax
            #y = np.argmax(predictions_sm, axis = -1)
            # y = y.tolist()
            y = torch.topk(predictions, k=1, dim = 2)[1].squeeze()
            y = y.tolist()
                

            recall_metric = evaluate.load('recall')
            precision_metric = evaluate.load('precision')
            f1_metric = evaluate.load('f1')
            roc_auc_metric = evaluate.load("roc_auc", "multiclass")


            # Go through all samples in batch and add to computation batch
            for idx, pred_batch in enumerate(y):
                references_all.extend(labels[idx])
                predictions_all.extend(pred_batch)
                #precision_metric.add_batch(references=labels[idx], predictions=pred_batch)
                #recall_metric.add_batch(references=labels[idx], predictions=pred_batch)
                #f1_metric.add_batch(references=labels[idx], predictions=pred_batch)
            
            # Calculate ROC
            for batch_idx, pred_batch in enumerate(predictions_sm):
                predictions_roc_all.extend(pred_batch.tolist())
                references_roc_all.extend(labels[batch_idx])
                #roc_auc_metric.add_batch(references=labels[batch_idx], prediction_scores = pred_batch.tolist())
                break
            break

        numberOfBatches = len(loop)
        # List all possible labels
        labels = np.arange(tokenizer.vocab_size)
        with open(f"./logs/Results_{config.task}_{config.tokenizer}_E{config.epochs}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}.txt", "w") as output:
            print(f"Results: {config.tokenizer}, Train={config.train_model} {config.tokenizer}_E{config.epochs}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}", file = output)
            output.write("macro averaging\n")
            output.write(str(recall_metric.compute(references = references_all, predictions = predictions_all, average = 'macro')))
            output.write("\n")
            output.write(str(precision_metric.compute(references = references_all, predictions = predictions_all, average = 'macro', zero_division = 0)))
            output.write("\n")
            output.write(str(f1_metric.compute( references = references_all, predictions = predictions_all, average = 'macro')))
            output.write("\n")
            output.write(str(roc_auc_metric.compute( references = references_roc_all, prediction_scores = predictions_roc_all, average = 'macro', multi_class = 'ovo', labels = labels, max_fpr = 1.0)))
            output.write("\n")
            output.write("weighted averaging\n")
            output.write(str(recall_metric.compute( references = references_all, predictions = predictions_all, average = 'weighted')))
            output.write("\n")
            output.write(str(precision_metric.compute( references = references_all, predictions = predictions_all, average = 'weighted', zero_division = 0)))
            output.write("\n")
            output.write(str(f1_metric.compute( references = references_all, predictions = predictions_all, average = 'weighted')))
            output.write("\n")
            output.write(str(roc_auc_metric.compute( references = references_roc_all, prediction_scores = predictions_roc_all, average = 'weighted', multi_class = 'ovo', labels = labels, max_fpr = 1.0)))
            output.close()
    
    if (config.task == 'ner'):

        sep_token_id = int(ner_tags_list.index("[SEP]"))
        cls_token_id = int(ner_tags_list.index("[CLS]"))
        unk_token_id = int(ner_tags_list.index("<unk>"))
        O_token_id = int(ner_tags_list.index("O"))

        special_token_predictions = 0
        O_token_predictions = 0
        epoch_loss = 0
        # setup loop with TQDM and dataloader
        loop = tqdm(data_loader, leave=True, mininterval=20,maxinterval=120)
        # Loop over all batches
        for batch in loop:
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch

            if(mode == 'Train'):
                # initialize calculated gradients (from prev step)
                optim.zero_grad()
                loss, logits = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
                # calculate loss for every parameter that needs grad update
                loss.backward()
                #torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                # update parameters
                optim.step()
            elif(mode == 'Validation'):
                with torch.no_grad():
                    loss, logits = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
            elif(mode == 'Test'):
                logits = model(input_ids, segment_ids, input_mask,valid_ids=valid_ids,attention_mask_label=l_mask)

            if(mode == 'Validation' or mode == 'Train'):
                # print relevant info to progress bar
                loop.set_description(f'{mode} Epoch {epoch}')
                batch_loss = loss.item()
                loop.set_postfix(loss=batch_loss)
                epoch_loss = epoch_loss + batch_loss
            
            softmax = nn.Softmax(dim=2)

            # Get highest NER label prediction for all sentences
            logits = torch.argmax(softmax(logits),dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()

            # Go through true labels
            for label_list_idx, true_sentence_labels in enumerate(label_ids):
                y_true_temp = []
                y_pred_temp = []

                for label_idx, label_id in enumerate(true_sentence_labels):

                    # Skip 0 label
                    if label_id == 0:
                        continue

                    # Skip [CLS] label at sequence beginning
                    if label_id == cls_token_id:
                        continue

                    # Detect [SEP] label at sentence end and ignore [SEP] and all sequence padding
                    elif label_id == sep_token_id:
                        y_true.append(y_true_temp)
                        y_pred.append(y_pred_temp)
                        break
                    else:
                        # Predicted NER label is special token: count preds
                        if (logits[label_list_idx][label_idx] == 0):
                            special_token_predictions = special_token_predictions +1
                        # Predicted NER label is O: count preds
                        elif (logits[label_list_idx][label_idx] == O_token_id):
                            O_token_predictions = O_token_predictions +1

                        # Append label and prediction to list
                        y_true_temp.append(label_map[label_id])
                        y_pred_temp.append(label_map[logits[label_list_idx][label_idx]])

        if (mode == 'Train' or mode == 'Validation'):
            print(f"True: {y_true[0:1]}, Predicted: {y_pred[0:1]}")
            report = classification_report(y_true, y_pred, digits=6, output_dict=True, zero_division = 0)
            # Calculate epoch loss
            epoch_loss = epoch_loss / len(data_loader)

            # Print info to Tensorboard
            writer.add_scalar("Loss", epoch_loss, epoch)
            macro_precision = report['macro avg']['precision']
            writer.add_scalar("macro_avg/precision", macro_precision, epoch)
            macro_recall = report['macro avg']['recall']
            writer.add_scalar("macro_avg/recall", macro_recall, epoch)
            macro_f1 = report['macro avg']['f1-score']
            writer.add_scalar("macro_avg/f1", macro_f1, epoch)

            weighted_precision = report['weighted avg']['precision']
            writer.add_scalar("weighted_avg/precision", weighted_precision, epoch)
            weighted_recall = report['weighted avg']['recall']
            writer.add_scalar("weighted_avg/recall", weighted_recall, epoch)
            weighted_f1 = report['weighted avg']['f1-score']
            writer.add_scalar("weighted_avg/f1", weighted_f1, epoch)
            print(f"O Token Predictions: {O_token_predictions}")
            print(f"loss: {epoch_loss} w prec: {weighted_precision} w recall: {weighted_recall} w f1: {weighted_f1}")
            return epoch_loss, macro_precision, macro_recall, macro_f1, weighted_precision, weighted_recall, weighted_f1

        else:
            report = classification_report(y_true, y_pred, digits=4, output_dict=False)
            with open(results_dir +"results.txt", "w") as output:
                print("***** Test results *****")
                print(f"Task: {config.task}")
                print(f"Model path: {config.saved_model_path}")
                print(f"Data path: {config.data_path}")
                print(f"Tokenizer: {config.tokenizer}")
                print(f"Batch size: {config.batch_size}")
                print(f"Epochs: {config.epochs}")
                print(f"Learning rate: {config.learning_rate}")
                print(f"Sequence length: {config.sequence_length}")
                print(f"Training: {config.train_model}")
                print(f"Num Threads: {config.num_threads}")
                print(f"Num Sentences: {config.num_sentences}")
                print(f"{report}\n Special token predictions: {special_token_predictions}")
                output.write(report)







In [None]:
log_idx = 0
# Tensorboard logging
tensorboard_dir = f"./runs/{config.task}/{config.tokenizer}_E{config.epochs}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}"
while(os.path.exists(tensorboard_dir+f"_{log_idx}")):
    # Check is dir is not empty
    if(os.listdir(tensorboard_dir+f"_{log_idx}") and log_idx <30):
        log_idx = log_idx+1
    else:
        break
tensorboard_dir = tensorboard_dir+f"_{log_idx}/"
#_dir = f"./runs/{config.task}/{config.tokenizer}_E{config.epochs}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}_{log_idx}/"
if not os.path.isdir(tensorboard_dir):
    os.makedirs(tensorboard_dir)

results_dir = f"./logs/{config.task}/Results/{config.task}_{config.tokenizer}_E{config.epochs}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}"
while(os.path.exists(results_dir+f"_{log_idx}")):
    # Check if directory is empty
    if( os.listdir(results_dir+f"_{log_idx}") and log_idx <30):
        log_idx = log_idx+1
    else:
        break
results_dir = results_dir+f"_{log_idx}/"
if not os.path.isdir(results_dir):
    os.makedirs(results_dir)


In [None]:
if (config.train_model):
    print("Training model", flush=True)

    trainLoader = loadTrainData()
    validationLoader = loadValidationData()

    epochs = config.epochs

    train_writer = SummaryWriter(log_dir=tensorboard_dir+"training")
    validation_writer = SummaryWriter(log_dir=tensorboard_dir+"validation")

    epoch_losses_train = []
    macro_precisions_train = []
    macro_recalls_train = []
    macro_f1s_train = []
    weighted_precisions_train = []
    weighted_recalls_train = []
    weighted_f1s_train = []

    epoch_losses_validation = []
    macro_precisions_val = []
    macro_recalls_val = []
    macro_f1s_val = []
    weighted_precisions_val = []
    weighted_recalls_val = []
    weighted_f1s_val = []

    for epoch in range(epochs):
        #epoch_losses_train.append(trainModel(epoch, trainLoader, train_writer))
        #epoch_losses_validation.append(validateModel(epoch, validationLoader, validation_writer))

        epoch_loss_train, macro_precision_train, macro_recall_train, macro_f1_train, weighted_precision_train, weighted_recall_train, weighted_f1_train = evaluateModel(trainLoader, mode = 'Train', writer = train_writer, results_dir = results_dir, epoch=epoch)
        epoch_losses_train.append(epoch_loss_train)
        macro_precisions_train.append(macro_precision_train)
        macro_recalls_train.append(macro_recall_train)
        macro_f1s_train.append(macro_f1_train)

        weighted_precisions_train.append(weighted_precision_train)
        weighted_recalls_train.append(weighted_recall_train)
        weighted_f1s_train.append(weighted_f1_train)

        epoch_loss_val, macro_precision_val, macro_recall_val, macro_f1_val, weighted_precision_val, weighted_recall_val, weighted_f1_val = evaluateModel(validationLoader, mode = 'Validation', writer = validation_writer, results_dir = results_dir, epoch=epoch)
        epoch_losses_validation.append(epoch_loss_val)
        macro_precisions_val.append(macro_precision_val)
        macro_recalls_val.append(macro_recall_val)
        macro_f1s_val.append(macro_f1_val)

        weighted_precisions_val.append(weighted_precision_val)
        weighted_recalls_val.append(weighted_recall_val)
        weighted_f1s_val.append(weighted_f1_val)


        if (activeMode == 'prod'):
            # Save model after each epoch
            model.save_pretrained(save_directory=f"./trained_models/{config.task}/{config.tokenizer}_E{epoch}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}/")
    
    train_writer.close()
    validation_writer.close()
    # Save epoch loss plots
    plt.figure()
    plt.plot(range(0,epochs), epoch_losses_train, 'b', label='Training loss')
    plt.plot(range(0,epochs), epoch_losses_validation, 'g', label='Validation loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(results_dir +"/loss.png", facecolor='white', transparent=False)
    plt.show()


    # # Save macro f1 plot
    plt.figure()
    plt.plot(range(0,epochs), macro_f1s_train, 'b', label='Training')
    plt.plot(range(0,epochs), macro_f1s_val, 'g', label='Validation')
    plt.title('Macro Avg F1')
    plt.xlabel('Epochs')
    plt.ylabel('F1')
    plt.legend()
    plt.savefig(results_dir +"/f1_macro.png", facecolor='white', transparent=False)
    # #plt.show()

    # # Save weighted f1 plot
    plt.figure()
    plt.plot(range(0,epochs), weighted_f1s_train, 'b', label='Training')
    plt.plot(range(0,epochs), weighted_f1s_val, 'g', label='Validation')
    plt.title('Weighted Avg F1')
    plt.xlabel('Epochs')
    plt.ylabel('F1')
    plt.legend()
    plt.savefig(results_dir +"/f1_weighted.png", facecolor='white', transparent=False)
    # #plt.show()

    # # Save weighted recall plot
    plt.figure()
    plt.plot(range(0,epochs,1), weighted_recalls_train, 'b', label='Training')
    plt.plot(range(0,epochs,1), weighted_recalls_val, 'g', label='Validation')
    plt.title('Weighted Avg Recall')
    plt.xlabel('Epochs')
    plt.ylabel('Recall')
    plt.legend()
    plt.savefig(results_dir +"/recall_weighted.png", facecolor='white', transparent=False)
    # #plt.show()


    # # Save macro recall plot
    plt.figure()
    plt.plot(range(0,epochs,1), macro_recalls_train, 'b', label='Training')
    plt.plot(range(0,epochs,1), macro_recalls_val, 'g', label='Validation')
    plt.title('Macro Avg Recall')
    plt.xlabel('Epochs')
    plt.ylabel('Recall')
    plt.legend()
    plt.savefig(results_dir +"/recall_macro.png", facecolor='white', transparent=False)
    # #plt.show()

    #  # Save weighted precision plot
    plt.figure()
    plt.plot(range(0,epochs,1), weighted_precisions_train, 'b', label='Training')
    plt.plot(range(0,epochs,1), weighted_precisions_val, 'g', label='Validation')
    plt.title('Weighted Avg Precision')
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.legend()
    plt.savefig(results_dir +"/precision_weighted.png", facecolor='white', transparent=False)
    # #plt.show()


    # # Save macro precision plot
    plt.figure()
    plt.plot(range(0,epochs,1), macro_precisions_train, 'b', label='Training')
    plt.plot(range(0,epochs,1), macro_precisions_val, 'g', label='Validation')
    plt.title('Macro Avg Precison')
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.legend()
    plt.savefig(results_dir +"/precision_macro.png", facecolor='white', transparent=False)
    #plt.show()

    # print(str(features[1].input_ids))
    # print(str(features[1].input_mask))
    # print(str(features[1].label_ids))
    # print(str(features[1].valid_ids))
    # print(str(features[1].label_mask))
    # print(str(features[1].segment_ids))

# Model evaluation

In [None]:
print("Model evaluation\n", flush = True)

In [None]:
test_loader = loadTestData()
evaluateModel(test_loader, mode = 'Test', results_dir=results_dir)

In [None]:
print("Finished evaluation")

In [None]:
#confusion_matrix = metrics.confusion_matrix(references_all, predictions_all, labels=labels)
#print(confusion_matrix)
#disp = metrics.ConfusionMatrixDisplay(references_all, predictions_all, labels=labels)
#disp.plot()

In [None]:
"""
# precision recall curve
from sklearn.metrics import precision_recall_curve, roc_curve
import matplotlib.pyplot as plt
#%matplotlib inline
precision = dict()
recall = dict()
for i in labels:
    precision[i], recall[i], _ = precision_recall_curve(references_roc_all[i],
                                                        predictions_roc_all[i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    break
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()"""