In [24]:
import os

# Export env vars to limit number of threads to use
num_threads = "26"
os.environ["OMP_NUM_THREADS"] = num_threads 
os.environ["OPENBLAS_NUM_THREADS"] = num_threads
os.environ["MKL_NUM_THREADS"] = num_threads 
os.environ["VECLIB_MAXIMUM_THREADS"] = num_threads
os.environ["NUMEXPR_NUM_THREADS"] = num_threads

# Only use CPU, hide GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [25]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import torch.nn as nn
#Import SummaryWriter for Tensorboard logging
from torch.utils.tensorboard import SummaryWriter
import evaluate
import numpy as np
# Progress bar
from tqdm import tqdm
# Easy file reading
import glob
import pandas as pd
import importlib
from sklearn import metrics

In [26]:
import utilities as utils
# Reload library if changed
importlib.reload(utils)

<module 'utilities' from '/home/shrdlu/cdaniel/syntrans/utilities.py'>

In [27]:
# Limit no. of threads used by Pytorch
torch.set_num_threads = int(num_threads)

In [28]:
PID = os.getpid()
PGID = os.getpgid(PID)
print(f"PID: {PID}, PGID: {PGID}")

PID: 12622, PGID: 113


In [29]:
activeMode= "develop"

In [30]:
configuration_csv = pd.read_csv(f"./config/{activeMode}.csv", dtype=str, sep=";")
config = utils.configureParameters(configuration_csv)
print(f"Model path: {config.saved_model_path}")
print(f"Data path: {config.data_path}")
print(f"Tokenizer: {config.tokenizer}")
print(f"Batch size: {config.batch_size}")
print(f"Epochs: {config.epochs}")
print(f"Learning rate: {config.learning_rate}")
print(f"Sequence length: {config.sequence_length}")
print(f"Training: {config.train_model}")



Model path: bert-base-uncased
Data path: ./data/original/ud/
Tokenizer: bert-base-uncased
Batch size: 2
Epochs: 3
Learning rate: 5e-05
Sequence length: 133
Training: False


In [31]:
writer = SummaryWriter()
tokenizer = BertTokenizer.from_pretrained(config.tokenizer)
model = BertForMaskedLM.from_pretrained(config.saved_model_path)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
device =  torch.device('cpu')
# Move model to device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [33]:
def createMaskedInputs(inputs):
    """
    creates masked input embeddings and labels from tokenized text

    :param inputs: tokenized text
    :return: masked input embeddings and new column labels 
    """ 
    # Clone input ids (tokens) to create labels
    inputs['labels'] = inputs.input_ids.detach().clone()
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array with 15% masked tokens
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
        (inputs.input_ids != 102) * (inputs.input_ids != 0)
    # Select indices of each nonzero (= selected) value as token to be masked
    selection = []

    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    # Mask selected tokens: replace with [MASK] code 103 in tensor
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
    
    return inputs

In [34]:
class SyntransDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [35]:
def train_ud_tokenizer(tokenizer, tokenizer_name):
    tokenizer_path = "./tokenizers/" + tokenizer_name 
    special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
    ]
    # 30,522 vocab is BERT's default vocab size, feel free to tweak
    vocab_size = 30_522
    # Load data
    text = []
    for ud_file in glob.iglob(config.data_path + '**/UD_English-Pronouns/en_*.txt', recursive=True):

        ud_file = os.path.abspath(ud_file)
        filename = os.path.basename(ud_file)
        print(filename, flush = True)
        tokenizer.train(files=ud_file, vocab_size=vocab_size, special_tokens=special_tokens)
    # make the directory if not already there
    if not os.path.isdir(tokenizer_path):
        os.mkdir(tokenizer_path)
    # save the tokenizer  
    tokenizer.save_model(tokenizer_path)
    


        



In [36]:
def loadSentencesFromFiles(filepath):
    """
    Load sentences from files.

    :param filepath: path to files (supports glob regex)
    :return: list of sentences
    """ 
    sentences = []
    for ud_file in glob.iglob(filepath, recursive=True):

        ud_file = os.path.abspath(ud_file)
        filename = os.path.basename(ud_file)
        print(filename, flush = True)
        with open(ud_file, 'r') as fp:
            sentences.extend(fp.read().split('\n'))
    return sentences
        


In [37]:
"""
# Print example of tokenized text
sentences = []
for ud_file in glob.iglob(config.data_path + '**/UD_English-Atiien_*.txt', recursive=True):

    ud_file = os.path.abspath(ud_file)
    filename = os.path.basename(ud_file)
    print(filename, flush = True)
    with open(ud_file, 'r') as fp:
        sentences.extend(fp.read().split('\n'))
count = 0
for sentence in sentences:
    # Tokenize data
    inputs = tokenizer(sentence, return_tensors='pt', max_length=config.sequence_length, truncation=True, padding='max_length')
    inputs = createMaskedInputs(inputs)

    # Create dataset from tokenized data
    dataset = SyntransDataset(inputs)
    loader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
    if(count==1):
        print(inputs['input_ids'])
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        print(tokens)
        print(inputs['labels'])
        break
    count=count+1"""

"\n# Print example of tokenized text\nsentences = []\nfor ud_file in glob.iglob(config.data_path + '**/UD_English-Atiien_*.txt', recursive=True):\n\n    ud_file = os.path.abspath(ud_file)\n    filename = os.path.basename(ud_file)\n    print(filename, flush = True)\n    with open(ud_file, 'r') as fp:\n        sentences.extend(fp.read().split('\n'))\ncount = 0\nfor sentence in sentences:\n    # Tokenize data\n    inputs = tokenizer(sentence, return_tensors='pt', max_length=config.sequence_length, truncation=True, padding='max_length')\n    inputs = createMaskedInputs(inputs)\n\n    # Create dataset from tokenized data\n    dataset = SyntransDataset(inputs)\n    loader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True)\n    if(count==1):\n        print(inputs['input_ids'])\n        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])\n        print(tokens)\n        print(inputs['labels'])\n        break\n    count=count+1"

In [38]:
def getMaxSequenceLength(sentences, cutoff_limit_percent=0.9999):
    """
    Calculate maximum sequence length for given data.
    param sentences: list of sentences
    param cutoff_limit_percent: percentage of all samples to accommodate with the max sequence length.
    returns: max sequence length which encompasses cutoff_limit_percent of all data samples
    """
    # Get number of tokens per sentence        
    max_sentence_tokens = 0
    sentence_tokens = {}
    print(f"Amount of samples: {len(sentences)}")
    # Tokenize data
    for sentence in sentences:

        inputs = tokenizer(sentence, return_tensors='pt')
        
        token_count = inputs.input_ids.size(dim=1)
        sentence_tokens[inputs.input_ids.size(dim=1)] = sentence_tokens.get(token_count,0) + 1
        if(token_count > max_sentence_tokens): 
            max_sentence_tokens = token_count
            
    no_tokens = 0
    # Calulate number of samples which should have a sequence length smaller than max_sequence_length
    cutoff = cutoff_limit_percent * len(sentences)
    max_sequence_length = 0
    print(max_sentence_tokens)
    for i in sorted(sentence_tokens):
        # print((i, sentence_tokens[i]), end=" ")
        if(no_tokens <= cutoff):
            no_tokens = no_tokens + sentence_tokens[i]
            max_sequence_length = i

    print(f"Max sequence length: {max_sequence_length} with {cutoff_limit_percent}% of samples smaller")
    return max_sequence_length

In [39]:
#sentences = loadSentencesFromFiles(config.data_path + '**/en_*.txt')

In [40]:
#print(getMaxSequenceLength(sentences))

In [41]:
if (config.train_model):
    print("Training model", flush=True)

    # Load data
    sentences = loadSentencesFromFiles(config.data_path + '**/*-train.txt')

    # Tokenize data
    inputs = tokenizer(sentences, return_tensors='pt', max_length=config.sequence_length, truncation=True, padding='max_length')
    inputs = createMaskedInputs(inputs)

    # Create dataset from tokenized data
    dataset = SyntransDataset(inputs)
    loader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
    
    # activate training mode
    model.train()

    from torch.optim import AdamW
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=config.learning_rate)

    epochs = config['epochs']
    writer.add_scalar("LR", config.learning_rate)
    writer.add_scalar("Batchsize", config.batch_size)

    for epoch in range(epochs):
        scalar_loss = 0
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True, mininterval=40,maxinterval=120)
        for batch in loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            scalar_loss = loss.item()
            loop.set_postfix(loss=scalar_loss)
        # Print info to Tensorboard
        writer.add_scalar("Loss/train", scalar_loss, epoch)
        # Save model after each epoch
        model.save_pretrained(save_directory=f"./trained_models/{config.tokenizer}_E{epoch}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}/")
writer.close()

# Model evaluation

In [42]:
print("Model evaluation\n", flush = True)
# Read test files
sentences = loadSentencesFromFiles(config.data_path + '**/*-test.txt')

Model evaluation

en_gum-ud-test.txt
en_lines-ud-test.txt
en_atis-ud-test.txt
en_pud-ud-test.txt
en_pronouns-ud-test.txt
en_partut-ud-test.txt
en_esl-ud-test.txt
en_ewt-ud-test.txt
en_gumreddit-ud-test.txt


In [43]:
print(config.sequence_length)

133


In [44]:

inputsTest = tokenizer(sentences, return_tensors='pt', max_length=config.sequence_length, truncation=True, padding='max_length')
inputsTest = createMaskedInputs(inputsTest)

In [45]:
datasetTest = SyntransDataset(inputsTest)
loader = torch.utils.data.DataLoader(datasetTest, batch_size=config.batch_size, shuffle=False)

In [56]:
model.eval()

with torch.no_grad():
    references_all = []
    predictions_all = []
    references_roc_all = []
    predictions_roc_all = []
    # Setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True, mininterval=20,maxinterval=120)
    for batch in loop:
        # Pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).tolist()

        softmax = nn.Softmax(dim = -1)
        predictions = model(input_ids)
        predictions = predictions['logits']
        #print(np.shape(np.array(predictions)))
        
        predictions_sm = softmax(predictions)
        #print(np.sum(np.array(predictions_sm[0][0])))
        # Get index of argmax
        #y = np.argmax(predictions_sm, axis = -1)
        # y = y.tolist()
        y = torch.topk(predictions, k=1, dim = 2)[1].squeeze()
        y = y.tolist()

#        words_pred = []
#        words_true = []
#        for w in y:
#            words_pred.append( tokenizer.convert_ids_to_tokens(w))
        
#        for w in labels[0]:
#            words_true.append( tokenizer.convert_ids_to_tokens(w))
        
        #print(words_pred[0:50])
        #print(words_true[0:50])
            

        recall_metric = evaluate.load('recall')
        precision_metric = evaluate.load('precision')
        f1_metric = evaluate.load('f1')
        roc_auc_metric = evaluate.load("roc_auc", "multiclass")


        # Go through all samples in batch and add to computation batch
        for idx, pred_batch in enumerate(y):
            references_all.extend(labels[idx])
            predictions_all.extend(pred_batch)
            #precision_metric.add_batch(references=labels[idx], predictions=pred_batch)
            #recall_metric.add_batch(references=labels[idx], predictions=pred_batch)
            #f1_metric.add_batch(references=labels[idx], predictions=pred_batch)
        
        # Calculate ROC
        for batch_idx, pred_batch in enumerate(predictions_sm):
            predictions_roc_all.extend(pred_batch.tolist())
            references_roc_all.extend(labels[batch_idx])
            #roc_auc_metric.add_batch(references=labels[batch_idx], prediction_scores = pred_batch.tolist())
            break
        break

    numberOfBatches = len(loop)
    # List all possible labels
    labels = np.arange(tokenizer.vocab_size)
    with open(f"./logs/Results_{config.tokenizer}_E{config.epochs}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}", "w") as output:
        print(f"Results: {config.tokenizer}, Train={config.train_model} {config.tokenizer}_E{config.epochs}_batches{config.batch_size}_LR{config.learning_rate}_SL{config.sequence_length}", file = output)
        output.write("macro averaging\n")
        output.write(str(recall_metric.compute(references = references_all, predictions = predictions_all, average = 'macro')))
        output.write("\n")
        output.write(str(precision_metric.compute(references = references_all, predictions = predictions_all, average = 'macro', zero_division = 0)))
        output.write("\n")
        output.write(str(f1_metric.compute( references = references_all, predictions = predictions_all, average = 'macro')))
        output.write("\n")
        output.write(str(roc_auc_metric.compute( references = references_roc_all, prediction_scores = predictions_roc_all, average = 'macro', multi_class = 'ovo', labels = labels, max_fpr = 1.0)))
        output.write("\n")
        output.write("weighted averaging\n")
        output.write(str(recall_metric.compute( references = references_all, predictions = predictions_all, average = 'weighted')))
        output.write("\n")
        output.write(str(precision_metric.compute( references = references_all, predictions = predictions_all, average = 'weighted', zero_division = 0)))
        output.write("\n")
        output.write(str(f1_metric.compute( references = references_all, predictions = predictions_all, average = 'weighted')))
        output.write("\n")
        output.write(str(roc_auc_metric.compute( references = references_roc_all, prediction_scores = predictions_roc_all, average = 'weighted', multi_class = 'ovo', labels = labels, max_fpr = 1.0)))

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|          | 0/93684 [00:05<?, ?it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
#confusion_matrix = metrics.confusion_matrix(references_all, predictions_all, labels=labels)
#print(confusion_matrix)
#disp = metrics.ConfusionMatrixDisplay(references_all, predictions_all, labels=labels)
#disp.plot()

TypeError: __init__() got an unexpected keyword argument 'labels'

In [48]:
"""
# precision recall curve
from sklearn.metrics import precision_recall_curve, roc_curve
import matplotlib.pyplot as plt
#%matplotlib inline
precision = dict()
recall = dict()
for i in labels:
    precision[i], recall[i], _ = precision_recall_curve(references_roc_all[i],
                                                        predictions_roc_all[i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    break
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()"""

ValueError: Expected array-like (array or non-string sequence), got 101