In [1]:
import os

# Export env vars to limit number of threads to use
num_threads = "26"
os.environ["OMP_NUM_THREADS"] = num_threads 
os.environ["OPENBLAS_NUM_THREADS"] = num_threads
os.environ["MKL_NUM_THREADS"] = num_threads 
os.environ["VECLIB_MAXIMUM_THREADS"] = num_threads
os.environ["NUMEXPR_NUM_THREADS"] = num_threads

# Only use CPU, hide GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import torch.nn as nn
#Import SummaryWriter for Tensorboard logging
from torch.utils.tensorboard import SummaryWriter
import evaluate
import numpy as np
from tqdm import tqdm  # for our progress bar
import glob

In [3]:
# Limit no. of threads used by Pytorch
torch.set_num_threads = int(num_threads)

In [4]:
PID = os.getpid()
PGID = os.getpgid(PID)
print(f"PID: {PID}, PGID: {PGID}")

PID: 220, PGID: 64


In [5]:
config = {}
config['train'] = True
config['epochs'] = 3
config['batch_size'] = 128
config['LR'] = 5e-5
config['bert'] = 'bert-base-uncased'


In [6]:
data_path = "./data/original/ud/"
writer = SummaryWriter()
tokenizer = BertTokenizer.from_pretrained(config['bert'])
model = BertForMaskedLM.from_pretrained(config['bert'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def createMaskedInputs(inputs):
    """
    creates masked input embeddings and labels from tokenized text

    :param inputs: tokenized text
    :return: masked input embeddings and new column labels 
    """ 
    # Clone input ids (tokens) to create labels
    inputs['labels'] = inputs.input_ids.detach().clone()
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array with 15% masked tokens
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
        (inputs.input_ids != 102) * (inputs.input_ids != 0)
    # Select indices of each nonzero (= selected) value as token to be masked
    selection = []

    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    # Mask selected tokens: replace with [MASK] code 103 in tensor
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
    
    return inputs

In [8]:
class SyntransDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [9]:
text = []
for ud_file in glob.iglob(data_path + '**/*-train.txt', recursive=True):

  ud_file = os.path.abspath(ud_file)
  filename = os.path.basename(ud_file)
  print(filename, flush = True)
  # Load train data
  with open(ud_file, 'r') as fp:
    text.extend(fp.read().split('\n'))

en_gum-ud-train.txt
en_lines-ud-train.txt
en_atis-ud-train.txt
en_partut-ud-train.txt
en_esl-ud-train.txt
en_ewt-ud-train.txt
en_gumreddit-ud-train.txt


In [10]:
# Get max sentence length
max_length = 0
for sentence in text:
    length = len(sentence)
    if(length > max_length):
        max_length = length


In [11]:
# Cap max sequence length at 512
if(max_length > 512):
    max_length = 512
inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
inputs = createMaskedInputs(inputs)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.

In [None]:
dataset = SyntransDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)


In [12]:
device =  torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [13]:
from torch.optim import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=config['LR'])

In [None]:
if (config['train']):
    print("Training model", flush=True)
    epochs = config['epochs']
    writer.add_scalar("LR", config['LR'])
    writer.add_scalar("Batchsize", config['batch_size'])

    for epoch in range(epochs):
        scalar_loss = 0
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True, mininterval=40,maxinterval=120)
        for batch in loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            scalar_loss = loss.item()
            loop.set_postfix(loss=scalar_loss)
        # Print info to Tensorboard
        writer.add_scalar("Loss/train", scalar_loss, epoch)
        # Save model after each epoch
        model.save_pretrained(save_directory=f"./trained_models/E{epoch}_{config['bert']}_batches{config['batch_size']}_LR{config['LR']}_SL{max_length}/")
writer.close()

# Model evaluation

In [21]:
print("Model evaluation\n", flush = True)
text = []
# Read test files
for ud_file in glob.iglob(data_path + '**/*-test.txt', recursive=True):

    ud_file = os.path.abspath(ud_file)
    filename = os.path.basename(ud_file)
    print(filename, flush = True)
    # Load test data
    with open(ud_file, 'r') as fp:
        text.extend(fp.read().split('\n'))

Model evaluation



In [None]:
inputsTest = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
inputsTest = createMaskedInputs(inputsTest)

In [None]:
datasetTest = SyntransDataset(inputsTest)
loader = torch.utils.data.DataLoader(datasetTest, batch_size=config['batch_size'], shuffle=False)

In [None]:
model.eval()

with torch.no_grad():
    # Setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True, mininterval=20,maxinterval=120)
    for batch in loop:
        # Pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).tolist()

        softmax = nn.Softmax(dim = 2)
        predictions = model(input_ids)
        predictions = predictions['logits']
        # predictions_sm = softmax(predictions)
        # Get index of argmax
        # y = np.argmax(predictions_sm, axis = -1)
        # y = y.tolist()
        y = torch.topk(predictions, k=1, dim = 2)[1].squeeze()
        y = y.tolist()

#        words_pred = []
#        words_true = []
#        for w in y:
#            words_pred.append( tokenizer.convert_ids_to_tokens(w))
        
#        for w in labels[0]:
#            words_true.append( tokenizer.convert_ids_to_tokens(w))
        
        #print(words_pred[0:50])
        #print(words_true[0:50])
            

        recall_metric = evaluate.load('recall')
        precision_metric = evaluate.load('precision')
        f1_metric = evaluate.load('f1')
        roc_auc_metric = evaluate.load("roc_auc", "multiclass")

        #print(np.shape(np.array(y)))
        #Sprint(np.shape(np.array(labels)))

        for idx, pred_batch in enumerate(y):

            precision_metric.add_batch(references=labels[idx], predictions=pred_batch)
            recall_metric.add_batch(references=labels[idx], predictions=pred_batch)
            f1_metric.add_batch(references=labels[idx], predictions=pred_batch)
            roc_auc_metric.add_batch(references=labels[idx], predictions=pred_batch)

    numberOfBatches = len(loop)
    print(f"Results: {config['bert']}, Train={config['train']}")
    print("macro averaging")
    print(recall_metric.compute( average = 'macro'))
    print(precision_metric.compute( average = 'macro'))
    print(f1_metric.compute( average = 'macro'))
    print(roc_auc_metric.compute( average = 'macro'))
    print("weighted averaging")
    print(recall_metric.compute( average = 'weighted'))
    print(precision_metric.compute( average = 'weighted'))
    print(f1_metric.compute( average = 'weighted'))
    print(roc_auc_metric.compute( average = 'weighted'))