In [None]:
import os
# Select number of threads to use
os.environ["OMP_NUM_THREADS"] = "48" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "48" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "48" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "48" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "48" # export NUMEXPR_NUM_THREADS=1

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import torch.nn as nn
import evaluate
import numpy as np
from tqdm import tqdm  # for our progress bar
import glob

In [None]:
PID = os.getpid()
PGID = os.getpgid(PID)
print(f"PID: {PID}, PGID: {PGID}")

In [None]:
config = {}
config['train'] = False
config['epochs'] = 5
config['bert'] = 'bert-base-uncased'

In [None]:
data_path = "./data/original/ud/UD_English_EWT/"
tokenizer = BertTokenizer.from_pretrained(config['bert'])
model = BertForMaskedLM.from_pretrained(config['bert'])

In [None]:
def createMaskedInputs(inputs):
    """
    creates masked input embeddings and labels from tokenized text

    :param inputs: tokenized text
    :return: masked input embeddings and new column labels 
    """ 
    # Clone input ids (tokens) to create labels
    inputs['labels'] = inputs.input_ids.detach().clone()
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array with 15% masked tokens
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
        (inputs.input_ids != 102) * (inputs.input_ids != 0)
    # Select indices of each nonzero (= selected) value as token to be masked
    selection = []

    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    # Mask selected tokens: replace with [MASK] code 103 in tensor
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
    
    return inputs

In [None]:
class SyntransDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
text = ""
for ud_file in glob.iglob(data_path + '**/*-train.txt', recursive=True):

  ud_file = os.path.abspath(ud_file)
  filename = os.path.basename(ud_file)
  print(filename, flush = True)
  # Load test data
  with open(ud_file, 'r') as fp:
    text.append(fp.read().split('\n'))

In [None]:
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs = createMaskedInputs(inputs)

In [None]:
dataset = SyntransDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)


In [None]:
device =  torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

In [None]:
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
if (config['train']):
    print("Training model", flush=True)
    epochs = config['epochs']

    for epoch in range(epochs):
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True)
        for batch in loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
    
    model_scripted = torch.jit.script(model) # Export to TorchScript
    model_scripted.save(f"./trainedModels/model_E{config['epochs']}_{config['bert']}.pt") # Save
        


# Model evaluation

In [None]:
#################### Multiprocessing

'''    print("Model evaluation\n", flush = True)

    for ud_file in glob.iglob(data_path + '**/*-test.txt', recursive=True):

        ud_file = os.path.abspath(ud_file)
        filename = os.path.basename(ud_file)
        print(filename, flush = True)
        # Load test data
        with open(ud_file, 'r') as fp:
            text.append(fp.read().split('\n'))'''



In [None]:
print("Model evaluation\n", flush = True)
text = ""
for ud_file in glob.iglob(data_path + '**/*-test.txt', recursive=True):

    ud_file = os.path.abspath(ud_file)
    filename = os.path.basename(ud_file)
    print(filename, flush = True)
    # Load test data
    with open(ud_file, 'r') as fp:
        text.append(fp.read().split('\n'))

In [None]:
inputsTest = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputsTest = createMaskedInputs(inputs)

In [None]:
datasetTest = SyntransDataset(inputsTest)
loader = torch.utils.data.DataLoader(datasetTest, batch_size=64, shuffle=False)

In [None]:
results = {}
results['recall'] = 0
results['precision'] = 0
results['f1'] = 0
model.eval()

with torch.no_grad():
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).tolist()

        softmax = nn.Softmax(dim = 2)
        predictions = model(input_ids)
        predictions = predictions['logits']
        # predictions_sm = softmax(predictions)
        # Get index of argmax
        # y = np.argmax(predictions_sm, axis = -1)
        # y = y.tolist()
        y = torch.topk(predictions, k=1, dim = 2)[1].squeeze()
        y = y.tolist()

#        words_pred = []
#        words_true = []
#        for w in y:
#            words_pred.append( tokenizer.convert_ids_to_tokens(w))
        
#        for w in labels[0]:
#            words_true.append( tokenizer.convert_ids_to_tokens(w))
        
        #print(words_pred[0:50])
        #print(words_true[0:50])
            

        recall_metric = evaluate.load('recall')
        precision_metric = evaluate.load('precision')
        f1_metric = evaluate.load('f1')

        #print(np.shape(np.array(y)))
        #Sprint(np.shape(np.array(labels)))

        for idx, pred_batch in enumerate(y):

            precision_metric.add_batch(references=labels[idx], predictions=pred_batch)
            recall_metric.add_batch(references=labels[idx], predictions=pred_batch)
            f1_metric.add_batch(references=labels[idx], predictions=pred_batch)

    numberOfBatches = len(loop)
    print("Results: untrained BERT-base")
    print(recall_metric.compute( average = 'micro'))
    print(precision_metric.compute( average = 'micro'))
    print(f1_metric.compute( average = 'micro'))

In [None]:
# from transformers import TrainingArguments

# args = TrainingArguments(
#     output_dir='out',
#     per_device_train_batch_size=4,
#     num_train_epochs=1
# )

In [None]:
# from transformers import Trainer

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=dataset
# )

In [None]:
# trainer.train()

