# BERT-based models Fine-tuning and Evaluation

## Instalando pacotes

In [None]:
!pip install --quiet -U transformers datasets seqeval evaluate accelerate

## Importando pacotes

In [None]:
import json

from datasets import load_from_disk
import evaluate
import numpy as np

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

## Carregando Dados

In [None]:
root_folder = './'


# Bertimbau
model_name     = "neuralmind/bert-base-portuguese-cased"
model_nickname = "bertimbau"

# mBERT
#model_name     = "bert-base-multilingual-cased"
#model_nickname = "mbert"

model_folder = f"{root_folder}/{model_nickname}"

print(model_name)
print(model_folder)

In [None]:
def keys_to_int(x):
    return {int(k): v for k, v in x.items()}

int2label = []
with open(f"{root_folder}/labels.json", "r") as f:
    int2label = json.load(f, object_hook=keys_to_int)

label2int = {label: index for index, label in int2label.items()}
label2int

print(int2label)
print(label2int)

## Treinando o modelo

In [None]:
dataset = load_from_disk(f"{model_folder}/dataset-bancos-{model_nickname}")
dataset

In [None]:
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p

    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [int2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [int2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
batch_size    = 16
logging_steps = dataset['train'].num_rows // batch_size
epochs        = 2

model         = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(int2label), id2label=int2label, label2id=label2int)
tokenizer     = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir                  = f"{model_folder}/results",
    num_train_epochs            = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    evaluation_strategy         = "epoch",
    disable_tqdm                = False,
    logging_steps               = logging_steps,
    fp16                        = True,
    save_total_limit            = 3
)


trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = dataset["train"],
    eval_dataset    = dataset["validation"],
    data_collator   = data_collator,
    tokenizer       = tokenizer,
    compute_metrics = compute_metrics
)

#fine tune using train method
trainer.train()


In [None]:
trainer.save_model(f"{model_folder}/financial_ner_{model_nickname}/model")

## Avaliando o Modelo com conjunto de Teste

In [None]:
import pandas as pd
import torch
import pickle

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

In [None]:
dataset = load_from_disk(f"{model_folder}/dataset-bancos-{model_nickname}")

In [None]:
model_path = f"{model_folder}/financial_ner_{model_nickname}/model"
model = AutoModelForTokenClassification.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=512)
model.eval()


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
data_collator

In [None]:
from transformers import PretrainedConfig

config = PretrainedConfig.from_pretrained(model_path)

config

In [None]:
dataset_test = dataset['test'].remove_columns(['texts', 'tokens'])

In [None]:
%%time

dataset_test = data_collator.torch_call(dataset_test)

predictions = []
batch_size = 16
len_dataset = len(dataset_test['input_ids'])
for i in range(0, len_dataset, batch_size):
  print(f"\r{len(predictions)}/{len_dataset}", end="")

  input_ids      = dataset_test['input_ids'][i : i + batch_size].to(device)
  attention_mask = dataset_test['attention_mask'][i : i + batch_size].to(device)

  res = model(input_ids,
             attention_mask=attention_mask)[0].argmax(dim=2)

  predictions.extend(res.tolist())


In [None]:
# salvando as predicoes no formato pickle
with open(f"{model_folder}/predictions-{model_nickname}.pkl", 'wb') as fp:
    pickle.dump(predictions, fp)

### Cálculo das métricas

In [None]:
# carregando as predicoes salvas
with open (f"{model_folder}/predictions-{model_nickname}.pkl", 'rb') as fp:
    predictions = pickle.load(fp)

In [None]:
%%time
ner_tags  = dataset_test['labels']

refs = []
for tags in ner_tags.tolist():
    refs.append( [ int2label[tag] for tag in tags if tag != -100 ] )

preds = []
for i in range(len(ner_tags)):
    predis = []
    for tag, pred in zip(ner_tags[i], predictions[i]):
        if tag != -100:
            predis.append(int2label[pred])

    preds.append( predis )

metric = evaluate.load("seqeval")

results = metric.compute(predictions=preds, references=refs, zero_division=0)

results

In [None]:
from seqeval.metrics import classification_report

print(classification_report(preds, refs, digits=4, zero_division=0))

### Análise manual das classificações

In [None]:
test = dataset['test']

i = 1497 #156

print('text   \t - ',test['texts'][i])
print('tokens \t - ',test['tokens'][i])
print('labels_ids \t - ',test['labels'][i])

y_true = [ int2label[tag] for tag in test["labels"][i] if tag != -100 ]
print('labels \t -', y_true)

y_pred = preds[i]
print('pred   \t -', y_pred)

subwords = tokenizer.convert_ids_to_tokens(test['input_ids'][i], skip_special_tokens=True)
print('subwords \t -', subwords)

df = pd.DataFrame([subwords, y_true, y_pred],
            index=["subwords", "y_true", "y_pred"])

display(df.loc[ : , 20:])