# T5-based models Fine-tuning and Evaluation

## Instalando pacotes

In [None]:
!pip install --quiet transformers sentencepiece datasets seqeval evaluate accelerate -U

## Importando pacotes

In [None]:
import json

from datasets import load_from_disk
import evaluate
import numpy as np

from transformers import AutoTokenizer, T5ForConditionalGeneration, MT5Tokenizer, MT5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq

metric = evaluate.load("seqeval")

## Carregando Dados

In [None]:
root_folder = './'
data_folder = f'{root_folder}/datasets/'

# PTT5
model_name     = "unicamp-dl/ptt5-base-portuguese-vocab"
model_nickname = "ptt5"

# mT5
#model_name     = "google/mt5-small"
#model_nickname = "mt5"

model_folder = f"{root_folder}/{model_nickname}"

print(model_name)
print(model_folder)

In [None]:

def get_ent_label(model_output):

  tokens = model_output.split()
  predicted_labels = []
  for token in tokens:
    if "|" in token:
      parts = token.split("|")
      ent   = parts[0].replace('[','').split('_')
      label = parts[1].split(']')[0]

      predicted_labels.append("B-"+label)
      for i in range(1, len(ent)):
        predicted_labels.append("I-"+label)

    else:
     if token not in ['[','|',']']:
       predicted_labels.append("O")

  return predicted_labels


#text = 'Vemos que o saldo dessa [carteira|CARTEIRA] 123 [carteira_de_crédito|CARTEIRA] em março de 2014 é bastante superior ao dos demais bancos, permitindo ao BB encerrar o período com [27,1%|PERCENTUAL]? de participação de mercado.'
#get_ent_label(text)

## Treinando o modelo

In [None]:
if model_nickname == 'mt5':
    model     = MT5ForConditionalGeneration.from_pretrained(model_name, max_length=512, return_dict = True)
    tokenizer = MT5Tokenizer.from_pretrained(model_name, legacy=False)
else:
    model     = T5ForConditionalGeneration.from_pretrained(model_name, max_length=512, return_dict = True)
    tokenizer = AutoTokenizer.from_pretrained(model_name, legacy=False) 
    
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
dataset = load_from_disk(f"{model_folder}/dataset-bancos-{model_nickname}")
dataset

In [None]:
def compute_metrics(p):
    predictions, labels = p

    batch_size = 4
    refs = []
    for i in range(0, len(labels), batch_size):
      l = labels[ i : i + batch_size]
      l[ l < 0 ] = tokenizer.pad_token_id
      batch = tokenizer.batch_decode(l, skip_special_tokens=True)
      refs.extend(batch)

    refs = [ get_ent_label(t) for t in refs ]

    preds = []
    for i in range(0, len(predictions), batch_size):
      preds.extend(tokenizer.batch_decode(predictions[ i : i + batch_size], skip_special_tokens=True))

    preds = [ get_ent_label(p) for p in preds ]

    # fazendo padding para que a listas tenham o mesmo tamanho
    for idx in range(len(refs)):

      if len(refs[idx]) > len(preds[idx]):
        diff = len(refs[idx]) - len(preds[idx])
        preds[idx] = preds[idx] + ['O']*diff

      if len(refs[idx]) < len(preds[idx]):
        diff = len(preds[idx]) - len(refs[idx])
        refs[idx] = refs[idx] + ['O']*diff


    results = metric.compute(predictions=preds, references=refs, zero_division=0)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
batch_size = 8
logging_steps = dataset['train'].num_rows // batch_size
epochs = 2

training_args = Seq2SeqTrainingArguments(
    output_dir                  = f"{model_folder}/results",
    num_train_epochs            = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    evaluation_strategy         = "epoch",
    disable_tqdm                = False,
    logging_steps               = logging_steps,
    fp16                        = (model_nickname == 'ptt5'),
    predict_with_generate       = True,
    save_total_limit            = 3,
    learning_rate               = 0.001 if model_nickname == 'mt5' else 5e-5
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'].remove_columns(["text","target"]), 
    eval_dataset=dataset['validation'].remove_columns(["text","target"]),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
trainer.save_model(f"{model_folder}/financial_ner_{model_nickname}/model")

## Avaliando modelo com conjunto de Teste

In [None]:
import pandas as pd
import torch
import pickle

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
if model_nickname == 'mt5':
    model     = MT5ForConditionalGeneration.from_pretrained(f"{model_folder}/financial_ner_{model_nickname}/model").to(device)
    tokenizer = MT5Tokenizer.from_pretrained(f"{model_folder}/financial_ner_{model_nickname}/model", model_max_length=512)
else:
    model     = T5ForConditionalGeneration.from_pretrained(f"{model_folder}/financial_ner_{model_nickname}/model").to(device)
    tokenizer = AutoTokenizer.from_pretrained(f"{model_folder}/financial_ner_{model_nickname}/model", model_max_length=512)

model.eval()


In [None]:
dataset = load_from_disk(f"{model_folder}/dataset-bancos-{model_nickname}")
dataset

In [None]:
%%time
# fazendo o padding e recuperando os tensors de mesmo tamanho
dataset_test = dataset['test'].remove_columns(['text', 'target'])
dataset_test = tokenizer.pad(dataset_test.to_dict())

ans = []
batch_size = 16
len_dataset = len(dataset_test['input_ids'])
for i in range(0, len_dataset, batch_size):
  print(f"\r{i}/{len_dataset} ", end="")

  input = torch.as_tensor(dataset_test['input_ids'][i : i + batch_size]).to(device)
  res = model.generate(input
                      , max_length=512
                      , num_beams=2
        ).to(device)

  ans.extend(res.tolist())

len_dataset, len(ans)

In [None]:
with open(f"{model_folder}/predictions-{model_nickname}.pkl", 'wb') as fp:
    pickle.dump(ans, fp)

### Cálculo das métricas

In [None]:
# carregando as predicoes salvas
with open (f"{model_folder}/predictions-{model_nickname}.pkl", 'rb') as fp:
    ans = pickle.load(fp)

In [None]:
%%time
from seqeval.metrics import classification_report

batch_size = 16
refs = []
for i in range(0, len(dataset_test['labels']), batch_size):
  refs.extend(tokenizer.batch_decode(dataset_test['labels'][ i : i + batch_size], skip_special_tokens=True))

refs = [ get_ent_label(t) for t in refs ]


predictions = []
for i in range(0, len(ans), batch_size):
  predictions.extend(tokenizer.batch_decode(ans[ i : i + batch_size], skip_special_tokens=True))

predictions = [ get_ent_label(t) for t in predictions ]


# fazendo padding para que a listas tenham o mesmo tamanho
for idx in range(len(refs)):

  if len(refs[idx]) > len(predictions[idx]):
    diff = len(refs[idx]) - len(predictions[idx])
    predictions[idx] = predictions[idx] + ['O']*diff

  if len(refs[idx]) < len(predictions[idx]):
    diff = len(predictions[idx]) - len(refs[idx])
    refs[idx] = refs[idx] + ['O']*diff

print(classification_report(predictions, refs, digits=4, zero_division=0))


In [None]:
metric.compute(predictions=predictions, references=refs, zero_division=0)


### Análise manual das sentenças geradas

In [None]:
test = dataset['test']

for i, row in df_fp[:2].iterrows():
  print(i)
  print('text     \t - ',test['text'][row['index']])
  print('target   \t - ',test['target'][row['index']])
  print(refs[row['index']])
  print(predictions[row['index']])
