# Finetuning tarefa REN com base de dados anotados

Autora: Sabrina dos Passos Tortelli

Tarefa: REN - Reconhecimento de Entidades Nomeadas

In [2]:
# %pip install datasets
# %pip install transformers==4.29.0
# %pip install torch
# %pip install nltk
# %pip install scikit-learn
# %pip install seqeval
# %pip install evaluate
# %pip install matplotlib

In [1]:
task = "ner"

model_checkpoint = "Model-2000lm-TceBr-bert-base-portuguese-cased/model"

In [2]:
import transformers
import datasets
import torch
# import tensorflow

print(transformers.__version__) # 4.29
print(datasets.__version__) # 2.14.5
print(torch.__version__) # 2.1.0+cu121
# print(tensorflow.__version__) # 2.14

from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset, load_metric, DatasetInfo, Features, ClassLabel, Sequence, Value
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random
import re
import evaluate
import json

  from .autonotebook import tqdm as notebook_tqdm


4.34.1
2.14.6
2.1.0+cu121


[nltk_data] Downloading package punkt to /home/sabrina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
torch.cuda.empty_cache()

Carregando os datasets

In [3]:
def substitui_tags(tag):
    match tag:
        case "O":
            return 0
        case "B-AREA":
            return 1
        case "I-AREA":
            return 2
        case "B-PROCURADOR_MP":
            return 3
        case "I-PROCURADOR_MP":
            return 4
        case "B-JULGAMENTO_CONTAS":
            return 5
        case "I-JULGAMENTO_CONTAS":
            return 6
        case "B-MINISTRO":
            return 7
        case "I-MINISTRO":
            return 8
        case "B-MOTIVO_TCE":
            return 9
        case "I-MOTIVO_TCE":
            return 10
        case "B-PROCESSO":
            return 11
        case "I-PROCESSO":
            return 12
        case "B-PROCESSO_VINCULADO":
            return 13
        case "I-PROCESSO_VINCULADO":
            return 14
        case "B-PROPOSTA_ENCAMINHAMENTO":
            return 15
        case "I-PROPOSTA_ENCAMINHAMENTO":
            return 16
        case "B-RECOMENDACAO_PLENARIO":
            return 17
        case "I-RECOMENDACAO_PLENARIO":
            return 18
        case "B-RESPONSAVEL":
            return 19
        case "I-RESPONSAVEL":
            return 20
        case "B-RESPONSAVEL_CARGO":
            return 21
        case "I-RESPONSAVEL_CARGO":
            return 22
        case "B-SANSAO":
            return 23
        case "I-SANSAO":
            return 24
        case "B-TIPO_INSTRUCAO":
            return 25
        case "I-TIPO_INSTRUCAO":
            return 26
        case "B-UN_INSTAURADORA":
            return 27
        case "I-UN_INSTAURADORA":
            return 28
        case "B-UN_JURISDICIONADA":
            return 29
        case "I-UN_JURISDICIONADA":
            return 30
        case "B-VALOR_DANO_ATUALIZADO":
            return 31
        case "I-VALOR_DANO_ATUALIZADO":
            return 32
        case "B-VALOR_DANO_INDICIO":
            return 33
        case "I-VALOR_DANO_INDICIO":
            return 34
        case _:
            return 0
        


In [4]:
ner_tags_features = Sequence(
        ClassLabel(
            names= [
                "O",
                "B-AREA",
                "I-AREA",
                "B-PROCURADOR_MP",
                "I-PROCURADOR_MP",
                "B-JULGAMENTO_CONTAS",
                "I-JULGAMENTO_CONTAS",
                "B-MINISTRO",
                "I-MINISTRO",
                "B-MOTIVO_TCE",
                "I-MOTIVO_TCE",
                "B-PROCESSO",
                "I-PROCESSO",
                "B-PROCESSO_VINCULADO",
                "I-PROCESSO_VINCULADO",
                "B-PROPOSTA_ENCAMINHAMENTO",
                "I-PROPOSTA_ENCAMINHAMENTO",
                "B-RECOMENDACAO_PLENARIO",
                "I-RECOMENDACAO_PLENARIO",
                "B-RESPONSAVEL",
                "I-RESPONSAVEL",
                "B-RESPONSAVEL_CARGO",
                "I-RESPONSAVEL_CARGO",
                "B-SANSAO",
                "I-SANSAO",
                "B-TIPO_INSTRUCAO",
                "I-TIPO_INSTRUCAO",
                "B-UN_INSTAURADORA",
                "I-UN_INSTAURADORA",
                "B-UN_JURISDICIONADA",
                "I-UN_JURISDICIONADA",
                "B-VALOR_DANO_ATUALIZADO",
                "I-VALOR_DANO_ATUALIZADO",
                "B-VALOR_DANO_INDICIO",
                "I-VALOR_DANO_INDICIO",
            ],
        )
    )

In [5]:
def separa_pequeno(palavras, last_setence_length, tokens, tags):
    token_list_conll = tokens[last_setence_length:last_setence_length + len(palavras)]
    tags_list_conll = tags[last_setence_length:last_setence_length + len(palavras)]
    last_setence_length = last_setence_length + len(palavras)
    return token_list_conll, tags_list_conll, last_setence_length

In [6]:
def ler_conll(input_folder):
    list_conll = []
    for diretorio_atual, subdiretorios, arquivos in os.walk(input_folder):
        for arquivo in arquivos:
            # if arquivo == 'admin.conll':
            if arquivo.endswith('_IP.conll') or arquivo.endswith('_AC.conll') or arquivo == 'admin.conll':
                # print("Arquivo lendo CONLL: ", os.path.join(diretorio_atual, arquivo))
                with open(os.path.join(diretorio_atual, arquivo), "r", encoding="utf-8") as f:
                    conll_data = f.readlines()
                    for line in conll_data:
                        if line != "" and line != "\n":
                            splits = line.split(" ")
                            token = re.sub("\n", "", splits[1])
                            list_conll.append((diretorio_atual.split('/')[1], splits[0], token.rstrip()))
    return list_conll

In [7]:
def ler_arquivo_txt(input_folder, list_conll):
    guid = 0
    dados = []
    for diretorio_atual, subdiretorios, arquivos in os.walk(input_folder):
        valores_tokens_arquivo = []
        tokens = []
        tags = []
        for arquivo in arquivos:
            if arquivo.endswith(".txt"):
                nome_diretorio = diretorio_atual.split('/')[1]
                for item in list_conll:
                    if nome_diretorio in item:
                        valores_tokens_arquivo.append((item[1], item[2]))
                        tokens.append(item[1])
                        tags.append(substitui_tags(item[2]))
                with open(os.path.join(diretorio_atual, arquivo), "r", encoding="utf-8") as f:
                    text = f.readlines()
                    last_setence_length = 0
                    for line in text:
                        if line != "" and line != "\n":
                            line = re.sub("\n", "", line)
                            palavras = line.split(" ")
                            setence_length = len(palavras)
                            if setence_length < 50:
                                token_list_conll, tags_list_conll, last_setence_length = separa_pequeno(palavras, last_setence_length, tokens, tags)
                                dados.append((str(guid), token_list_conll, tags_list_conll))
                                guid += 1
                            else:
                                for i in range(0, setence_length, 49):
                                    palavras_slice = palavras[i:i + 49]
                                    token_list_conll, tags_list_conll, last_setence_length = separa_pequeno(palavras_slice, last_setence_length, tokens, tags)
                                    dados.append((str(guid), token_list_conll, tags_list_conll))
                                    guid += 1
    return dados

In [8]:
import pickle

def preparar_dados(input_folder, proporcao_treinamento=0.8):
    dados = []
    list_conll = []

    list_conll = ler_conll(input_folder)
    dados = ler_arquivo_txt(input_folder, list_conll)
    # print(dados)
    random.shuffle(dados)

    # # Escrever o array em um arquivo
    with open('DadosJson-2/anotacao_TESTE.pkl', 'wb') as file:
        pickle.dump(dados, file)

    # Ler o array de volta do arquivo
    with open('DadosJson-2/anotacao.pkl', 'rb') as file:
        dados = pickle.load(file)

    tamanho_treinamento = int(len(dados) * proporcao_treinamento)
    dados_treinamento = dados[:tamanho_treinamento]
    dados_validacao = dados[tamanho_treinamento:]

    data_dict_treinamento = {
        "id": [ex[0] for ex in dados_treinamento],
        "tokens": [ex[1] for ex in dados_treinamento],
        "ner_tags": [ex[2] for ex in dados_treinamento]
    }

    data_dict_validacao = {
        "id": [ex[0] for ex in dados_validacao],
        "tokens": [ex[1] for ex in dados_validacao],
        "ner_tags": [ex[2] for ex in dados_validacao]
    }

    features = Features({
        "id": Value("string"),
        "tokens": Sequence(Value("string")),
        "ner_tags": ner_tags_features
    })

    dataset_info = DatasetInfo(features=features)

    dataset_treinamento = Dataset.from_dict(data_dict_treinamento, info=dataset_info)
    dataset_validacao = Dataset.from_dict(data_dict_validacao, info=dataset_info)

    dataset_dict = DatasetDict({
        "train": dataset_treinamento,
        "validation": dataset_validacao
    })

    return dataset_dict

In [9]:
dados = preparar_dados('DataSetAnotado/annotation')
# dados = preparar_dados('DataSetAnotado/annotation1')
# dados = preparar_dados('DataSetAnotado/annotation2')
# dados = preparar_dados('DataSetAnotado/annotation3')

[('0', ['ACORDAO', 'No', '10397/2021', '-', 'TCU', '-', '2a', 'Camara', '1.'], [0, 0, 0, 0, 0, 0, 0, 0, 0]), ('1', ['Processo:', 'TC-004.897/2016-1.'], [0, 0]), ('2', ['2.'], [0]), ('3', ['Grupo:', 'II;', 'Classe', 'de', 'Assunto:', 'II', '-', 'Tomada', 'de', 'Contas', 'Especial.'], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ('4', ['3.'], [0]), ('5', ['Responsaveis:', 'Odivar', 'Faco', '(262.322.003-49),', 'Maria', 'Valderez', 'Clemente', 'de', 'Queiroz', '(026.143.163-34)', 'e', 'AP', 'Transportes', 'Ltda.'], [0, 19, 20, 0, 19, 20, 20, 20, 20, 0, 0, 19, 20, 0]), ('6', ['(03.518.143/0001-88).'], [0]), ('7', ['4.'], [0]), ('8', ['Entidade:', 'Municipio', 'de', 'Beberibe/CE.'], [0, 29, 30, 0]), ('9', ['5.'], [0]), ('10', ['Relator:', 'Ministro-Substituto', 'Marcos', 'Bemquerer', 'Costa.'], [0, 0, 7, 8, 0]), ('11', ['6.'], [0]), ('12', ['Representante', 'do', 'Ministerio', 'Publico:', 'Procurador', 'Rodrigo', 'Medeiros', 'de', 'Lima.'], [0, 0, 0, 0, 0, 3, 4, 4, 0]), ('13', ['7.'], [0]), ('14', [

In [10]:
dados

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 6700
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1676
    })
})

In [14]:
dados["train"][50]

{'id': '15920',
 'tokens': ['(peca',
  '114),',
  'em',
  'concordancia',
  'com',
  'o',
  'relatorio',
  'do',
  'tomador',
  'de',
  'contas.',
  'O',
  'certificado',
  'de',
  'auditoria',
  '282/2019',
  'e',
  'o',
  'parecer',
  'do'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [15]:
dados["train"].features[f"ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-AREA', 'I-AREA', 'B-PROCURADOR_MP', 'I-PROCURADOR_MP', 'B-JULGAMENTO_CONTAS', 'I-JULGAMENTO_CONTAS', 'B-MINISTRO', 'I-MINISTRO', 'B-MOTIVO_TCE', 'I-MOTIVO_TCE', 'B-PROCESSO', 'I-PROCESSO', 'B-PROCESSO_VINCULADO', 'I-PROCESSO_VINCULADO', 'B-PROPOSTA_ENCAMINHAMENTO', 'I-PROPOSTA_ENCAMINHAMENTO', 'B-RECOMENDACAO_PLENARIO', 'I-RECOMENDACAO_PLENARIO', 'B-RESPONSAVEL', 'I-RESPONSAVEL', 'B-RESPONSAVEL_CARGO', 'I-RESPONSAVEL_CARGO', 'B-SANSAO', 'I-SANSAO', 'B-TIPO_INSTRUCAO', 'I-TIPO_INSTRUCAO', 'B-UN_INSTAURADORA', 'I-UN_INSTAURADORA', 'B-UN_JURISDICIONADA', 'I-UN_JURISDICIONADA', 'B-VALOR_DANO_ATUALIZADO', 'I-VALOR_DANO_ATUALIZADO', 'B-VALOR_DANO_INDICIO', 'I-VALOR_DANO_INDICIO'], id=None), length=-1, id=None)

In [16]:
label_list = dados["train"].features[f"{task}_tags"].feature.names
print(len(label_list))
label_list

35


['O',
 'B-AREA',
 'I-AREA',
 'B-PROCURADOR_MP',
 'I-PROCURADOR_MP',
 'B-JULGAMENTO_CONTAS',
 'I-JULGAMENTO_CONTAS',
 'B-MINISTRO',
 'I-MINISTRO',
 'B-MOTIVO_TCE',
 'I-MOTIVO_TCE',
 'B-PROCESSO',
 'I-PROCESSO',
 'B-PROCESSO_VINCULADO',
 'I-PROCESSO_VINCULADO',
 'B-PROPOSTA_ENCAMINHAMENTO',
 'I-PROPOSTA_ENCAMINHAMENTO',
 'B-RECOMENDACAO_PLENARIO',
 'I-RECOMENDACAO_PLENARIO',
 'B-RESPONSAVEL',
 'I-RESPONSAVEL',
 'B-RESPONSAVEL_CARGO',
 'I-RESPONSAVEL_CARGO',
 'B-SANSAO',
 'I-SANSAO',
 'B-TIPO_INSTRUCAO',
 'I-TIPO_INSTRUCAO',
 'B-UN_INSTAURADORA',
 'I-UN_INSTAURADORA',
 'B-UN_JURISDICIONADA',
 'I-UN_JURISDICIONADA',
 'B-VALOR_DANO_ATUALIZADO',
 'I-VALOR_DANO_ATUALIZADO',
 'B-VALOR_DANO_INDICIO',
 'I-VALOR_DANO_INDICIO']

In [17]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [18]:
show_random_elements(dados["train"])

Unnamed: 0,id,tokens,ner_tags
0,7165,[e],[O]
1,8408,"[do, Regimento, Interno),, o, recolhimento, da, divida, aos, cofres, do, Tesouro, Nacional,, atualizada, monetariamente, desde, a, data, deste, acordao, ate]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,10846,"[isso,, tendo, em, mente, que, o, objeto, da, presente, avenca, encontra-se, 100%, executado, (peca, 1,, p., 5, e, 76-78), e, considerando-se,, ademais,, a, sistematica, da, autorizacao, do, saque, dos, recursos, que, ficava, condicionado, ao, atesto, da, execucao, fisica, pela, Caixa]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-UN_INSTAURADORA]"
3,5711,"[que, a, fundamentam,, a, Procuradoria, da, Republica, no, Estado, do, Rio, Grande, do, Sul,, nos, termos, do, 7o, do, art., 209, do, Regimento, Interno, do, TCU,, para, adocao, das, medidas, que, entender, cabiveis,, com, a, informacao, de, que, a, decisao, esta, sujeita, a, Recurso, de, Reconsideracao, previsto, no, art.]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,637,"[das, quantias, abaixo, discriminadas,, com, a, fixacao, do, prazo, de, 15, (quinze), dias,]","[I-SANSAO, I-SANSAO, O, O, O, O, O, O, O, O, O, O, O]"
5,13381,"[9., Acordao:, VISTOS,, relatados, e, discutidos, estes, autos, de, Tomada, de, Contas, Especial, instaurada, pela, Caixa, Economica, Federal,, em, desfavor, da, Sra., Claudia]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-UN_INSTAURADORA, I-UN_INSTAURADORA, I-UN_INSTAURADORA, O, O, O, O, B-RESPONSAVEL]"
6,3175,"[da, impugnacao, total, das, despesas, realizadas]","[O, B-MOTIVO_TCE, I-MOTIVO_TCE, I-MOTIVO_TCE, I-MOTIVO_TCE, I-MOTIVO_TCE]"
7,9309,[de],[O]
8,5321,"[e, considerando-se,, outrossim,, que]","[O, O, O, O]"
9,15499,"[Social, -, AVT, (peca, 8,, p., 79-94),, atestam, a, execucao, integral, do, presente, contrato, de, repasse,, e, atestam,, ademais, disso,, a, existencia, documental, e, fisica, de]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


Pré-processando os dados

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [20]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [21]:
tokenized_dataset= dados.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=670,
    remove_columns=dados["train"].column_names
)

Map:   0%|          | 0/14268 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map:  14%|█▍        | 2010/14268 [00:00<00:01, 9376.08 examples/s]

Map: 100%|██████████| 14268/14268 [00:00<00:00, 17443.61 examples/s]
Map: 100%|██████████| 3567/3567 [00:00<00:00, 21081.72 examples/s]


Fine tuning do modelo

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

label_names = dados["train"].features["ner_tags"].feature.names
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at Model-2000lm-TceBr-bert-base-portuguese-cased/model were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at Model

In [24]:
from torch.backends import cudnn 
print(torch.cuda.is_available())
print(cudnn.is_available())
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
torch.cuda.empty_cache()

True
True
2.1.0+cu121
12.1
8902


In [25]:
model_name = model_checkpoint.split("/")[-2]
print(model_name)

Model-2000lm-TceBr-bert-base-portuguese-cased


In [26]:
# hyperparameters

per_device_batch_size = 8 
gradient_accumulation_steps = 2

learning_rate = 1e-4
num_train_epochs = 5
weight_decay = 0.01

save_total_limit = 3
logging_steps = 290
eval_steps = logging_steps
evaluation_strategy = 'steps'
logging_strategy = 'steps'
save_strategy = 'steps'
save_steps = logging_steps
load_best_model_at_end = True

fp16 = True

# folders
folder_model = 'e' + str(num_train_epochs) + '_lr' + str(learning_rate)
# output_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-1-' + str(model_name) + '/checkpoints/' + folder_model
# logging_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-1-' + str(model_name) + '/logs/' + folder_model

# output_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-2-' + str(model_name) + '/checkpoints/' + folder_model
# logging_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-2-' + str(model_name) + '/logs/' + folder_model

output_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-3-' + str(model_name) + '/checkpoints/' + folder_model
logging_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-3-' + str(model_name) + '/logs/' + folder_model

# get best model through a metric
metric_for_best_model = 'eval_f1'
if metric_for_best_model == 'eval_f1':
    greater_is_better = True
elif metric_for_best_model == 'eval_loss':
    greater_is_better = False  

args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size*2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    save_total_limit=save_total_limit,
    logging_steps = logging_steps,
    eval_steps = logging_steps,
    load_best_model_at_end = load_best_model_at_end,
    metric_for_best_model = metric_for_best_model,
    greater_is_better = greater_is_better,
    gradient_checkpointing = False,
    do_train = True,
    do_eval = True,
    do_predict = True,
    evaluation_strategy = evaluation_strategy,
    logging_dir=logging_dir, 
    logging_strategy = logging_strategy,
    save_strategy = save_strategy,
    save_steps = save_steps,
    fp16 = fp16,
    push_to_hub=False,
)

In [27]:
seqeval = evaluate.load('seqeval')

In [28]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [29]:
from transformers.trainer_callback import EarlyStoppingCallback

# wait early_stopping_patience x eval_steps before to stp the training in order to get a better model
early_stopping_patience = save_total_limit

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
)

In [30]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
290,0.2308,0.042598,0.92063,0.91822,0.919424,0.990944
580,0.0368,0.034956,0.929465,0.948315,0.938795,0.992391
870,0.0246,0.020106,0.956832,0.964344,0.960573,0.995771
1160,0.0154,0.021436,0.957902,0.967615,0.962734,0.994185
1450,0.0224,0.015601,0.967752,0.971868,0.969806,0.996787
1740,0.0139,0.017713,0.964867,0.970232,0.967542,0.996216
2030,0.0133,0.016952,0.971541,0.971541,0.971541,0.997023
2320,0.0152,0.014899,0.983168,0.974485,0.978807,0.997426
2610,0.0081,0.013186,0.980579,0.974485,0.977523,0.997371
2900,0.0054,0.014702,0.977392,0.975793,0.976592,0.997176


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3190, training_loss=0.03587966977989412, metrics={'train_runtime': 9719.7631, 'train_samples_per_second': 7.34, 'train_steps_per_second': 0.459, 'total_flos': 3239735506957680.0, 'train_loss': 0.03587966977989412, 'epoch': 3.58})

In [31]:
trainer.evaluate()

{'eval_loss': 0.01489872857928276,
 'eval_precision': 0.9831683168316832,
 'eval_recall': 0.9744847890088322,
 'eval_f1': 0.9788072942336126,
 'eval_accuracy': 0.9974264450163456,
 'eval_runtime': 202.839,
 'eval_samples_per_second': 17.585,
 'eval_steps_per_second': 1.099,
 'epoch': 3.58}

In [32]:
predictions, labels, _ = trainer.predict(tokenized_dataset["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = seqeval.compute(predictions=true_predictions, references=true_labels)
results

{'AREA': {'precision': 0.9696969696969697,
  'recall': 0.9696969696969697,
  'f1': 0.9696969696969697,
  'number': 132},
 'JULGAMENTO_CONTAS': {'precision': 0.9484536082474226,
  'recall': 0.9387755102040817,
  'f1': 0.9435897435897437,
  'number': 98},
 'MINISTRO': {'precision': 0.9970282317979198,
  'recall': 0.9955489614243324,
  'f1': 0.9962880475129918,
  'number': 674},
 'MOTIVO_TCE': {'precision': 0.9542857142857143,
  'recall': 0.9488636363636364,
  'f1': 0.9515669515669515,
  'number': 176},
 'PROCESSO': {'precision': 1.0,
  'recall': 0.9876543209876543,
  'f1': 0.9937888198757764,
  'number': 81},
 'PROCURADOR_MP': {'precision': 0.9955357142857143,
  'recall': 0.9823788546255506,
  'f1': 0.9889135254988914,
  'number': 227},
 'PROPOSTA_ENCAMINHAMENTO': {'precision': 0.9745762711864406,
  'recall': 0.92,
  'f1': 0.9465020576131686,
  'number': 125},
 'RECOMENDACAO_PLENARIO': {'precision': 1.0,
  'recall': 0.9508196721311475,
  'f1': 0.9747899159663865,
  'number': 61},
 'RESPO

Salvar

In [33]:
# save best model
# model_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-1-' + str(model_name) + '/model/' + folder_model
# model_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-2-' + str(model_name) + '/model/' + folder_model
model_dir = 'ModelNerTceBr/' + 'ner-TceBr-Final-3-' + str(model_name) + '/model/' + folder_model
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('ModelNerTceBr/ner-TceBr-Final-3-Model-2000lm-TceBr-bert-base-portuguese-cased/model/e5_lr0.0001/tokenizer_config.json',
 'ModelNerTceBr/ner-TceBr-Final-3-Model-2000lm-TceBr-bert-base-portuguese-cased/model/e5_lr0.0001/special_tokens_map.json',
 'ModelNerTceBr/ner-TceBr-Final-3-Model-2000lm-TceBr-bert-base-portuguese-cased/model/e5_lr0.0001/vocab.txt',
 'ModelNerTceBr/ner-TceBr-Final-3-Model-2000lm-TceBr-bert-base-portuguese-cased/model/e5_lr0.0001/added_tokens.json',
 'ModelNerTceBr/ner-TceBr-Final-3-Model-2000lm-TceBr-bert-base-portuguese-cased/model/e5_lr0.0001/tokenizer.json')