# BERTimbau FineTuning Language Model

Autora: Sabrina dos Passos Tortelli

Tarefa: FineTuning Language Model 2000 arquivos

In [1]:
# %pip install datasets
# %pip install transformers==4.29.0
# %pip install torch
# %pip install nltk
# %pip install scikit-learn
# %pip install seqeval
# %pip install evaluate
# %pip install matplotlib

In [3]:
model_checkpoint = "neuralmind/bert-base-portuguese-cased"

In [4]:
import transformers
import datasets

print(transformers.__version__)
print(datasets.__version__)

import pathlib
from pathlib import Path

import pandas as pd
from datasets import Dataset, DatasetDict

import os
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


4.29.0
2.14.6


[nltk_data] Downloading package punkt to /home/sabrina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
path_to_text_files = 'DadosTXT'

p = Path(path_to_text_files).glob('**/*')
files = [x for x in p if x.is_file() and x.suffix == '.txt']
files

[PosixPath('DadosTXT/023-722-2017-7_AC.txt'),
 PosixPath('DadosTXT/010-303-2019-7_AC.txt'),
 PosixPath('DadosTXT/002-908-2020-4_AC.txt'),
 PosixPath('DadosTXT/018-550-2019-3_AC.txt'),
 PosixPath('DadosTXT/006-717-2016-0_AC.txt'),
 PosixPath('DadosTXT/002-680-2020-3_IP.txt'),
 PosixPath('DadosTXT/000-092-2018-5_AC.txt'),
 PosixPath('DadosTXT/015-242-2018-8_AC.txt'),
 PosixPath('DadosTXT/016-212-2016-9_AC.txt'),
 PosixPath('DadosTXT/015-069-2021-4_IP.txt'),
 PosixPath('DadosTXT/005-744-2019-9_AC.txt'),
 PosixPath('DadosTXT/000-738-2016-6_AC.txt'),
 PosixPath('DadosTXT/009-523-2016-2_AC.txt'),
 PosixPath('DadosTXT/017-454-2017-4_AC.txt'),
 PosixPath('DadosTXT/007-715-2016-1_AC.txt'),
 PosixPath('DadosTXT/021-330-2016-6_AC.txt'),
 PosixPath('DadosTXT/025-092-2016-2_AC.txt'),
 PosixPath('DadosTXT/027-715-2018-3_AC.txt'),
 PosixPath('DadosTXT/025-877-2020-8_IP.txt'),
 PosixPath('DadosTXT/012-389-2016-1_AC.txt'),
 PosixPath('DadosTXT/040-373-2018-5_AC.txt'),
 PosixPath('DadosTXT/027-713-2018-

In [6]:
len(files)

2000

In [7]:
paragraphs_list = list()

for file in files:
  paragraphs_by_file_list = list()
  with open(file, 'r') as f:
    data = f.read()
    sentences = sent_tokenize(data)
    for sentence in sentences:
      if sentence != '':
        paragraphs_by_file_list.append(sentence)
  paragraphs_list.extend(paragraphs_by_file_list)

In [8]:
len(paragraphs_list)

182571

In [9]:
df = pd.DataFrame(paragraphs_list)
df.rename(columns={0: 'text'}, inplace=True)
df.head()

Unnamed: 0,text
0,ACORDAO No 8379/2020 - TCU - 1a Camara (246.55...
1,1.
2,Processo TC 023.722/2017-7.
3,2.
4,Grupo II - Classe II - Assunto: Tomada de Cont...


In [10]:
df['text'].str.split().apply(len).value_counts()

text
1       37661
6        6368
5        6223
4        5598
2        5252
        ...  
447         1
309         1
430         1
1018        1
470         1
Name: count, Length: 471, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(df, test_size=0.2)
train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)

In [12]:
train.head()

Unnamed: 0,text
0,"202, 6o, do Regimento Interno do TCU, com a im..."
1,"Diante do exposto, submetem-se os autos a cons..."
2,"Por outro lado, a nao TRIBUNAL DE CONTAS DA UN..."
3,"6o, inciso II, e 19 da Instrucao Normativa TCU..."
4,Aurenisia Celestino Figueiredo Brandao e pela ...


In [13]:
train_dataset = Dataset.from_pandas(train)
validation_dataset = Dataset.from_pandas(validation)

In [14]:
datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = validation_dataset

datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 146056
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 36515
    })
})

In [15]:
datasets["train"][10]

{'text': '9.'}

In [16]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [17]:
show_random_elements(datasets["train"])

Unnamed: 0,text
0,"Alem disso, em linha com a jurisprudencia do TCU, chegou-se a conclusao que a execucao parcial de bem ou servico imprestavel a populacao, sem conexao com o plano de trabalho pactuado, deve levar a condenacao em debito pela totalidade dos recursos transferidos."
1,"O responsavel arrolado na fase interna foi devidamente comunicado e, diante da ausencia de justificativas suficientes para elidir a irregularidade e da nao devolucao dos recursos, instaurou-se a tomada de contas especial."
2,7.
3,38.
4,"157 do Regimento Interno do TCU, ao Banco do Brasil S/A, agencia 4404-0, para que, no prazo de quinze dias, sejam encaminhados os seguintes documentos/informacoes acerca da conta 7563-9, agencia 4404-0, relativa ao projeto cultural Pronac 07-7118 (item 28): a) extrato desde a abertura da conta ate o respectivo encerramento; e b) relacao dos seguintes dados relativos a todos os representantes habilitados para movimentar a conta 7563-9, agencia 4404-0: nome, CPF, periodo de habilitacao e relacao desse representante com a pessoa juridica; 31.2. realizar diligencia, com fundamento nos arts."
5,"Sendo assim, em razao de nao ter transcorrido mais de 10 anos entre esta data e a data do ato que ordenou a citacao (21/5/2020 - peca 26), constata-se que nao ocorreu a prescricao da pretensao punitiva."
6,"Dessa forma, identificado dano ao erario, deve se instaurar e julgar o processo de tomada de contas especial para responsabilizar seus agentes causadores, respeitando o direito ao contraditorio e a ampla defesa, independentemente de quando ocorreram os atos impugnados."
7,2.
8,"De posse dessa Certidao, eram obtidos os demais documentos, tambem ideologicamente falsos (CPF e CTPS), contando, para tanto, com a participacao de servidores publicos dos Correios, de Prefeitura e da Receita Federal."
9,Petrobras utilizam uma ou mais das tecnicas supracitadas.


In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [19]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [20]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Map (num_proc=4): 100%|██████████| 146056/146056 [00:03<00:00, 42449.22 examples/s]
Map (num_proc=4): 100%|██████████| 36515/36515 [00:00<00:00, 36833.89 examples/s]


In [21]:
tokenized_datasets["train"][2]

{'input_ids': [101,
  566,
  1342,
  1341,
  117,
  123,
  229,
  22280,
  267,
  21748,
  22318,
  11964,
  9369,
  10836,
  16273,
  5118,
  22308,
  250,
  22301,
  7281,
  5234,
  22317,
  7639,
  118,
  3479,
  125,
  18459,
  1097,
  16061,
  7639,
  125,
  18459,
  1097,
  16061,
  202,
  1477,
  125,
  14969,
  888,
  9176,
  256,
  304,
  22280,
  240,
  1423,
  125,
  2499,
  538,
  12228,
  22281,
  122,
  13961,
  3391,
  22280,
  143,
  538,
  5590,
  18833,
  15780,
  123,
  13240,
  18374,
  304,
  22280,
  366,
  14520,
  5577,
  170,
  3353,
  171,
  212,
  3549,
  16017,
  119,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [22]:
block_size = 128

In [23]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [24]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4): 100%|██████████| 146056/146056 [00:06<00:00, 23294.35 examples/s]
Map (num_proc=4): 100%|██████████| 36515/36515 [00:01<00:00, 21638.87 examples/s]


In [25]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'##RIBUNAL DE CONTAS DA UNIAO Secretaria - Geral de Controle Externo Secretaria de Controle Externo no Estado de Alagoas 6 comprovacao por meio de controle nos abastecimentos e discriminacoes nos documentos fiscais impede a devida evidenciacao das despesas realizadas com recursos do PNATE. [SEP] [CLS] 6o, inciso II, e 19 da Instrucao Normativa TCU 71 / 2012, ante o longo transcurso de prazo decorrido desde as irregularidades sem a notificacao ou citacao dos responsaveis ( mais de 10 anos ), evidenciando amplo preju'

In [25]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
from transformers import Trainer, TrainingArguments

In [27]:
# hyperparameters

per_device_batch_size = 8
gradient_accumulation_steps = 1

learning_rate = 2e-5
num_train_epochs = 5 
weight_decay = 0.01

save_total_limit = 2
logging_steps = 100 
eval_steps = logging_steps
evaluation_strategy = 'steps'
logging_strategy = 'steps'
save_strategy = 'steps'
save_steps = logging_steps
load_best_model_at_end = True

fp16 = True

# folders
model_name = model_checkpoint.split("/")[-1]
folder_model = 'e' + str(num_train_epochs) + '_lr' + str(learning_rate)
output_dir = 'Model-2000' + 'lm-TceBr-' + str(model_name) + '/checkpoints/' + folder_model
logging_dir = 'Model-2000' + 'lm-TceBr-' + str(model_name) + '/logs/' + folder_model

# get best model through a metric
metric_for_best_model = 'eval_loss'
if metric_for_best_model == 'eval_f1':
    greater_is_better = True
elif metric_for_best_model == 'eval_loss':
    greater_is_better = False  

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size*2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    save_total_limit=save_total_limit,
    logging_steps = logging_steps,
    eval_steps = logging_steps,
    load_best_model_at_end = load_best_model_at_end,
    metric_for_best_model = metric_for_best_model,
    greater_is_better = greater_is_better,
    gradient_checkpointing = False,
    do_train = True,
    do_eval = True,
    do_predict = True,
    evaluation_strategy = evaluation_strategy,
    logging_dir=logging_dir, 
    logging_strategy = logging_strategy,
    save_strategy = save_strategy,
    save_steps = save_steps,
    fp16 = fp16,
    push_to_hub=False,
)

In [26]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [29]:
from transformers.trainer_callback import EarlyStoppingCallback

# wait early_stopping_patience x eval_steps before to stp the training in order to get a better model
early_stopping_patience = save_total_limit

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
)

In [30]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,1.821,1.493623
200,1.5654,1.395483
300,1.4808,1.329169
400,1.4023,1.280789
500,1.3825,1.249131
600,1.2957,1.215332
700,1.2858,1.192234
800,1.2872,1.17273
900,1.2675,1.149754
1000,1.2858,1.132642


TrainOutput(global_step=4600, training_loss=1.1541053556359333, metrics={'train_runtime': 45418.8367, 'train_samples_per_second': 6.324, 'train_steps_per_second': 0.791, 'total_flos': 2421463702732800.0, 'train_loss': 1.1541053556359333, 'epoch': 0.64})

In [31]:
import math
eval_results = trainer.evaluate()
print(eval_results)
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

{'eval_loss': 0.9303459525108337, 'eval_runtime': 843.5059, 'eval_samples_per_second': 17.069, 'eval_steps_per_second': 1.067, 'epoch': 0.64}
Perplexity: 2.54


In [27]:
# save best model
# model_dir = 'Model-2000' + 'lm-TceBr-' + str(model_name) + '/model/'
model_dir = 'Model-2000lm-TceBr-bert-base-portuguese-cased/model'
# trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('Model-2000lm-TceBr-bert-base-portuguese-cased/model/tokenizer_config.json',
 'Model-2000lm-TceBr-bert-base-portuguese-cased/model/special_tokens_map.json',
 'Model-2000lm-TceBr-bert-base-portuguese-cased/model/vocab.txt',
 'Model-2000lm-TceBr-bert-base-portuguese-cased/model/added_tokens.json',
 'Model-2000lm-TceBr-bert-base-portuguese-cased/model/tokenizer.json')