In [None]:
PATH_DATASET = 'XXX'
PRETRAINED_MODEL = 'raquelsilveira/legalbertpt_fp' #'neuralmind/bert-base-portuguese-cased'

## Leitura dos Dados

In [None]:
import pandas as pd

df_data = pd.read_csv(PATH_DATASET)

In [None]:
n1 = df_data.n1.unique()
for n in n1:
    print(f'{n} - {df_data[df_data.n1 == n].label.nunique()}')

## Encode Dados

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import multiprocessing

from datetime import datetime
from transformers import BertTokenizer

path_encode_saida = f'/input/'

from preprocessing.encoding_bert import encode_bert

begin = datetime.now()
print(f'Inicio: {begin}')

df_data = pd.read_csv(PATH_DATASET)

In [None]:
from transformers import BertTokenizer

# Tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL, do_lower_case=False, use_fast = True)



In [37]:
input_token_size = 512

# Encode
encoded_data = tokenizer.batch_encode_plus(
    df_data.texto_tratado.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=input_token_size,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
process_ids = df_data['id_processo']
documentos_id = df_data['id_processo_documento']
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']

headers_input = [f'input_{i}' for i in range(input_ids.shape[1])]
input_savein = os.path.join(path_encode_saida, f'input_ids.csv')

df_input_ids = pd.DataFrame(np.array(input_ids), columns=headers_input)
df_input_ids = pd.concat([process_ids, documentos_id, df_input_ids], axis=1)
df_input_ids.to_csv(input_savein, index=False)

headers_mask = [f'att_mask_{i}' for i in range(attention_masks.shape[1])]
att_savein = os.path.join(path_encode_saida, f'attention_masks.csv')

df_attention_masks = pd.DataFrame(np.array(attention_masks), columns=headers_mask)
df_attention_masks = pd.concat([process_ids, documentos_id, df_attention_masks], axis=1)
df_attention_masks.to_csv(att_savein, index=False)

end = datetime.now()
print(f'Final: {end}')
print(f'Tempo total: {end - begin}')

## Run BERT

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def get_data_encodes(path_encode):
    path_encode_input = os.path.join(path_encode, 'input_ids.csv')
    path_encode_mask = os.path.join(path_encode, 'attention_masks.csv')

    df_input = pd.read_csv(path_encode_input)
    array_inputs = list(df_input.drop(columns=['id_processo', 'id_processo_documento']).values)
    df_input['input_ids'] = array_inputs
    df_input = df_input[['id_processo', 'id_processo_documento', 'input_ids']]
    df_input = df_input.rename(columns={'id_processo': 'processo_id',
                                        'id_processo_documento': 'documento_id'})

    df_mask = pd.read_csv(path_encode_mask)
    array_mask = list(df_mask.drop(columns=['id_processo', 'id_processo_documento']).values)
    df_mask['attention_masks'] = array_mask
    df_mask = df_mask[['id_processo', 'id_processo_documento', 'attention_masks']]
    df_mask = df_mask.rename(columns={'id_processo': 'processo_id',
                                      'id_processo_documento': 'documento_id'})

    df_merge = pd.merge(df_input, df_mask, how='inner', on=['processo_id', 'documento_id'])

    size = len(df_merge.iloc[0]['input_ids'])

    return df_merge, size


def get_data(path_dataset, path_encode, percent_treino=0.8):

  df_data = pd.read_csv(path_dataset, usecols=['documento_id', 'processo_id', 'assunto_id', 'label'])

  df_data.rename(columns={'cd_assunto_trf' : 'assunto_id',
                          'id_processo_documento': 'documento_id',
                          'id_processo': 'processo_id'},
                 inplace=True)
  
  print('Tamanho do dataset:', len(df_data))

  
  df_encode, size = get_data_encodes(path_encode)
  df_data = pd.merge(df_data, df_encode, how='inner', on=['processo_id', 'documento_id'])
  
  df_data = df_data[df_data.groupby(['label'])['documento_id'].transform('nunique') > 10]

  print(len(df_data))

  df_data_treino, df_data_teste, _, _ = train_test_split(df_data, df_data['label'],
                                                        test_size=1-percent_treino, stratify=df_data['label'],
                                                        random_state = 0)

  df_data_treino['split'] = ['train'] * len(df_data_treino)
  df_data_teste['split'] = ['test'] * len(df_data_teste)
  df_concat = pd.concat([df_data_treino, df_data_teste])

  print('Tamanho dataset (df_concat):', len(df_concat))
  return df_concat

def wrapper_tensor_data_bert(data):
    input_ids = []
    attention_masks = []
    processo_ids = []
    labels = []

    for _, row in data.iterrows():
      
      input_ids.append(list(row['input_ids']))
      attention_masks.append(list(row['attention_masks']))
      processo_ids.append(row['processo_id'])
      labels.append(row['id_label'])

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    processo_ids = torch.tensor(processo_ids)

    print('labels', labels)
    return TensorDataset(input_ids, attention_masks, labels, processo_ids)

In [44]:
def get_data_classificador(x_data, val_size, train_size, batch_size):

  x_data['documento_id'] = x_data['documento_id'].astype(int)

  x_data_train, x_data_test = x_data[x_data.split == 'train'], x_data[x_data.split == 'test']
  y_test = x_data_test['id_label']

  percent_val = val_size / (val_size + train_size)

  x_data_train, x_data_val, y_train, y_val = train_test_split(x_data_train, x_data_train['id_label'],
                                                    test_size=percent_val, stratify=x_data_train['id_label'],
                                                    random_state = 0)


  print('train:', len(x_data_train))
  print('teste:', len(x_data_test))
  print('val:', len(x_data_val))
  dataset_train = wrapper_tensor_data_bert(x_data_train)
  dataset_val = wrapper_tensor_data_bert(x_data_val)
  dataset_test = wrapper_tensor_data_bert(x_data_test)

  return dataset_train, dataset_val, dataset_test

In [None]:
def get_data_classifier(path_dataset, path_encode):

    df_data = get_data(path_dataset, path_encode)
    print('Tamanho dataset:', len(df_data))

    num_class = df_data['label'].nunique()
    print('Número de classes:', num_class)

    labels = list(df_data['label'].unique())
    dict_label = {}
    for id, value  in enumerate(labels):
      dict_label[value] = id

    df_data['id_label'] = df_data['label'].replace(dict_label)

    dataset_train, dataset_val, dataset_test = get_data_classificador(df_data, percen_val, percent_treino, batch_size)

    dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

    dataloader_val = DataLoader(dataset_val,
                                sampler=SequentialSampler(dataset_val),
                                batch_size=batch_size)

    dataloader_test = DataLoader(dataset_test,
                                sampler=SequentialSampler(dataset_test),
                                batch_size=batch_size)

    return dataloader_train, dataloader_val, dataloader_test, dict_label, num_class

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup


def get_model(modelo_id, num_class, dataloader_train):

  model = AutoModelForSequenceClassification.from_pretrained(modelo_id, num_labels=num_class, output_attentions=False, output_hidden_states=False, id2label=label_dict)

  optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

  scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

  return model, optimizer, scheduler

In [47]:
import torch

torch.cuda.empty_cache()

In [48]:
def evaluate(dataloader_test, model):

    model.eval()

    loss_test_total = 0
    predictions, true_test = [], []

    processos = []

    for batch in dataloader_test:

        batch = tuple(b.to(device) for b in batch)

        processos.extend(batch[3].cpu().numpy())

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_test.append(label_ids)

    loss_test_avg = loss_test_total/len(dataloader_test)

    predictions = np.concatenate(predictions, axis=0)
    true_test = np.concatenate(true_test, axis=0)

    return loss_test_avg, predictions, true_test, processos

In [49]:
from sklearn import metrics
from sklearn.metrics import f1_score, classification_report, confusion_matrix

def f1_score_func(preds, labels, metric):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average=metric)

def f1_score_func_average(preds, labels, average_f1):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average=average_f1)

def accuracy_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return metrics.accuracy_score(labels_flat, preds_flat)

def classification_report_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    report = metrics.classification_report(labels_flat,preds_flat)
    print(report)

def confusion_matrix_class(labels, preds):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return confusion_matrix(labels_flat,preds_flat)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
def run_train(model, optimizer, scheduler, dataloader_train, dataloader_val):

  model.to(device)

  history = {
      'train': {
        'f1_weighted': [], 'f1_micro': [], 'f1_macro': [], 'acc': [], 'loss': []
      },
      'val': {
        'f1_weighted': [], 'f1_micro': [], 'f1_macro': [], 'acc': [], 'loss': []
      }
  }

  best_model = None
  best_metric = -1

  early_stop_count = 0

  for epoch in tqdm.tqdm(range(1, epochs+1)):

      if early_stop_count >= early_stop:
          print("Early stop!")
          break

      model.train()

      loss_train_total = 0

      for batch in dataloader_train:

          model.zero_grad()

          batch = tuple(b.to(device) for b in batch)

          inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1],
                    'labels':         batch[2],
                  }
                  
          outputs = model(**inputs)

          loss = outputs[0]
          loss_train_total += loss.item()

          loss.backward()

          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          optimizer.step()
          scheduler.step()

      torch.cuda.empty_cache()
      
      tqdm.tqdm.write(f'\nEpoch {epoch}')

      loss_train_avg = loss_train_total/len(dataloader_train)

      train_loss, predictions_train, true_train, _ = evaluate(dataloader_train, model)
      val_loss, predictions_val, true_val, _ = evaluate(dataloader_val, model)

      tqdm.tqdm.write(f'Train loss: {train_loss}, Val loss: {val_loss}')

      acc_train = accuracy_score_func(predictions_train, true_train)
      f1_micro_train = f1_score_func(predictions_train, true_train, 'micro')
      f1_macro_train = f1_score_func(predictions_train, true_train, 'macro')
      f1_weighted_train = f1_score_func(predictions_train, true_train, 'weighted')

      acc_val = accuracy_score_func(predictions_val, true_val)
      f1_micro_val = f1_score_func(predictions_val, true_val, 'micro')
      f1_macro_val = f1_score_func(predictions_val, true_val, 'macro')
      f1_weighted_val = f1_score_func(predictions_val, true_val, 'weighted')

      history['train']['acc'].append(acc_train)
      history['train']['loss'].append(train_loss)
      history['train']['f1_macro'].append(f1_micro_train)
      history['train']['f1_micro'].append(f1_macro_train)
      history['train']['f1_weighted'].append(f1_weighted_train)

      history['val']['acc'].append(acc_val)
      history['val']['loss'].append(val_loss)
      history['val']['f1_macro'].append(f1_micro_val)
      history['val']['f1_micro'].append(f1_macro_val)
      history['val']['f1_weighted'].append(f1_weighted_val)


      early_stop_count += 1
      if best_model is None or f1_weighted_val > best_metric:
          print(f'best model:{epoch} : {f1_weighted_val} - {best_metric}')
          best_metric = f1_weighted_val
          best_model = copy.copy(model)
          early_stop_count = 0

      torch.cuda.empty_cache()

  best_model.config.id2label = label_dict
  best_model.config.label2id = {label_dict[d] : d for d in label_dict}

  best_model.save_pretrained(f'{path_saida}/modelo_classificacao_bert/')

  return best_model, history

In [None]:
def get_metrics_train(history):
  plt.figure(figsize=(12,10))

  epochs = list(range(len(history['train']['loss'])))

  subplots = 1
  metrics = ['loss', 'acc', 'f1_macro', 'f1_micro', 'f1_weighted']

  for m in metrics:

    plt.subplot(3,2,subplots)
    plt.plot(epochs, history['train'][m], '.-', label='train')
    plt.plot(epochs, history['val'][m], '.-', label='val')
    plt.ylabel(m)

    plt.grid()
    plt.legend()

    subplots += 1
    plt.tight_layout()
  plt.show()

In [None]:
import torch
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


modelo_id = PRETRAINED_MODEL

path_saida = '/bracis_2025_bertimbau/'
path_encode = '/input/'

percent_treino = 0.80
percen_val = 0.05
batch_size = 4
epochs = 20
early_stop = 5
balanceamento = False

torch.cuda.empty_cache()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

dataloader_train, dataloader_val, dataloader_test, dict_label, num_class = get_data_classifier(PATH_DATASET, path_encode)

In [None]:
#id2label
label_dict = {v: k for k, v in dict_label.items()}

model, optimizer, scheduler = get_model(modelo_id, num_class, dataloader_train)

best_model, history = run_train(model, optimizer, scheduler, dataloader_train, dataloader_val)
get_metrics_train(history)

In [None]:
test_loss, predictions, true_test, processos = evaluate(dataloader_test, best_model)

classification_report(true_test, predictions)