In [3]:
import os
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import random
import numpy as np

In [4]:
# Semente para reprodutibilidade
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [5]:
# Configurações gerais
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Usando dispositivo: {device}')
nclasses = 2  # Número de classes
nepochs = 10  # Épocas
batch_size = 16  # Tamanho dos lotes
batch_status = 32  # Frequência de exibição de status
learning_rate = 2e-5  # Taxa de aprendizado mais comum para BERT
early_stop = 5  # Critério de parada antecipada
max_length = 360  # Comprimento máximo das sequências
write_path = 'model'  # Diretório para salvar o modelo

Usando dispositivo: cuda


In [6]:
# Carregar os dados
data = pd.read_csv("DATAFRAME.csv")

In [7]:
# Divisão dos dados (ex: 80% treino, 10% val, 10% teste)
train_data, test_data = train_test_split(data, test_size=0.10, random_state=seed, stratify=data['contra'])
train_data, val_data = train_test_split(train_data, test_size=0.125, random_state=seed, stratify=train_data['contra'])

print(f"Tamanho do Treino: {len(train_data)}")
print(f"Tamanho da Validação: {len(val_data)}")
print(f"Tamanho do Teste: {len(test_data)}")

Tamanho do Treino: 1177
Tamanho da Validação: 169
Tamanho do Teste: 150


In [8]:
# Classe personalizada para o dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['contra']
        inputs = self.tokenizer(text, return_tensors='pt',
                                padding='max_length', truncation=True,
                                max_length=self.max_length)
        return {key: val.squeeze(0) for key, val in inputs.items()}, torch.tensor(label)


In [9]:
# Modelo customizado com camadas extras
class CustomBERTModel(nn.Module):
    def __init__(self, model_name, nclasses):
        super(CustomBERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, nclasses)

        # Congelar todas as camadas do BERT inicialmente
        for param in self.bert.parameters():
            param.requires_grad = False

        # Descongelar as últimas 4 camadas
        for param in self.bert.encoder.layer[-4:].parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        dropped_out = self.dropout(pooled_output)
        logits = self.classifier(dropped_out)
        return logits

In [10]:
# Inicializar o tokenizador e o modelo
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
model = CustomBERTModel('neuralmind/bert-base-portuguese-cased', nclasses).to(device)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [11]:
# Configurar o otimizador apenas para os parâmetros treináveis
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)


In [12]:
# Definir a função de perda com pesos para classes desbalanceadas (ajuste conforme necessário)
class_weights = torch.tensor([1.0, 2.5]).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [13]:
# Criar datasets e dataloaders
train_dataset = CustomDataset(train_data, tokenizer, max_length)
val_dataset = CustomDataset(val_data, tokenizer, max_length)
test_dataset = CustomDataset(test_data, tokenizer, max_length)


In [14]:
traindata = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valdata = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
testdata = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [15]:
# Função de avaliação (retorna F1 e Acurácia)
def evaluate(model, dataloader):
    model.eval()
    y_real, y_pred = [], []
    with torch.no_grad():
        for batch_idx, (inputs, labels) in enumerate(dataloader):
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            logits = model(**inputs)
            pred_labels = torch.argmax(logits, 1)

            y_real.extend(labels.cpu().tolist())
            y_pred.extend(pred_labels.cpu().tolist())

    f1 = f1_score(y_real, y_pred, average='weighted')
    acc = accuracy_score(y_real, y_pred)
    return f1, acc, (y_real, y_pred)

if not os.path.exists(write_path):
    os.makedirs(write_path)

In [16]:
max_f1, repeat = 0, 0
for epoch in range(nepochs):
    model.train()
    losses = []

    for batch_idx, (inputs, labels) in enumerate(traindata):
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        logits = model(**inputs)
        loss = loss_fn(logits, labels)
        losses.append(float(loss))

        # Backprop
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if (batch_idx + 1) % batch_status == 0:
            print(f'Epoch: {epoch} [{batch_idx + 1}/{len(traindata)}]\tLoss: {loss:.6f}')

    # Avaliação no conjunto de validação
    f1_val, acc_val, _ = evaluate(model, valdata)
    print(f'Epoch {epoch} - Val F1: {f1_val:.4f}, Val Accuracy: {acc_val:.4f}')

    # Early Stopping baseado no Val F1
    if f1_val > max_f1:
        torch.save(model.state_dict(), os.path.join(write_path, 'best_model.pth'))
        max_f1 = f1_val
        repeat = 0
        print('Novo melhor modelo salvo.')
    else:
        repeat += 1

    if repeat == early_stop:
        print('Early stopping atingido.')
        break

# Avaliar no conjunto de teste final com o melhor modelo
model.load_state_dict(torch.load(os.path.join(write_path, 'best_model.pth')))
f1_test, acc_test, (y_real, y_pred) = evaluate(model, testdata)
print("Desempenho no conjunto de teste:")
print(classification_report(y_real, y_pred, target_names=['0', '1']))
print(f"F1 (teste): {f1_test:.4f}, Accuracy (teste): {acc_test:.4f}")

Epoch: 0 [32/74]	Loss: 0.701079
Epoch: 0 [64/74]	Loss: 0.441900
Epoch 0 - Val F1: 0.4756, Val Accuracy: 0.5621
Novo melhor modelo salvo.
Epoch: 1 [32/74]	Loss: 0.445746
Epoch: 1 [64/74]	Loss: 0.378276
Epoch 1 - Val F1: 0.8574, Val Accuracy: 0.8580
Novo melhor modelo salvo.
Epoch: 2 [32/74]	Loss: 0.361751
Epoch: 2 [64/74]	Loss: 0.562474
Epoch 2 - Val F1: 0.8693, Val Accuracy: 0.8698
Novo melhor modelo salvo.
Epoch: 3 [32/74]	Loss: 0.099081
Epoch: 3 [64/74]	Loss: 0.070618
Epoch 3 - Val F1: 0.8699, Val Accuracy: 0.8698
Novo melhor modelo salvo.
Epoch: 4 [32/74]	Loss: 0.112896
Epoch: 4 [64/74]	Loss: 0.022936
Epoch 4 - Val F1: 0.8695, Val Accuracy: 0.8698
Epoch: 5 [32/74]	Loss: 0.014355
Epoch: 5 [64/74]	Loss: 0.023717
Epoch 5 - Val F1: 0.8758, Val Accuracy: 0.8757
Novo melhor modelo salvo.
Epoch: 6 [32/74]	Loss: 0.021397
Epoch: 6 [64/74]	Loss: 0.005652
Epoch 6 - Val F1: 0.8758, Val Accuracy: 0.8757
Epoch: 7 [32/74]	Loss: 0.183824
Epoch: 7 [64/74]	Loss: 0.048634
Epoch 7 - Val F1: 0.8876, Val

  model.load_state_dict(torch.load(os.path.join(write_path, 'best_model.pth')))


Desempenho no conjunto de teste:
              precision    recall  f1-score   support

           0       0.81      0.82      0.82        79
           1       0.80      0.79      0.79        71

    accuracy                           0.81       150
   macro avg       0.81      0.81      0.81       150
weighted avg       0.81      0.81      0.81       150

F1 (teste): 0.8066, Accuracy (teste): 0.8067
