In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset

# Carregue os conjuntos de dados
train_data = pd.read_csv("caminho do arquivo", encoding="iso-8859-1", sep=';')
test_data = pd.read_csv("caminho do arquivo", encoding="iso-8859-1", sep=';')

# Função de pré-processamento para BERT
def preprocess_text_for_bert(text, tokenizer, max_length=128):
    tokens = tokenizer.encode_plus(
        text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    return tokens

# Carregue o tokenizador e o modelo BERT
tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = BertForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=3)

# Pré-processamento dos dados
X_train = train_data['response'].apply(lambda x: preprocess_text_for_bert(x, tokenizer))
X_test = test_data['response'].apply(lambda x: preprocess_text_for_bert(x, tokenizer))

# Converta os rótulos para tensores
y_train = torch.tensor(train_data['code0_3'].values)
y_test = torch.tensor(test_data['code0_3'].values)

# Crie conjuntos de dados do PyTorch
train_input_ids = torch.stack(X_train.apply(lambda x: x['input_ids'][0]).tolist())
train_attention_masks = torch.stack(X_train.apply(lambda x: x['attention_mask'][0]).tolist())

# Certifique-se de que y_train tenha o mesmo tamanho que os outros tensores
y_train = y_train[:len(train_input_ids)]

train_dataset = TensorDataset(train_input_ids, train_attention_masks, y_train)

test_input_ids = torch.stack(X_test.apply(lambda x: x['input_ids'][0]).tolist())
test_attention_masks = torch.stack(X_test.apply(lambda x: x['attention_mask'][0]).tolist())

# Certifique-se de que y_test tenha o mesmo tamanho que os outros tensores
y_test = y_test[:len(test_input_ids)]

test_dataset = TensorDataset(test_input_ids, test_attention_masks, y_test)

# Ajuste os tamanhos para garantir que todos tenham o mesmo comprimento
min_length = min(test_input_ids.size(0), test_attention_masks.size(0), y_test.size(0))

test_dataset = TensorDataset(test_input_ids[:min_length], test_attention_masks[:min_length], y_test[:min_length])


# Defina hiperparâmetros
batch_size = 8
learning_rate = 2e-5
epochs = 3

# Carregue o otimizador e a função de perda
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

from tqdm import tqdm

# Crie DataLoader para treino com barra de progresso
train_dataloader = tqdm(DataLoader(train_dataset, batch_size=batch_size, shuffle=True))

# Treinamento do modelo
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Crie DataLoader para teste com barra de progresso
test_dataloader = tqdm(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))

from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Avaliação do modelo com barra de progresso
model.eval()
all_predictions = []
all_labels = []
with torch.no_grad():
    # Crie uma barra de progresso para o conjunto de teste
    test_dataloader_with_progress = tqdm(test_dataloader, desc="Avaliação do Modelo", leave=False)

    for batch in test_dataloader_with_progress:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calcule métricas de avaliação
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
f1 = f1_score(all_labels, all_predictions, average='weighted')

# Exiba as métricas de avaliação
print("Acurácia do modelo: {:.2f}%".format(100 * accuracy))
print("Precisão do modelo: {:.2f}".format(precision))
print("Revocação do modelo: {:.2f}".format(recall))
print("Medida F do modelo: {:.2f}".format(f1))

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# y_true são os rótulos verdadeiros e y_pred são os rótulos preditos
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap="Blues")
plt.title("Matriz de Confusão")
plt.show()


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1206/1206 [3:10:11<00:00,  9.46s/it]
