In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Carregando dados
train_data = pd.read_csv("caminho do arquivo", encoding="iso-8859-1", sep=';')
test_data = pd.read_csv("caminho do arquivo", encoding="iso-8859-1", sep=';')

# Carregando o modelo DistilBERT e o tokenizador
model_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=4)  # 4 classes: 0, 1, 2, 9

# Função para tokenizar os textos
def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['response']:
        encoding = tokenizer.encode_plus(
            text,
            max_length=max_length,
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenização dos dados de treinamento e teste
train_input_ids, train_attention_masks = tokenize_text(train_data)
test_input_ids, test_attention_masks = tokenize_text(test_data)

# Convertendo as labels para tensores do PyTorch
train_labels = torch.tensor(train_data['code0_3'].values)
test_labels = torch.tensor(test_data['code0_3'].values)

# Criando conjuntos de dados PyTorch
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

# Treinamento do modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Parâmetros de treinamento
batch_size = 16
epochs = 3
learning_rate = 2e-5

# Criando DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Otimizador e função de perda
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Loop de treinamento
for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_train_loss = train_loss / len(train_loader)

    # Avaliação no conjunto de teste
    model.eval()
    test_loss = 0.0
    predictions = []
    true_labels = []


    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f'Test Epoch {epoch + 1}/{epochs}'):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            test_loss += loss.item()

            _, predicted_labels = torch.max(logits, 1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    average_test_loss = test_loss / len(test_loader)
    test_accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

    print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {average_train_loss:.4f} - Test Loss: {average_test_loss:.4f} - '
          f'Test Accuracy: {test_accuracy:.4f} - Precision: {precision:.4f} - Recall: {recall:.4f} - F1 Score: {f1:.4f}')

FileNotFoundError: [Errno 2] No such file or directory: '/content/pt_gci_train.csv'

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

# ======================
# 1️⃣ Carregar os dados
# ======================
train_data = pd.read_csv("caminho do arquivo", encoding="iso-8859-1", sep=';')
test_data = pd.read_csv("vaminho do arquivo", encoding="iso-8859-1", sep=';')

# Vamos garantir que só as classes 0, 1 e 2 sejam usadas
train_data = train_data[train_data['code0_3'].isin([0, 1, 2])]
test_data = test_data[test_data['code0_3'].isin([0, 1, 2])]

# ======================
# 2️⃣ Modelo e tokenizador
# ======================
model_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes

# ======================
# 3️⃣ Tokenização
# ======================
def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []

    for text in df['response']:
        encoding = tokenizer.encode_plus(
            text,
            max_length=max_length,
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

train_input_ids, train_attention_masks = tokenize_text(train_data)
test_input_ids, test_attention_masks = tokenize_text(test_data)

train_labels = torch.tensor(train_data['code0_3'].values)
test_labels = torch.tensor(test_data['code0_3'].values)

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

# ======================
# 4️⃣ Treinamento
# ======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

batch_size = 16
epochs = 3
learning_rate = 2e-5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# ======================
# 5️⃣ Loop de treino e avaliação
# ======================
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Treinando Época {epoch+1}/{epochs}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # ======================
    # Avaliação
    # ======================
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Avaliando Época {epoch+1}/{epochs}"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f"\nÉpoca {epoch+1}/{epochs}")
    print(f"Perda média de treino: {avg_train_loss:.4f}")
    print(f"Acurácia total: {accuracy*100:.2f}%")

    # Relatório detalhado por classe (0,1,2)
    print("\nRelatório de desempenho por classe:")
    print(classification_report(true_labels, predictions, digits=3, target_names=["Classe 0", "Classe 1", "Classe 2"]))



tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.


Época 1/3
Perda média de treino: 0.8204
Acurácia total: 62.81%

Relatório de desempenho por classe:
              precision    recall  f1-score   support

    Classe 0      0.665     0.712     0.688       563
    Classe 1      0.820     0.454     0.585       722
    Classe 2      0.462     0.876     0.605       315

    accuracy                          0.628      1600
   macro avg      0.649     0.681     0.626      1600
weighted avg      0.695     0.628     0.625      1600



Treinando Época 2/3: 100%|██████████| 400/400 [1:40:11<00:00, 15.03s/it]
Avaliando Época 2/3: 100%|██████████| 100/100 [07:19<00:00,  4.40s/it]



Época 2/3
Perda média de treino: 0.7117
Acurácia total: 56.38%

Relatório de desempenho por classe:
              precision    recall  f1-score   support

    Classe 0      0.815     0.352     0.491       563
    Classe 1      0.519     0.929     0.666       722
    Classe 2      0.508     0.105     0.174       315

    accuracy                          0.564      1600
   macro avg      0.614     0.462     0.444      1600
weighted avg      0.621     0.564     0.508      1600



Treinando Época 3/3: 100%|██████████| 400/400 [1:37:16<00:00, 14.59s/it]
Avaliando Época 3/3: 100%|██████████| 100/100 [07:24<00:00,  4.44s/it]


Época 3/3
Perda média de treino: 0.6656
Acurácia total: 63.62%

Relatório de desempenho por classe:
              precision    recall  f1-score   support

    Classe 0      0.699     0.657     0.678       563
    Classe 1      0.675     0.626     0.649       722
    Classe 2      0.489     0.622     0.547       315

    accuracy                          0.636      1600
   macro avg      0.621     0.635     0.625      1600
weighted avg      0.647     0.636     0.639      1600




