## Carregue a base de dados e faça a divisão entre treino, validação e teste.

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import accelerate
import torch
import pandas as pd
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


## Treine o BERT (antes, faça a tokenização e veja como estão os tokens de um documento!)


In [5]:
data = pd.read_csv("./Dmoz-Science.csv")
encoder = LabelEncoder()
data['labels'] = encoder.fit_transform(data['class'])

texts = data['text'].tolist()
labels = data['labels'].tolist()

train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Criando datasets do Hugging Face
dataset_train = Dataset.from_dict({"text": train_texts, "label": train_labels})
dataset_val = Dataset.from_dict({"text": val_texts, "label": val_labels})
dataset_test = Dataset.from_dict({"text": test_texts, "label": test_labels})

# Aplicando a tokenização
dataset_train = dataset_train.map(tokenize_function, batched=True)
dataset_val = dataset_val.map(tokenize_function, batched=True)
dataset_test = dataset_test.map(tokenize_function, batched=True)

# Removendo a coluna de texto, mantendo apenas tokens
dataset_train = dataset_train.remove_columns(["text"])
dataset_val = dataset_val.remove_columns(["text"])
dataset_test = dataset_test.remove_columns(["text"])

display(dataset_test.to_pandas())


Map: 100%|██████████| 698/698 [00:00<00:00, 1212.22 examples/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 1405.96 examples/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 1412.21 examples/s]


Unnamed: 0,label,input_ids,token_type_ids,attention_mask
0,1,"[101, 23755, 6494, 2595, 28625, 6494, 9102, 12...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,"[101, 4950, 3420, 1037, 2440, 2846, 1997, 2802...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,"[101, 3516, 7554, 4982, 2545, 2592, 2055, 1996...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,"[101, 3493, 13674, 3485, 2981, 2658, 3485, 799...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,"[101, 10768, 28550, 1005, 1055, 20506, 12654, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
...,...,...,...,...
145,1,"[101, 11358, 4195, 3720, 2013, 1037, 2338, 201...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
146,0,"[101, 2888, 11515, 4710, 2470, 2523, 1012, 199...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
147,1,"[101, 21706, 2950, 1037, 2275, 1997, 3964, 201...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
148,0,"[101, 2695, 1011, 11203, 2968, 1024, 14557, 36...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [6]:
import gc

# After training or evaluation step
gc.collect()
torch.cuda.empty_cache()

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    return {
        'accuracy': acc,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro
    }

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
# 5. Avaliação nos dados de teste
predictions = trainer.predict(dataset_test)
logits = predictions.predictions
predicted_labels = torch.argmax(torch.tensor(logits), dim=-1).tolist()

# Métricas finais
acc = accuracy_score(test_labels, predicted_labels)
f1_micro = f1_score(test_labels, predicted_labels, average='micro')
f1_macro = f1_score(test_labels, predicted_labels, average='macro')
conf_matrix = confusion_matrix(test_labels, predicted_labels)

print("Accuracy:", acc)
print("F1 Micro:", f1_micro)
print("F1 Macro:", f1_macro)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(test_labels, predicted_labels))