In [None]:
# 1) Instalar pacotes (execute uma vez no Colab)
!pip install transformers datasets --quiet

# 2) Importar bibliotecas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)
from datasets import Dataset




In [None]:
# 3) Carregar dados
df = pd.read_csv('campanhas.csv', decimal=',')

# 5) Combinar campos textuais
df['text'] = df['H1'].fillna('') + ' ' + df['H2'].fillna('') # + ' ' + df['CTA'].fillna('')

# 6) Dividir treino e teste
train_df, test_df = train_test_split(
    df[['text','ACIMA_MEDIA']],
    test_size=0.2,
    random_state=42,
    stratify=df['ACIMA_MEDIA']
)

# 7) Converter para Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df.rename(columns={'ACIMA_MEDIA':'label'}))
test_ds  = Dataset.from_pandas(test_df.rename(columns={'ACIMA_MEDIA':'label'}))

# 8) Tokenizer em português
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)

# 9) Modelo BERT para classificação binária
model = AutoModelForSequenceClassification.from_pretrained(
    'neuralmind/bert-base-portuguese-cased',
    num_labels=2
)
data_collator = DataCollatorWithPadding(tokenizer)

# 10) Função de métricas
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/715 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/179 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install --upgrade transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 11) Configuração de treinamento
from transformers import TrainingArguments

STEPS_PER_EPOCH = 1500

training_args = TrainingArguments(
    output_dir="results",
    eval_steps=STEPS_PER_EPOCH,
    save_steps=STEPS_PER_EPOCH,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="logs",
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 12) Treinar e avaliar
trainer.train()
results = trainer.evaluate()
print(results)

  trainer = Trainer(


Step,Training Loss


{'eval_loss': 0.7076356410980225, 'eval_accuracy': 0.7430167597765364, 'eval_precision': 0.7619047619047619, 'eval_recall': 0.7111111111111111, 'eval_f1': 0.735632183908046, 'eval_runtime': 17.8776, 'eval_samples_per_second': 10.013, 'eval_steps_per_second': 1.287, 'epoch': 3.0}


In [None]:
metrics = trainer.evaluate()
print(f"Acurácia:  {metrics['eval_accuracy']:.4f}")
print(f"Precisão:  {metrics['eval_precision']:.4f}")
print(f"Recall:    {metrics['eval_recall']:.4f}")
print(f"F1-score:  {metrics['eval_f1']:.4f}")

Acurácia:  0.7430
Precisão:  0.7619
Recall:    0.7111
F1-score:  0.7356


In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

pred_output = trainer.predict(test_ds)
logits    = pred_output.predictions
labels    = pred_output.label_ids
preds     = np.argmax(logits, axis=-1)

print(classification_report(labels, preds, target_names=["Abaixo média","Acima média"]))

cm = confusion_matrix(labels, preds)
print("Matriz de Confusão:\n", cm)


              precision    recall  f1-score   support

Abaixo média       0.73      0.78      0.75        89
 Acima média       0.76      0.71      0.74        90

    accuracy                           0.74       179
   macro avg       0.74      0.74      0.74       179
weighted avg       0.74      0.74      0.74       179

Matriz de Confusão:
 [[69 20]
 [26 64]]
