In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout,Lambda
from tensorflow.keras.models import Model

# =============================================
# 1. Carregar dataset
# =============================================
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])  # ham=0, spam=1

# =============================================
# 2. Tokenização com BERT
# =============================================
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"  # pode trocar por "neuralmind/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 80

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

# Dividir dados
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

train_encodings = tokenize_texts(X_train)
test_encodings = tokenize_texts(X_test)

# =============================================
# 3. Criar dataset TensorFlow
# =============================================
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train.values
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test.values
)).batch(16)

# =============================================
# 4. Modelo BERT + Camadas densas
# =============================================

bert = TFBertModel.from_pretrained(MODEL_NAME)

for layer in bert.layers:
    layer.trainable = False  # congela BERT

input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

def bert_layer(inputs):
    input_ids, attention_mask = inputs
    outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
    cls_output = outputs.last_hidden_state[:, 0, :]  # saída do token [CLS]
    return cls_output

cls_output = tf.keras.layers.Lambda(bert_layer, output_shape=(bert.config.hidden_size,))([input_ids, attention_mask])


drop = Dropout(0.3)(cls_output)
out = Dense(1, activation='sigmoid')(drop)

model = Model(inputs=[input_ids, attention_mask], outputs=out)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# =============================================
# 5. Treinar o modelo
# =============================================
EPOCHS = 3

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS
)

# =============================================
# 6. Avaliação
# =============================================
loss, acc = model.evaluate(test_dataset)
print(f"Acurácia no teste: {acc*100:.2f}%")


Some layers from the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1768s[0m 6s/step - accuracy: 0.5242 - loss: 0.6926 - val_accuracy: 0.8735 - val_loss: 0.5046
Epoch 2/3
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1730s[0m 6s/step - accuracy: 0.8648 - loss: 0.4892 - val_accuracy: 0.8655 - val_loss: 0.4045
Epoch 3/3
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1746s[0m 6s/step - accuracy: 0.8640 - loss: 0.4126 - val_accuracy: 0.8655 - val_loss: 0.3602
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m346s[0m 5s/step - accuracy: 0.8755 - loss: 0.3497
Acurácia no teste: 86.55%
