In [1]:
# Step 0: Install required packages
!pip install -q transformers keras-nlp


In [2]:
import tensorflow as tf
from tensorflow.keras.datasets import reuters
from transformers import BertTokenizerFast
import keras_nlp
import numpy as np

# Load dataset and decode to text
(train_x, train_y), (test_x, test_y) = reuters.load_data(num_words=None, test_split=0.2)

word_index = reuters.get_word_index()
reverse_word_index = {v: k for k, v in word_index.items()}

def decode_newswire(encoded):
    return " ".join([reverse_word_index.get(i - 3, "?") for i in encoded])

train_texts = [decode_newswire(x) for x in train_x]
test_texts = [decode_newswire(x) for x in test_x]

# Convert to binary classification: topic 3 vs all
train_labels = np.array([1 if y == 3 else 0 for y in train_y])
test_labels = np.array([1 if y == 3 else 0 for y in test_y])


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_texts(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="np")

train_tokens = tokenize_texts(train_texts)
test_tokens = tokenize_texts(test_texts)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# Create tf.data.Dataset
train_ds = tf.data.Dataset.from_tensor_slices((
    {
        "token_ids": train_tokens["input_ids"],
        "segment_ids": train_tokens["token_type_ids"],
        "padding_mask": train_tokens["attention_mask"]
    },
    train_labels
)).batch(32).prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((
    {
        "token_ids": test_tokens["input_ids"],
        "segment_ids": test_tokens["token_type_ids"],
        "padding_mask": test_tokens["attention_mask"]
    },
    test_labels
)).batch(32).prefetch(tf.data.AUTOTUNE)


In [5]:
# Load and compile the model
model = keras_nlp.models.BertClassifier.from_preset(
    "bert_base_en_uncased",
    num_classes=2,
    preprocessor=None
)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(3e-5),
    metrics=["accuracy"]
)

model.summary(line_length=100)


In [11]:
train_ds = train_ds.take(2)  # Takes first 100 batches (not samples)
test_ds = test_ds.take(1)


In [12]:
# Train the model
history = model.fit(train_ds, validation_data=test_ds, epochs=1)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 88s/step - accuracy: 0.9583 - loss: 0.2239 - val_accuracy: 0.8438 - val_loss: 0.4549


In [13]:
# Evaluate performance
loss, accuracy = model.evaluate(test_ds)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18s/step - accuracy: 0.8438 - loss: 0.4549
Test Loss: 0.4549
Test Accuracy: 0.8438


In [14]:
# Inference on custom samples
sample_texts = [
    "The stock market crashed today amid global concerns.",
    "The new processor improves performance by 20% over last year.",
    "The company reported higher earnings this quarter."
]

sample_tokens = tokenizer(sample_texts, padding="max_length", truncation=True, max_length=128, return_tensors="tf")

inputs = {
    "token_ids": sample_tokens["input_ids"],
    "segment_ids": sample_tokens["token_type_ids"],
    "padding_mask": sample_tokens["attention_mask"]
}

preds = model.predict(inputs)
probs = tf.nn.softmax(preds, axis=-1).numpy()

for i, text in enumerate(sample_texts):
    label = np.argmax(probs[i])
    sentiment = "Topic 3" if label == 1 else "Other"
    confidence = probs[i][label]
    print(f"Text: {text}")
    print(f"Prediction: {sentiment} (Confidence: {confidence:.4f})")
    print("-" * 60)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
Text: The stock market crashed today amid global concerns.
Prediction: Other (Confidence: 0.7532)
------------------------------------------------------------
Text: The new processor improves performance by 20% over last year.
Prediction: Other (Confidence: 0.5500)
------------------------------------------------------------
Text: The company reported higher earnings this quarter.
Prediction: Other (Confidence: 0.6861)
------------------------------------------------------------
