In [1]:
import os
import tensorflow_datasets as tfds
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from torch.utils.data import Dataset

In [2]:

# ============ STEP 0: Disable WandB ============
os.environ["WANDB_DISABLED"] = "true"

In [3]:
# ============ STEP 1: Load Dataset ============
ds, info = tfds.load("ag_news_subset", with_info=True, as_supervised=True)
train_ds, test_ds = ds["train"], ds["test"]



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/ag_news_subset/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/incomplete.X1F5I4_1.0.0/ag_news_subset-train.tfrecord*...: â€¦

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/incomplete.X1F5I4_1.0.0/ag_news_subset-test.tfrecord*...:  â€¦

Dataset ag_news_subset downloaded and prepared to /root/tensorflow_datasets/ag_news_subset/1.0.0. Subsequent calls will reuse this data.


In [4]:
# ============ STEP 2: Tokenization ============
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_batch(texts, labels):
    tokens = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=64,   # shorter sequences for speed
        return_tensors="pt"
    )
    tokens["labels"] = labels
    return tokens

def tfds_to_torch(dataset):
    texts, labels = [], []
    for text, label in tfds.as_numpy(dataset):
        texts.append(text.decode())
        labels.append(label)
    return tokenize_batch(texts, torch.tensor(labels))

train_encodings = tfds_to_torch(train_ds)
test_encodings = tfds_to_torch(test_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# ============ STEP 3: Custom PyTorch Dataset ============
class AGNewsDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

train_dataset = AGNewsDataset(train_encodings)
test_dataset = AGNewsDataset(test_encodings)

In [6]:
# ============ STEP 4: Load Model ============
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# ============ STEP 5: Training Arguments ============
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    eval_steps=200,
    save_steps=200,
    per_device_train_batch_size=32,   # increase if GPU has memory
    per_device_eval_batch_size=32,
    num_train_epochs=1,               # reduce epochs for speed
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    learning_rate=5e-5,               # AdamW default
    fp16=True,                        # mixed precision if GPU supports
)

In [8]:
# ============ STEP 6: Trainer ============
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [9]:
# ============ STEP 7: Train ============
trainer.train()


Step,Training Loss
50,0.7427
100,0.3583
150,0.3491
200,0.3342
250,0.3177
300,0.3382
350,0.3004
400,0.301
450,0.3056
500,0.3024


TrainOutput(global_step=3750, training_loss=0.23320042724609374, metrics={'train_runtime': 1034.9702, 'train_samples_per_second': 115.945, 'train_steps_per_second': 3.623, 'total_flos': 3946736701440000.0, 'train_loss': 0.23320042724609374, 'epoch': 1.0})

In [10]:
# Save fine-tuned model
model.save_pretrained("./fine-tuned-bert-agnews")
tokenizer.save_pretrained("./fine-tuned-bert-agnews")

('./fine-tuned-bert-agnews/tokenizer_config.json',
 './fine-tuned-bert-agnews/special_tokens_map.json',
 './fine-tuned-bert-agnews/vocab.txt',
 './fine-tuned-bert-agnews/added_tokens.json',
 './fine-tuned-bert-agnews/tokenizer.json')

In [11]:
# ============ STEP 8: Evaluate ============
results = trainer.evaluate()
print("Evaluation:", results)

Evaluation: {'eval_loss': 0.18448281288146973, 'eval_runtime': 7.0834, 'eval_samples_per_second': 1072.925, 'eval_steps_per_second': 33.599, 'epoch': 1.0}


In [12]:
# ============ STEP 9: Prediction Helper ============
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return label_map[predicted_class]

In [13]:
# Reload model + tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = "./fine-tuned-bert-agnews"
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # ðŸ”‘ Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return label_map[predicted_class]

# Test again
print("Prediction:", predict("The stock market crashed after the company announced bankruptcy."))
print("Prediction:", predict("Lionel Messi scored a hat-trick in yesterday's match."))
print("Prediction:", predict("NASA is planning a new mission to study Jupiter."))


Prediction: Business
Prediction: World
Prediction: Sci/Tech
