<a href="https://colab.research.google.com/github/shamrosewebdev/News_Topic_Classifier_Using_BERT/blob/main/News_Topic_Classifier_Using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Libraries
!pip install -U transformers accelerate datasets evaluate


import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
import evaluate

In [None]:
# Load the dataset
dataset = load_dataset("ag_news")

dataset

In [None]:
# Load Tokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# Tokenization Function

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [None]:
#Tokenize Dataset

tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset


In [None]:
# Prepare Dataset for Pytorch

tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")


In [None]:
# Load BERT Model

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4
)


In [None]:
# Evaluation Metrics

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }


In [None]:
# Training Arguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,   # internship-safe
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none"
)


In [None]:
# Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)


In [None]:
# Model Training

trainer.train()


In [None]:
# Save the model

trainer.save_model("./news_bert_model")
tokenizer.save_pretrained("./news_bert_model")


In [None]:
# Prediction Test

label_names = ["World", "Sports", "Business", "Sci/Tech"]

def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
    return label_names[pred]

print(classify_news("Apple releases new AI powered smartphone"))
