In [None]:
!pip install transformers datasets torch scikit-learn streamlit pyngrok



In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch

# Load dataset
dataset = load_dataset("ag_news")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Format dataset for PyTorch
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Split dataset (use smaller subsets for faster training in Colab)
train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(10000))
test_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(2000))

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch
import os

# Disable W&B to avoid API key prompt
os.environ["WANDB_DISABLED"] = "true"

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Training arguments (use local paths)
training_args = TrainingArguments(
    output_dir="/content/bert_news_classifier",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="/content/logs",
    logging_steps=100,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

# Save model to local path
model.save_pretrained("/content/bert_news_classifier")
tokenizer.save_pretrained("/content/bert_news_classifier")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2914,0.268952,0.9105,0.910335
2,0.1923,0.277295,0.92,0.920043
3,0.1105,0.302983,0.9215,0.921497


('/content/bert_news_classifier/tokenizer_config.json',
 '/content/bert_news_classifier/special_tokens_map.json',
 '/content/bert_news_classifier/vocab.txt',
 '/content/bert_news_classifier/added_tokens.json',
 '/content/bert_news_classifier/tokenizer.json')

In [None]:
# Evaluate model
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1-Score: {eval_results['eval_f1']:.4f}")

Accuracy: 0.9215
F1-Score: 0.9215


In [None]:
# Label mapping
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

# Test with sample headlines
sample_headlines = [
    "New AI breakthrough in quantum computing",
    "Stock market crashes amid global uncertainty",
    "Team wins championship in thrilling final"
]

# Load tokenizer and model from local path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("/content/bert_news_classifier")
model = AutoModelForSequenceClassification.from_pretrained("/content/bert_news_classifier")
model.to(device)
model.eval()

# Predict
for headline in sample_headlines:
    inputs = tokenizer(headline, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    print(f"Headline: {headline}")
    print(f"Predicted Topic: {label_map[prediction]}\n")

Headline: New AI breakthrough in quantum computing
Predicted Topic: Sci/Tech

Headline: Stock market crashes amid global uncertainty
Predicted Topic: Business

Headline: Team wins championship in thrilling final
Predicted Topic: Sports



In [None]:
%%writefile app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer from local path
model = AutoModelForSequenceClassification.from_pretrained("/content/bert_news_classifier")
tokenizer = AutoTokenizer.from_pretrained("/content/bert_news_classifier")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Label mapping
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

# Streamlit app
st.title("News Topic Classifier")
st.write("Enter a news headline to classify its topic.")

# Input text
headline = st.text_input("Headline", "Enter a news headline here...")

if st.button("Classify"):
    if headline:
        # Tokenize input
        inputs = tokenizer(headline, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        # Predict
        with torch.no_grad():
            outputs = model(**inputs)
            prediction = torch.argmax(outputs.logits, dim=-1).item()

        # Display result
        st.write(f"Predicted Topic: **{label_map[prediction]}**")
    else:
        st.write("Please enter a headline.")

Writing app.py


In [None]:
from pyngrok import ngrok

# Set ngrok authtoken (replace with your actual token)
!ngrok authtoken 31onOyFv68hlV5TS5ULVJEDNJFU_5mzRkrTDsJTcWbY32XCb

# Stop any running Streamlit process
!pkill streamlit

# Start Streamlit server
!streamlit run app.py &>/dev/null&

# Create public URL
ngrok.kill()  # Close existing tunnels
public_url = ngrok.connect(8501)
print(f"Streamlit app running at: {public_url}")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit app running at: NgrokTunnel: "https://0ddd53f10b00.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!pkill streamlit
ngrok.kill()

In [None]:
import json

# Save evaluation results to a local file
results = {
    "accuracy": eval_results["eval_accuracy"],
    "f1_score": eval_results["eval_f1"]
}
with open("/content/bert_news_classifier/results.json", "w") as f:
    json.dump(results, f, indent=4)
print("Evaluation results saved to /content/bert_news_classifier/results.json")

Evaluation results saved to /content/bert_news_classifier/results.json
