In [3]:
# Install required packages
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [18]:
import torch
import tensorflow

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 Load the AG News Dataset

In [8]:
# Load the AG News dataset
from datasets import load_dataset
ag_news_dataset = load_dataset("ag_news")

# This will give you train and test splits
train_dataset = ag_news_dataset["train"]
test_dataset = ag_news_dataset["test"]

# Print some information about the dataset
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of test examples: {len(test_dataset)}")
print(f"Sample example: {train_dataset[0]}")

# Map class indices to category names
class_mapping = {
    0: 'World',
    1: 'Sports',
    2: 'Business',
    3: 'Sci/Tech'
}

# Print the class distribution in the training set
from collections import Counter
class_counts = Counter(train_dataset['label'])
print("\nClass distribution in training set:")
for label_id, count in class_counts.items():
    print(f"{class_mapping[label_id]}: {count} examples")


# First, let's check the actual structure of the dataset
print("Dataset type:", type(train_dataset))
print("First element type:", type(train_dataset[0]))
# Display a few examples from each class

for label_id, class_name in class_mapping.items():
    examples = [ex for ex in train_dataset.select(range(100)) if ex['label'] == label_id]
    if examples:
        print(f"\n{class_name} example:")
        print(f"Text: {examples[0]['text']}")


Number of training examples: 120000
Number of test examples: 7600
Sample example: {'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}

Class distribution in training set:
Business: 30000 examples
Sci/Tech: 30000 examples
Sports: 30000 examples
World: 30000 examples
Dataset type: <class 'datasets.arrow_dataset.Dataset'>
First element type: <class 'dict'>

Business example:
Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.

Sci/Tech example:
Text: 'Madden,' 'ESPN' Football Score in Different Ways (Reuters) Reuters - Was absenteeism a little high\on Tuesday among the guys at the office? EA Sports would like\to think it was because "Madden NFL 2005" came out that day,\and some fans of the football simulation are rabid enough to\take a sick day to play it.


Prepare Data for Training

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create a custom dataset class
class BlogClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        # Convert label to tensor
        label_tensor = torch.tensor(label)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label_tensor
        }

# Extract texts and labels
train_texts = [item['text'] for item in train_dataset]
train_labels = [item['label'] for item in train_dataset]
test_texts = [item['text'] for item in test_dataset]
test_labels = [item['label'] for item in test_dataset]

# Split training data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42, stratify=train_labels
)

# Create datasets
train_data = BlogClassificationDataset(train_texts, train_labels, tokenizer)
val_data = BlogClassificationDataset(val_texts, val_labels, tokenizer)
test_data = BlogClassificationDataset(test_texts, test_labels, tokenizer)

# Create dataloaders
batch_size = 16
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

print(f"Train loader size: {len(train_loader)}")
print(f"Validation loader size: {len(val_loader)}")
print(f"Test loader size: {len(test_loader)}")

# Check a sample batch
sample_batch = next(iter(train_loader))
print("\nSample batch keys:", sample_batch.keys())
print("Input IDs shape:", sample_batch['input_ids'].shape)
print("Attention mask shape:", sample_batch['attention_mask'].shape)
print("Labels shape:", sample_batch['labels'].shape)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Train loader size: 6750
Validation loader size: 750
Test loader size: 475

Sample batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs shape: torch.Size([16, 128])
Attention mask shape: torch.Size([16, 128])
Labels shape: torch.Size([16])


In [24]:
# For PyTorch models
from transformers import DistilBertModel, DistilBertConfig

# Create a configuration
config = DistilBertConfig()

# Create a model instance
model = DistilBertModel(config)

# Print the model architecture
print(model)


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

Create the Model with Modified Layers

In [11]:
import torch.nn as nn
from transformers import DistilBertModel

class BlogClassifier(nn.Module):
    def __init__(self, pretrained_model_name="distilbert-base-uncased", num_classes=4):
        super(BlogClassifier, self).__init__()
        self.base_model = DistilBertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.1)

        # Custom classification head with intermediate layer
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),  # Reduce dimension from BERT's hidden size
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)  # Output layer for 4 classes
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Use [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BlogClassifier().to(device)

# Freeze base model layers
for param in model.base_model.parameters():
    param.requires_grad = False

# Unfreeze the last transformer layer for fine-tuning
for param in model.base_model.transformer.layer[-1].parameters():
    param.requires_grad = True

print(f"Model initialized on device: {device}")
print("Model architecture:")
print(model)

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model initialized on device: cuda
Model architecture:
BlogClassifier(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout

In [13]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import time
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

def train_model(model, train_loader, val_loader, epochs=2, lr=2e-5):
    # Initialize optimizer
    optimizer = AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=lr,
        weight_decay=0.01
    )

    # Set up learning rate scheduler
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Training loop
    best_val_accuracy = 0.0

    for epoch in range(epochs):
        start_time = time.time()

        # Training phase
        model.train()
        train_loss = 0.0
        train_preds = []
        train_labels = []

        for batch in train_loader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Track statistics
            train_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            train_preds.extend(predicted.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

        train_accuracy = accuracy_score(train_labels, train_preds)
        avg_train_loss = train_loss / len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)

                val_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                val_preds.extend(predicted.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(val_labels, val_preds)
        avg_val_loss = val_loss / len(val_loader)

        # Print statistics
        elapsed_time = time.time() - start_time
        print(f"Epoch {epoch+1}/{epochs} - Time: {elapsed_time:.2f}s")
        print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            # Save to Google Drive
            save_path = '/content/drive/MyDrive/blog_classifier_model.pt'
            torch.save(model.state_dict(), save_path)
            print(f"Saved best model with accuracy: {val_accuracy:.4f} to {save_path}")

        print("-" * 50)

    return model

# Train the model
print(f"Starting training on {device}...")
model = train_model(model, train_loader, val_loader, epochs=3, lr=2e-5)


Starting training on cuda...
Epoch 1/3 - Time: 600.71s
Train Loss: 0.2662, Train Accuracy: 0.9088
Val Loss: 0.1990, Val Accuracy: 0.9278
Saved best model with accuracy: 0.9278 to /content/drive/MyDrive/blog_classifier_model.pt
--------------------------------------------------
Epoch 2/3 - Time: 598.68s
Train Loss: 0.1962, Train Accuracy: 0.9306
Val Loss: 0.1837, Val Accuracy: 0.9332
Saved best model with accuracy: 0.9332 to /content/drive/MyDrive/blog_classifier_model.pt
--------------------------------------------------
Epoch 3/3 - Time: 596.20s
Train Loss: 0.1757, Train Accuracy: 0.9388
Val Loss: 0.1776, Val Accuracy: 0.9365
Saved best model with accuracy: 0.9365 to /content/drive/MyDrive/blog_classifier_model.pt
--------------------------------------------------


Initial Dimensions (Before Changes)
When using a pre-trained model like DistilBERT:

Input embedding dimension: 768 (standard for DistilBERT base model)

Hidden layer dimension: 768

Output dimension: 4 (corresponding to the four categories: World, Sports, Business, Sci/Tech)

Dimensions After Changes
After modifying the layers for yur specific classification task:

Input embedding dimension: 768 (unchanged from base model)

Intermediate layer dimension: 256 (reduced from 768)

Output dimension: 4 (corresponding to  four categories)

Evaluate

In [16]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            _, predicted = torch.max(logits, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['World', 'Sports', 'Business', 'Sci/Tech'])

    print(f"Test Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)

    return accuracy, all_preds, all_labels

# Load the best model
best_model_path = '/content/drive/MyDrive/blog_classifier_model.pt'
model.load_state_dict(torch.load(best_model_path, map_location=device))

# Evaluate on test set

evaluate_model(model, test_loader)
print("Evaluating the model on test set:")

Test Accuracy: 0.9353

Classification Report:
              precision    recall  f1-score   support

       World       0.95      0.94      0.94      1900
      Sports       0.98      0.98      0.98      1900
    Business       0.92      0.89      0.90      1900
    Sci/Tech       0.90      0.93      0.91      1900

    accuracy                           0.94      7600
   macro avg       0.94      0.94      0.94      7600
weighted avg       0.94      0.94      0.94      7600

Evaluating the model on test set:


In [17]:
def classify_blog(text, model=None, tokenizer=None):
    # If model is not provided, load it
    if model is None:
        model_path = '/content/drive/MyDrive/blog_classifier_model.pt'
        model = BlogClassifier().to(device)
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()

    # If tokenizer is not provided, initialize it
    if tokenizer is None:
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    # Tokenize the input text
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=128,
        padding='max_length',
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get prediction
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        pred_idx = predicted.item()

    # Map prediction index to category
    categories = ['World', 'Sports', 'Business', 'Sci/Tech']
    pred_label = categories[pred_idx]

    return pred_label

# Test the classifier on a sample blog post
sample_blog = """
The latest advancements in quantum computing have shown promising results in solving complex optimization problems.
Researchers at MIT have developed a new algorithm that can factor large numbers exponentially faster than classical computers.
This breakthrough could have significant implications for cryptography and data security in the coming years.
"""

category = classify_blog(sample_blog, model, tokenizer)
print(f"The blog post is classified as: {category}")

# Another example
sample_blog2 = """
The stock market rallied today as investors responded positively to the latest economic data.
The Dow Jones Industrial Average rose by 2.3%, while the NASDAQ composite index gained 3.1%.
Analysts attribute the gains to better-than-expected corporate earnings and reduced inflation concerns.
"""

category2 = classify_blog(sample_blog2, model, tokenizer)
print(f"The blog post is classified as: {category2}")


The blog post is classified as: Sci/Tech
The blog post is classified as: Business
