# Recurrent Neural Networks/LSTMs

This notebook demonstrates text classification using modern PyTorch and Hugging Face libraries.

**Note:** We're using `transformers` and `datasets` instead of the deprecated `torchtext` library.

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')}")

PyTorch version: 2.9.0
Device: mps


In [2]:
# load in huffpost dataset from project 
from datasets import load_from_disk
import pickle

print("Loading HuffPost dataset...")
processed_datasets = load_from_disk("huffpost_processed_milestone2")
train_ds = processed_datasets['train']
val_ds = processed_datasets['validation']
test_ds = processed_datasets['test']

# Load label encoder
with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

# Load class weights
class_weights = np.load('class_weights.npy')
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

num_labels = len(le.classes_)
print(f"Number of labels: {num_labels}")
print(f"Loaded: Train={len(train_ds)}, Val={len(val_ds)}, Test={len(test_ds)}")

# Initialize tokenizer for DistilBERT
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize the datasets
print("Tokenizing datasets...")
train_dataset = train_ds.map(tokenize_function, batched=True, batch_size=1000)
val_dataset = val_ds.map(tokenize_function, batched=True, batch_size=1000)
test_dataset = test_ds.map(tokenize_function, batched=True, batch_size=1000)

# Set format for PyTorch - this converts to tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenization complete!")
print(f"Vocab size: {tokenizer.vocab_size}")

Loading HuffPost dataset...
Number of labels: 37
Loaded: Train=140255, Val=30055, Test=30055
Tokenizing datasets...


Map:   0%|          | 0/140255 [00:00<?, ? examples/s]

Map:   0%|          | 0/30055 [00:00<?, ? examples/s]

Map:   0%|          | 0/30055 [00:00<?, ? examples/s]

Tokenization complete!
Vocab size: 30522


In [3]:
# Load pretrained DistilBERT model for sequence classification
from transformers import AutoModelForSequenceClassification

print("Loading pretrained DistilBERT model...")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="single_label_classification"
)

# Move to device
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
model = model.to(device)

print(f"Model loaded with {sum(p.numel() for p in model.parameters()):,} parameters")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"Device: {device}")
print(f"Output classes: {num_labels}")

Loading pretrained DistilBERT model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 66,981,925 parameters
Trainable parameters: 66,981,925
Device: mps
Output classes: 37


In [4]:
# Training setup for DistilBERT
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

BATCH_SIZE = 16  # Smaller batch size for transformer
LEARNING_RATE = 2e-5  # Standard learning rate for BERT fine-tuning
EPOCHS = 3  # 3-5 epochs typical for fine-tuning

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Loss and optimizer
class_weights_tensor = class_weights_tensor.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Use AdamW optimizer (standard for transformers)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

# Learning rate scheduler with warmup
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps // 10,  # 10% warmup
    num_training_steps=total_steps
)

print(f"Training setup complete!")
print(f"Train batches per epoch: {len(train_loader)}")
print(f"Val batches per epoch: {len(val_loader)}")
print(f"Test batches per epoch: {len(test_loader)}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Total training steps: {total_steps}")
print(f"Using weighted loss for {len(class_weights_tensor)} classes")

Training setup complete!
Train batches per epoch: 8766
Val batches per epoch: 1879
Test batches per epoch: 1879
Learning rate: 2e-05
Total training steps: 26298
Using weighted loss for 37 classes


In [5]:
# Training loop for DistilBERT
from tqdm import tqdm

def train_epoch(model, loader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    pbar = tqdm(loader, desc="Training")
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass - model returns a dict with 'logits'
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = criterion(logits, labels)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        pbar.set_postfix({'loss': f'{loss.item():.3f}', 'acc': f'{100*correct/total:.1f}%'})
    
    return total_loss / len(loader), 100 * correct / total

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return total_loss / len(loader), 100 * correct / total

# Train the model
print("Starting DistilBERT fine-tuning...\n")
best_val_acc = 0

for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    print(f"\nEpoch {epoch+1}/{EPOCHS}:")
    print(f"  Train Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%")
    print(f"  Val Loss: {val_loss:.3f} | Val Acc: {val_acc:.2f}%")
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        model.save_pretrained('best_distilbert_model')
        tokenizer.save_pretrained('best_distilbert_model')
        print(f"  ✓ New best model saved!")
    print()

print(f"Training complete! Best validation accuracy: {best_val_acc:.2f}%")

# Final test evaluation
print("\nEvaluating on test set...")
model = AutoModelForSequenceClassification.from_pretrained('best_distilbert_model').to(device)
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.2f}%")

Starting DistilBERT fine-tuning...



Training: 100%|██████████| 8766/8766 [18:00<00:00,  8.11it/s, loss=1.821, acc=54.0%]
Evaluating: 100%|██████████| 1879/1879 [01:03<00:00, 29.76it/s]



Epoch 1/3:
  Train Loss: 1.805 | Train Acc: 54.03%
  Val Loss: 1.262 | Val Acc: 64.11%
  ✓ New best model saved!



Training: 100%|██████████| 8766/8766 [18:00<00:00,  8.11it/s, loss=1.089, acc=68.2%]
Evaluating: 100%|██████████| 1879/1879 [01:01<00:00, 30.67it/s]



Epoch 2/3:
  Train Loss: 1.047 | Train Acc: 68.25%
  Val Loss: 1.189 | Val Acc: 66.76%
  ✓ New best model saved!



Training: 100%|██████████| 8766/8766 [17:48<00:00,  8.20it/s, loss=0.130, acc=74.9%]
Evaluating: 100%|██████████| 1879/1879 [01:04<00:00, 29.24it/s]



Epoch 3/3:
  Train Loss: 0.772 | Train Acc: 74.87%
  Val Loss: 1.210 | Val Acc: 67.98%
  ✓ New best model saved!

Training complete! Best validation accuracy: 67.98%

Evaluating on test set...


Evaluating: 100%|██████████| 1879/1879 [01:02<00:00, 30.02it/s]

Test Loss: 1.238 | Test Acc: 67.79%



