## Acknowledgments:
"""
The core concepts and architectural patterns implemented here were learned from and inspired by several excellent educational resources, including Jay Alammar's "The Illustrated Transformer", Andrej Karpathy's "Let's build GPT", and Josh Starmer's course on DeepLearning.AI.
"""

In [1]:
# --- IMPORTS ---

# Core PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# TorchText for NLP
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# Standard Python Libraries
import math
import os
from itertools import islice

# Utilities
import portalocker # For safe data downloading

# custom modules
import sys
import os
# Go up one level from 'notebooks' to the project root
sys.path.append(os.path.abspath('..')) 


from model import TransformerClassifier
from utils import evaluate
import config


print("✅ All libraries and modules imported successfully.")

✅ All libraries and modules imported successfully.


----

In [2]:
## Part1: Data Preparation

def getSrcdata(full_dataset=False, sample_size=1000, cache_dir="./ag_news_cache"):
#def getSrcdata(cache_dir="./ag_news_cache"):
    # Set up tokenizer
    #tokenizer = get_tokenizer("basic_english")

    def yield_tokens(data_iter):
        for _, text in data_iter:
            yield tokenizer(text)

    # Force single-thread token processing
    torch.set_num_threads(1)
    os.makedirs(cache_dir, exist_ok=True)
    lock_path = os.path.join(cache_dir, "download.lock")

    # Safe cache access with portalocker
    with open(lock_path, "w") as lock_file:
        portalocker.lock(lock_file, portalocker.LOCK_EX)
        try:
            train_iter, test_iter = AG_NEWS(root=cache_dir, split=('train', 'test'))
        finally:
            portalocker.unlock(lock_file)

        # Take only first `sample_size` training/test samples
    if full_dataset:
        print("Loading FULL dataset...")
        train_sample = list(train_iter)
        test_sample = list(test_iter)
    else:
        print(f"Loading SAMPLE of {sample_size}...")
        train_sample = list(islice(train_iter, sample_size))
        test_sample = list(islice(test_iter, sample_size))
        
    print(f"Training sample size: {len(train_sample)}")
    print(f"Test sample size: {len(test_sample)}")

    # Build vocab from first `sample_size` training samples
    vocab = build_vocab_from_iterator(
        yield_tokens(train_sample),
        specials=["<unk>", "<pad>"]
    )
    vocab.set_default_index(vocab["<unk>"])

    return train_sample, test_sample, vocab




In [3]:
## Part2: Define a collate function to process batches

def collate_batch(batch):
    labels_list, text_list = [], []
    for (_label, _text) in batch:
        # --- FIX THE LABEL HERE, ONCE AND FOR ALL ---
        labels_list.append(_label - 1) 
        processed_text = torch.tensor([vocab[token] for token in tokenizer(_text)], dtype=torch.long)
        text_list.append(processed_text)
    
    labels_tensor = torch.tensor(labels_list, dtype=torch.long)
    padded_text = pad_sequence(text_list, batch_first=True, padding_value=PAD_IDX)
    return padded_text, labels_tensor

In [4]:
## Part3: Define the tokenizer (should be global)

tokenizer = get_tokenizer("basic_english")
print("✅ Tokenizer defined globally.")

✅ Tokenizer defined globally.


In [5]:
## Part5: get data now from above function

# 1. Get the raw data and vocab
train_sample, test_sample, vocab = getSrcdata(full_dataset=config.FULL_DATASET) # Using a sample for speed

# --- SAVE THE VOCAB ARTIFACT ---
torch.save(vocab, config.VOCAB_SAVE_PATH)
print(f"✅ Vocab saved to {config.VOCAB_SAVE_PATH}")

# 2. Define PAD_IDX
PAD_IDX = vocab['<pad>']

# 3. Create the DataLoaders
train_dataloader = DataLoader(train_sample, batch_size=config.BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_sample, batch_size=config.BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

print("✅ Train and Test DataLoaders created successfully.")



Loading FULL dataset...
Training sample size: 120000
Test sample size: 7600
✅ Vocab saved to ../models/newsclassification_vocab.pth
✅ Train and Test DataLoaders created successfully.


In [6]:
# Model Hyperparameters
VOCAB_SIZE = len(vocab) # This needs to be calculated after data loading


model = TransformerClassifier(
    vocab_size=VOCAB_SIZE,
    d_model=config.D_MODEL,
    num_heads=config.NUM_HEADS,
    num_layers=config.NUM_LAYERS,
    d_ff=config.D_FF,
    num_classes=config.NUM_CLASSES
).to(config.DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()


In [7]:
# --- DATALOADER DEFINITIVE TEST ---
print("--- Inspecting the first batch from the DataLoader ---")

# Get one batch of data from the loader
try:
    first_batch_tokens, first_batch_labels = next(iter(train_dataloader))

    # Perform the sanity check on THIS BATCH
    unique_labels = first_batch_labels.unique()
    min_label = first_batch_labels.min().item()
    max_label = first_batch_labels.max().item()
    
    print(f"Batch token shape: {first_batch_tokens.shape}")
    print(f"Batch label shape: {first_batch_labels.shape}")
    print(f"\nUnique labels in this batch: {unique_labels}")
    print(f"Minimum label in this batch: {min_label}")
    print(f"Maximum label in this batch: {max_label}")
    
    # Define NUM_CLASSES here for the check
    NUM_CLASSES = 4 

    if max_label >= NUM_CLASSES or min_label < 0:
        print("\n❌ FATAL ERROR: The DataLoader is producing labels that are out of bounds!")
        print(f"   - Model expects labels from 0 to {NUM_CLASSES-1}.")
        print(f"   - This batch contains labels from {min_label} to {max_label}.")
    else:
        print("\n✅ SUCCESS: The labels in this batch are correct.")

except Exception as e:
    print(f"\n❌ An error occurred while trying to get a batch: {e}")

--- Inspecting the first batch from the DataLoader ---
Batch token shape: torch.Size([64, 83])
Batch label shape: torch.Size([64])

Unique labels in this batch: tensor([0, 1, 2, 3])
Minimum label in this batch: 0
Maximum label in this batch: 3

✅ SUCCESS: The labels in this batch are correct.


In [8]:
# --- Initialize the Supervisor's State for Early Stopping ---
epochs_without_improvement = 0
best_test_accuracy = 0.0

for epoch in range(config.NUM_EPOCHS):
    
    # --- Training Phase ---
    model.train()
    total_loss = 0
    
    for batch_tokens, batch_labels in train_dataloader:
            batch_tokens = batch_tokens.to(config.DEVICE)
            batch_labels = batch_labels.to(config.DEVICE)
            
            logits = model(batch_tokens)
            loss = loss_fn(logits, batch_labels)
            total_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    #avg training loss
    avg_loss = total_loss / len(train_dataloader)
    
    # --- Evaluation Phase ---
    test_accuracy = evaluate(model, test_dataloader, config.DEVICE)
    print(f"Epoch {epoch+1:02d}/{config.NUM_EPOCHS} | Train Loss: {avg_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

    # --- (EARLY STOPPING) ---
    if test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        epochs_without_improvement = 0
        torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
        print(f"  -> New best model saved with accuracy: {best_test_accuracy:.4f}")
    else:
        epochs_without_improvement += 1
        print(f"  -> No improvement. Patience: {epochs_without_improvement}/{config.PATIENCE}")

    # Check if the supervisor should stop the training
    if epochs_without_improvement >= config.PATIENCE:
        print(f"\nEARLY STOPPING after {config.PATIENCE} epochs without improvement.")
        break # Exit the training loop



Epoch 01/20 | Train Loss: 0.5572 | Test Accuracy: 0.8674
  -> New best model saved with accuracy: 0.8674
Epoch 02/20 | Train Loss: 0.3469 | Test Accuracy: 0.8847
  -> New best model saved with accuracy: 0.8847
Epoch 03/20 | Train Loss: 0.2894 | Test Accuracy: 0.8957
  -> New best model saved with accuracy: 0.8957
Epoch 04/20 | Train Loss: 0.2527 | Test Accuracy: 0.9033
  -> New best model saved with accuracy: 0.9033
Epoch 05/20 | Train Loss: 0.2281 | Test Accuracy: 0.9125
  -> New best model saved with accuracy: 0.9125
Epoch 06/20 | Train Loss: 0.2082 | Test Accuracy: 0.9076
  -> No improvement. Patience: 1/3
Epoch 07/20 | Train Loss: 0.1911 | Test Accuracy: 0.9133
  -> New best model saved with accuracy: 0.9133
Epoch 08/20 | Train Loss: 0.1735 | Test Accuracy: 0.9120
  -> No improvement. Patience: 1/3
Epoch 09/20 | Train Loss: 0.1610 | Test Accuracy: 0.9146
  -> New best model saved with accuracy: 0.9146
Epoch 10/20 | Train Loss: 0.1486 | Test Accuracy: 0.9163
  -> New best model save