# Document Classification using Neural Networks and TorchText
This notebook demonstrates how to implement a simple document classification model using PyTorch and TorchText. We will work with the AG News dataset and build a pipeline for:
1. Text preprocessing and tokenization
2. Embedding text using `nn.EmbeddingBag`
3. Building a feedforward neural network
4. Making predictions using `argmax` over logits
5. Understanding logits, classes, and hyperparameters

---

In [None]:
# Step 1: Imports and Setup
import torch
import torch.nn as nn
import torchtext
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import numpy as np
import time

# Set seed for reproducibility
torch.manual_seed(42)


In [None]:
# Step 2: Load AG_NEWS Dataset and Tokenize
train_iter = AG_NEWS(split='train')
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

print("Vocabulary size:", len(vocab))

In [None]:
# Step 3: Pipeline to Encode Text as Tensor
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1  # Adjust labels to start from 0

# Preview a sample
example_text = "Google's quantum computer achieves new milestone in speed."
print("Token indices:", text_pipeline(example_text))

In [None]:
# Step 4: Create Batch Function with Offsets (for EmbeddingBag)
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_tensor = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_tensor = torch.cat(text_list)
    return label_tensor, text_tensor, offsets

# Create DataLoader
train_iter = AG_NEWS(split='train')
dataloader = DataLoader(list(train_iter)[:1000], batch_size=8, shuffle=True, collate_fn=collate_batch)

In [None]:
# Step 5: Define Model Architecture (EmbeddingBag + Linear Layer)
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

num_classes = 4
vocab_size = len(vocab)
embed_dim = 64

model = TextClassificationModel(vocab_size, embed_dim, num_classes)
print(model)

In [None]:
# Step 6: Make Predictions with Argmax
# Fetch a single batch
for labels, text, offsets in dataloader:
    outputs = model(text, offsets)
    predictions = torch.argmax(outputs, dim=1)
    print("Logits:\n", outputs)
    print("Predicted classes:", predictions)
    print("True labels:", labels)
    break

## Summary
In this notebook, you:
- Loaded the AG_NEWS dataset using TorchText
- Built a vocabulary and tokenized the data
- Used `nn.EmbeddingBag` to aggregate word embeddings efficiently
- Built a simple classifier with a linear output layer
- Used the `argmax` function to predict classes from logits

**Next steps**: Train the model using a loss function like `CrossEntropyLoss` and an optimizer like `SGD` or `Adam`, and evaluate it on a test set.

---