# Learning Project: Document Classification with PyTorch and TorchText

This project builds a text classification model using the AG_NEWS dataset with PyTorch and TorchText. The model predicts the category of a news article (World, Sports, Business, or Sci/Tech) from its raw text. It uses:
- __Tokenization__ and __vocabulary building__ to convert raw text into numeric format
- A __collate function__ with EmbeddingBag for efficient text representation without padding
- A __simple feedforward neural network__ for classification
- __Cross-entropy loss__ and __stochastic gradient descent (SGD)__ for training
- A learning __rate scheduler__ to dynamically adjust training speed

The project demonstrates the full NLP pipeline: from data preprocessing to model training, evaluation, and optimization.

In [1]:
# load libraries 
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import StepLR

In [2]:
# Train test split 
train_iter, test_iter = AG_NEWS(split=('train','test'))

In [None]:
# Tokenizaiton and Vocab building 
tokenizer = get_tokenizer('basic_english')
# build vocabulary (with generator function to avoid memory inefficiency)
vocab = build_vocab_from_iterator(
    (tokenizer(text) for _,text in train_iter), 
    specials=['<unk>']
)
vocab.set_default_index(vocab['<unk>'])

In [4]:
# Collate function 
## collate function essentially preprocess the batch on the fly 
def collate_batch(batch): 
    # label_list -> store the true class label as int
    # text_list -> store tokenized & indexed text 
    # offset -> starting index of each sample 
    label_list, text_list, offset = [],[],[0]
    for label,text in batch: 
        label_list.append(label - 1) # -1 for 0 indexed 
        processed_text = torch.tensor(vocab(tokenizer(text)), dtype=torch.int64) # tokenized & numericalized
        text_list.append(processed_text)
        offset.append(processed_text.size(0)) # basically length of tensor .size is a tensor method to get dim
    # convert label list to label tensor 
    label_tensor = torch.tensor(label_list,dtype=torch.int64)
    # convert text list to text tensor 
    text_tensor = torch.cat(text_list)
    # cumulative offsets: start index of each sample 
    offset_tensor = torch.tensor(offset[:-1]).cumsum(dim=0) # last length no needed; dim = 0 row wised 

    return label_tensor,text_tensor,offset_tensor


In [5]:
# DataLoader 
from torch.utils.data.dataset import random_split
# 95% for training and 5% for validation 
t_len = len(list(AG_NEWS(split='train')))
num_train = int(t_len * 0.95)
num_valid = t_len - num_train
# randomly split training set into training and validation 
train_set, valid_set = random_split(list(AG_NEWS(split='train')), [num_train,num_valid])
# wrap training set into train_dataloader 
train_dataloader = DataLoader(train_set,batch_size=8,shuffle=True,collate_fn=collate_batch)
# wrap validation set into valid_dataloader 
valid_dataloader = DataLoader(valid_set,batch_size=8,collate_fn=collate_batch) # no shuffle, deterministic 
# wrap test set into test_dataloader 
test_dataloader = DataLoader(list(AG_NEWS(split='test')), batch_size=8,collate_fn=collate_batch)

In [6]:
# define the model 
class TextClassificationModel(nn.Module):  # inherits from nn.Module
    def __init__(self,vocab_size, embed_dim, num_class): 
        # super() access parent class (nn.Module) methods 
        super().__init__() # initialize the base(constructor) nn.Module 
        # voccab_size -> # of unique tokens 
        # embed_dim -> dim of word embedding (each word is represented by embed_dim dimension vector)
        self.embedding = nn.EmbeddingBag(vocab_size,embed_dim,sparse = False)
        # fully connected layer 
        ## a simple linear layer that projects the final embedding to class logits.
        self.fc = nn.Linear(embed_dim,num_class)
        # call function init_weights to initiate model weight 
        self.init_weights() 

    # probably not necessary, but I will have one just for good practice 
    def init_weights(self): 
        initrange = 0.5 
        self.embedding.weight.data.uniform_(-initrange,initrange)
        self.fc.weight.data.uniform_(-initrange,initrange)
        self.fc.bias.data.zero_() # initialize biases to zero 

    def forward(self,text,offset): 
        # embedded is a tensor shape of [batch size, embed_dim], which is pooled embedding of a doc 
        embedded = self.embedding(text,offset) # text is 1D tensor 
        return self.fc(embedded)


In [7]:
# initialize model, loss, optimizer, and scheduler 

num_class = 4 # business, sci, sports, world 
vocab_size = len(vocab)
embed_dim = 64 

# init model 
model = TextClassificationModel(vocab_size=vocab_size,embed_dim=embed_dim,num_class=num_class)

# cross entropy loss
criterion = nn.CrossEntropyLoss()
# optimizer 
optimizer = torch.optim.SGD(model.parameters(), lr = 1)
# scheduler 
## step size -> period of learning rate decay 
## gamma -> multiplicative factor of learning rate decay 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = 1,gamma = 0.9) 

In [8]:
# Model Training 
def train(dataloder): # dataloader iterate over taining batch 
    # tells the model to operate in training mode 
    model.train()
    # metrics 
    ## total_acc -> total # of correct prediction 
    ## total_loss -> total loss over batches 
    ## total_count -> total # of samples seen 
    total_acc, total_loss, total_count = 0,0,0

    # dataloader contains label, text, offset 
    for labels,text,offsets in dataloder: 
        # clear old gradients 
        optimizer.zero_grad()
        output = model(text,offsets) # nn.Module calls model.__call__() which calls forward() 
        loss = criterion(output,labels) # compute how different the predicted vs. actual probabilities are 
        # backpropagation 
        loss.backward() 
        # gradient clipping (prevents explosion)
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=0.5)
        # apply gradient update 
        optimizer.step() 

        # accumulate metrics 
        total_loss += loss.item() # float value from loss tensor 
        # argmax(1) find predicted label compare to true label and sum over bool tensor 
        total_acc += (output.argmax(1)==labels).sum().item() 
        total_count += labels.size(0) 

    # return avg loss and accuracy 
    return total_loss/total_count, total_acc/total_count
    
for epoch in range(15): 
    loss,acc = train(train_dataloader) 
    # scheduler step 
    scheduler.step() 
    print(f"Epoch {epoch+1}: Accuracy = {acc}, Loss = {loss}")

Epoch 1: Accuracy = 0.8535438596491228, Loss = 0.0523283961474969
Epoch 2: Accuracy = 0.9116491228070176, Loss = 0.033529493554022974
Epoch 3: Accuracy = 0.9254561403508772, Loss = 0.02869510322029594
Epoch 4: Accuracy = 0.9331754385964912, Loss = 0.02553864622307485
Epoch 5: Accuracy = 0.9401754385964912, Loss = 0.023260910315879235
Epoch 6: Accuracy = 0.9457280701754386, Loss = 0.021346841640920804
Epoch 7: Accuracy = 0.9504122807017544, Loss = 0.01976001121050279
Epoch 8: Accuracy = 0.9547543859649122, Loss = 0.018311474884890152
Epoch 9: Accuracy = 0.9576578947368422, Loss = 0.017151077284391615
Epoch 10: Accuracy = 0.9608333333333333, Loss = 0.016152021202176146
Epoch 11: Accuracy = 0.9638947368421052, Loss = 0.015269376229713392
Epoch 12: Accuracy = 0.9659824561403508, Loss = 0.014560792648488738
Epoch 13: Accuracy = 0.9681052631578947, Loss = 0.013870331654276718
Epoch 14: Accuracy = 0.9694649122807018, Loss = 0.013334292842408743
Epoch 15: Accuracy = 0.9715350877192982, Loss = 

In [10]:
# evaluation 
def evaluate(dataloader): 
    # put model in evaluation mode 
    model.eval() 
    total_acc, total_count = 0,0 
    # disable gradient computing or storing 
    with torch.no_grad(): 
        for labels,text,offsets in dataloader: 
            output = model(text,offsets)
            total_acc += (output.argmax(1)==labels).sum().item()
            total_count += labels.size(0) 
    return total_acc/total_count # accuracy 

print(f"Validation accuracy: {evaluate(valid_dataloader)}")
print(f"Test accuracy: {evaluate(test_dataloader)}")


Validation accuracy: 0.9118333333333334
Test accuracy: 0.9039473684210526


In [19]:
def predict(text, model, vocab, tokenizer):
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        # Tokenize and convert to vocab indices
        tokens = torch.tensor(vocab(tokenizer(text)), dtype=torch.int64)
        # Offsets tensor (starting index for the sentence in the batch)
        offsets = torch.tensor([0])
        # Get the model output
        output = model(tokens, offsets)
        # Get the predicted class index
        predicted_label = output.argmax(1).item()
        return predicted_label

In [20]:
test = 'I think sports like football can be dangerous.' 
label_index = predict(test,model,vocab,tokenizer)
label_map = ["World", "Sports", "Business", "Sci/Tech"]
print(f"Predicted category: {label_map[label_index]}")

Predicted category: Sports
