<a href="https://colab.research.google.com/github/shraddha-an/nlp/blob/main/pytorch_tc_agnews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Text Classification tutorial with Torch Text**

## **I. Data Preprocessing**

### **1) Importing Libraries**

In [1]:
# Importing the libraries
import torch
from torchtext.datasets import AG_NEWS

from collections import Counter

### **2) Buidling vocabulary**

In [2]:
# Tokenizing the texts and building a vocabulary object out of the training text
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')
count = Counter()

# Importing training data
train_data = AG_NEWS(split = 'train')

# Tokenizing the sentences and updating the Counter object with the words
for (label, text) in train_data:
  
  # Passing sentences through the tokenizer & passing the tokens to the counter
  count.update(tokenizer(text))

# Passing the counter object & minimum frequency arguments to the Vocab class to build the vocab object
from torchtext.vocab import Vocab
vocab = Vocab(counter = count, min_freq = 1)


In [3]:
# Playing around with the vocab object
sent = list("I like the song Bolero very much. I am also interested in Delta Force.".split())

s = list("here is an example".split())
[vocab[i] for i in sent]

[0, 318, 3, 3375, 0, 1168, 0, 0, 1914, 457, 4846, 8, 0, 0]

### **3) Text Preprocessing and Label Pipelines**

In [4]:
# Building the text processing pipeline -> tokenizer + converting tokens to integers.
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# Label pipeline
label_pipeline = lambda x: int(x) - 1

In [5]:
text_pipeline('Delta force baby')
label_pipeline(3)

2

### **4) Creating the DataLoaders**

In [6]:
# Building a DataLoader object from the iterable dataset.
# Before sending to the model, collate_fn function works on a batch of samples generated from DataLoader. 
# The input to collate_fn is a batch of data with the batch size in DataLoader, 
# and collate_fn processes them according to the data processing pipelines declared previously.

# Setting device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Defining the collate function
def collate_batch(batch):
  label_list, text_list, offset = [], [], [0]

  for (lbl, txt) in batch:
    label_list.append(label_pipeline(lbl))
    processed_text = torch.tensor(text_pipeline(txt), dtype = torch.int64)
    text_list.append(processed_text)
    offset.append(processed_text.size(0))

  # Converting the lists to torch tensors
  label_list = torch.tensor(label_list, dtype = torch.int64)
  offset = torch.tensor(offset[:-1]).cumsum(dim = 0)
  text_list = torch.cat(text_list) 
  return label_list.to(device), text_list.to(device), offset.to(device)

# Creating the dataloader object
from torch.utils.data import DataLoader

data = DataLoader(train_data, batch_size = 8, shuffle = False, collate_fn = collate_batch)

## **II. Model Definition**

In [7]:
# Importing torch libraries to define the model class
import torch.nn as nn, torch.nn.functional as F

# Defining my Text Classification model
class TextClassification(nn.Module):
  
  # Defining the layer architecture in the constructor
  def __init__(self, vocab_size, embed_dim, num_classes):
    # Calling the parent class constructors
    super(TextClassification, self).__init__()

    # Defining my layers
    self.embed = nn.EmbeddingBag(num_embeddings = vocab_size, embedding_dim = embed_dim, sparse = True)
    self.fc = nn.Linear(in_features = embed_dim, out_features = num_classes)

    # Calling the function to initialize weights after the layers have been defined
    self.init_weights()

  # Defining a function to initialize weights
  def init_weights(self):
    # Initializing the weights of the embedding layer & the fc layer with values drawn from a uniform distribution of (-k, k)
    initrange = 0.5
    self.embed.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    
    # Initializing the bias of the fc layer to zero
    self.fc.bias.data.zero_()

  # Defining the forward pass computations
  def forward(self, text, offset):
    output = self.embed(text, offset)
    
    return self.fc(output)

In [8]:
# Creating an object of the TextClassificationModel
train_iter  = AG_NEWS(split = 'train')
num_classes = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
embedding_size = 64

model = TextClassification(vocab_size = vocab_size, embed_dim = embedding_size, num_classes = num_classes).to(device)
model

TextClassification(
  (embed): EmbeddingBag(95812, 64, mode=mean)
  (fc): Linear(in_features=64, out_features=4, bias=True)
)

In [9]:
# Optimizer
from torch.optim import AdamW, SGD

optimizer = SGD(model.parameters(), lr = 5)

# Criterion
criterion = nn.CrossEntropyLoss()

# Obtaining train/test datasets
train_data, test_data = AG_NEWS()

train = DataLoader(list(train_data), shuffle = True, collate_fn = collate_batch, batch_size = 64)
test = DataLoader(list(test_data), shuffle = True, collate_fn = collate_batch, batch_size = 64)

### **1) Training Loop**

In [10]:
# Training loop
%time

# Putting model in train mode
model.train()

# Epochs 
epochs = 20

for epoch in range(epochs):

  # Calculating training accuracy for every epoch
  total_acc, total_count = 0, 0

  # Training batches
  for (label, text, offsets) in train:
    
    # Push variables to device
    #label, text, offsets = label.to(device), text.to(device), offsets.to(device)

    # Clear out gradients from previous training batch
    optimizer.zero_grad()

    # Forward pass; feed inputs to model & get outputs
    outputs = model(text, offsets)
  
    # Calculate loss between model's predictions & actual target
    loss = criterion(outputs, label)

    # Back propagate loss throughout the neural network
    loss.backward()

    # To prevent exploding gradients problem, clipping the norm of the gradients to 0.1
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

    # Update parameters based on the current gradient
    optimizer.step()

    total_acc += (outputs.argmax(1) == label).sum().item()
    total_count += label.size(0)
  
  print('\nEpoch {}/{}    -  Training Accuracy: {}'.format(epoch + 1, epochs, total_acc/ total_count))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.48 µs

Epoch 1/20    -  Training Accuracy: 0.8198

Epoch 2/20    -  Training Accuracy: 0.9016083333333333

Epoch 3/20    -  Training Accuracy: 0.9152333333333333

Epoch 4/20    -  Training Accuracy: 0.9240333333333334

Epoch 5/20    -  Training Accuracy: 0.9291083333333333

Epoch 6/20    -  Training Accuracy: 0.9348666666666666

Epoch 7/20    -  Training Accuracy: 0.9380333333333334

Epoch 8/20    -  Training Accuracy: 0.9423416666666666

Epoch 9/20    -  Training Accuracy: 0.945

Epoch 10/20    -  Training Accuracy: 0.9481666666666667

Epoch 11/20    -  Training Accuracy: 0.9507833333333333

Epoch 12/20    -  Training Accuracy: 0.9532916666666666

Epoch 13/20    -  Training Accuracy: 0.9548333333333333

Epoch 14/20    -  Training Accuracy: 0.9573416666666666

Epoch 15/20    -  Training Accuracy: 0.958825

Epoch 16/20    -  Training Accuracy: 0.9605583333333333

Epoch 17/20    -  Training Accuracy: 0.9624583333333333

Epoch 18/2

## **III. Testing**

In [11]:
# Testing
# Putting model in eval mode
model.eval()

# Accuracy
test_acc, test_count = 0, 0 

# Disabling torch gradient calculation.
with torch.no_grad():
  # Iterating through batches in the test dataloader
  for (label, text, offset) in test:
    
    # Feeding inputs to the model
    output = model(text, offset)

    # Identifying the correctly predicted labels
    test_acc += (output.argmax(1) == label).sum().item()
    test_count += label.size(0)


# Test Accuracy
print('Test Accuracy: ', test_acc/test_count)

Test Accuracy:  0.9006578947368421
