In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import re
import io

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F

# We'll be using Pytorch's text library called torchtext! 
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T

from tqdm.notebook import trange, tqdm

In [None]:
# Define the hyperparameters
# Learning rate for the optimizer
learning_rate = 1e-4

# Number of epochs for training
nepochs = 20

# Batch size for data loaders
batch_size = 128

# Maximum sequence length for text inputs
max_len = 128

# Root directory of the dataset
data_set_root = "../datasets"

In [None]:
# We'll be using the AG News Dataset
# Which contains a short news article and a single label to classify the "type" of article
# Note that for torchtext these datasets are NOT Pytorch dataset classes "AG_NEWS" is a function that
# returns a Pytorch DataPipe!

# Pytorch DataPipes vvv
# https://pytorch.org/data/main/torchdata.datapipes.iter.html

# vvv Good Blog on the difference between DataSet and DataPipe
# https://medium.com/deelvin-machine-learning/comparison-of-pytorch-dataset-and-torchdata-datapipes-486e03068c58
# Depending on the dataset sometimes the dataset doesn't download and gives an error
# and you'll have to download and extract manually 
# "The datasets supported by torchtext are datapipes from the torchdata project, which is still in Beta status"

# Un-comment to triger the DataPipe to download the data vvv
# dataset_train = AG_NEWS(root=data_set_root, split="train")
# data = next(iter(dataset_train))

# Side-Note I've noticed that the WikiText dataset is no longer able to be downloaded :(

In [None]:
# Un-Comment to train sentence-piece model for tokenizer and vocab!

# from torchtext.data.functional import generate_sp_model

# with open(os.path.join(data_set_root, "datasets/AG_NEWS/train.csv")) as f:
#     with open(os.path.join(data_set_root, "datasets/AG_NEWS/data.txt"), "w") as f2:
#         for i, line in enumerate(f):
#             text_only = "".join(line.split(",")[1:])
#             filtered = re.sub(r'\\|\\n|;', ' ', text_only.replace('"', ' ').replace('\n', ' ')) # remove newline characters
#             filtered = filtered.replace(' #39;', "'")
#             filtered = filtered.replace(' #38;', "&")
#             filtered = filtered.replace(' #36;', "$")
#             filtered = filtered.replace(' #151;', "-")

#             f2.write(filtered.lower() + "\n")

# generate_sp_model(os.path.join(data_set_root, "datasets/AG_NEWS/data.txt"), 
#                   vocab_size=20000, model_prefix='spm_ag_news')

In [None]:
class AGNews(Dataset):
    """
    The AGNews class is a custom Dataset for handling the AG News dataset.
    This dataset consists of news articles categorized into four classes.
    The class loads the data from CSV files, preprocesses the text by cleaning and combining
    relevant columns, and provides an interface to access individual samples along with their
    corresponding class labels.
    
    Attributes:
        df (pd.DataFrame): The DataFrame containing the preprocessed dataset.
    """
    
    def __init__(self, num_datapoints, test_train="train"):
        # Load the dataset from the specified CSV file
        self.df = pd.read_csv(os.path.join(data_set_root, "datasets/AG_NEWS/" + test_train + ".csv"),
                              names=["Class", "Title", "Content"])
        
        # Fill any missing values with empty strings
        self.df.fillna('', inplace=True)
        
        # Combine the Title and Content columns into a single Article column
        self.df['Article'] = self.df['Title'] + " : " + self.df['Content']
        
        # Drop the now redundant Title and Content columns
        self.df.drop(['Title', 'Content'], axis=1, inplace=True)
        
        # Clean the Article column by removing unwanted characters and replacing HTML codes
        self.df['Article'] = self.df['Article'].str.replace(r'\\n|\\|\\r|\\r\\n|\n|"', ' ', regex=True)
        self.df['Article'] = self.df['Article'].replace({' #39;': "'", 
                                                         ' #38;': "&", 
                                                         ' #36;': "$",
                                                         ' #151;': "-"}, 
                                                        regex=True)

    def __getitem__(self, index):
        # Retrieve the article text and convert it to lowercase
        text = self.df.loc[index]["Article"].lower()
        
        # Retrieve the class label and convert it to an integer
        class_index = int(self.df.loc[index]["Class"]) - 1

        # Return a tuple of the class index and the article text
        return class_index, text
    
    def __len__(self):
        # Return the number of data points in the dataset
        return len(self.df)

In [None]:
# Create training and testing datasets
dataset_train = AGNews(num_datapoints=data_set_root, test_train="train")
dataset_test = AGNews(num_datapoints=data_set_root, test_train="test")

# Create data loaders for the training and testing datasets
data_loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=True)
data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, num_workers=8)

In [None]:
def yield_tokens(file_path):
    with io.open(file_path, encoding='utf-8') as f:
        # Iterate through each line in the file
        for line in f:
            # Yield the token from the first column (split by tab)
            yield [line.split("\t")[0]]

# Build a vocabulary from the tokens yielded by the yield_tokens function
# We will also add "special" tokens that we'll use to signal something to our model
# <pad> is a padding token that is added to the end of a sentence to ensure 
# the length of all sequences in a batch is the same
# <sos> signals the "Start-Of-Sentence" aka the start of the sequence
# <eos> signal the "End-Of-Sentence" aka the end of the sequence
# <unk> "unknown" token is used if a token is not contained in the vocab
vocab = build_vocab_from_iterator(yield_tokens("spm_ag_news.vocab"), 
                                  specials=['<pad>', '<sos>', '<eos>', '<unk>'],
                                  special_first=True)

# Set the default index for unknown tokens to the index of the '<unk>' token
vocab.set_default_index(vocab['<unk>'])

In [None]:
# Data transform to turn text into vocab tokens
text_tranform = T.Sequential(
    # Tokeniz with pre-existing Tokenizer
    T.SentencePieceTokenizer("spm_ag_news.model"),
    ## converts the sentences to indices based on given vocabulary
    T.VocabTransform(vocab=vocab),
    ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
    # 1 as seen in previous section
    T.AddToken(1, begin=True),
    # Crop the sentance if it is longer than the max length
    T.Truncate(max_seq_len=max_len),
    ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
    # 2 as seen in previous section
    T.AddToken(2, begin=False),
    # Convert the list of lists to a tensor, this will also
    # Pad a sentence with the <pad> token if it is shorter than the max length
    # This ensures all sentences are the same length!
    T.ToTensor(padding_value=0),
)

In [None]:
class TokenDrop(nn.Module):
    """For a batch of tokens indices, randomly replace a non-specical token with <pad>.
    
    Args:
        prob (float): probability of dropping a token
        pad_token (int): index for the <pad> token
        num_special (int): Number of special tokens, assumed to be at the start of the vocab
    """

    def __init__(self, prob=0.1, pad_token=0, num_special=4):
        self.prob = prob
        self.num_special = num_special
        self.pad_token = pad_token

    def __call__(self, sample):
        # Randomly sample a bernoulli distribution with p=prob
        # to create a mask where 1 means we will replace that token
        mask = torch.bernoulli(self.prob * torch.ones_like(sample)).long()
        
        # only replace if the token is not a special token
        can_drop = (sample >= self.num_special).long()
        mask = mask * can_drop
        
        replace_with = (self.pad_token * torch.ones_like(sample)).long()
        
        sample_out = (1 - mask) * sample + mask * replace_with
        
        return sample_out

In [None]:
# Sinusoidal positional embeds
class SinusoidalPosEmb(nn.Module):

    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[:, None] * emb[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb
    

class TransformerBlock(nn.Module):
    def __init__(self, hidden_size=128, num_heads=4):
        super().__init__()
        
        # Layer normalization for the input
        self.norm1 = nn.LayerNorm(hidden_size)
        
        # Multi-head self-attention mechanism
        self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads=num_heads, 
                                                    batch_first=True, dropout=0.25)
        
        # Layer normalization for the output of the attention mechanism
        self.norm2 = nn.LayerNorm(hidden_size)
        
        # Feed-forward neural network layer
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),  # Linear transformation
            nn.LayerNorm(hidden_size),  # Layer normalization
            nn.ELU(),  # Activation function (ELU)
            nn.Linear(hidden_size, hidden_size)  # Linear transformation
        )
                
    def forward(self, x, key_padding_mask):
        # Layer normalization for the input
        norm_x = self.norm1(x)
        
        # Multi-head self-attention mechanism
        # [0] selects the attention output
        attn_output = self.multihead_attn(norm_x, 
                                          norm_x, 
                                          norm_x, 
                                          key_padding_mask=key_padding_mask)[0]
        
        # Residual connection and layer normalization for the attention output
        x = attn_output + x
        norm_x = self.norm2(x)
        
        # Feed-forward neural network layer
        mlp_output = self.mlp(norm_x)
        
        # Residual connection and output of the TransformerBlock
        output = mlp_output + x
        return output


# "Encoder-Only" Style Transformer with self-attention
class Transformer(nn.Module):
    """
    Transformer model consisting of an embedding layer, positional embeddings, 
    multiple Transformer blocks, and a final output layer.
    
    Args:
        num_emb (int): Number of embedding tokens.
        output_size (int): Dimensionality of the output.
        hidden_size (int): Dimensionality of the hidden layers.
        num_layers (int): Number of Transformer blocks.
        num_heads (int): Number of attention heads.
    """
    def __init__(self, num_emb, output_size, hidden_size=128, num_layers=3, num_heads=4):
        super(Transformer, self).__init__()
        
        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, hidden_size)
#         self.embedding.weight.data = 0.001 * self.embedding.weight.data

        self.pos_emb = SinusoidalPosEmb(hidden_size)
        
        self.blocks = nn.ModuleList([
            TransformerBlock(hidden_size, num_heads) for _ in range(num_layers)
        ])
        
        self.out_vec = nn.Parameter(torch.zeros(1, 1, hidden_size))
        
        self.fc_out = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_seq):
        """
        Forward pass through the Transformer model.
        
        Args:
            input_seq (Tensor): Input sequence tensor with shape (batch_size, sequence_length).
        
        Returns:
            Tensor: Output tensor with shape (batch_size, output_size).
        """
        bs, _ = input_seq.shape
        
        # Create mask of padding tokens
        key_padding_mask = input_seq == 0
        # Add extra mask for output vec embedding
        key_padding_mask = torch.cat((torch.zeros(bs, 1, device=input_seq.device).bool(), 
                                      key_padding_mask), 1)
        
        # Embed the input sequence tokens
        input_embs = self.embedding(input_seq)
        
        # Concatenate a learnable output vector to the embeddings
        input_embs = torch.cat((self.out_vec.expand(bs, 1, -1), input_embs), 1)
        bs, l, h = input_embs.shape

        # Add a unique embedding to each token embedding depending on its position in the sequence
        seq_indx = torch.arange(l, device=input_seq.device)
        pos_emb = self.pos_emb(seq_indx).reshape(1, l, h).expand(bs, l, h)
        embs = input_embs + pos_emb
        
        # Pass the embeddings through each Transformer block
        for block in self.blocks:
            embs = block(embs, key_padding_mask)
        
        # Pass the first ebedding in the sequence to the final linear layer to get the output
        return self.fc_out(embs[:, 0])

In [None]:
# Set the device to GPU if available, otherwise use CPU
device = torch.device(0 if torch.cuda.is_available() else 'cpu')

# Embedding size
hidden_size = 256

# Create model
tf_classifier = Transformer(num_emb=len(vocab), output_size=4, hidden_size=hidden_size, 
                            num_layers=4, num_heads=8).to(device)

# Initialize the optimizer with above parameters
optimizer = optim.Adam(tf_classifier.parameters(), lr=learning_rate)

# We'll decay the learning rate with a Cosine scheduler
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 
                                                    T_max=nepochs,
                                                    eta_min=0)
# Define the loss function
loss_fn = nn.CrossEntropyLoss()

# Custom transform that will randomly replace a token with <pad>
td = TokenDrop(prob=0.5)

# Loggers for training and testing
training_loss_logger = []
test_loss_logger = []

training_acc_logger = []
test_acc_logger = []

In [None]:
# Let's see how many Parameters our Model has!
num_model_params = 0
for param in tf_classifier.parameters():
    num_model_params += param.flatten().shape[0]

print("-This Model Has %d (Approximately %d Million) Parameters!" % (num_model_params, num_model_params//1e6))

In [None]:
# Initialize progress bar for tracking epochs
pbar = trange(0, nepochs, leave=False, desc="Epoch")
train_acc = 0
test_acc = 0

# Loop over each epoch
for epoch in pbar:
    train_acc_count = 0
    test_acc_count = 0
    
    # Update the progress bar with current training and testing accuracy
    pbar.set_postfix_str('Accuracy: Train %.2f%%, Test %.2f%%' % (train_acc * 100, test_acc * 100))
    
    # Set the model to training mode
    tf_classifier.train()
    steps = 0
    
    # Loop over each batch in the training dataset
    for label, text in tqdm(data_loader_train, desc="Training", leave=False):
        bs = label.shape[0]
        
        # Transform the text to tokens and move to the GPU
        text_tokens = text_tranform(list(text)).to(device)
        label = label.to(device)

        # Randomly drop tokens to aid in regularization
        text_tokens = td(text_tokens)

        # Get the model predictions
        pred = tf_classifier(text_tokens)

        # Compute the loss using cross-entropy loss
        loss = loss_fn(pred, label)
        
        # Backpropagation and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Log the training loss
        training_loss_logger.append(loss.item())
        
        # Update training accuracy
        train_acc_count += (pred.argmax(1) == label).sum()
        steps += bs
    
    # Calculate average training accuracy
    train_acc = (train_acc_count / steps).item()
    training_acc_logger.append(train_acc)
    
    # Update learning rate
    lr_scheduler.step()
    
    # Set the model to evaluation mode
    tf_classifier.eval()
    steps = 0
    
    # Loop over each batch in the testing dataset
    with torch.no_grad():
        for label, text in tqdm(data_loader_test, desc="Testing", leave=False):
            bs = label.shape[0]
            
            # Transform the text to tokens and move to the GPU
            text_tokens = text_tranform(list(text)).to(device)
            label = label.to(device)

            # Get the model predictions
            pred = tf_classifier(text_tokens)

            # Compute the loss using cross-entropy loss
            loss = loss_fn(pred, label)
            test_loss_logger.append(loss.item())

            # Update testing accuracy
            test_acc_count += (pred.argmax(1) == label).sum()
            steps += bs

        # Calculate average testing accuracy
        test_acc = (test_acc_count / steps).item()
        test_acc_logger.append(test_acc)

In [None]:
_ = plt.figure(figsize=(10, 5))
_ = plt.plot(np.linspace(0, nepochs, len(training_loss_logger)), training_loss_logger)
_ = plt.plot(np.linspace(0, nepochs, len(test_loss_logger)), test_loss_logger)

_ = plt.legend(["Train", "Test"])
_ = plt.title("Training Vs Test Loss")
_ = plt.xlabel("Epochs")
_ = plt.ylabel("Loss")

In [None]:
_ = plt.figure(figsize=(10, 5))
_ = plt.plot(np.linspace(0, nepochs, len(training_acc_logger)), training_acc_logger)
_ = plt.plot(np.linspace(0, nepochs, len(test_acc_logger)), test_acc_logger)

_ = plt.legend(["Train", "Test"])
_ = plt.title("Training Vs Test Accuracy")
_ = plt.xlabel("Epochs")
_ = plt.ylabel("Accuracy")
print("Max Test Accuracy %.2f%%" % (np.max(test_acc_logger) * 100))