In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import io
import re

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from torch.distributions import Categorical

from torchtext.datasets import YahooAnswers
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url
from torchtext.data.functional import sentencepiece_tokenizer, load_sp_model

from tqdm.notebook import trange, tqdm

OSError: /home/sanele/Desktop/2025/Projects/The-Complete-Pytorch-Deep-Learning-Series-/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [None]:
# Define the hyperparameters
# Learning rate for model optimization
learning_rate = 1e-4

# Number of epochs for training
nepochs = 10

# Batch size for training data loader
batch_size = 32

# Maximum length of input questions
max_len_q = 32

# Maximum length of output answers
max_len_a = 64

# Root directory of the dataset
data_set_root = "../datasets"

In [None]:
# We'll be using the YahooAnswers Dataset
# Note that for torchtext these datasets are NOT Pytorch dataset classes "YahooAnswers" is a function that
# returns a Pytorch DataPipe!

# Pytorch DataPipes vvv
# https://pytorch.org/data/main/torchdata.datapipes.iter.html

# vvv Good Blog on the difference between DataSet and DataPipe
# https://medium.com/deelvin-machine-learning/comparison-of-pytorch-dataset-and-torchdata-datapipes-486e03068c58

# Depending on the dataset sometimes the dataset doesn't download and gives an error
# and you'll have to download and extract manually 
# "The datasets supported by torchtext are datapipes from the torchdata project, which is still in Beta status"

# Un-comment to triger the DataPipe to download the data vvv
# dataset_train = YahooAnswers(root=data_set_root, split="train")
# data = next(iter(dataset_train))

# Side-Note I've noticed that the WikiText dataset is no longer able to be downloaded :(

In [None]:
### Uncomment to "Train" a Sentence Piece Tokenizer with the train data capping the vocab size to 20000 tokens
# from torchtext.data.functional import generate_sp_model

# with open(os.path.join(data_set_root, "datasets/YahooAnswers/train.csv")) as f:
#     with open(os.path.join(data_set_root, "datasets/YahooAnswers/data.txt"), "w") as f2:
#         for i, line in enumerate(f):
#             text_only = "".join(line.split(",")[1:])
#             filtered = re.sub(r'\\|\\n|;', ' ', text_only.replace('"', ' ').replace('\n', ' ')) # remove newline characters
#             f2.write(filtered.lower() + "\n")


# generate_sp_model(os.path.join(data_set_root, "datasets/YahooAnswers/data.txt"), 
#                   vocab_size=20000, model_prefix='spm_user_ya')

In [None]:
# YahooQA dataset class definition
class YahooQA(Dataset):
    def __init__(self, num_datapoints, test_train="train"):
        # Read the Yahoo Answers dataset CSV file based on the test_train parameter (train or test)
        self.df = pd.read_csv(os.path.join(data_set_root, "datasets/YahooAnswers/" + test_train + ".csv"),
                              names=["Class", "Q_Title", "Q_Content", "A"])
        
        # Fill missing values with empty string
        self.df.fillna('', inplace=True)
        
        # Combine Q_Title and Q_Content columns into a single Q column (question)
        self.df['Q'] = self.df['Q_Title'] + ' ' + self.df['Q_Content']
        
        # Drop Q_Title and Q_Content columns as they are no longer needed
        self.df.drop(['Q_Title', 'Q_Content'], axis=1, inplace=True)
        
        # Replace special characters with whitespace in the Q and A columns
        self.df['Q'] = self.df['Q'].str.replace(r'\\n|\\|\\r|\\r\\n|\n|"', ' ', regex=True)
        self.df['A'] = self.df['A'].str.replace(r'\\n|\\|\\r|\\r\\n|\n|"', ' ', regex=True)

    # Method to get a single item (question, answer pair) from the dataset
    def __getitem__(self, index):
        # Get the question and answer texts at the given index, converted to lowercase
        question_text = self.df.loc[index]["Q"].lower()
        answer_text = self.df.loc[index]["A"].lower()

        return question_text, answer_text

    # Method to get the length of the dataset
    def __len__(self):
        # Return the total number of question-answer pairs in the dataset
        return len(self.df)

In [None]:
# Create YahooQA dataset instances for training and testing
dataset_train = YahooQA(num_datapoints=data_set_root, test_train="train")
dataset_test = YahooQA(num_datapoints=data_set_root, test_train="test")

# Create data loaders for training and testing datasets
# DataLoader for training dataset
data_loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True)
# DataLoader for testing dataset
data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, num_workers=4)

In [None]:
# Example of using the tokenizer
# Load the SentencePiece model
sp_model = load_sp_model("spm_user_ya.model")

# Create a tokenizer using the loaded model
tokenizer = sentencepiece_tokenizer(sp_model)

# Iterate over tokens generated by the tokenizer
for token in tokenizer(["i am creating"]):
    print(token)

In [None]:
# Define a function to yield tokens from a file
def yield_tokens(file_path):
    # Open the file in UTF-8 encoding
    with io.open(file_path, encoding='utf-8') as f:
        # Iterate over each line in the file
        for line in f:
            # Yield the token split by tab character
            yield [line.split("\t")[0]]

            
# Build vocabulary from the iterator of tokens
# We will also add "special" tokens that we'll use to signal something to our model
# <pad> is a padding token that is added to the end of a sentence to ensure 
# the length of all sequences in a batch is the same
# <soq> signals the "Start-Of-Question" aka the start of the Question sequence
# <eoq> signals the "End-Of-Question" aka the end of the Question sequence
# <soa> signals the "Start-Of-Answer" aka the start of the Answer sequence
# <eoa> signals the "End-Of-Answer" aka the end of the Answer sequence
# <unk> "unknown" token is used if a token is not contained in the vocab
vocab = build_vocab_from_iterator(
    yield_tokens("spm_user_ya.vocab"),
    # Define special tokens with special_first=True to place them at the beginning of the vocabulary
    specials=['<pad>', '<soq>', '<eoq>', '<soa>', '<eoa>', '<unk>'],
    special_first=True
)

# Set default index for out-of-vocabulary tokens
vocab.set_default_index(vocab['<unk>'])

In [None]:
# Define transformation pipeline for questions
q_transform = T.Sequential(
    # Tokenize sentences using pre-existing SentencePiece tokenizer model
    T.SentencePieceTokenizer("spm_user_ya.model"),
    # Convert tokens to indices based on given vocabulary
    T.VocabTransform(vocab=vocab),
    # Add <sos> token at the beginning of each sentence (index 1 in vocabulary)
    T.AddToken(1, begin=True),
    # Crop the sentence if it is longer than the max question length
    T.Truncate(max_seq_len=max_len_q),
    # Add <eos> token at the end of each sentence (index 2 in vocabulary)
    T.AddToken(2, begin=False),
    # Convert the list of lists to a tensor and pad sentences with the <pad> token if shorter than max length
    T.ToTensor(padding_value=0)
)

# Define transformation pipeline for answers
a_transform = T.Sequential(
    # Tokenize sentences using pre-existing SentencePiece tokenizer model
    T.SentencePieceTokenizer("spm_user_ya.model"),
    # Convert tokens to indices based on given vocabulary
    T.VocabTransform(vocab=vocab),
    # Add <sos> token at the beginning of each sentence (index 3 in vocabulary)
    T.AddToken(3, begin=True),
    # Crop the sentence if it is longer than the max answer length
    T.Truncate(max_seq_len=max_len_a),
    # Add <eos> token at the end of each sentence (index 4 in vocabulary)
    T.AddToken(4, begin=False),
    # Convert the list of lists to a tensor and pad sentences with the <pad> token if shorter than max length
    T.ToTensor(padding_value=0)
)

In [None]:
# Define LSTM model class
class LSTM(nn.Module):
    def __init__(self, num_emb, num_layers=1, emb_size=128, hidden_size=128):
        super(LSTM, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(num_emb, emb_size)

        # MLP layer for embedding
        self.mlp_emb = nn.Sequential(
            nn.Linear(emb_size, emb_size),
            nn.LayerNorm(emb_size),
            nn.ELU(),
            nn.Linear(emb_size, emb_size)
        )
        
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=emb_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.25
        )

        # MLP layer for output
        self.mlp_out = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_size // 2, num_emb)
        )
        
    def forward(self, input_seq, hidden_in, mem_in):
        # Embed input sequence
        input_embs = self.embedding(input_seq)
        # Pass through MLP for embedding
        input_embs = self.mlp_emb(input_embs)
                
        # Pass through LSTM layer
        output, (hidden_out, mem_out) = self.lstm(input_embs, (hidden_in, mem_in))
                
        # Pass through MLP for output
        return self.mlp_out(output), hidden_out, mem_out

In [None]:
# Check if GPU is available, set device accordingly
device = torch.device(0 if torch.cuda.is_available() else 'cpu')

# Define embedding size, hidden size, and number of layers for the LSTM model
emb_size = 256
hidden_size = 1024
num_layers = 4

# Create LSTM model instance
lstm_qa = LSTM(num_emb=len(vocab), num_layers=num_layers, 
               emb_size=emb_size, hidden_size=hidden_size).to(device)

# Initialize optimizer with Adam optimizer
optimizer = optim.Adam(lstm_qa.parameters(), lr=learning_rate, weight_decay=1e-4)

# Define the loss function (Cross Entropy Loss)
loss_fn = nn.CrossEntropyLoss()

# List to store training loss during each epoch
training_loss_logger = []

In [None]:
# Let's see how many Parameters our Model has!
num_model_params = 0
for param in lstm_qa.parameters():
    num_model_params += param.flatten().shape[0]

print("-This Model Has %d (Approximately %d Million) Parameters!" % (num_model_params, num_model_params//1e6))

In [None]:
# Training loop
for epoch in trange(0, nepochs, leave=False, desc="Epoch"):    
    # Set LSTM model to training mode
    lstm_qa.train()
    steps = 0
    # Iterate over batches in training data loader
    for q_text, a_text in tqdm(data_loader_train, desc="Training", leave=False):
        # Transform both question and answer text
        q_text_tokens = q_transform(list(q_text)).to(device)
        a_text_tokens = a_transform(list(a_text)).to(device)
        
        # Inputs and outputs for the answer next-token prediction
        a_input_text = a_text_tokens[:, :-1]
        a_output_text = a_text_tokens[:, 1:]
        
        # Batch size
        bs = q_text_tokens.shape[0]
        
        # Initialise the memory buffers
        hidden = torch.zeros(num_layers, bs, hidden_size, device=device)
        memory = torch.zeros(num_layers, bs, hidden_size, device=device)

        # Encode the whole question sequence
        _, hidden, memory = lstm_qa(q_text_tokens, hidden, memory)

        # Perform a "next-token" prediction on the answer sequence
        # providing the model with the memory buffers from the question-encoding step
        pred, hidden, memory = lstm_qa(a_input_text, hidden, memory)

        # Calculate loss
        loss = loss_fn(pred.transpose(1, 2), a_output_text)

        # Zero gradients, perform backward pass, and update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log training loss
        training_loss_logger.append(loss.item())

In [None]:
_ = plt.figure(figsize=(10, 5))
_ = plt.plot(training_loss_logger)
_ = plt.title("Training Loss")

In [None]:
# Get a question and its corresponding answer from the test dataset
q_text, a_text = next(iter(data_loader_test))

In [None]:
# Choose an index from the test data loader
index = 0

# Display the selected question
print("QUESTION:")
print(q_text[index])

# Initialize the prompt with the selected question
init_prompt = [q_text[index]]

# Transform the initial prompt into tokens and move to device
input_tokens = q_transform(init_prompt).to(device)

# Add Start-Of-Answer token to prompt the network to start generating the answer
input_tokens = torch.cat((input_tokens, 3 * torch.ones(1, 1, device=device).long()), 1)

print("\nINITIAL PROMPT TOKENS:")
print(input_tokens)
print("VOCABULARY TOKENS:")
print(vocab.lookup_tokens(input_tokens[0].cpu().numpy()))

# Temperature parameter for sampling
temp = 0.8

In [None]:
# Generate text tokens
log_tokens = []
# Set LSTM model to evaluation mode
lstm_qa.eval()

# Disable gradient calculation
with torch.no_grad():    
    # Initialize hidden and memory tensors
    hidden = torch.zeros(num_layers, 1, hidden_size, device=device)
    memory = torch.zeros(num_layers, 1, hidden_size, device=device)
    
    # Iterate over a maximum of 100 tokens
    for i in range(100):
        # Forward pass through LSTM model
        data_pred, hidden, memory = lstm_qa(input_tokens, hidden, memory)
        
        # Sample from the distribution of probabilities
        dist = Categorical(logits=data_pred[:, -1, :]/temp)
        input_tokens = dist.sample().reshape(1, 1)
        
        # Append sampled token to log_tokens list
        log_tokens.append(input_tokens.cpu())
        
        # Check if the sampled token is the End-Of-Answer token
        if input_tokens.item() == 4:
            break

In [None]:
# Convert the list of token indices into text using the vocabulary
pred_text = "".join(vocab.lookup_tokens(torch.cat(log_tokens, 1)[0].numpy()))

# Print the generated text
print(pred_text)

In [None]:
# Clean up the generated text by replacing special tokens and removing unwanted characters
cleaned_text = pred_text.replace("▁", " ").replace("<unk>", "").replace("<eoa>", "")

# Print the cleaned text
print(cleaned_text)

In [None]:
# Have a look at the next token probabilities 
plt.plot(F.softmax(data_pred[:, -1, :]/temp, -1).cpu().numpy().flatten())