In [None]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

In [None]:
import os
import urllib

##############################
# Download data if necessary
##############################

file_path = "../data/the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

## Data loader for txt file

In [None]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)

## Data loader for list of strings

Test end of text splitting between different articles

In [None]:
def encode_and_decode_example(list_of_strings):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Get the token ID for <|endoftext|>
    endoftext_token = tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})[0]

    all_tokens = []
    for text in list_of_strings:
        # Encode the text
        encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        all_tokens.extend(encoded + [endoftext_token])

    # Decode the tokens
    decoded = tokenizer.decode(all_tokens)

    return all_tokens, decoded

# Test the function
string_sample = ['this is the first passage', 'this is the second']
tokens, decoded = encode_and_decode_example(string_sample)

print("Tokens:", tokens)
print("Decoded text:", decoded)

In [None]:
import torch
from torch.utils.data import Dataset

class GPTDatasetV1(Dataset):
    def __init__(self, articles, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Get the token ID for <|endoftext|>
        endoftext_token = tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})[0]

        # Tokenize all articles with end-of-text token
        all_tokens = []
        for article in articles:
            article_tokens = tokenizer.encode(article, allowed_special={"<|endoftext|>"})
            all_tokens.extend(article_tokens + [endoftext_token])

        # Use a sliding window to chunk the tokens into overlapping sequences of max_length
        for i in range(0, len(all_tokens) - max_length, stride):
            input_chunk = all_tokens[i:i + max_length]
            target_chunk = all_tokens[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
import tiktoken
from torch.utils.data import DataLoader

def create_dataloader_v1(articles, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(articles, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

# Usage:
# articles = ['this is the first article', 'this is the second article']
# dataloader = create_dataloader_v1(articles)

## Load in dataset

In [None]:
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def parse_sgm_to_dataframe(file_path: str) -> pd.DataFrame:
    # Open and read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        sgm_data = file.read()

    # Parse the SGML data
    soup = BeautifulSoup(sgm_data, 'html.parser')

    # List to hold parsed data
    data = []

    # Iterate over each Reuters tag in the SGML
    for reuters in soup.find_all('reuters'):
        # Extract the NEWID attribute to serve as an ID
        article_id = reuters.get('newid')

        # Extract the BODY content
        body = reuters.find('body')
        body_text = body.get_text().strip() if body else ''

        # Extract the TOPICS
        topics = reuters.find('topics')
        if topics:
            # Get all topics listed under <D> tags
            topics_list = [d.get_text().strip() for d in topics.find_all('d')]
            # If there are topics, add a row for each topic
            if topics_list:
                for topic in topics_list:
                    data.append({'ID': article_id, 'Topic': topic, 'Body': body_text})
            else:
                # If <topics> tag exists but is empty, add a row with empty string for Topic
                data.append({'ID': article_id, 'Topic': '', 'Body': body_text})
        else:
            # If there's no <topics> tag, add a row with None for Topic
            data.append({'ID': article_id, 'Topic': None, 'Body': body_text})

    # Create a DataFrame from the parsed data
    df = pd.DataFrame(data)
    return df

In [None]:
df = parse_sgm_to_dataframe('../data/reuters21578/reut2-000.sgm')

In [None]:
articles = list(df['Body'].values)

In [None]:
len(articles)

## Create data pipeline

In [None]:
vocab_size = 50257
output_dim = 256
max_len = 1024
context_length = max_len

In [None]:
import torch
import torch.nn as nn

token_embedding_layer = nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(articles, batch_size=8, max_length=max_length, stride=max_length)

In [None]:
def inspect_batch(x, y, n_samples=2):
    for i in range(min(n_samples, len(x))):
        tokenizer = tiktoken.get_encoding("gpt2")
        
        print(f"\nSample {i+1}:")
        
        # Decode and print the input sequence
        input_text = tokenizer.decode(x[i].tolist())
        print(f"Input text: {input_text}")
        print(f"Input encoding: {x[i].tolist()}")
        
        # Decode and print the target sequence
        target_text = tokenizer.decode(y[i].tolist())
        print(f"Target text: {target_text}")
        print(f"Target encoding: {y[i].tolist()}")
        
        print("-" * 50)

In [None]:
INSPECT = True

In [None]:
for batch in dataloader:
    x, y = batch

    if INSPECT:
        # Visual inspection
        inspect_batch(x, y)

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [None]:
print(input_embeddings.shape)