In [25]:
import torch


vocab_size = 50257
output_dim = 256


token_embadding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [26]:
token_embadding_layer

Embedding(50257, 256)

In [27]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [28]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [29]:
!pip install tiktoken



In [30]:
import tiktoken

In [31]:
raw_text = """
    An LLM tokenizer breaks down human text (words, characters,
    subwords) into numerical "tokens" that Large Language Models
    (LLMs) can understand, forming a crucial step to translate 
    language into numbers for processing, impacting model efficiency, 
    cost, and context handling, with common methods like BPE balancing 
    vocabulary size and flexibility for various languages.
    Converts text to numbers: LLMs only process numbers, so the tokenizer maps text chunks (tokens) to unique integer IDs.
    Breaks down text: It splits sentences into words, parts of words (subwords), or individual characters, depending on the algorithm
"""

In [32]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [33]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[  198,   220,   220,   220],
        [ 1052, 27140,    44, 11241],
        [ 7509,  9457,   866,  1692],
        [ 2420,   357, 10879,    11],
        [ 3435,    11,   198,   220],
        [  220,   220,   850, 10879],
        [    8,   656, 29052,   366],
        [   83,   482,   641,     1]])

Inputs shape:
 torch.Size([8, 4])


In [34]:
token_embeddings = token_embadding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [35]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [36]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [37]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
