We need to generate the input-target pairs required for training an LLM

Goal: implement a data loader that fetches the input-target pairs from the training dataset using a sliding window approach

In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [43]:
with open("data/fadat_noticias.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

88630


## Data sampling with a sliding window

In [47]:
enc_sample = enc_text[:10]
enc_sample

[32, 376, 2885, 1404, 2169, 401, 78, 1296, 283, 773]

In [9]:
context_size = 4 # number of tokens in the input sequence

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")


x: [376, 2885, 1404, 16614]
y:      [2885, 1404, 16614, 68]


In [17]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[376] ----> 2885
[376, 2885] ----> 1404
[376, 2885, 1404] ----> 16614
[376, 2885, 1404, 16614] ----> 68


In [18]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 F ----> AD
 FAD ----> AT
 FADAT ---->  pretend
 FADAT pretend ----> e


<div style="text-align: center;">
    <img src="static/llm_dataloader.png" alt=Diagram showing a large language model represented by interconnected neural network nodes processing a sequence of text tokens. The nodes are arranged in layers, illustrating the flow of information from input to output. The workspace is digital and organized, with a neutral and focused atmosphere. No visible text is present in the image. The tone is analytical and educational, emphasizing the complexity and structure of language model data processing. width="800"/>
</div>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) # tokenize the entire text
        
        # sliding window to chunk the text into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride): 
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self): # total number of rows in the dataset
        return len(self.input_ids)
    
    def __getitem__(self, idx): # return a single row from the dataset
        return self.input_ids[idx], self.target_ids[idx]

In [23]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
    stride=128, shuffle=True, drop_last=True, num_workers=0
    ):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )
    
    return dataloader

In [25]:
with open("data/fadat_noticias.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader) # dataloader -> iterator
first_batch = next(data_iter)
print(first_batch)


[tensor([[  32,  376, 2885, 1404]]), tensor([[ 376, 2885, 1404, 2169]])]


Obs: `max_length` is set to 4 but is relatively small, only chosen for illustration purposes. 

It is common to train LLMs with input sizes of at least 256.

In [32]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
batch = next(data_iter)
print(batch)

[tensor([[  32,  376, 2885, 1404]]), tensor([[ 376, 2885, 1404, 2169]])]


In [36]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  795,    79,  2301, 14991],
        [  267,   269,  5669,  8873],
        [44349, 10034,    64,    13],
        [   78,   384,  6592,  2418],
        [ 2319, 13276,    11,  2448],
        [13161,  4533, 31215,   257],
        [33380,   283, 38251,   435],
        [  390,   755,  8836, 13370]])

Targets:
 tensor([[   79,  2301, 14991,   312],
        [  269,  5669,  8873, 31215],
        [10034,    64,    13,   198],
        [  384,  6592,  2418,  6557],
        [13276,    11,  2448, 22019],
        [ 4533, 31215,   257,  1296],
        [  283, 38251,   435,  2188],
        [  755,  8836, 13370,   292]])


## Creating token embeddings

In [37]:
input_ids = torch.tensor([2, 3, 5, 1])

In [39]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [41]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


If we compare the embedding vector for token ID 3 to the previous embedding matrix, we see that it is identical to the 4th row. 

In other words, the embedding layer is essentially a look-up operation that retrieves rows from the embedding layer's weight matrix via a token ID.


In [42]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


This is how embedding vectors are created from token IDs. 

Let's add a small modification to these embedding vectors to encode positional information about a token within a text.

## Encoding word positions

Self-attention mechanism doesn't have a notion of position or order for the tokens within a sequence.


In principle, the deterministic, position-independent embedding of the token ID we coded is good for reproducibility purposes. 

However, since the self-attention mechanism of LLMs itself is also position-agnostic, it is helpful to inject additional position information into the LLM.

Absolute positional embeddings are directly associated with specific positions in a sequence. 

**For each position in the input sequence, a unique embedding is added to the token's embedding to convey its exact location.**

<div style="text-align: center;">
    <img src="static/positional_embeddings.png" alt=Positional embeddings diagram showing colored vectors aligned with a sequence of text tokens. Each vector represents a unique position in the sequence, visually illustrating how positional information is added to token embeddings in a neural network. The background is clean and minimal, emphasizing the technical and educational focus. No visible text is present. The tone is analytical and instructional, supporting understanding of how language models encode order and position. width="800"/>
</div>

- Positional embeddings are added to the token embedding vector to create the input embeddings for an LLM.
- The positional vectors have the same dimension as the original token embeddings.


Instead of focusing on the absolute position of a token, the emphasis of relative positional embeddings is on the relative position or distance between tokens. 

This means the model learns the relationships in terms of "how far apart" rather than "at which exact position."

The advantage here is that the model can generalize better to sequences of varying lengths, even if it hasn't seen such lengths during training.

Both types of positional embeddings aim to **augment the capacity of LLMs to understand the order and relationships between tokens**, ensuring more accurate and context-aware predictions. 

The choice between them often depends on the specific application and the nature of the data being processed.

<u>For example:</u> OpenAI's GPT models use absolute positional embeddings that are optimized during the training process rather than being fixed or predefined like the positional encodings in the original Transformer model.

This optimization process is part of the model training itself, which we will implement later in this book.

In [48]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [49]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)


Token IDs:
 tensor([[   32,   376,  2885,  1404],
        [ 2169,   401,    78,  1296],
        [  283,   773,   452,  8836],
        [  646,   418,  1451, 36096],
        [  390,  1323,  7718,   369],
        [  258,    66,  3681,   418],
        [  304,   390, 17463,   263],
        [ 7736,   528,  6557,    12]])

Inputs shape:
 torch.Size([8, 4])


In [50]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


For a GPT model's absolute embedding approach, we just need to create another embedding layer that has the same dimension as the token_embedding_layer:

In [None]:
context_length = max_length # represents the supported input size of the LLM.
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [54]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [57]:
pos_embeddings

tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       grad_fn=<EmbeddingBackward0>)

In [56]:
token_embeddings[:1]

tensor([[[ 0.0308, -0.4387, -1.0033,  ..., -2.0125,  0.3858, -0.8801],
         [ 0.0240, -0.7000, -0.2415,  ..., -1.2735, -0.5178,  1.2304],
         [-0.2507, -0.0546,  0.6687,  ...,  0.9618,  2.3737, -0.0528],
         [-0.3368,  0.9981, -0.5168,  ..., -1.4778,  0.5504, -1.5233]]],
       grad_fn=<SliceBackward0>)