In [2]:
import urllib.request

import tiktoken

url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "../../../data/the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('../the-verdict.txt', <http.client.HTTPMessage at 0x1090b2c10>)

In [3]:
with open("../../../data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:
import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

# We refrain from making all text lower-case because caps help LLMs distinguish between proper nouns and common nouns, understand sentence structure, and learn to generate text with proper capitalization.

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [5]:
result = re.split(r'([,.]|\s)', text)
result = [item for item in result if item.strip()]
print(result)

#whether or not we encode whitespaces as separate characters or just remove them depends on our application and its requirements.
#keeping whitespaces can be useful if we train models that are sensitive to the exact structure of the text (like python code). Removing them reduces memory and computing requirements.

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [6]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [7]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

# vocabs define how we map each unique word and special character to a unique integer. each unique token is mapped to a unique token ID.
# each unique token is added to the vocab in alphabetical order. duplicate tokens are removed.

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [8]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

#starting with a new sample text, we tokenize this new text and use the existing vocabulary to convert the text tokens into token IDs.
# the vocab is built from the entire training set and can be applied to the training set itself and any new text samples.

1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


**A complete tokenizer class** that:

- Splits text into tokens.
- Carries out the string-to-integer mapping to produce token IDs via the vocabulary.
- Implements a `decode` method that carries out the reverse integer-to-string mapping to convert the token IDs back into text.

In [9]:
class SimpleTokenizerV1:
    def __init__(self, vocab): # the function works via an existing vocab, which we can use to encode and decode text.
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text) # preprocesses input text into token IDs
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) # converts tokens back into text
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #removes spaces before the specified punctuation
        return text

tokenizer = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know, Mrs.Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]
It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.


In [10]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text)) #word "Hello" not included in "The Verdict" short story. Need to consider larger, diverse training sets to extend the vocabulary when working with LLMs.
# we can just modify the tokenizer to use an <|unk|> token if it encounters a word that is not part of the vocabulary.
# we can also add a <|endoftext|> token that we can use to separate two unrelated text sources. this helps the lLM understand that although these text sources are concatenated for training, they are, in fact, unrelated.

KeyError: 'Hello'

In [11]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [12]:
class SimpleTokenizerV2:
    def __init__(self, vocab): # the function works via an existing vocab, which we can use to encode and decode text.
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text) # preprocesses input text into token IDs
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) # converts tokens back into text
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text) #removes spaces before the specified punctuation
        return text

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text))) #quick sanity check

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


Depending on the LLM, some researchers also consider additional special tokens such as the following:

- `[BOS]` (beginning of sequence), marks the start of a text. Signifies to the LLM where a piece of content begins.
- `[EOS]` (end of sequence), positioned at the end of a text and is especially useful when concatenating multiple unrelated texts, similar to `<|endoftext|>`.
- `[PAD]` (padding), when training LLMs with batch sizes larger than one, the batch might contain texts of varying length. The shorter texts are extended, padded, up to the length of the longest text in the batch.


In [13]:
import tiktoken

#bpe breaks down words into subword units. Unknown tokens are not longer used, nor needed.
tokenizer = tiktoken.get_encoding("gpt2")
text = ("Helo, do you like tea? <|endoftext|> In the sunlit terraces"
        "of someunknownPlace")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

#the endoftext token is assigned the largest token ID, out of 50257 tokens.
#the BPE encodes and decodes unknown words correctly.

[12621, 78, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271]
Helo, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace


The algorithm underlying byte-pair encoding breaks down words that aren't in its predefined vocabulary into smaller subwords units or even individual characters, enabling it to handle out-of-vocab words.
BPE builds its vocabulary iteratively merging frequent characters into subwords and frequent subwords into words. It starts by adding all individual single characters to its vocab ("a", "b", etc.). Next, it merges character combinations that frequently occur together into subwords, each merge being determined by a frequency cutoff.

**Example:**
- The text sample "Akwir ier" would first be tokenized into individual characters or subwords.
- "Ak", "w", "ir", "w", "", "ier", each with their corresponding token IDs (33901, 86, 343, 86, 220, 959).

The ability to break down words into individual characters ensures that the tokenizer, and the LLM that is trained with it, can process any text, even if it contains words that were not present in its training data.

In [14]:
example = "Awkir ier"
integers_ex = tokenizer.encode(example, allowed_special={"<|endoftext|>"})
string_ex = tokenizer.decode(integers_ex)
print(integers_ex, string_ex)

[23155, 74, 343, 220, 959] Awkir ier


### Data Sampling with a Sliding Window
We now create the input-target pairs required for training the LLM. LLMs are pretrained by predicting the next word in a text, and we mask out all words that are past the target.
**Example:**

- Input: "LLMs", target: "learn"
- Input: "LLMs learn", target: "to"
- Input: "LLMs learn to", target: "predict"
- Input: "LLMs learn to predict", target: "one"
- Input: "LLMs learn to predict one", target: "word"


In [15]:
with open("../../../data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

enc_sample = enc_text[50:]
context_size = 4 #how many tokens are included in the input
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

5145
x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [16]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "--->", desired)

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "--->", tokenizer.decode([desired]))

[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287
[290, 4920, 2241, 287] ---> 257
 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


As a last step to enable embeddings, we need an efficient data loader that iterates over the input dataset and returns the inputs and targets as PyTorch tensors. We want to return two tensors:

- an input tensor containing the text that the LLM sees.
- an output, target sensor that includes the targets for the LLM to predict.

Each row in the (of size `max_length`) input tensor `x` represents one input context. A second tensor,`y` contains the corresponding prediction targets, which are created by shifting the input by one position.

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        token_ids = tokenizer.encode(txt) #tokenizes entire text, and converts them into token IDs as a single step.
        self.token_ids = torch.tensor(token_ids, dtype=torch.long)
        self.max_length = max_length
        self.stride = stride
        self.n_windows = (len(token_ids) - self.max_length) // self.stride

    def __len__(self):
        return self.n_windows # returns the total number of rows in the dataset

    def __getitem__(self, idx):
        start = idx * self.stride
        chunk = self.token_ids[start : start + self.max_length + 1]
        return chunk[:-1].clone(), chunk[1:].clone()

def create_dataloader(txt,
                      tokenizer=None,
                      enc_name='gpt2',
                      batch_size=4,
                      max_length=256,
                      stride=128,
                      shuffle=True,
                      drop_last=True,
                      num_workers=0):
    tokenizer = tokenizer or tiktoken.get_encoding(enc_name)
    dataset = GPTDataset(txt, tokenizer, max_length=max_length, stride=stride)
    return DataLoader(dataset,
                      batch_size=batch_size,
                      shuffle=shuffle,
                      drop_last=drop_last, #drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training.
                      num_workers=num_workers) # num of CPU processes to use for preprocessing.


We test the `dataloader` with a batch size of 1 for an LLM with a context size of 4 to develop an intuition of how the `GPTDataset` class and the `create_dataloader` work together.

In [18]:
with open("../../../data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader) # convert dataloader into a iterator to fetch the next entry, leveraging the defined __getitem__ function above.
first_batch = next(data_iter)
print(first_batch[0]) #the input token IDs
print(first_batch[1]) #the target token IDs.

#Since max_length is set to 4, each of the two tensors contain four token IDs.

tensor([[  40,  367, 2885, 1464]])
tensor([[ 367, 2885, 1464, 1807]])


In [19]:
second_batch = next(data_iter)
print(second_batch[0])
print(second_batch[1])

tensor([[ 367, 2885, 1464, 1807]])
tensor([[2885, 1464, 1807, 3619]])


If we compare the first and second batches, we see that the second batch's token IDs are shifted by only one positions. The `stride` parameter dictates the number of positions the inputs shift across batches, emulating a sliding window approach.

Note that, in the case below, we increase the stride to 4 to utilize the data set fully. This avoids overlap while using every single word. **More overlap could lead to increased overfitting**.


In [20]:
dataloader = create_dataloader(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs: \n", inputs)
print("Targets: \n", targets)

Inputs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [21]:
input_ids = torch.tensor([2,3,5,1]) #suppose we have four tokens with IDs 2,3,5 and 1
vocab_size = 6 #for simplicity, we have a small vocab of 6 words
output_dim = 3 #we want to create embeddings of size 3 (GPT3 has embedding size of 12288)

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight) #there is one row for each word, and one column for each of the three dimensions

print(embedding_layer(torch.tensor([3])))
print(embedding_layer(input_ids))

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)
tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


The embedding layer is essentially a lookup operation retrieving rows from the embedding layer's weight matrix, through token IDs. **The embedding layer is just a more efficient way of implementing one-hot encoding followed by a matrix multiplication in a fully connected layer**. It therefore can be seen as a neural network that can be optimized via backpropagation.

So far, the embedding layer maps the same token ID to the same vector representation, regardless of position in the input sequence. Indeed, a minor problem with LLMs is that their self-attention mechanism does not have a notion for position. It is therefore helpful to inject additional position information into the lLM.

- **Relative Positional Embeddings**: Emphasis is on the relative position or distance between tokens. The model learns relationships in terms of 'how far apart' rather than 'at which exact position'. The model can usually generalize better to sequences of varying length.
- **Absolute Positional Embeddings**: Directly associated with specific positions in a sequence. For each position in an input sequence, a unique embedding is added to the token's embedding to convey its location. Token Embedding: [1,1,1] --> Positional Embedding: [1.1, 1.2, 1.3] --> Input Embeddings: [2.1, 2.2, 2.3].

The choice between them depends on the specific application and the nature of the data being processed. OpenAI's models use absolute positional embeddings, but they are optimized in the training process rather than being fixed or predefined.

In [23]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

max_length = 4
dataloader = create_dataloader(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False) #sampling from the dataloader, we embed each token in each batch into a 256 dim vector.
# if we have a batch of 8 with four tokens each, the result will be an 8x4x256 tensor.
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [24]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

context_length = max_length #supported input size of the LLM.
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape) #input to the pos_embeddings is usually a placeholder vector torch.arange(context_length) containing a seq. of numbers up to the max.length-1.

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])


We now have input embeddings that can be decoded for further postprocessing steps, and eventually to generate output text.
