# Download Data

In [2]:
import urllib.request

url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
filepath = "./data/the-verdict.txt"

urllib.request.urlretrieve(url, filepath)

('./data/the-verdict.txt', <http.client.HTTPMessage at 0x7f8cd0db9c70>)

In [3]:
with open(filepath, "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total Characters:", len(raw_text))
print(raw_text[0:99])

Total Characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Tokenization

### Basic Tokenizer

In [6]:
import re
text = "Hello, world! This, is a test."
result = re.split(r'(\s)', text)
result

['Hello,', ' ', 'world!', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']

## handling punctuations as a separate token

In [7]:
result = re.split(r'([,.:;?_!"()\']|--|\|\s)', text)
result

['Hello', ',', ' world', '!', ' This', ',', ' is a test', '.', '']

# remove whitespaces

In [8]:
result = [item.strip() for item in result  if item.strip()]
result

['Hello', ',', 'world', '!', 'This', ',', 'is a test', '.']

## Preprocessing raw text data 

In [9]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed  if item.strip()]
print("Total tokens:", len(preprocessed))
preprocessed[:10]

Total tokens: 4690


['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

# Convert token into token IDs

This is an intermediate step before converting the tokenID into embedding vectors.
To map the the previously generated tokens into token IDs, we have to build the vocabulary first. 
This vocabulary defines how we map each unique word and special character to a unique integer.

In [10]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print("Vocabulary Size:", vocab_size)

vocab = {token:integer for integer, token in enumerate(all_words)}
list(vocab.items())[:50]

Vocabulary Size: 1130


[('!', 0),
 ('"', 1),
 ("'", 2),
 ('(', 3),
 (')', 4),
 (',', 5),
 ('--', 6),
 ('.', 7),
 (':', 8),
 (';', 9),
 ('?', 10),
 ('A', 11),
 ('Ah', 12),
 ('Among', 13),
 ('And', 14),
 ('Are', 15),
 ('Arrt', 16),
 ('As', 17),
 ('At', 18),
 ('Be', 19),
 ('Begin', 20),
 ('Burlington', 21),
 ('But', 22),
 ('By', 23),
 ('Carlo', 24),
 ('Chicago', 25),
 ('Claude', 26),
 ('Come', 27),
 ('Croft', 28),
 ('Destroyed', 29),
 ('Devonshire', 30),
 ('Don', 31),
 ('Dubarry', 32),
 ('Emperors', 33),
 ('Florence', 34),
 ('For', 35),
 ('Gallery', 36),
 ('Gideon', 37),
 ('Gisburn', 38),
 ('Gisburns', 39),
 ('Grafton', 40),
 ('Greek', 41),
 ('Grindle', 42),
 ('Grindles', 43),
 ('HAD', 44),
 ('Had', 45),
 ('Hang', 46),
 ('Has', 47),
 ('He', 48),
 ('Her', 49)]

Putting it now all together into a tokenizer class

In [11]:
class SimpleTokenizerV1:

    def __init__(self, vocab: dict):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)            
        tokens = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in tokens]
        return tokens, ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

* The encode function turns text into token IDs
* The decode function turns token IDs back into text

In [12]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
print(text)
tokens, ids = tokenizer.encode(text)
print("Tokens:", tokens)
print("Encoded:", ids)
print("Decoded:", tokenizer.decode(ids))

"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride.
Tokens: ['"', 'It', "'", 's', 'the', 'last', 'he', 'painted', ',', 'you', 'know', ',', '"', 'Mrs', '.', 'Gisburn', 'said', 'with', 'pardonable', 'pride', '.']
Encoded: [1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded: " It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


## Adding Special context Tokens
It's useful to add some "special" tokens for unknown words and to denote the end of a text

* Some tokenizers use special tokens to help the LLM with additional context

* Some of these special tokens are

    * [BOS] (beginning of sequence) marks the beginning of text
    * [EOS] (end of sequence) marks where the text ends (this is usually used to concatenate multiple unrelated texts, e.g., two different Wikipedia articles or two    different books, and so on)
    *   [PAD] (padding) if we train LLMs with a batch size greater than 1 (we may include multiple texts with different lengths; with the padding token we pad the  shorter texts to the longest length so that all texts have an equal length)
    * [UNK] to represent words that are not included in the vocabulary

* Note that GPT-2 does not need any of these tokens mentioned above but only uses an <|endoftext|> token to reduce complexity

* The <|endoftext|> is analogous to the [EOS] token mentioned above

* GPT also uses the <|endoftext|> for padding (since we typically use a mask when training on batched inputs, we would not attend padded tokens anyways, so it does not matter what these tokens are)

* GPT-2 does not use an <UNK> token for out-of-vocabulary words; instead, GPT-2 uses a byte-pair encoding (BPE) tokenizer, which breaks down words into subword units which we will discuss in a later section

In [13]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
len(vocab.items())

1132

In [14]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


We also need to adjust the tokenizer accordingly so that it knows when and how to use the new <unk> token

In [15]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [16]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)
print(tokenizer.encode(text))
tokenizer.decode(tokenizer.encode(text))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

## BytePair Encoding (BPE)

BPE tokenizers break down unknown words into subwords and individual characters:

* GPT-2 used BytePair encoding (BPE) as its tokenizer
* it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words
* For instance, if GPT-2's vocabulary doesn't have the word "unfamiliarword," it might tokenize it as ["unfam", "iliar", "word"] or some other subword breakdown, depending on its trained BPE merges
* The original BPE tokenizer can be found here: https://github.com/openai/gpt-2/blob/master/src/encoder.py
* In this chapter, we are using the BPE tokenizer from OpenAI's open-source tiktoken library, which implements its core algorithms in Rust to improve computational performance
* I created a notebook in the ./bytepair_encoder that compares these two implementations side-by-side (tiktoken was about 5x faster on the sample text)

In [17]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp38-cp38-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp38-cp38-macosx_10_9_x86_64.whl (961 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m962.0/962.0 kB[0m [31m387.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [21]:
import importlib
import tiktoken

#print("tiktoken version:", importlib.metadata.version("tiktoken"))

In [22]:
tiktoken.list_encoding_names()

['gpt2', 'r50k_base', 'p50k_base', 'p50k_edit', 'cl100k_base', 'o200k_base']

In [23]:
tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunkonwnplace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 2954, 261, 675, 5372, 13]


In [25]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunkonwnplace.


In [26]:
"Vocabulary Size:", tokenizer.n_vocab

('Vocabulary Size:', 50257)

In [27]:
tokenizer.max_token_value

50256

In [28]:
# lets try how BPE handles unknown word: "akwirwdsjfodh ier"

text = "akwirwdsjfodh ier"
token_ids = tokenizer.encode(text)
print(token_ids)
print(tokenizer.decode(token_ids))

[461, 86, 343, 86, 9310, 73, 69, 375, 71, 220, 959]
akwirwdsjfodh ier


In [29]:
tokenizer.decode(list(range(100)))

'!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~������'

# Data sampling with a sliding window

We train LLMs to generate one word at a time, so we want to prepare the training data accordingly where the next word in a sequence represents the target to predict:


In [30]:
with open(filepath, "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


* For each text chunk, we want the inputs and targets
* Since we want the model to predict the next word, the targets are the inputs shifted by one position to the right

In [31]:
enc_text[:5], tokenizer.decode(enc_text[:5])

([40, 367, 2885, 1464, 1807], 'I HAD always thought')

In [32]:
# we remove first 50 tokens for demonstration purpose

enc_sample = enc_text[50:]
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1: context_size + 1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [None]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(context, "-->", target)
    print(tokenizer.decode(context), "-->", tokenizer.decode([target]))
    print('---')

[290] --> 4920
 and -->  established
---
[290, 4920] --> 2241
 and established -->  himself
---
[290, 4920, 2241] --> 287
 and established himself -->  in
---
[290, 4920, 2241, 287] --> 257
 and established himself in -->  a
---


* We will take care of the next-word prediction in a later chapter after we covered the attention mechanism
* For now, we implement a simple data loader that iterates over the input dataset and returns the inputs and targets shifted by one

In [None]:
import torch

class GPTDatasetV1(torch.utils.data.Dataset):

    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - max_length, stride):
            x = token_ids[i: i + max_length]
            y = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(x))
            self.target_ids.append(torch.tensor(y))

    def __len__(self):
        return len(self.input_ids)
    

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]


In [39]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [40]:
with open("./data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [41]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [42]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


* We can also create batched outputs
* Note that we increase the stride here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting

In [43]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Create Token Embeddings

* The data is already almost ready for an LLM
* But lastly let us embed the tokens in a continuous vector representation using an embedding layer
* Usually, these embedding layers are part of the LLM itself and are updated (trained) during model training

In [46]:
#Suppose we have the following four input examples with input ids 2, 3, 5, and 1 (after tokenization):
input_ids = torch.tensor([2, 3, 5, 1])

#For the sake of simplicity, suppose we have a small vocabulary of only 6 words and we want to create embeddings of size 3:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

#This would result in a 6x3 weight matrix:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

* For those who are familiar with one-hot encoding, the embedding layer approach above is essentially just a more efficient way of implementing one-hot encoding followed by matrix multiplication in a fully-connected layer, which is described in the supplementary code in ./embedding_vs_matmul

* Because the embedding layer is just a more efficient implementation that is equivalent to the one-hot encoding and matrix-multiplication approach it can be seen as a neural network layer that can be optimized via backpropagation

In [47]:
#To convert a token with id 3 into a 3-dimensional vector, we do the following:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

* An embedding layer is essentially a look-up operation.

## Encoding Word Positions

* Embedding layer convert IDs into identical vector representations regardless of where they are located in the input sequence.
* Positional embeddings are combined with the token embedding vector to form the input embeddings for a large language model.
* The BytePair encoder has a vocabulary size of 50,257
* Suppose we want to encode the input tokens into a 256-dimensional vector representation:

In [48]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [49]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [50]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [53]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

# uncomment & execute the following line to see how the embeddings look like
#print(token_embeddings)

torch.Size([8, 4, 256])


* GPT-2 uses absolute position embeddings, so we just create another embedding layer:

In [54]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# uncomment & execute the following line to see how the embedding layer weights look like
# print(pos_embedding_layer.weight)

In [57]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [55]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)


torch.Size([4, 256])


* To create the input embeddings used in an LLM, we simply add the token and the positional embeddings:

In [58]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
