# Fresh start for data preparation
1. Tokenization
2. Toekn Embedding
3. Positional Embedding
4. Input Embedding (toekn+positional)


### 1. Tokenization


###### Create tokens

In [None]:
import os
import requests
import re

In [None]:
# Download the book
# file_path = "the-verdict.txt"
file_path = "alice_in_wonderland.txt"
# url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
url = "https://raw.githubusercontent.com/kuemit/txt_book/refs/heads/master/examples/alice_in_wonderland.txt"
if not os.path.exists(file_path):
  response = requests.get(url, timeout=30)
  response.raise_for_status()
  text_data = response.text
  with open(file_path, "w", encoding="utf-8") as file:
    file.write(text_data)
else:
  with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

print(f"Length of text: {len(text_data)} characters")
print(f"First 100 characters of text: {text_data[:99]}")


Length of text: 148208 characters
First 100 characters of text: TITLE: Alice's Adventures in Wonderland
AUTHOR: Lewis Carroll


= CHAPTER I = 
=( Down the Rabbit-H


###### Word based tokenization to start with

In [None]:
# Spliting the word
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text_data)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])
print(f"Length of preprocessed: {len(preprocessed)}")

['TITLE', ':', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'AUTHOR', ':', 'Lewis', 'Carroll', '=', 'CHAPTER', 'I', '=', '=', '(', 'Down', 'the', 'Rabbit-Hole', ')', '=', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired']
Length of preprocessed: 34158


###### Crating token ids

In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(f"Vocab size: {vocab_size}")
vocab = {word: idx for idx, word in enumerate(all_words)}
id_to_word = {idx: word for idx, word in enumerate(all_words)}
print(f" word_to_id['Alice']: {vocab['Alice']}")
print(f" id_to_word[1]: {id_to_word[40]}")


Vocab size: 3189
 word_to_id['Alice']: 40
 id_to_word[1]: Alice


In [None]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'([,.:;?_!"()\']|--|\s)', r'\1', text)
    return text



In [None]:
tokenizer = SimpleTokenizerV1(vocab)
ids = tokenizer.encode(text_data)
print(ids[:100])
# print(tokenizer.decode(ids[:100]))

[377, 22, 40, 2, 2489, 35, 1772, 447, 33, 22, 222, 83, 24, 76, 184, 24, 24, 3, 110, 2848, 321, 4, 24, 40, 3066, 913, 2904, 1589, 3039, 2901, 2139, 2635, 1011, 1702, 2631, 2152, 2848, 879, 6, 805, 2139, 1680, 2121, 2904, 1289, 22, 2153, 2167, 2974, 2573, 1649, 2223, 1795, 2848, 954, 1702, 2631, 3066, 2394, 6, 1005, 1806, 1649, 2110, 2244, 2167, 1149, 1772, 1806, 6, 663, 3093, 1804, 2848, 3019, 2139, 736, 954, 6, 2, 2870, 40, 734, 2244, 2167, 1148, 25, 2, 349, 2573, 3066, 1139, 1772, 1702, 2184, 2029, 3, 846, 3087, 846]


In [None]:
print(tokenizer.decode(ids[:50]))

TITLE : Alice ' s Adventures in Wonderland AUTHOR : Lewis Carroll = CHAPTER I = = ( Down the Rabbit-Hole ) = Alice was beginning to get very tired of sitting by her sister on the bank , and of having nothing to do : once or twice she


##### Add special tokenizer for unknown text handling

In [None]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
len(vocab.items())

3191

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('yourself', 3186)
('youth', 3187)
('zigzag', 3188)
('<|endoftext|>', 3189)
('<|unk|>', 3190)


In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'([,.:;?_!"()\']|--|\s)', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)
ids = tokenizer.encode(text_data)
print(ids[:100])

[377, 22, 40, 2, 2489, 35, 1772, 447, 33, 22, 222, 83, 24, 76, 184, 24, 24, 3, 110, 2848, 321, 4, 24, 40, 3066, 913, 2904, 1589, 3039, 2901, 2139, 2635, 1011, 1702, 2631, 2152, 2848, 879, 6, 805, 2139, 1680, 2121, 2904, 1289, 22, 2153, 2167, 2974, 2573, 1649, 2223, 1795, 2848, 954, 1702, 2631, 3066, 2394, 6, 1005, 1806, 1649, 2110, 2244, 2167, 1149, 1772, 1806, 6, 663, 3093, 1804, 2848, 3019, 2139, 736, 954, 6, 2, 2870, 40, 734, 2244, 2167, 1148, 25, 2, 349, 2573, 3066, 1139, 1772, 1702, 2184, 2029, 3, 846, 3087, 846]


In [None]:
print(tokenizer.decode(ids[:50]))

TITLE : Alice ' s Adventures in Wonderland AUTHOR : Lewis Carroll = CHAPTER I = = ( Down the Rabbit-Hole ) = Alice was beginning to get very tired of sitting by her sister on the bank , and of having nothing to do : once or twice she


#### Byte pair encoding

In [None]:
%pip install tiktoken -q

In [None]:
import tiktoken
print(f"tiktoken version: {tiktoken.__version__}")

tiktoken version: 0.12.0


In [None]:
## Byte pair tokenizer
tokenizerBPE = tiktoken.get_encoding("gpt2")

In [None]:
integers = tokenizerBPE.encode(text_data, allowed_special={"<|endoftext|>"})
print(len(integers))

42098


In [None]:
# Multiple ecodings using tiktoken
encodings = {
    "gpt2": tiktoken.get_encoding("gpt2"),
    "gpt3": tiktoken.get_encoding("p50k_base"),
    "gpt4": tiktoken.get_encoding("cl100k_base")
}
vocab_sizes = {model: encoding.n_vocab for model, encoding in encodings.items()}
# print
for model, vocab_size in vocab_sizes.items():
  print(f"vocabulary size for {model.upper()}: {vocab_size}")


vocabulary size for GPT2: 50257
vocabulary size for GPT3: 50281
vocabulary size for GPT4: 100277


### 1. Token Embedding

##### Input-Target pairs

In [None]:
enc_text = tokenizerBPE.encode(text_data, allowed_special={"<|endoftext|>"})
print(len(enc_text))

42098


In [None]:
context_size = 4 #length of the input
#The context_size of 4 means that the model is trained to look at a sequence of 4 words (or tokens)
#to predict the next word in the sequence.
#The input x is the first 4 tokens [1, 2, 3, 4], and the target y is the next 4 tokens [2, 3, 4, 5]

x = enc_text[:context_size]
y = enc_text[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [49560, 2538, 25, 14862]
y:      [2538, 25, 14862, 338]


In [None]:
for i in range(1, context_size+1):
    context = enc_text[:i]
    desired = enc_text[i]

    print(context, "---->", desired)

[49560] ----> 2538
[49560, 2538] ----> 25
[49560, 2538, 25] ----> 14862
[49560, 2538, 25, 14862] ----> 338


In [None]:
for i in range(1, context_size+1):
    context = enc_text[:i]
    desired = enc_text[i]

    print(tokenizerBPE.decode(context), "---->", tokenizerBPE.decode([desired]))

TIT ----> LE
TITLE ----> :
TITLE: ---->  Alice
TITLE: Alice ----> 's


##### Data Loader

In [None]:
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_id = []
    self.target_ids = []

    tokend_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
    for i in range(0, len(tokend_ids)-max_length, stride):
      input_chunk = tokend_ids[i:i+max_length]
      target_chunk = tokend_ids[i+1:i+max_length+1]
      self.input_id.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_id)

  def __getitem__(self, idx):
    return self.input_id[idx], self.target_ids[idx]



In [None]:
def create_dataloader_v1(text, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

  tokenizer = tiktoken.get_encoding("gpt2")

  dataset = GPTDatasetV1(text, tokenizer, max_length, stride)

  dataloader = DataLoader(dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last,
                          num_workers=num_workers)
  return dataloader


In [None]:
import torch
print("PyTorch version:", torch.__version__)
max_length = 4
dataloader = create_dataloader_v1(
    text_data, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

PyTorch version: 2.9.0+cpu
Inputs:
 tensor([[49560,  2538,    25, 14862],
        [  338, 15640,   287, 42713],
        [  198,    32, 24318,  1581],
        [   25, 10174, 21298,   628],
        [  198,    28,  5870, 29485],
        [  314,   796,   220,   198],
        [16193,  5588,   262, 25498],
        [   12,    39,  2305,  1267]])

Targets:
 tensor([[ 2538,    25, 14862,   338],
        [15640,   287, 42713,   198],
        [   32, 24318,  1581,    25],
        [10174, 21298,   628,   198],
        [   28,  5870, 29485,   314],
        [  796,   220,   198, 16193],
        [ 5588,   262, 25498,    12],
        [   39,  2305,  1267,    28]])


In [None]:
vocab_size = 50257
output_dim = 256

# torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
embedding_layer

Embedding(50257, 256)

In [None]:
token_embeddings = embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

### 3. Positional Embedding

In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


### 4. Input Embedding

In [None]:
input_embeddings = token_embedding + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
