# Tokenization for LLM
1. Split text inot words and sub words (Token)
2. Convert/Assign each token to token id

* Book for data (The Verdict) - https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt


## Step 1: Creating Tokens

In [None]:
import os
import requests
import re

In [None]:
# Download the book
# file_path = "the-verdict.txt"
file_path = "alice_in_wonderland.txt"
# url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
url = "https://raw.githubusercontent.com/kuemit/txt_book/refs/heads/master/examples/alice_in_wonderland.txt"
if not os.path.exists(file_path):
  response = requests.get(url, timeout=30)
  response.raise_for_status()
  text_data = response.text
  with open(file_path, "w", encoding="utf-8") as file:
    file.write(text_data)
else:
  with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

print(f"Length of text: {len(text_data)} characters")
print(f"First 100 characters of text: {text_data[:99]}")

Length of text: 148208 characters
First 100 characters of text: TITLE: Alice's Adventures in Wonderland
AUTHOR: Lewis Carroll


= CHAPTER I = 
=( Down the Rabbit-H


In [None]:
# Spliting the word
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text_data)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['TITLE', ':', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'AUTHOR', ':', 'Lewis', 'Carroll', '=', 'CHAPTER', 'I', '=', '=', '(', 'Down', 'the', 'Rabbit-Hole', ')', '=', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired']


In [None]:
print(len(preprocessed))

34158


## Step 2: Creating Token IDs

In [None]:
all_words = sorted(list(set(preprocessed)))
vcab_size = len(all_words)
print(f"Vocab size: {vcab_size}")

Vocab size: 3189


In [None]:
vcab = {word: i for i, word in enumerate(all_words)}
print(vcab)



## Toeknizer Class


In [None]:
class SimpleTokenizerV1():
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {index: word for word, index in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # Replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV1(vcab)
text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(f"Token IDs: {ids}")
decodes = tokenizer.decode(ids)
print(f"Word : {decodes}")

KeyError: 'painted'

In [None]:
# IF sentance is not poresent in vacob
# text = "Hello, do you like tea?"
# print(tokenizer.encode(text))
# SInce Hello is not present, it will throw an error

## Special Context Token For Missign Word/Token

In [None]:
# To better handle this add two tokens, endoftext and unk
all_token = sorted(list(set(preprocessed)))
all_token.extend(["<|endoftext|>", "<|unk|>"])
vcab_size = len(all_token)
print(f"Vocab size: {vcab_size}")
vocab = {token: integer for integer, token in enumerate(all_token)}
print(len(vocab.items()))


Vocab size: 3191
3191


In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('yourself', 3186)
('youth', 3187)
('zigzag', 3188)
('<|endoftext|>', 3189)
('<|unk|>', 3190)


In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i: s for s, i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # Replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
tokenizer.encode(text)

[3190,
 6,
 1289,
 3182,
 1920,
 2820,
 25,
 3189,
 198,
 2848,
 3190,
 3190,
 2139,
 2848,
 3190,
 8]

# Byte Pair Encoding
 Not eeach word is a token, it can be broken down inot multiple subword/token


* Word based tokenizer
* Subword based tokenizer
* Character based tokenizer

Byte Pair is subword tokenization

**tiktoken pythom library will be used**




In [None]:
%pip install tiktoken -q

In [None]:
import tiktoken
print(f"tiktoken version: {tiktoken.__version__}")

tiktoken version: 0.12.0


In [None]:
## Byte pair tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
## BPT encoding
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
## BPT decoding
words = tokenizer.decode(integers)
print(words)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [None]:
## Another experiment for tokenizaton with article text
articleTokenIds = tokenizer.encode(text_data)
print(f"articleTokenIds len : {len(articleTokenIds)}")
aticleString = tokenizer.decode(articleTokenIds)
print(f"aticleString len : {len(aticleString)}")


articleTokenIds len : 42098
aticleString len : 148208


## INPUT-TARGET PAIRS

* implement data loader that fetches the input-target pairs using a sliding window approach

In [None]:
# remove the first 200 token from data set to do experiment
articleTokenIds = articleTokenIds[200:]

In [None]:
context_size = 4 # length of the input, model is trained to look at a sequence of 4 words
x = articleTokenIds[:context_size]
y = articleTokenIds[1:context_size+1]
print(f"x: {x}")
print(f"y: {y}")


x: [503, 286, 262, 835]
y: [286, 262, 835, 284]


In [None]:
for i in range(1, context_size+1):
  context = articleTokenIds[:i]
  desired = articleTokenIds[i]
  print(f"context: {context}, desired: {desired}")

context: [503], desired: 286
context: [503, 286], desired: 262
context: [503, 286, 262], desired: 835
context: [503, 286, 262, 835], desired: 284


In [None]:
for i in range(1, context_size+1):
  context = articleTokenIds[:i]
  desired = articleTokenIds[i]
  print(f"{tokenizer.decode(context)} ----> {tokenizer.decode([desired])}")

 out ---->  of
 out of ---->  the
 out of the ---->  way
 out of the way ---->  to


#### Implement dataloader using PyTorch to convert it into tensor

In [None]:
# DataSet
import torch
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []
    ## Tokenize the etire text
    token_id = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
    for i in range(0, len(token_id)-max_length, stride):
      input_chunk = token_id[i:i+max_length]
      target_chunk = token_id[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, index):
    return self.input_ids[index], self.target_ids[index]


In [None]:
# DataLoader
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                      stride=128, shuffle=True, drop_last=True,
                      num_workers=0):
  # Initialize the tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")
  # Create the dataset
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  dataloader = DataLoader(dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=drop_last,
                          num_workers=num_workers)
  return dataloader




In [None]:
dataloder = create_dataloader_v1(text_data, batch_size=8,
                                 max_length=4,
                                 stride=4,
                                 shuffle=False,
                                 num_workers=2)

dataloder_itr = iter(dataloder)
first_batch = next(dataloder_itr)
print(f"first_batch:\n {first_batch}")

first_batch:
 [tensor([[  345, 10783,   284,  1560],
        [  588,   262,   804,   286],
        [  198,    28,  5870, 29485],
        [  757, 13679,   628,   220],
        [  673,   198, 18108,   587],
        [  318,   340,  8348,   628],
        [  843,   262, 33958,   746],
        [  628,   220,  4600,  8241]]), tensor([[10783,   284,  1560,   502],
        [  262,   804,   286,   262],
        [   28,  5870, 29485,  6711],
        [13679,   628,   220,  4600],
        [  198, 18108,   587, 24504],
        [  340,  8348,   628,   220],
        [  262, 33958,   746,   261],
        [  220,  4600,  8241, 15986]])]


In [None]:
second_batch = next(dataloder_itr)
print(f"second_batch:\n {second_batch}")

second_batch:
 [tensor([[  220,   220,   220,   220],
        [  220,   314, 16498,   910],
        [   25,   198,    63,   270],
        [   13,   220,  4600,    40],
        [  220, 24430,   198,   220],
        [ 2540, 10868,   606,   510],
        [   40,   466,  4601,   314],
        [15986,   345,  3375,   284]]), tensor([[  220,   220,   220,   220],
        [  314, 16498,   910,   612],
        [  198,    63,   270,  6140],
        [  220,  4600,    40,  2492],
        [24430,   198,   220,   220],
        [10868,   606,   510,   757],
        [  466,  4601,   314,  8020],
        [  345,  3375,   284,  8348]])]


## Token Embeddings

* It convert words/token with their associated verctor
* Sementic Meanings preserved

In [None]:
# Example with small embedding
example_input_ids = torch.tensor([[2, 3, 5, 1]])
example_vocab_size = 6
example_dim = 3

torch.manual_seed(123)
example_embedding_layer = torch.nn.Embedding(example_vocab_size, example_dim)
# example_output = example_embedding_layer(example_input_ids)
# print(example_output)
print(example_embedding_layer.weight)


Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [None]:
# lookup in embedding matrix
print(example_embedding_layer(torch.tensor([3])))
print(example_embedding_layer.weight[3])

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)
tensor([-0.4015,  0.9666, -1.1481], grad_fn=<SelectBackward0>)


### Positional Encoding

<div class="alert alert-block alert-success">

Previously, we focused on very small embedding sizes in this chapter for illustration
purposes.

We now consider more realistic and useful embedding sizes and encode the input
tokens into a 256-dimensional vector representation.

This is smaller than what the original
GPT-3 model used (in GPT-3, the embedding size is 12,288 dimensions) but still reasonable
for experimentation.

Furthermore, we assume that the token IDs were created by the BPE
tokenizer that we implemented earlier, which has a vocabulary size of 50,257:

</div>

In [None]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
    text_data, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [None]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[ 1560,   502,  4032,   531],
        [  198, 47436,   284,   262],
        [  198,    63,   403, 18049],
        [  464, 40289,     0,   220],
        [  287,   257,  1877,    11],
        [  475,    11,   706,  4964],
        [13679,   290,   262,  2677],
        [ 7898,   262,  4973,   379]])

Inputs shape:
 torch.Size([8, 4])


In [None]:
# CoEmbed these token ids inot 256 dimentional vectors - 8x4x256
token_embeding = token_embedding_layer(inputs)
print(token_embeding.shape)
# tokeb_embeding[0]

torch.Size([8, 4, 256])


##### create another embedding layer for positional encoding
It would be of size context_length(4)x256

In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


In [None]:
pos_embedding = pos_embedding_layer(torch.arange(max_length))
print(pos_embedding.shape)

torch.Size([4, 256])


In [None]:
# Now add both toekn embeding and pos embeding to get final embeddings
input_embeddings = token_embeding + pos_embedding
print(input_embeddings.shape)

torch.Size([8, 4, 256])
