Click [here](https://morihosseini.medium.com/from-characters-to-context-tokenization-in-llms-09b20abc42ed) to access the associated Medium article.

# Setup

In [24]:
!uv pip install nltk tiktoken tokenizers sentencepiece transformers

[2K[2mResolved [1m22 packages[0m in 314ms[0m                                                [0m
[2K[2mDownloaded [1m1 package[0m in 792ms[0m                                       [0m
[2K[2mInstalled [1m3 packages[0m in 219ms[0m4.40.2                                 [0m
 [32m+[39m [1mnumpy[0m[2m==1.26.4[0m
 [32m+[39m [1msafetensors[0m[2m==0.4.3[0m
 [32m+[39m [1mtransformers[0m[2m==4.40.2[0m


# Basic Tokenization Techniques

## Sentence Tokenization

In [11]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

text = "Tokenization is fascinating. Sentence tokenization splits text into sentences. It's crucial for NLP."
sentences = sent_tokenize(text)

for sentence in sentences:
    print(sentence)

Tokenization is fascinating.
Sentence tokenization splits text into sentences.
It's crucial for NLP.


[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Word Tokenization

In [14]:
import nltk
from nltk.tokenize import word_tokenize

sentence = "Word tokenization is essential for NLP tasks."
words = word_tokenize(sentence)

print(words)

['Word', 'tokenization', 'is', 'essential', 'for', 'NLP', 'tasks', '.']


## Subword Tokenization

In [22]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")

text = "Subword tokenization with BPE is powerful."
encoded = enc.encode(text)
decoded = enc.decode(encoded)

assert decoded == text

print(encoded)

[3214, 1178, 4037, 2065, 449, 426, 1777, 374, 8147, 13]


# Advanced Tokenization Methods

## Byte-Level BPE (Byte-Pair Encoding)

In [18]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
encoded = tokenizer.encode("Byte-Level BPE is fascinating.")
decoded = tokenizer.decode(encoded.ids)

print(encoded.tokens)

['[CLS]', 'byte', '-', 'level', 'bp', '##e', 'is', 'fascinating', '.', '[SEP]']


# Tokenization in Pretrained LLMs

In [33]:
from transformers import BertTokenizer, GPT2Tokenizer

# Load BERT and GPT2 tokenizers
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize a sentence
sentence = "Tokenization is fascinating."
bert_tokens = bert_tokenizer.tokenize(sentence)
gpt2_tokens = gpt2_tokenizer.tokenize(sentence)

print("BERT tokens:", bert_tokens)
print("GPT-2 tokens:", gpt2_tokens)

BERT tokens: ['token', '##ization', 'is', 'fascinating', '.']
GPT-2 tokens: ['Token', 'ization', 'Ġis', 'Ġfascinating', '.']
