In [1]:
! pip3 install tiktoken



In [2]:
import importlib
import tiktoken

print("tiktoken version:",importlib.metadata.version("tiktoken"))

tiktoken version: 0.12.0


In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
# "gpt2" means: use the same tokenization rules and vocabulary that were used to train the GPT-2 model.

In [5]:
# "gpt2" encoding includes:
#  A vocabulary of 50,000 tokens.
#  Special byte pair encoding (BPE) rules for how to split text into subword tokens.
#  Mappings from text pieces → integer IDs and back.

### The usage of this tokenizer is similar to SimpleTokenizerV2

In [6]:
text = (
    "Hello,do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)
integer = tokenizer.encode(text,allowed_special = {"<|endoftext|>"})

print(integer)

[15496, 11, 4598, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


### Decode

In [7]:
strings = tokenizer.decode(integer)

print(strings)

Hello,do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [None]:
# The BPE Tokenizer ,which is used to train the models like GPT-2,GPT-3 and the orginal model used in ChatGPT with <|endoftext|> being assigned the largest token ID.

# The BPE can handle unknown word without <|unk|> tokens. How?

# The algo underlying BPE breakdown words that aren't in its predefined vocabulary into smaller subwords units or even individuals characters.
# This enables it to handle Out-ofVocabulary (OOV) words.
# Because of BPE algo, if the tokenizer encounters an unfamiliar word during tokenization,it can represents it as sequence of subword tokens or characters

# Another example 

In [11]:
integer = tokenizer.encode("AKwirw ier jh")
print(integer)

strings = tokenizer.decode(integer)
print(strings)

[10206, 86, 343, 86, 220, 959, 474, 71]
AKwirw ier jh
