# Using BytePair encodding from `tiktoken`

In [1]:
# !pip install tiktoken

In [2]:
import importlib.metadata

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.5.1


In [3]:
import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")

text = "Hello, world. Is this-- a test?"

In [4]:
integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [5]:
strings = tik_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


In [6]:
print(tik_tokenizer.n_vocab)

50257


# Using the original Byte-pair encoding implementation used in GPT-2

In [7]:
from bpe_openai_gpt2 import get_encoder

In [8]:
orig_tokenizer = get_encoder(model_name="gpt2", models_dir=".")

In [9]:
integers = orig_tokenizer.encode(text)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [10]:
strings = orig_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


# Using the BytePair Tokenizer in HuggingFace transformers

In [11]:
# pip install transformers

In [12]:
import transformers

transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.33.2'

In [14]:
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [15]:
hf_tokenizer(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

# A quick performance benchmark

In [18]:
with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [19]:
%timeit orig_tokenizer.encode(raw_text)

4.17 ms ± 18.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
%timeit tik_tokenizer.encode(raw_text)

1.68 ms ± 9.31 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [22]:
%timeit hf_tokenizer(raw_text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors


8.81 ms ± 51.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]

8.8 ms ± 74 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
