# Understanding how a base tokenizer from hugginface works

In [1]:
from datasets import load_dataset
from tqdm import tqdm
from dataclasses import dataclass, field
from typing import Optional
from transformers import AutoTokenizer, HfArgumentParser
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder")
print("Tokenizer class:", type(tokenizer).__name__)
print("Name or path:", tokenizer.name_or_path)
print("Vocab size:", tokenizer.vocab_size)
# Model max length is probably max input length
print("Model max length:", tokenizer.model_max_length)
print("Is fast tokenizer:", getattr(tokenizer, "is_fast", "N/A"))
print("Padding side:", tokenizer.padding_side)
print("Truncation side:", tokenizer.truncation_side)
print("Special tokens:", tokenizer.special_tokens_map)
print("Added tokens:", tokenizer.added_tokens_decoder)

Tokenizer class: GPT2TokenizerFast
Name or path: bigcode/starcoder
Vocab size: 49152
Model max length: 1000000000000000019884624838656
Is fast tokenizer: True
Padding side: right
Truncation side: right
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<fim_prefix>', '<fim_middle>', '<fim_suffix>', '<fim_pad>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>', '<commit_before>', '<commit_msg>', '<commit_after>', '<reponame>']}
Added tokens: {0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 1: AddedToken("<fim_prefix>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 2: AddedToken("<fim_middle>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

## **Encoding Example**

In [12]:
# Basic Text2Tokens

text1 = "How will this be encoded?"
tks1 = tokenizer.encode(text = text1)
text2 = "[i**2 for i in range(10)]"
tks2 = tokenizer.encode(text = text2)
print(f"Tokenized text:\ntks1: {tks1}\ntks2: {tks2}")

back1 = tokenizer.decode(tks1)
back2 = tokenizer.decode(tks2)
print(f"\n\nDecoded text:\ntext1: {text1}\ntext2: {text2}")


Tokenized text:
tks1: [8257, 1098, 458, 526, 11830, 49]
tks2: [77, 91, 326, 36, 436, 595, 328, 2155, 26, 35, 34, 2177]


Decoded text: text1: How will this be encoded?
text2: [i**2 for i in range(10)]


In [20]:
# For the first text
text1 = "How will this be encoded?"
tokens1 = tokenizer.tokenize(text1)
print("Text1 tokens:", tokens1)

# For the second text  
text2 = "[i**2 for i in range(10)]"
tokens2 = tokenizer.tokenize(text2)
print("Text2 tokens:", tokens2) # no spaces tokenized? 

# From cursor:
# The Ġ symbol you see is a special character that this tokenizer uses to mark the beginning of words (it represents a space). So:
# 'Ġwill' means " will" (space + will)
# 'Ġthis' means " this" (space + this)


Text1 tokens: ['How', 'Ġwill', 'Ġthis', 'Ġbe', 'Ġencoded', '?']
Text2 tokens: ['[', 'i', '**', '2', 'Ġfor', 'Ġi', 'Ġin', 'Ġrange', '(', '1', '0', ')]']


In [21]:
# Show the mapping
for i, token in enumerate(tokens1):
    token_id = tokenizer.convert_tokens_to_ids([token])[0] # returns a list with a single element
    print(f"'{token}' -> {token_id}")

'How' -> 8257
'Ġwill' -> 1098
'Ġthis' -> 458
'Ġbe' -> 526
'Ġencoded' -> 11830
'?' -> 49


In [33]:
 # Basic letters, numbers, punctuation and etc.
 base_vocab = list(bytes_to_unicode().values())
 print(f"{base_vocab[:10]} ....")

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*'] ....


# Data Streaming

In [34]:
from datasets import load_dataset

# Load dataset with streaming=True (reads one example at a time)
dataset = load_dataset("smangrul/hug_stack", split="train", streaming=True)

# Create an iterator (like a bookmark in a book)
iter_dataset = iter(dataset)
print("DATASET:\n")
print(dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
# Read one example at a time
# The iterator itself gives you 1 data point per call to next()
first_example = next(iter_dataset)
# first_example.keys() --> dict_keys(['text', 'id', 'metadata', '__index_level_0__'])
print("First example text length:", len(first_example['text']))


First example text length: 2195


In [50]:
# Option 1: Use individual examples
dataset = load_dataset("smangrul/hug_stack", split="train", streaming=True)
iter_dataset = iter(dataset)

example_1 = next(iter_dataset)
example_2 = next(iter_dataset)
example_3 = next(iter_dataset)
print("Example 1:", example_1['text'][:100])

# Option 2: Use batch iterator (don't call next() first)
dataset = load_dataset("smangrul/hug_stack", split="train", streaming=True)
iter_dataset = iter(dataset)

def batch_iterator(batch_size=3):
    for i in range(0, 9, 3):
        batch = []
        for _ in range(3):
            batch.append(next(iter_dataset)['text'])
        yield batch

for i, batch in enumerate(batch_iterator()):
    print(f"Batch {i+1}: {batch[0][:100]}")

Example 1: <!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Ve
Batch 1: <!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Ve
Batch 2: # Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, V
Batch 3: from .constants import (
    MODEL_NAME,
    OPTIMIZER_NAME,
    RNG_STATE_NAME,
    SAFE_MODEL_NAME


# Training a New Tokenizer

Its main purpose is to create a custom tokenizer tailored to a specific language or domain (e.g., legal documents, medical texts), which often improves model performance.

In [51]:
from transformers import AutoTokenizer
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode

original_tokenizer = tokenizer

# Sample code text
code_text = "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)"

print("=== ORIGINAL GPT-2 TOKENIZER ===")
print("Text:", code_text)
print("Tokens:", original_tokenizer.tokenize(code_text))
print("Token IDs:", original_tokenizer.encode(code_text))
print()

# Train a new tokenizer on code data
sample_code_texts = [
    "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
    "class MyClass:\n    def __init__(self):\n        self.value = 0",
    "import pandas as pd\ndf = pd.read_csv('data.csv')",
    "for i in range(10):\n    print(i)",
    "def hello():\n    print('Hello, world!')"
]

def code_batch_iterator():
    for i in range(0, len(sample_code_texts), 2):
        yield sample_code_texts[i:i+2]

base_vocab = list(bytes_to_unicode().values())
new_tokenizer = original_tokenizer.train_new_from_iterator(
    code_batch_iterator(), 
    vocab_size=1000,  # Small vocab for demo
    initial_alphabet=base_vocab
)

print("=== NEW CODE-TRAINED TOKENIZER ===")
print("Text:", code_text)
print("Tokens:", new_tokenizer.tokenize(code_text))
print("Token IDs:", new_tokenizer.encode(code_text))
print()

# Compare the results
print("=== COMPARISON ===")
print("Original tokens:", original_tokenizer.tokenize(code_text))
print("New tokens:    ", new_tokenizer.tokenize(code_text))
print("Original IDs:  ", original_tokenizer.encode(code_text))
print("New IDs:       ", new_tokenizer.encode(code_text))

=== ORIGINAL GPT-2 TOKENIZER ===
Text: def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)
Tokens: ['def', 'Ġfib', 'onacci', '(', 'n', '):', 'ĊĠĠĠ', 'Ġif', 'Ġn', 'Ġ<=', 'Ġ', '1', ':', 'ĊĠĠĠĠĠĠĠ', 'Ġreturn', 'Ġn', 'ĊĠĠĠ', 'Ġreturn', 'Ġfib', 'onacci', '(', 'n', '-', '1', ')', 'Ġ+', 'Ġfib', 'onacci', '(', 'n', '-', '2', ')']
Token IDs: [589, 28176, 34682, 26, 96, 711, 284, 415, 310, 2511, 225, 35, 44, 291, 442, 310, 284, 442, 28176, 34682, 26, 96, 31, 35, 27, 474, 28176, 34682, 26, 96, 31, 36, 27]




=== NEW CODE-TRAINED TOKENIZER ===
Text: def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)
Tokens: ['def', 'Ġfibonacci', '(', 'n', '):', 'ĊĠĠĠ', 'Ġif', 'Ġn', 'Ġ<=', 'Ġ', '1', ':', 'ĊĠĠĠĠĠĠĠ', 'Ġreturn', 'Ġn', 'ĊĠĠĠ', 'Ġreturn', 'Ġfibonacci', '(', 'n', '-', '1', ')', 'Ġ+', 'Ġfibonacci', '(', 'n', '-', '2', ')']
Token IDs: [293, 295, 26, 96, 279, 278, 362, 309, 353, 239, 35, 44, 313, 321, 309, 278, 321, 295, 26