# Evaluate the Tokenizer

Metrics:
-> Final vocabulary size
-> Tokenization consistency across splits
-> Average tokens per sentence
-> Compression ratio (original chars / tokens)

In [64]:
from tokenizers import Tokenizer
import tokenizers
from datasets import load_dataset
import pandas as pd
from typing import List
import statistics
import re

# Minimum sentence length in the val and test set
MIN_LENGTH = 10

# Load the my tokenizer
tokenizer = Tokenizer.from_file("my_tokenier")

# # Create a HuggingFace wrapper around your tokenizer
# hf_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

# # Save in HuggingFace format (creates multiple files)
# hf_tokenizer.save_pretrained("my_hf_tokenizer")

In [65]:
# Sanity check!
enc = tokenizer.encode("Hello world!")
print("Tokens:", enc.tokens)
print("Encoded IDs:", enc.ids)
dec = tokenizer.decode(enc.ids)
print("Decoded: ", dec)

Tokens: ['Hell', 'o ', 'world', '!']
Encoded IDs: [9279, 304, 5811, 7]
Decoded:  Hell o  world !


In [66]:
# Load the validation and test set
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-v1")
print(f"Dataset splits: {dataset.keys()}")
print(f"Validation set size: {len(dataset['validation'])}")
print(f"Test set size: {len(dataset['test'])}")
validation_texts = dataset['validation']['text'] 
test_texts = dataset['test']['text']

Dataset splits: dict_keys(['test', 'train', 'validation'])
Validation set size: 3760
Test set size: 4358


In [67]:
# apply same preprocessing steps as that done to the training data# 2.3 Clean the Data

# Remove duplicates
val_data_clean = set(validation_texts)
test_data_clean = set(test_texts)

def clean_text(text):
    # Remove <unk> tokens
    text = re.sub(r'<unk>', '', text)
    # Replace all sequences of whitespace (spaces, tabs, newlines) with a single space, and remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
# Remove <unk> tokens and normalize white spaces
val_data_clean = [clean_text(text) for text in val_data_clean]
test_data_clean = [clean_text(text) for text in test_data_clean]
# Remove empty and very short sequences
val_data_clean = [text for text in val_data_clean if len(text) > MIN_LENGTH]
test_data_clean = [text for text in test_data_clean if len(text) > MIN_LENGTH]

# Load clean train data 
with open("wikitext2_train_cleaned.txt", "r", encoding="utf-8") as f:
    train_data_clean = f.readlines()

In [68]:
encoded = tokenizer.encode("hit me baby one more time")
tokens = encoded.tokens

In [69]:
print(encoded)
print("Number of tokens:", len(encoded), len(tokens))

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Number of tokens: 7 7


**Final Vocab Size**

In [70]:
# Vocab Size
print("Tokenizer vocab size: " , tokenizer.get_vocab_size())

Tokenizer vocab size:  30000


**Tokenization Consistency Across Splits**

In [None]:
def compute_tokenization_stats(texts: List[str], tokenizer: tokenizers.Tokenizer, split_name: str):
    """Compute tokenization statistics for a given split"""
    total_tokens = 0
    total_chars = 0
    total_sentences = 0
    token_lengths = []
    tokens_per_sentences = []
    token_lengths_dict = {}
    
    for text in texts:
        if text.strip():  # Skip empty texts
            # Tokenize the text
            encoded = tokenizer.encode(text)
            tokens = encoded.tokens
            
            # Count statistics
            total_tokens += len(tokens)
            total_sentences += 1
            token_lengths.extend([len(token) for token in tokens])
            tokens_per_sentences.append(len(tokens))

            # track token lengths:
            for token in tokens:
                if token not in token_lengths_dict:
                    token_lengths_dict[token] = len(token)

    
    # Calculate averages
    avg_tokens_per_sentence = total_tokens / total_sentences if total_sentences > 0 else 0
    avg_token_length = sum(token_lengths) / len(token_lengths) if token_lengths else 0
    
    return {
        'split': split_name,
        'total_sentences': total_sentences,
        'total_tokens': total_tokens,
        'avg_tokens_per_sentence': avg_tokens_per_sentence,
        'median_tokens_per_sentence': statistics.median(tokens_per_sentences),
        'avg_token_length': avg_token_length,
        'median_token_length': statistics.median(token_lengths_dict.values()),
    }

# Compute stats for all splits
train_stats = compute_tokenization_stats(train_data_clean, tokenizer, "Train")
val_stats = compute_tokenization_stats(val_data_clean, tokenizer, "Validation") 
test_stats = compute_tokenization_stats(test_data_clean, tokenizer, "Test")

# Results
stats_df = pd.DataFrame([train_stats, val_stats, test_stats])
stats_df

Unnamed: 0,split,total_sentences,total_tokens,avg_tokens_per_sentence,median_tokens_per_sentence,avg_token_length,median_token_length
0,Train,21115,1648075,78.052332,72.0,6.268695,7.0
1,Validation,2286,166765,72.950569,66.0,6.221275,7.0
2,Test,2604,185057,71.066436,64.0,6.207855,7.0


Notes about the tokenization consistency across splits metric:
- All splits have identical median_token_length = 7.0
- Stable average token length (all similar to median token length)

**Compression Ratio**

In [None]:
# ----- Original test data -----
test_byte_size = sum(len(text.encode('utf-8')) for text in test_data_clean)
print(f"Test data size in bytes: {test_byte_size}")

Test data size in bytes: 1150234
Tokenized test data size in bytes: 185057


In [None]:
def compute_compression_metrics(texts, tokenizer):
    total_chars = 0
    total_bytes = 0
    total_tokens = 0
    
    for text in texts:
        if text.strip():
            encoded = tokenizer.encode(text)
            total_chars += len(text)                   
            total_bytes += len(text.encode('utf-8'))    # Memory usage, 1 character = 8 bites = 1 byte
            total_tokens += len(encoded.tokens)         
    
    char_per_token = total_chars / total_tokens   # Characters per token
    bytes_per_token = total_bytes / total_tokens  # Bytes per token
    
    return char_per_token, bytes_per_token

char_per_token, bytes_per_token = compute_compression_metrics(test_data_clean, tokenizer)
print(f"Characters per token: {char_per_token:.2f}")
print(f"Bytes per token: {bytes_per_token:.2f}")

 

Characters per token: 6.21
Bytes per token: 6.22


**Save Tokenizer**

In [93]:
# Convert to HuggingFace format for better compatibility
from transformers import PreTrainedTokenizerFast

# Create HuggingFace wrapper around your tokenizer
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # Add special tokens mapping
    pad_token="[PAD]",
    unk_token="[UNK]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)

# Save in HuggingFace format (creates multiple files)
hf_tokenizer.save_pretrained("my_bpe_tokenizer_hf")
print("Tokenizer saved in HuggingFace format!")


Tokenizer saved in HuggingFace format!


# Push to Hugging Face HUB

In [98]:
from huggingface_hub import HfApi, create_repo

# 1 - Intialize the HF API
api = HfApi()

# 2 - Create the repository 
repo_id = "Rogarcia18/wikitext2-bpe-tokenizer"
# Check if repo exists, only create if it does not exist
try:
    api.repo_info(repo_id=repo_id, repo_type="model")
    print(f"Repo '{repo_id}' already exists on Hugging Face Hub.")
except Exception:
    create_repo(
        repo_id=repo_id,
        repo_type="model",
        private=False # Set to True if you want it private
    )
    print(f"Repo '{repo_id}' created on Hugging Face Hub.")

# 3 - Upload the tokenizer's files 
api.upload_folder(
    folder_path="my_bpe_tokenizer_hf",
    repo_id=repo_id,
    commit_message="Upload BPE tokenizer trained on WikiText-2"
)

# 4 - Upload model card

model_card_content = """
---
license: apache-2.0
tags:
- tokenizer
- bpe
- wikitext2
- nlp
---

# WikiText-2 BPE Tokenizer

A Byte Pair Encoding (BPE) tokenizer trained on the WikiText-2 dataset.

## Model Details
- **Vocabulary Size**: 30,000 tokens
- **Training Data**: WikiText-2 (Salesforce/wikitext)
- **Special Tokens**: [PAD], [UNK], [CLS], [SEP], [MASK]
- **Compression Ratio**: ~6.4 characters per token

## Usage
```python
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Rogarcia18/wikitext2-bpe-tokenizer")
```

## Training Details
- Dataset: WikiText-2 (wikitext-2-v1)
- Preprocessing: Deduplication, <unk> removal, whitespace normalization, remove samples cases with less than 10 characters
- Architecture: BPE with HuggingFace tokenizers library
"""
# Save model card
with open("my_bpe_tokenizer_hf/README.md", "w") as f:
    f.write(model_card_content)

api.upload_file(
    path_or_fileobj="my_bpe_tokenizer_hf/README.md",
    path_in_repo="README.md",
    repo_id=repo_id,
    commit_message="Add model card"
)

Repo 'Rogarcia18/wikitext2-bpe-tokenizer' already exists on Hugging Face Hub.


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Rogarcia18/wikitext2-bpe-tokenizer/commit/7cd0c069509785244550797bbebb6d8591a23fa2', commit_message='Add model card', commit_description='', oid='7cd0c069509785244550797bbebb6d8591a23fa2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Rogarcia18/wikitext2-bpe-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='Rogarcia18/wikitext2-bpe-tokenizer'), pr_revision=None, pr_num=None)

In [99]:
# Try using the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Rogarcia18/wikitext2-bpe-tokenizer")

In [100]:
tokenizer.encode("This is my first tokenizer")

[4644, 330, 1309, 542, 909, 80, 314, 819, 293]