# Tokenization Warmup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, BertTokenizer, GPT2Tokenizer

# Download necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/raphael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Part 1: Custom Tokenizer from Corpus

In [6]:
# 1.1 Define a simple corpus
corpus = [
    "Natural language processing (NLP) is a field of AI.",
    "Tokenization is the process of breaking text into tokens.",
    "Tokens can be words, subwords, or characters.",
    "Modern NLP uses transformer models like BERT and GPT.",
    "Fine-tuning allows adapting pre-trained models to specific tasks."
]
full_text = " ".join(corpus)
print(f"Corpus size: {len(corpus)} sentences, {len(full_text)} characters")

Corpus size: 5 sentences, 275 characters


In [7]:
# 1.2 Basic Word Tokenizer
def simple_word_tokenizer(text):
    """Split text on whitespace and remove punctuation"""
    # Remove punctuation and lowercase the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split on whitespace
    tokens = text.split()
    return tokens

In [8]:
# 1.3 Simple Regex Tokenizer
def regex_tokenizer(text):
    """Tokenize text using regular expressions"""
    # This regex pattern matches words, numbers, and common punctuation
    pattern = r'\w+|[^\w\s]'
    tokens = re.findall(pattern, text.lower())
    return tokens


In [9]:
# 1.4 Character Tokenizer
def char_tokenizer(text):
    """Tokenize text into individual characters"""
    return list(text)

In [10]:
# 1.5 Build a vocabulary from our corpus
def build_vocabulary(corpus, tokenizer_func):
    """Build a vocabulary from a corpus using the provided tokenizer function"""
    all_tokens = []
    for text in corpus:
        tokens = tokenizer_func(text)
        all_tokens.extend(tokens)
    
    # Count tokens and create vocabulary
    token_counts = Counter(all_tokens)
    # Add special tokens: unknown token and padding token
    vocabulary = {
        "<PAD>": 0,  # Padding token
        "<UNK>": 1,  # Unknown token
    }
    # Add the rest of the vocabulary
    for idx, (token, _) in enumerate(token_counts.most_common()):
        vocabulary[token] = idx + 2  # +2 because we already have 2 special tokens
    
    # Create a reverse mapping (id -> token) for decoding
    id_to_token = {idx: token for token, idx in vocabulary.items()}
    
    return vocabulary, id_to_token, token_counts

In [14]:
# 1.6 Implement a simple subword tokenizer (BPE-like approach)
def train_subword_tokenizer(corpus, vocab_size=100, min_frequency=2):
    """Implement a very simplified version of Byte-Pair Encoding (BPE)"""
    # Start with character-level tokens
    all_chars = []
    for text in corpus:
        all_chars.extend(list(text))
    
    # Initialize vocabulary with unique characters
    vocab = sorted(set(all_chars))
    
    # Initialize each word as a sequence of characters
    words = [" ".join(list(word)) for sentence in corpus for word in sentence.split()]
    
    # Count character pairs
    while len(vocab) < vocab_size:
        # Count pairs
        pairs = Counter()
        for word in words:
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += 1
        
        # If no more pairs, break
        if not pairs:
            break
            
        # Get the most frequent pair
        best_pair = max(pairs, key=pairs.get)
        if pairs[best_pair] < min_frequency:
            break
            
        # Create new token from the pair
        new_token = ''.join(best_pair)
        vocab.append(new_token)
        
        # Replace all occurrences of the pair
        new_words = []
        for word in words:
            new_word = word.replace(' '.join(best_pair), new_token)
            new_words.append(new_word)
        words = new_words
    
    return vocab, words

In [15]:
# 1.7 Create Encoder and Decoder functions
def encode(text, tokenizer_func, vocab, add_special_tokens=True):
    """
    Encode text to token IDs
    
    Args:
        text (str): Text to encode
        tokenizer_func (callable): Tokenizer function to use
        vocab (dict): Vocabulary mapping (token -> id)
        add_special_tokens (bool): Whether to add special tokens
        
    Returns:
        list: List of token IDs
    """
    tokens = tokenizer_func(text)
    
    # Convert tokens to ids (handling unknown tokens)
    ids = []
    for token in tokens:
        # Use the token id if in vocabulary, else use <UNK> token id
        token_id = vocab.get(token, vocab["<UNK>"])
        ids.append(token_id)
    
    return ids

def decode(ids, id_to_token):
    """
    Decode token IDs back to text
    
    Args:
        ids (list): List of token IDs
        id_to_token (dict): Reverse vocabulary mapping (id -> token)
        
    Returns:
        str: Decoded text
    """
    # Convert ids back to tokens
    tokens = [id_to_token.get(i, id_to_token[1]) for i in ids]  # Default to <UNK> for unknown ids
    
    # Simple space-joining (this is a simplification and won't work well for all tokenizer types)
    # A real decoder would need to be aware of the tokenization scheme
    text = " ".join(tokens)
    
    # Remove special tokens if present
    text = text.replace("<PAD> ", "").replace(" <PAD>", "")
    text = text.replace("<UNK> ", "").replace(" <UNK>", "")
    
    return text

In [17]:
# 1.8 Apply our tokenizers to the corpus and demonstrate encode/decode
print("\nApplying different tokenizers to our corpus and demonstrating encode/decode...")

# Word tokenizer
word_vocab, word_id_to_token, word_counts = build_vocabulary(corpus, simple_word_tokenizer)
print(f"Word vocabulary size: {len(word_vocab)} (including special tokens)")
print(f"Example word tokens for first sentence: {simple_word_tokenizer(corpus[0])}")

# Encode/decode example
example_text = "NLP is an exciting field."
word_ids = encode(example_text, simple_word_tokenizer, word_vocab)
print(f"Encoded word ids: {word_ids}")
decoded_text = decode(word_ids, word_id_to_token)
print(f"Decoded text: '{decoded_text}'")

# Out-of-vocabulary example
oov_text = "Supercalifragilisticexpialidocious is a very long word!"
word_ids_oov = encode(oov_text, simple_word_tokenizer, word_vocab)
print(f"Encoded word ids with OOV: {word_ids_oov}")
decoded_text_oov = decode(word_ids_oov, word_id_to_token)
print(f"Decoded text with OOV: '{decoded_text_oov}'")

# Regex tokenizer
regex_vocab, regex_id_to_token, regex_counts = build_vocabulary(corpus, regex_tokenizer)
print(f"\nRegex vocabulary size: {len(regex_vocab)}")
print(f"Example regex tokens for first sentence: {regex_tokenizer(corpus[0])}")

# Character tokenizer
char_vocab, char_id_to_token, char_counts = build_vocabulary(corpus, char_tokenizer)
print(f"\nCharacter vocabulary size: {len(char_vocab)}")
print(f"First 10 character tokens for first sentence: {char_tokenizer(corpus[0])[:10]}")

# Character tokenizer OOV handling - rarely has OOV issues since each character is a token
example_with_emoji = "NLP is fun! 😊"
char_ids = encode(example_with_emoji, char_tokenizer, char_vocab)
print(f"Character encoding with emoji: {char_ids}")
decoded_emoji = decode(char_ids, char_id_to_token)
print(f"Decoded text with emoji: '{decoded_emoji}'")


Applying different tokenizers to our corpus and demonstrating encode/decode...
Word vocabulary size: 39 (including special tokens)
Example word tokens for first sentence: ['natural', 'language', 'processing', 'nlp', 'is', 'a', 'field', 'of', 'ai']
Encoded word ids: [2, 3, 1, 1, 11]
Decoded text: 'nlp is field'
Encoded word ids with OOV: [1, 3, 10, 1, 1, 1]
Decoded text with OOV: 'is a'

Regex vocabulary size: 46
Example regex tokens for first sentence: ['natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'a', 'field', 'of', 'ai', '.']

Character vocabulary size: 42
First 10 character tokens for first sentence: ['N', 'a', 't', 'u', 'r', 'a', 'l', ' ', 'l', 'a']
Character encoding with emoji: [21, 26, 22, 2, 9, 4, 2, 18, 16, 7, 1, 2, 1]
Decoded text with emoji: 'N L P   i s   f u n  '


### Questions:
1. Why do different tokenization approaches produce different numbers of tokens?
   - Compare the token counts from word, character, and subword tokenizers
   - How does the vocabulary size affect the number of tokens produced?
   - Which approach results in the longest sequences? The shortest?

2. How does subword tokenization handle out-of-vocabulary words better than word tokenization?
   - Test tokenizing 'Supercalifragilisticexpialidocious' with both approaches
   - How many <UNK> tokens does each approach produce?
   - Which approach preserves more information about the original word?

3. What are the trade-offs between character, word, and subword tokenization?
   - Consider sequence length, vocabulary size, and information preservation
   - Which approach works better for morphologically rich languages?
   - How do these approaches handle compound words differently?

4. How does case handling affect tokenization and model performance?
   - Compare case-preserving vs case-insensitive tokenization
   - When might preserving case be critical? When might it be unnecessary?
   - How does case handling affect vocabulary size?
   - What's the impact on proper nouns and acronyms?

5. What strategies could you use to handle case in a more efficient way?
   - Consider special tokens, preprocessing techniques, or embedding modifications
   - How do modern tokenizers like WordPiece and BPE handle case?
   - What's the difference between cased and uncased models?

6. How would you handle tokenization of multilingual text?
   - What challenges arise when tokenizing languages with different writing systems?
   - How do character-based approaches compare to word-based approaches for non-Latin scripts?
   - What strategies are used in multilingual models like mBERT?
   - How would you handle code-switching (mixing languages within a single text)?

7. How do tokenization choices impact downstream tasks?
   - For which tasks might character tokenization be superior?
   - When would word tokenization be preferred?
   - How does tokenization affect sequence length and thus model efficiency?

# Part 2: Pre-trained Tokenizers

In [19]:

# 2.1 NLTK Tokenizer
print("\nUsing NLTK pre-trained tokenizer...")
nltk_tokens = word_tokenize(corpus[0])
print(f"NLTK tokens: {nltk_tokens}")

# 2.2 Hugging Face Tokenizers
print("\nUsing pre-trained tokenizers from Hugging Face...")

# BERT tokenizer (WordPiece)
print("Loading BERT tokenizer...")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_tokens = bert_tokenizer.tokenize(corpus[0])
bert_ids = bert_tokenizer.encode(corpus[0])
print(f"BERT tokens: {bert_tokens}")
print(f"BERT token ids: {bert_ids}")
print(f"BERT decoded: {bert_tokenizer.decode(bert_ids)}")

# GPT-2 tokenizer (BPE)
print("\nLoading GPT-2 tokenizer...")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokens = gpt2_tokenizer.tokenize(corpus[0])
gpt2_ids = gpt2_tokenizer.encode(corpus[0])
print(f"GPT-2 tokens: {gpt2_tokens}")
print(f"GPT-2 token ids: {gpt2_ids}")
print(f"GPT-2 decoded: {gpt2_tokenizer.decode(gpt2_ids)}")

# 2.3 Compare tokenization approaches on our corpus
print("\nComparing tokenization approaches on our corpus...")
sentence = "Fine-tuning allows adapting pre-trained models to specific tasks."

tokenization_results = {
    "Simple Word": simple_word_tokenizer(sentence),
    "Regex": regex_tokenizer(sentence),
    "Character": char_tokenizer(sentence)[:10] + ["..."],  # Showing just first 10 chars
    "NLTK": word_tokenize(sentence),
    "BERT": bert_tokenizer.tokenize(sentence),
    "GPT-2": gpt2_tokenizer.tokenize(sentence)
}

for name, tokens in tokenization_results.items():
    print(f"{name} Tokenizer: {tokens}")
    print(f"Token count: {len(tokens)}\n")

# 2.4 Out-of-vocabulary handling
print("\nHandling out-of-vocabulary words...")
oov_sentence = "Supercalifragilisticexpialidocious is a very long word!"

print(f"BERT tokens for OOV: {bert_tokenizer.tokenize(oov_sentence)}")
print(f"GPT-2 tokens for OOV: {gpt2_tokenizer.tokenize(oov_sentence)}")


Using NLTK pre-trained tokenizer...
NLTK tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'AI', '.']

Using pre-trained tokenizers from Hugging Face...
Loading BERT tokenizer...
BERT tokens: ['natural', 'language', 'processing', '(', 'nl', '##p', ')', 'is', 'a', 'field', 'of', 'ai', '.']
BERT token ids: [101, 3019, 2653, 6364, 1006, 17953, 2361, 1007, 2003, 1037, 2492, 1997, 9932, 1012, 102]
BERT decoded: [CLS] natural language processing ( nlp ) is a field of ai. [SEP]

Loading GPT-2 tokenizer...
GPT-2 tokens: ['Natural', 'Ġlanguage', 'Ġprocessing', 'Ġ(', 'N', 'LP', ')', 'Ġis', 'Ġa', 'Ġfield', 'Ġof', 'ĠAI', '.']
GPT-2 token ids: [35364, 3303, 7587, 357, 45, 19930, 8, 318, 257, 2214, 286, 9552, 13]
GPT-2 decoded: Natural language processing (NLP) is a field of AI.

Comparing tokenization approaches on our corpus...
Simple Word Tokenizer: ['finetuning', 'allows', 'adapting', 'pretrained', 'models', 'to', 'specific', 'tasks']
Token count: 8

Regex