### 1. Implement Graphene-pair encoding tokernizer

In [19]:
import grapheme
from collections import defaultdict, Counter
import json
import re

class GraphemeBPETokenizer:
    def __init__(self):
        self.grapheme_to_id = {}
        self.id_to_grapheme = {}
        self.merges = []
        self.vocab = {}
        
    def _get_graphemes(self, text):
        """Extract graphemes from text using grapheme library"""
        return list(grapheme.graphemes(text))
    
    def _get_word_tokens(self, text):
        """Split text into words and convert each word to grapheme sequence"""
        # Simple word splitting - you might want to improve this for Sinhala
        words = re.findall(r'\S+|\s+', text)
        word_tokens = []
        
        for word in words:
            if word.isspace():
                word_tokens.append([word])
            else:
                graphemes = self._get_graphemes(word)
                # Add end-of-word marker to distinguish word boundaries
                graphemes.append('</w>')
                word_tokens.append(graphemes)
        
        return word_tokens
    
    def _get_pairs(self, word_tokens):
        """Get all adjacent pairs of graphemes/tokens"""
        pairs = defaultdict(int)
        
        for word in word_tokens:
            for i in range(len(word) - 1):
                pair = (word[i], word[i + 1])
                pairs[pair] += 1
        
        return pairs
    
    def _merge_vocab(self, pair, word_tokens):
        """Merge the most frequent pair in vocabulary"""
        new_word_tokens = []
        
        for word in word_tokens:
            new_word = []
            i = 0
            
            while i < len(word):
                if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
                    # Merge the pair
                    new_word.append(word[i] + word[i + 1])
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            
            new_word_tokens.append(new_word)
        
        return new_word_tokens
    
    def train(self, texts, vocab_size=1000):
        """Train BPE tokenizer starting with graphemes"""
        print("Starting grapheme-based BPE training...")
        
        # Step 1: Extract all words and convert to grapheme sequences
        all_word_tokens = []
        for text in texts:
            word_tokens = self._get_word_tokens(text)
            all_word_tokens.extend(word_tokens)
        
        # Step 2: Initialize vocabulary with all graphemes
        grapheme_freq = Counter()
        for word_tokens in all_word_tokens:
            for word in word_tokens:
                for token in word:
                    grapheme_freq[token] += 1
        
        # Create initial vocabulary
        self.vocab = dict(grapheme_freq)
        
        print(f"Initial vocabulary size: {len(self.vocab)}")
        print(f"Sample graphemes: {list(self.vocab.keys())[:20]}")
        
        # Step 3: BPE merging process
        for i in range(vocab_size - len(self.vocab)):
            pairs = self._get_pairs(all_word_tokens)
            
            if not pairs:
                break
                
            # Find most frequent pair
            best_pair = max(pairs, key=pairs.get)
            
            # Merge the pair
            all_word_tokens = self._merge_vocab(best_pair, all_word_tokens)
            
            # Update vocabulary
            merged_token = best_pair[0] + best_pair[1]
            self.vocab[merged_token] = pairs[best_pair]
            self.merges.append(best_pair)
            
            if i % 100 == 0:
                print(f"Merge {i}: {best_pair} -> {merged_token} (freq: {pairs[best_pair]})")
        
        # Create token mappings
        self.grapheme_to_id = {token: i for i, token in enumerate(self.vocab.keys())}
        self.id_to_grapheme = {i: token for token, i in self.grapheme_to_id.items()}
        
        print(f"Final vocabulary size: {len(self.vocab)}")
        print("Training completed!")
    
    def encode(self, text):
        """Encode text using trained BPE model"""
        word_tokens = self._get_word_tokens(text)
        
        # Apply merges
        for pair in self.merges:
            word_tokens = self._merge_vocab(pair, word_tokens)
        
        # Convert to IDs
        token_ids = []
        for word in word_tokens:
            for token in word:
                if token in self.grapheme_to_id:
                    token_ids.append(self.grapheme_to_id[token])
                else:
                    # Handle unknown tokens - you might want to use UNK token
                    pass
        
        return token_ids
    
    def decode(self, token_ids):
        """Decode token IDs back to text"""
        tokens = []
        for token_id in token_ids:
            if token_id in self.id_to_grapheme:
                tokens.append(self.id_to_grapheme[token_id])
        
        # Join tokens and remove end-of-word markers
        text = ''.join(tokens).replace('</w>', ' ')
        return text.strip()
    
    def save(self, filepath):
        """Save trained model"""
        model_data = {
            'vocab': self.vocab,
            'merges': self.merges,
            'grapheme_to_id': self.grapheme_to_id,
            'id_to_grapheme': self.id_to_grapheme
        }
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(model_data, f, ensure_ascii=False, indent=2)
    
    def load(self, filepath):
        """Load trained model"""
        with open(filepath, 'r', encoding='utf-8') as f:
            model_data = json.load(f)
        
        self.vocab = model_data['vocab']
        self.merges = [tuple(merge) for merge in model_data['merges']]
        self.grapheme_to_id = model_data['grapheme_to_id']
        self.id_to_grapheme = {int(k): v for k, v in model_data['id_to_grapheme'].items()}




### 2. Load dataset

In [3]:
from datasets import Dataset, load_dataset
from itertools import islice

  from .autonotebook import tqdm as notebook_tqdm


In [20]:

ds = load_dataset("ai4bharat/samanantar", "ta", split="train", streaming=True)

subset = list(islice(ds, 10000))
subset_ds = Dataset.from_list(subset)

print(subset_ds)
print(subset_ds[0])

# Extract Tamil texts (target side)
tamil_texts = [item['tgt'] for item in subset_ds]

# Split into train (8K) and eval (2K) sets
train_texts_ta = tamil_texts[:8000]
eval_texts_ta = tamil_texts[8000:10000]


Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 10000
})
{'idx': 0, 'src': 'Some 14 months later, the second calf is born.', 'tgt': 'சுமார் 14 மாதங்கள் கழித்து, இரண்டாம் கன்றை ஈனுகிறது.'}


### 3. Implement calculating compression ratio function

In [21]:
def calculate_compression_ratio(tokenizer, texts):
    """Calculate compression ratio for a list of texts"""
    total_chars = 0
    total_tokens = 0
    
    for text in texts:
        # Count characters (including spaces)
        char_count = len(text)
        
        # Count tokens
        token_ids = tokenizer.encode(text)
        token_count = len(token_ids)
        
        total_chars += char_count
        total_tokens += token_count
    
    # Compression ratio = original_size / compressed_size
    compression_ratio = total_chars / total_tokens if total_tokens > 0 else 0
    
    return {
        'total_characters': total_chars,
        'total_tokens': total_tokens,
        'compression_ratio': compression_ratio
    }

### 4. function for train Graphene-pair encoding tokernizer

In [22]:

def train_grapheme_bpe(train_texts, lang, vocab_size=1000):
    """
    Train GraphemeBPE tokenizer from your dataset
    
    Args:
        dataset: Your dataset with structure {'idx': int, 'src': str, 'tgt': str}
        vocab_size: Desired vocabulary size
    """
    
    print(f"Training on {len(train_texts)} {lang} texts")
    
    # Show sample texts and their graphemes
    print(f"\nSample training text: {train_texts[0]}")
    sample_graphemes = list(grapheme.graphemes(train_texts[0]))
    print(f"Graphemes: {sample_graphemes}")
    print(f"Number of graphemes: {len(sample_graphemes)}")
    
    # Initialize and train tokenizer
    tokenizer = GraphemeBPETokenizer()
    print(f"\nTraining tokenizer with vocab_size={vocab_size}...")
    tokenizer.train(train_texts, vocab_size=vocab_size)
    
    # Save model
    model_path = f"{lang}_grapheme_bpe.json"
    tokenizer.save(model_path)
    print(f"\nModel saved to: {model_path}")
    
    # Show vocabulary statistics
    print(f"\nVocabulary Statistics:")
    print(f"Total vocabulary size: {len(tokenizer.vocab)}")
    print(f"Number of merges performed: {len(tokenizer.merges)}")
    
    return tokenizer




### 5. function for train Byte-pair encoding tokernizer

In [23]:
def train_hf_bpe_tokenizer(train_texts, lang, vocab_size=1000):
    from tokenizers import Tokenizer
    from tokenizers.models import BPE
    from tokenizers.trainers import BpeTrainer
    from tokenizers.pre_tokenizers import Whitespace

    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()

    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    )

    tokenizer.train_from_iterator(train_texts, trainer)

    tokenizer.save(f"{lang}_hf_bpe.json")
    print("\nHuggingFace BPE tokenizer saved.")
    
    return tokenizer

### 6. Train Graphene-pair encoding tokernizer

In [24]:
# Train GPE tokenizer
gpe_tokenizer_ta = train_grapheme_bpe(train_texts_ta, "tamil", vocab_size=1000)

# Evaluate GPE tokenizer
gpe_stats_ta = calculate_compression_ratio(gpe_tokenizer_ta, eval_texts_ta)
print("\n🔹 GPE Compression Ratio:", gpe_stats_ta['compression_ratio'])


Training on 8000 tamil texts

Sample training text: சுமார் 14 மாதங்கள் கழித்து, இரண்டாம் கன்றை ஈனுகிறது.
Graphemes: ['சு', 'மா', 'ர்', ' ', '1', '4', ' ', 'மா', 'த', 'ங்', 'க', 'ள்', ' ', 'க', 'ழி', 'த்', 'து', ',', ' ', 'இ', 'ர', 'ண்', 'டா', 'ம்', ' ', 'க', 'ன்', 'றை', ' ', 'ஈ', 'னு', 'கி', 'ற', 'து', '.']
Number of graphemes: 35

Training tokenizer with vocab_size=1000...
Starting grapheme-based BPE training...
Initial vocabulary size: 188
Sample graphemes: ['ச', 'ு', 'ம', 'ா', 'ர', '்', '<', '/', 'w', '>', ' ', '1', '4', 'த', 'ங', 'க', 'ள', 'ழ', 'ி', ',']
Merge 0: ('ம்', '</w>') -> ம்</w> (freq: 6861)
Merge 100: ('0', '</w>') -> 0</w> (freq: 368)
Merge 200: ('செய்', 'ய') -> செய்ய (freq: 209)
Merge 300: ('வி', 'ரு') -> விரு (freq: 140)
Merge 400: ('யெ', 'கோவா') -> யெகோவா (freq: 111)
Merge 500: ('க்', 'கூ') -> க்கூ (freq: 89)
Merge 600: ('ப்பை', '</w>') -> ப்பை</w> (freq: 75)
Merge 700: ('ளா', 'ல்</w>') -> ளால்</w> (freq: 62)
Merge 800: ('வ', 'ங்கி') -> வங்கி (freq: 55)
Final vocabula

### 7. Train Byte-pair encoding tokernizer

In [25]:
# Train HF BPE tokenizer
hf_tokenizer_ta = train_hf_bpe_tokenizer(train_texts_ta, "tamil", vocab_size=1000)

# Evaluate HF tokenizer
hf_stats_ta = calculate_compression_ratio(hf_tokenizer_ta, eval_texts_ta)
print("🔹 HF BPE Compression Ratio:", hf_stats_ta['compression_ratio'])


HuggingFace BPE tokenizer saved.
🔹 HF BPE Compression Ratio: 2.9452891717743204


### 8. Overview of comparission

In [26]:
# ---- Final Comparison ----
print("\n📊 Compression Ratio Comparison")
print(f"GPE Tokenizer:      {gpe_stats_ta['compression_ratio']:.2f}")
print(f"HuggingFace BPE:    {hf_stats_ta['compression_ratio']:.2f}")


📊 Compression Ratio Comparison
GPE Tokenizer:      2.92
HuggingFace BPE:    2.95


In [11]:
# Load Sinhala Flores+ dataset
sin_ds = load_dataset("openlanguagedata/flores_plus", "sin_Sinh")
train_texts_si = [item['text'] for item in sin_ds['dev']]
eval_texts_si = [item['text'] for item in sin_ds['devtest']]

In [27]:
# Train GPE tokenizer
gpe_tokenizer_si = train_grapheme_bpe(train_texts_si, "sinhala", vocab_size=1000)

# Evaluate GPE tokenizer
gpe_stats_si = calculate_compression_ratio(gpe_tokenizer_si, eval_texts_si)
print("\n🔹 GPE Compression Ratio:", gpe_stats_si['compression_ratio'])


Training on 997 sinhala texts

Sample training text: සඳුදා දින, ස්ටැන්ෆර්ඩ් සරසවි වෛද්‍ය පාසලේ විද්‍යාඥයෝ, සෛලයක ආකාරය අනුව එය වර්ග කළ හැකි නව දෝෂ නිර්ණය කිරීමේ මෙවලමක්: එනම්, එකක් ඇමරිකානු ඩොලර් ශතයක පමණ මුදලකින් නිෂ්පාදනය කළ හැකි සාමාන්‍ය ඉන්ක්ජෙට් මුද්‍රණ යන්ත්‍රයකින් මුද්‍රණය කළ හැකි ඉතා කුඩා චිපයක් සොයාගත් බවට නිවේදනය කළහ.
Graphemes: ['ස', 'ඳු', 'දා', ' ', 'දි', 'න', ',', ' ', 'ස්', 'ටැ', 'න්', 'ෆ', 'ර්', 'ඩ්', ' ', 'ස', 'ර', 'ස', 'වි', ' ', 'වෛ', 'ද්\u200d', 'ය', ' ', 'පා', 'ස', 'ලේ', ' ', 'වි', 'ද්\u200d', 'යා', 'ඥ', 'යෝ', ',', ' ', 'සෛ', 'ල', 'ය', 'ක', ' ', 'ආ', 'කා', 'ර', 'ය', ' ', 'අ', 'නු', 'ව', ' ', 'එ', 'ය', ' ', 'ව', 'ර්', 'ග', ' ', 'ක', 'ළ', ' ', 'හැ', 'කි', ' ', 'න', 'ව', ' ', 'දෝ', 'ෂ', ' ', 'නි', 'ර්', 'ණ', 'ය', ' ', 'කි', 'රී', 'මේ', ' ', 'මෙ', 'ව', 'ල', 'ම', 'ක්', ':', ' ', 'එ', 'න', 'ම්', ',', ' ', 'එ', 'ක', 'ක්', ' ', 'ඇ', 'ම', 'රි', 'කා', 'නු', ' ', 'ඩො', 'ල', 'ර්', ' ', 'ශ', 'ත', 'ය', 'ක', ' ', 'ප', 'ම', 'ණ', ' ', 'මු', 'ද', 'ල', 'කි', 'න්', ' ', 'නි', 'ෂ්', 'පා

In [28]:
# Train HF BPE tokenizer
hf_tokenizer_si = train_hf_bpe_tokenizer(train_texts_si, "sinhala", vocab_size=1000)

# Evaluate HF tokenizer
hf_stats_si = calculate_compression_ratio(hf_tokenizer_si, eval_texts_si)
print("🔹 HF BPE Compression Ratio:", hf_stats_si['compression_ratio'])


HuggingFace BPE tokenizer saved.
🔹 HF BPE Compression Ratio: 2.6817242578612492


In [29]:
# ---- Final Comparison ----
print("\n📊 Compression Ratio Comparison")
print(f"GPE Tokenizer:      {gpe_stats_si['compression_ratio']:.2f}")
print(f"HuggingFace BPE:    {hf_stats_si['compression_ratio']:.2f}")


📊 Compression Ratio Comparison
GPE Tokenizer:      2.44
HuggingFace BPE:    2.68
