### 1. Implement Graphene-pair encoding tokernizer

In [None]:
import grapheme
from collections import defaultdict, Counter
import json
import re

class GraphemeBPETokenizer:
    def __init__(self):
        self.grapheme_to_id = {}
        self.id_to_grapheme = {}
        self.merges = []
        self.vocab = {}
        
    def _get_graphemes(self, text):
        """Extract graphemes from text using grapheme library"""
        return list(grapheme.graphemes(text))
    
    def _get_word_tokens(self, text):
        """Split text into words and convert each word to grapheme sequence"""
        # Simple word splitting - you might want to improve this for Sinhala
        words = re.findall(r'\S+|\s+', text)
        word_tokens = []
        
        for word in words:
            if word.isspace():
                word_tokens.append([word])
            else:
                graphemes = self._get_graphemes(word)
                # Add end-of-word marker to distinguish word boundaries
                graphemes.append('</w>')
                word_tokens.append(graphemes)
        
        return word_tokens
    
    def _get_pairs(self, word_tokens):
        """Get all adjacent pairs of graphemes/tokens"""
        pairs = defaultdict(int)
        
        for word in word_tokens:
            for i in range(len(word) - 1):
                pair = (word[i], word[i + 1])
                pairs[pair] += 1
        
        return pairs
    
    def _merge_vocab(self, pair, word_tokens):
        """Merge the most frequent pair in vocabulary"""
        new_word_tokens = []
        
        for word in word_tokens:
            new_word = []
            i = 0
            
            while i < len(word):
                if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
                    # Merge the pair
                    new_word.append(word[i] + word[i + 1])
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            
            new_word_tokens.append(new_word)
        
        return new_word_tokens
    
    def train(self, texts, vocab_size=1000):
        """Train BPE tokenizer starting with graphemes"""
        print("Starting grapheme-based BPE training...")
        
        # Step 1: Extract all words and convert to grapheme sequences
        all_word_tokens = []
        for text in texts:
            word_tokens = self._get_word_tokens(text)
            all_word_tokens.extend(word_tokens)
        
        # Step 2: Initialize vocabulary with all graphemes
        grapheme_freq = Counter()
        for word_tokens in all_word_tokens:
            for word in word_tokens:
                for token in word:
                    grapheme_freq[token] += 1
        
        # Create initial vocabulary
        self.vocab = dict(grapheme_freq)
        
        print(f"Initial vocabulary size: {len(self.vocab)}")
        print(f"Sample graphemes: {list(self.vocab.keys())[:20]}")
        
        # Step 3: BPE merging process
        for i in range(vocab_size - len(self.vocab)):
            pairs = self._get_pairs(all_word_tokens)
            
            if not pairs:
                break
                
            # Find most frequent pair
            best_pair = max(pairs, key=pairs.get)
            
            # Merge the pair
            all_word_tokens = self._merge_vocab(best_pair, all_word_tokens)
            
            # Update vocabulary
            merged_token = best_pair[0] + best_pair[1]
            self.vocab[merged_token] = pairs[best_pair]
            self.merges.append(best_pair)
            
            if i % 100 == 0:
                print(f"Merge {i}: {best_pair} -> {merged_token} (freq: {pairs[best_pair]})")
        
        # Create token mappings
        self.grapheme_to_id = {token: i for i, token in enumerate(self.vocab.keys())}
        self.id_to_grapheme = {i: token for token, i in self.grapheme_to_id.items()}
        
        print(f"Final vocabulary size: {len(self.vocab)}")
        print("Training completed!")
    
    def encode(self, text):
        """Encode text using trained BPE model"""
        word_tokens = self._get_word_tokens(text)
        
        # Apply merges
        for pair in self.merges:
            word_tokens = self._merge_vocab(pair, word_tokens)
        
        # Convert to IDs
        token_ids = []
        for word in word_tokens:
            for token in word:
                if token in self.grapheme_to_id:
                    token_ids.append(self.grapheme_to_id[token])
                else:
                    # Handle unknown tokens - you might want to use UNK token
                    pass
        
        return token_ids
    
    def decode(self, token_ids):
        """Decode token IDs back to text"""
        tokens = []
        for token_id in token_ids:
            if token_id in self.id_to_grapheme:
                tokens.append(self.id_to_grapheme[token_id])
        
        # Join tokens and remove end-of-word markers
        text = ''.join(tokens).replace('</w>', ' ')
        return text.strip()
    
    def save(self, filepath):
        """Save trained model"""
        model_data = {
            'vocab': self.vocab,
            'merges': self.merges,
            'grapheme_to_id': self.grapheme_to_id,
            'id_to_grapheme': self.id_to_grapheme
        }
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(model_data, f, ensure_ascii=False, indent=2)
    
    def load(self, filepath):
        """Load trained model"""
        with open(filepath, 'r', encoding='utf-8') as f:
            model_data = json.load(f)
        
        self.vocab = model_data['vocab']
        self.merges = [tuple(merge) for merge in model_data['merges']]
        self.grapheme_to_id = model_data['grapheme_to_id']
        self.id_to_grapheme = {int(k): v for k, v in model_data['id_to_grapheme'].items()}




### 2. Load dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 10000
})
{'idx': 0, 'src': 'Some 14 months later, the second calf is born.', 'tgt': '‡Æö‡ØÅ‡ÆÆ‡Ææ‡Æ∞‡Øç 14 ‡ÆÆ‡Ææ‡Æ§‡Æô‡Øç‡Æï‡Æ≥‡Øç ‡Æï‡Æ¥‡Æø‡Æ§‡Øç‡Æ§‡ØÅ, ‡Æá‡Æ∞‡Æ£‡Øç‡Æü‡Ææ‡ÆÆ‡Øç ‡Æï‡Æ©‡Øç‡Æ±‡Øà ‡Æà‡Æ©‡ØÅ‡Æï‡Æø‡Æ±‡Æ§‡ØÅ.'}


### 3. Implement calculating compression ratio function

### 4. function for train Graphene-pair encoding tokernizer

### 5. function for train Byte-pair encoding tokernizer

### 6. Train Graphene-pair encoding tokernizer

Training on 8000 tamil texts

Sample training text: ‡Æö‡ØÅ‡ÆÆ‡Ææ‡Æ∞‡Øç 14 ‡ÆÆ‡Ææ‡Æ§‡Æô‡Øç‡Æï‡Æ≥‡Øç ‡Æï‡Æ¥‡Æø‡Æ§‡Øç‡Æ§‡ØÅ, ‡Æá‡Æ∞‡Æ£‡Øç‡Æü‡Ææ‡ÆÆ‡Øç ‡Æï‡Æ©‡Øç‡Æ±‡Øà ‡Æà‡Æ©‡ØÅ‡Æï‡Æø‡Æ±‡Æ§‡ØÅ.
Graphemes: ['‡Æö‡ØÅ', '‡ÆÆ‡Ææ', '‡Æ∞‡Øç', ' ', '1', '4', ' ', '‡ÆÆ‡Ææ', '‡Æ§', '‡Æô‡Øç', '‡Æï', '‡Æ≥‡Øç', ' ', '‡Æï', '‡Æ¥‡Æø', '‡Æ§‡Øç', '‡Æ§‡ØÅ', ',', ' ', '‡Æá', '‡Æ∞', '‡Æ£‡Øç', '‡Æü‡Ææ', '‡ÆÆ‡Øç', ' ', '‡Æï', '‡Æ©‡Øç', '‡Æ±‡Øà', ' ', '‡Æà', '‡Æ©‡ØÅ', '‡Æï‡Æø', '‡Æ±', '‡Æ§‡ØÅ', '.']
Number of graphemes: 35

Training tokenizer with vocab_size=1000...
Starting grapheme-based BPE training...
Initial vocabulary size: 188
Sample graphemes: ['‡Æö', '‡ØÅ', '‡ÆÆ', '‡Ææ', '‡Æ∞', '‡Øç', '<', '/', 'w', '>', ' ', '1', '4', '‡Æ§', '‡Æô', '‡Æï', '‡Æ≥', '‡Æ¥', '‡Æø', ',']
Merge 0: ('‡ÆÆ‡Øç', '</w>') -> ‡ÆÆ‡Øç</w> (freq: 6861)
Merge 100: ('0', '</w>') -> 0</w> (freq: 368)
Merge 200: ('‡Æö‡ØÜ‡ÆØ‡Øç', '‡ÆØ') -> ‡Æö‡ØÜ‡ÆØ‡Øç‡ÆØ (freq: 209)
Merge 300: ('‡Æµ‡Æø', '‡Æ∞‡ØÅ') -> ‡Æµ‡Æø‡Æ∞‡ØÅ (freq: 140)
Merge 400: ('‡ÆØ

### 7. Train Byte-pair encoding tokernizer


HuggingFace BPE tokenizer saved.
üîπ HF BPE Compression Ratio: 2.9452891717743204


### 8. Overview of comparission


üìä Compression Ratio Comparison
GPE Tokenizer:      2.92
HuggingFace BPE:    2.95


Training on 997 sinhala texts

Sample training text: ‡∑É‡∂≥‡∑î‡∂Ø‡∑è ‡∂Ø‡∑í‡∂±, ‡∑É‡∑ä‡∂ß‡∑ê‡∂±‡∑ä‡∑Ü‡∂ª‡∑ä‡∂©‡∑ä ‡∑É‡∂ª‡∑É‡∑Ä‡∑í ‡∑Ä‡∑õ‡∂Ø‡∑ä‚Äç‡∂∫ ‡∂¥‡∑è‡∑É‡∂Ω‡∑ö ‡∑Ä‡∑í‡∂Ø‡∑ä‚Äç‡∂∫‡∑è‡∂•‡∂∫‡∑ù, ‡∑É‡∑õ‡∂Ω‡∂∫‡∂ö ‡∂Ü‡∂ö‡∑è‡∂ª‡∂∫ ‡∂Ö‡∂±‡∑î‡∑Ä ‡∂ë‡∂∫ ‡∑Ä‡∂ª‡∑ä‡∂ú ‡∂ö‡∑Ö ‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂±‡∑Ä ‡∂Ø‡∑ù‡∑Ç ‡∂±‡∑í‡∂ª‡∑ä‡∂´‡∂∫ ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏‡∑ö ‡∂∏‡∑ô‡∑Ä‡∂Ω‡∂∏‡∂ö‡∑ä: ‡∂ë‡∂±‡∂∏‡∑ä, ‡∂ë‡∂ö‡∂ö‡∑ä ‡∂á‡∂∏‡∂ª‡∑í‡∂ö‡∑è‡∂±‡∑î ‡∂©‡∑ú‡∂Ω‡∂ª‡∑ä ‡∑Å‡∂≠‡∂∫‡∂ö ‡∂¥‡∂∏‡∂´ ‡∂∏‡∑î‡∂Ø‡∂Ω‡∂ö‡∑í‡∂±‡∑ä ‡∂±‡∑í‡∑Ç‡∑ä‡∂¥‡∑è‡∂Ø‡∂±‡∂∫ ‡∂ö‡∑Ö ‡∑Ñ‡∑ê‡∂ö‡∑í ‡∑É‡∑è‡∂∏‡∑è‡∂±‡∑ä‚Äç‡∂∫ ‡∂â‡∂±‡∑ä‡∂ö‡∑ä‡∂¢‡∑ô‡∂ß‡∑ä ‡∂∏‡∑î‡∂Ø‡∑ä‚Äç‡∂ª‡∂´ ‡∂∫‡∂±‡∑ä‡∂≠‡∑ä‚Äç‡∂ª‡∂∫‡∂ö‡∑í‡∂±‡∑ä ‡∂∏‡∑î‡∂Ø‡∑ä‚Äç‡∂ª‡∂´‡∂∫ ‡∂ö‡∑Ö ‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂â‡∂≠‡∑è ‡∂ö‡∑î‡∂©‡∑è ‡∂†‡∑í‡∂¥‡∂∫‡∂ö‡∑ä ‡∑É‡∑ú‡∂∫‡∑è‡∂ú‡∂≠‡∑ä ‡∂∂‡∑Ä‡∂ß ‡∂±‡∑í‡∑Ä‡∑ö‡∂Ø‡∂±‡∂∫ ‡∂ö‡∑Ö‡∑Ñ.
Graphemes: ['‡∑É', '‡∂≥‡∑î', '‡∂Ø‡∑è', ' ', '‡∂Ø‡∑í', '‡∂±', ',', ' ', '‡∑É‡∑ä', '‡∂ß‡∑ê', '‡∂±‡∑ä', '‡∑Ü', '‡∂ª‡∑ä', '‡∂©‡∑ä', ' ', '‡∑É', '‡∂ª', '‡∑É', '‡∑Ä‡∑í', ' ', '‡∑Ä‡∑õ', '‡∂Ø‡∑ä\u200d', '‡∂∫', ' ', '‡∂¥‡


HuggingFace BPE tokenizer saved.
üîπ HF BPE Compression Ratio: 2.6817242578612492



üìä Compression Ratio Comparison
GPE Tokenizer:      2.44
HuggingFace BPE:    2.68
