# Enhanced Byte Pair Encoding for Odia Text

## Input Data Structure
```
project_root/
├── odia_bpe_tokenizer_enhanced.ipynb
├── data/
│   └── odia/
│       ├── file1.txt
│       └── file2.txt
```

In [8]:
import os
import glob
import regex as re
from collections import defaultdict, Counter
import numpy as np
from typing import List, Dict, Tuple, Optional
from IPython.display import HTML, display

# Define Odia patterns
ODIA_PATTERNS = {
    'basic': re.compile(r"""
        \s*[\u0B00-\u0B7F]+  # Basic Odia characters
        |\s*[\p{N}]+         # Numbers
        |\s*[^\s\p{L}\p{N}]+ # Punctuation and symbols
        |\s+                 # Whitespace
        """, re.VERBOSE),
    
    'linguistic': re.compile(r"""
        # Consonant clusters with virama
        \s*[\u0B15-\u0B39]\u0B4D[\u0B15-\u0B39]+  
        # CV combinations
        |\s*[\u0B15-\u0B39][\u0B3E-\u0B4C]?      
        # Independent vowels
        |\s*[\u0B05-\u0B14]                       
        # Numbers and punctuation
        |\s*[\p{N}]+                              
        |\s*[^\s\p{L}\p{N}]+
        |\s+
        """, re.VERBOSE)
}

# Fix for CSS display
display(HTML('''
<style>
    pre {
        white-space: pre-wrap;
    }
</style>
'''))

In [9]:
class CompressedOdiaTokenizer:
    def __init__(self, 
                 max_vocab_size: int = 16000,
                 target_compression: float = 4.0,
                 max_token_length: int = 24,
                 pattern_type: str = 'linguistic',
                 max_tokens_per_sequence: Optional[int] = None):
        """
        Initialize the Odia BPE tokenizer
        
        Args:
            max_vocab_size: Maximum vocabulary size (default 16000)
            target_compression: Target compression ratio (default 4.0)
            max_token_length: Maximum token length in characters (default 24)
            pattern_type: Type of tokenization pattern ('basic', 'detailed', or 'linguistic')
            max_tokens_per_sequence: Maximum tokens per sequence (default None)
        """
        self.max_vocab_size = max_vocab_size
        self.target_compression = target_compression
        self.max_token_length = max_token_length
        self.pattern = ODIA_PATTERNS.get(pattern_type, ODIA_PATTERNS['basic'])
        
        # Special tokens
        self.special_tokens = {
            '<UNK>': 0,  # Unknown token
            '<S>': 1,    # Start of text
            '</S>': 2    # End of text
        }
        
        # Vocabulary mappings
        self.stoi: Dict[str, int] = {}  # String to index
        self.itos: Dict[int, str] = {}  # Index to string
        self.merges: Dict[Tuple[int, int], int] = {}  # Merge rules
        
    def _is_odia_char(self, char: str) -> bool:
        """Check if character is in Odia Unicode range"""
        return '\u0B00' <= char <= '\u0B7F'
    
    def _calculate_compression(self, text: str, tokens: List[int]) -> float:
        """
        Calculate compression ratio
        
        Args:
            text: Original text
            tokens: List of token indices
            
        Returns:
            Compression ratio (original size / tokenized size)
        """
        original_size = len(text.encode('utf-8'))
        bits_per_token = np.ceil(np.log2(len(self.stoi)))
        tokenized_size = len(tokens) * np.ceil(bits_per_token / 8)  # Convert bits to bytes
        return original_size / tokenized_size

    def _get_merge_score(self, pair: Tuple[int, int], freq: int, text_len: int) -> float:
        """Enhanced scoring with more aggressive merging"""
        def get_str(p):
            if isinstance(p, int):
                token = self.itos[p]
                if isinstance(token, tuple):
                    return ''.join(get_str(t) for t in token)
                return token
            return str(p)
        
        token_str = ''.join(get_str(p) for p in pair)
        token_len = len(token_str)
        
        # More relaxed length constraint
        if token_len > self.max_token_length:
            return 0.0
            
        # Enhanced scoring system with higher weights
        length_bonus = np.log2(token_len + 1) * 2.0  # Doubled length bonus
        freq_score = (freq / text_len) * 1.5         # Increased frequency weight
        
        # Enhanced linguistic bonuses
        is_odia_cluster = any(self._is_odia_char(c) for c in token_str)
        has_conjunct = '\u0B4D' in token_str
        has_matra = any(c in token_str for c in '\u0B3E\u0B3F\u0B40\u0B41\u0B42\u0B43\u0B47\u0B48\u0B4B\u0B4C')
        
        # More aggressive linguistic scoring
        linguistic_score = 1.0
        if is_odia_cluster:
            linguistic_score *= 2.0        # Increased from 1.5
        if has_conjunct:
            linguistic_score *= 1.8        # Increased from 1.3
        if has_matra:
            linguistic_score *= 1.5        # Increased from 1.2
            
        # Additional bonuses for common patterns
        if self._is_common_word_part(token_str):
            linguistic_score *= 1.5
            
        return freq_score * length_bonus * linguistic_score

    def _is_common_word_part(self, token_str: str) -> bool:
        """Check if token is likely to be a meaningful word part"""
        # Common Odia word endings
        common_endings = [
            'ର', 'ରେ', 'ଟି', 'ଗୁଡ଼ିକ', 'ମାନେ', 'ଙ୍କୁ', 'ଙ୍କ', 'ଙ୍କର',
            'ଟା', 'ଟାରେ', 'ଗୁଡ଼ାକ', 'ମାନଙ୍କ', 'ମାନଙ୍କୁ'
        ]
        
        # Common Odia word beginnings
        common_beginnings = [
            'ପ୍ର', 'ଅନୁ', 'ଅଧି', 'ପରି', 'ଉପ', 'ସମ', 'ବି', 'ନି', 'ସୁ',
            'ଆ', 'ଇ', 'ଉ', 'ଏ', 'ଓ'
        ]
        
        return (any(token_str.endswith(end) for end in common_endings) or
                any(token_str.startswith(begin) for begin in common_beginnings))

    def _get_stats(self, ids: List[int]) -> Dict[Tuple[int, int], int]:
        """Count token pair frequencies"""
        stats = defaultdict(int)
        for pair in zip(ids, ids[1:]):
            stats[pair] += 1
        return stats

    def _merge(self, ids: List[int], pair: Tuple[int, int], idx: int) -> List[int]:
        """
        Merge all occurrences of pair into a new token
        
        Args:
            ids: List of token indices
            pair: Pair to merge
            idx: New token index
            
        Returns:
            Updated list of token indices
        """
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids

    def pre_tokenize(self, text: str) -> List[str]:
        """
        Pre-tokenize text using regex pattern
        
        Args:
            text: Input text
            
        Returns:
            List of pre-tokenized strings
        """
        tokens = self.pattern.findall(text)
        return [t.strip() for t in tokens if t.strip()]

    def train(self, text: str) -> float:
        """Modified train method with more aggressive merging"""
        # Pre-tokenize with linguistic patterns
        pre_tokens = self.pre_tokenize(text)
        processed_text = ' '.join(pre_tokens)
        
        # Initialize with more combinations
        chars = sorted(list(set(processed_text)))
        initial_tokens = chars.copy()
        
        # Add more initial combinations
        for i, char in enumerate(chars):
            if self._is_odia_char(char):
                # Add all possible vowel mark combinations
                for matra in '\u0B3E\u0B3F\u0B40\u0B41\u0B42\u0B43\u0B47\u0B48\u0B4B\u0B4C':
                    combined = char + matra
                    initial_tokens.append(combined)
                
                # Add more consonant clusters
                for j, next_char in enumerate(chars):
                    if self._is_odia_char(next_char):
                        # Single conjunct
                        cluster = char + '\u0B4D' + next_char
                        initial_tokens.append(cluster)
                        
                        # Double conjuncts (common in Odia)
                        if j < len(chars) - 1:
                            for k, third_char in enumerate(chars[j+1:]):
                                if self._is_odia_char(third_char):
                                    triple = cluster + '\u0B4D' + third_char
                                    if triple in processed_text:
                                        initial_tokens.append(triple)
        
        # Add common word parts to initial vocabulary
        for token in pre_tokens:
            if len(token) <= self.max_token_length:
                initial_tokens.append(token)
        
        # Remove duplicates and initialize vocabulary
        initial_tokens = list(set(initial_tokens))
        vocab_size = len(self.special_tokens) + len(initial_tokens)
        self.stoi = {token: i+len(self.special_tokens) for i, token in enumerate(initial_tokens)}
        self.itos = {i: token for token, i in self.stoi.items()}
        
        # Add special tokens
        for token, idx in self.special_tokens.items():
            self.stoi[token] = idx
            self.itos[idx] = token

        # Continue with more aggressive BPE training
        ids = [self.stoi[ch] for ch in processed_text]
        text_len = len(processed_text)
        
        while vocab_size < self.max_vocab_size:
            # Get pair frequencies
            stats = self._get_stats(ids)
            if not stats:
                break
                
            # Score all pairs
            pair_scores = {
                pair: self._get_merge_score(pair, freq, text_len)
                for pair, freq in stats.items()
            }
            
            if not pair_scores:
                break
                
            # Select best pair
            best_pair = max(pair_scores.items(), key=lambda x: x[1])[0]
            
            # Merge tokens
            new_token_idx = len(self.stoi)
            ids = self._merge(ids, best_pair, new_token_idx)
            
            # Update vocabularies
            self.stoi[best_pair] = new_token_idx
            self.itos[new_token_idx] = best_pair
            self.merges[best_pair] = new_token_idx
            
            vocab_size += 1
            
            # Check compression ratio
            current_compression = self._calculate_compression(processed_text, ids)
            if current_compression >= self.target_compression:
                break
                
        return self._calculate_compression(processed_text, ids)

    def encode(self, text: str) -> List[int]:
        """Modified encode method with pre-tokenization"""
        if not self.stoi:
            raise ValueError("Tokenizer needs to be trained first")
            
        # Pre-tokenize the text
        pre_tokens = self.pre_tokenize(text)
        processed_text = ' '.join(pre_tokens)
        
        # Start with characters
        ids = []
        for ch in processed_text:
            if ch in self.stoi:
                ids.append(self.stoi[ch])
            else:
                ids.append(self.stoi['<UNK>'])
                
        # Apply merges in order of creation
        changes_made = True
        while changes_made:
            changes_made = False
            i = 0
            while i < len(ids) - 1:
                current_pair = (ids[i], ids[i+1])
                if current_pair in self.merges:
                    # Replace pair with merged token
                    ids[i:i+2] = [self.merges[current_pair]]
                    changes_made = True
                else:
                    i += 1
                
        return ids

    def decode(self, ids: List[int]) -> str:
        """
        Decode token indices back to text
        
        Args:
            ids: List of token indices
            
        Returns:
            Decoded text
        """
        text = ""
        for idx in ids:
            if idx in self.itos:
                token = self.itos[idx]
                if isinstance(token, tuple):
                    # Recursively decode merged pairs
                    text += self.decode([token[0], token[1]])
                else:
                    text += token
            else:
                text += self.itos[self.special_tokens['<UNK>']]
        return text

    def _is_valid_char(self, char: str) -> bool:
        """Check if character is valid for tokenization"""
        # Valid Odia range
        if '\u0B00' <= char <= '\u0B7F':
            return True
        # Common punctuation and whitespace
        if char in {' ', '.', ',', '।', '?', '!', '\n', '\t'}:
            return True
        # Latin characters and numbers
        if char.isalnum():
            return True
        return False

In [10]:
def load_odia_files(file_pattern: str) -> str:
    """Load all Odia text files matching pattern"""
    text = ""
    for filename in glob.glob(file_pattern):
        with open(filename, 'r', encoding='utf-8') as f:
            content = f.read()
            # Clean the text
            cleaned = ''.join(char for char in content 
                            if '\u0B00' <= char <= '\u0B7F'  # Odia characters
                            or char.isspace()  # Whitespace
                            or char in {'.', ',', '।', '?', '!'})  # Punctuation
            text += cleaned + "\n"
    return text

# Load input files
#input_files_pattern = "data/odia/*.txt"
input_files_pattern = "odia_texts/*.txt"
try:
    text = load_odia_files(input_files_pattern)
    print(f"Loaded text from files matching: {input_files_pattern}")
except Exception as e:
    print(f"Error loading files: {e}")
    # Fallback to sample text
    text = """
    ଓଡ଼ିଆ ଭାଷା ଏକ ପ୍ରାଚୀନ ଭାରତୀୟ ଭାଷା।
    ଏହା ଭାରତର ଓଡ଼ିଶା ରାଜ୍ୟର ସରକାରୀ ଭାଷା।
    """
    print("Using sample text instead")

print("\nText preview:")
print(text[:500], "...")

Loaded text from files matching: odia_texts/*.txt

Text preview:
ସାପ କାମୁଡ଼ା ରୋଗୀଙ୍କ ପାଇଁ ଷ୍ଟ୍ରେଚର ଜେଲ୍ ଫେରିଲେ ଅର୍ଚ୍ଚନା ଅନୁପସ୍ଥିତ ୯୬ ଶିକ୍ଷକ ବହିଷ୍କୃତ ହକି ଷ୍ଟାଡିୟମ ବିମାନବନ୍ଦର ଭିତ୍ତିଭୂମି ସମୀକ୍ଷା କଲେ ୫ ଟି ସଚିବ ଖୁବଶୀଘ୍ର ବିମାନ ସେବା ୧୩ରେ ଭାରତ ସ୍ପେନ୍ ମ୍ୟାଚ୍ ଆପ୍ କବ୍ ଜାରେ ଦିଲ୍ଲୀ ଏମ୍ ସିଡି ବିମୁଦ୍ରାୟନ ମାମଲା ସୁପ୍ରିମକୋର୍ଟରେ ରାୟ ସଂରକ୍ଷିତ ରେପୋରେଟ୍ ବୃଦ୍ଧି କଲା ଆର୍ ବିଆଇ ଇଏମ୍ ଆଇ ବଢ଼ିବ ଛତିଶା ନିଯୋଗ ବୈଠକର ନିଷ୍ପତ୍ତି ଶ୍ରୀମନ୍ଦିରରେ ସ୍ମାର୍ଟଫୋନ ନିଷିଦ୍ଧ

ସାପ କାମୁଡ଼ା ରୋଗୀଙ୍କ ପାଇଁ ଷ୍ଟ୍ରେଚର ଜେଲ୍ ଫେରିଲେ ଅର୍ଚ୍ଚନା ଅନୁପସ୍ଥିତ ୯୬ ଶିକ୍ଷକ ବହିଷ୍କୃତ ହକି ଷ୍ଟାଡିୟମ ବିମାନବନ୍ଦର ଭିତ୍ତିଭୂମି ସମୀକ୍ଷା କଲେ ୫ ଟି ସଚିବ ...


In [11]:
# Create and train tokenizer with enhanced parameters
tokenizer = CompressedOdiaTokenizer(
    max_vocab_size=16000,        # Increased from 5000
    target_compression=4.0,      # Increased from 3.2
    max_token_length=24,         # Increased from 12
    pattern_type='linguistic'    # Using linguistic patterns
)

# Train
compression = tokenizer.train(text)
print(f"Achieved compression ratio: {compression:.2f}")

# Test encoding/decoding
tokens = tokenizer.encode(text[:1000])  # Test on first 1000 chars
decoded = tokenizer.decode(tokens)

# Print statistics
print(f"\nVocabulary size: {len(tokenizer.stoi)}")
print(f"Original text length: {len(text)}")
print(f"Number of tokens: {len(tokens)}")

# Calculate token length statistics
token_lengths = [len(str(token)) for token in tokenizer.stoi.keys()]
avg_len = sum(token_lengths) / len(token_lengths)
print(f"\nAverage token length: {avg_len:.2f} characters")
print(f"Longest token length: {max(token_lengths)} characters")

Achieved compression ratio: 4.00

Vocabulary size: 8924
Original text length: 11237339
Number of tokens: 579

Average token length: 4.30 characters
Longest token length: 12 characters


In [12]:
# Analyze vocabulary composition
print("Vocabulary Analysis:")

# Count different token types
odia_tokens = 0
special_tokens = 0
merged_tokens = 0

for token in tokenizer.stoi.keys():
    if isinstance(token, tuple):
        merged_tokens += 1
    elif token in tokenizer.special_tokens:
        special_tokens += 1
    elif any('\u0B00' <= c <= '\u0B7F' for c in str(token)):
        odia_tokens += 1

print(f"\nToken Type Distribution:")
print(f"Odia tokens: {odia_tokens}")
print(f"Merged tokens: {merged_tokens}")
print(f"Special tokens: {special_tokens}")

Vocabulary Analysis:

Token Type Distribution:
Odia tokens: 7663
Merged tokens: 1257
Special tokens: 3
