# Enhanced Byte Pair Encoding for Odia Text

## Input Data Structure
```
project_root/
├── odia_bpe_tokenizer_enhanced.ipynb
├── data/
│   └── odia/
│       ├── file1.txt
│       └── file2.txt
```

In [None]:
import os
import glob
import regex as re
from collections import defaultdict, Counter
import numpy as np
from typing import List, Dict, Tuple, Optional
from IPython.display import HTML, display

# Fix for CSS display
display(HTML('''
<style>
    pre {
        white-space: pre-wrap;
    }
</style>
'''))

In [None]:
def load_odia_files(file_pattern: str) -> str:
    """Load all Odia text files matching pattern"""
    text = ""
    for filename in glob.glob(file_pattern):
        with open(filename, 'r', encoding='utf-8') as f:
            content = f.read()
            # Clean the text
            cleaned = ''.join(char for char in content 
                            if '\u0B00' <= char <= '\u0B7F'  # Odia characters
                            or char.isspace()  # Whitespace
                            or char in {'.', ',', '।', '?', '!'})  # Punctuation
            text += cleaned + "\n"
    return text

# Load input files
input_files_pattern = "data/odia/*.txt"
try:
    text = load_odia_files(input_files_pattern)
    print(f"Loaded text from files matching: {input_files_pattern}")
except Exception as e:
    print(f"Error loading files: {e}")
    # Fallback to sample text
    text = """
    ଓଡ଼ିଆ ଭାଷା ଏକ ପ୍ରାଚୀନ ଭାରତୀୟ ଭାଷା।
    ଏହା ଭାରତର ଓଡ଼ିଶା ରାଜ୍ୟର ସରକାରୀ ଭାଷା।
    """
    print("Using sample text instead")

print("\nText preview:")
print(text[:500], "...")

In [None]:
# Create and train tokenizer with enhanced parameters
tokenizer = CompressedOdiaTokenizer(
    max_vocab_size=16000,        # Increased from 5000
    target_compression=4.0,      # Increased from 3.2
    max_token_length=24,         # Increased from 12
    pattern_type='linguistic'    # Using linguistic patterns
)

# Train
compression = tokenizer.train(text)
print(f"Achieved compression ratio: {compression:.2f}")

# Test encoding/decoding
tokens = tokenizer.encode(text[:1000])  # Test on first 1000 chars
decoded = tokenizer.decode(tokens)

# Print statistics
print(f"\nVocabulary size: {len(tokenizer.stoi)}")
print(f"Original text length: {len(text)}")
print(f"Number of tokens: {len(tokens)}")

# Calculate token length statistics
token_lengths = [len(str(token)) for token in tokenizer.stoi.keys()]
avg_len = sum(token_lengths) / len(token_lengths)
print(f"\nAverage token length: {avg_len:.2f} characters")
print(f"Longest token length: {max(token_lengths)} characters")

In [None]:
# Analyze vocabulary composition
print("Vocabulary Analysis:")

# Count different token types
odia_tokens = 0
special_tokens = 0
merged_tokens = 0

for token in tokenizer.stoi.keys():
    if isinstance(token, tuple):
        merged_tokens += 1
    elif token in tokenizer.special_tokens:
        special_tokens += 1
    elif any('\u0B00' <= c <= '\u0B7F' for c in str(token)):
        odia_tokens += 1

print(f"\nToken Type Distribution:")
print(f"Odia tokens: {odia_tokens}")
print(f"Merged tokens: {merged_tokens}")
print(f"Special tokens: {special_tokens}")