# Bengali BPE Tokenizer Training Notebook

Steps:
1. Download Bengali Wikipedia dump
2. Extract with WikiExtractor
3. Normalize & clean
4. Train custom BPE (educational)
5. Train Hugging Face Tokenizers BPE
6. Evaluate compression metrics
7. Save & push to Hugging Face Hub
8. Prepare artifacts for Hugging Face Space

Set `HF_TOKEN` as an environment variable for upload.


In [1]:
import os, sys, subprocess, json, math, shutil, tarfile, bz2, re
from pathlib import Path
import requests
from tqdm import tqdm

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data'
RAW_DIR = DATA_DIR / 'raw'
EXTRACT_DIR = DATA_DIR / 'extracted'
CLEAN_TEXT = DATA_DIR / 'clean_corpus.txt'
MODEL_DIR = BASE_DIR / 'model'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
for d in [DATA_DIR, RAW_DIR, EXTRACT_DIR]:
    d.mkdir(exist_ok=True, parents=True)

print('Directories ready.')

Directories ready.


## 1. Download Bengali Wikipedia Dump


In [None]:
dump_url = 'https://dumps.wikimedia.org/bnwiki/latest/bnwiki-latest-pages-articles.xml.bz2'
dump_path = RAW_DIR / 'bnwiki-latest-pages-articles.xml.bz2'
if not dump_path.exists():
    resp = requests.get(dump_url, stream=True)
    total = int(resp.headers.get('content-length', 0))
    with open(dump_path, 'wb') as f, tqdm(total=total, unit='B', unit_scale=True, desc='Downloading dump') as pbar:
        for chunk in resp.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
                pbar.update(len(chunk))
else:
    print('Dump already downloaded.')


## 2. Extract with WikiExtractor


In [None]:
import bz2, re, json, shutil
from pathlib import Path

dump_path = dump_path
extract_output_dir = EXTRACT_DIR / "bnwiki_stream"
if extract_output_dir.exists():
    shutil.rmtree(extract_output_dir)
extract_output_dir.mkdir(parents=True, exist_ok=True)

title_re   = re.compile(r"<title>(.*?)</title>")
text_re    = re.compile(r"<text[^>]*>(.*?)</text>", re.DOTALL)
redirect_re= re.compile(r"<redirect")

def iter_pages(bz2_file):
    buf = []
    inside = False
    with bz2.open(bz2_file, "rt", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if "<page>" in line:
                inside = True
                buf = [line]
            elif "</page>" in line and inside:
                buf.append(line)
                yield "".join(buf)
                inside = False
            elif inside:
                buf.append(line)

def clean_wikitext(txt):
    txt = re.sub(r"\{\{.*?\}\}", " ", txt, flags=re.DOTALL)
    txt = re.sub(r"\[\[File:.*?\]\]", " ", txt, flags=re.IGNORECASE | re.DOTALL)
    txt = re.sub(r"\[\[(?:[^|\]]*\|)?([^|\]]+)\]\]", r"\1", txt)
    txt = re.sub(r"<ref.*?</ref>", " ", txt, flags=re.DOTALL)
    txt = re.sub(r"<.*?>", " ", txt)
    txt = re.sub(r"\s+", " ", txt)
    return txt.strip()

batch = []
batch_size = 4000
file_index = 0
page_count = 0

for page_xml in iter_pages(dump_path):
    if redirect_re.search(page_xml):
        continue
    tm = title_re.search(page_xml)
    xm = text_re.search(page_xml)
    if not (tm and xm):
        continue
    title = tm.group(1)
    body = clean_wikitext(xm.group(1))
    if not body:
        continue
    batch.append({"title": title, "text": body})
    page_count += 1
    if len(batch) >= batch_size:
        out_file = extract_output_dir / f"wiki_{file_index:05d}.json"
        with open(out_file, "w", encoding="utf-8") as out:
            for obj in batch:
                out.write(json.dumps(obj, ensure_ascii=False) + "\n")
        batch.clear()
        file_index += 1

if batch:
    out_file = extract_output_dir / f"wiki_{file_index:05d}.json"
    with open(out_file, "w", encoding="utf-8") as out:
        for obj in batch:
            out.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"Streaming extraction complete. Pages: {page_count}, Files: {file_index + 1}")
print("Sample file:", out_file)

## 3. Normalize & Clean


In [None]:
import json, unicodedata, gzip
from src.utils import normalize_line

SOURCE_JSON_DIR = EXTRACT_DIR / "bnwiki_stream"
print("Using source JSON directory:", SOURCE_JSON_DIR)

import json
from src.utils import normalize_line
from pathlib import Path

CLEAN_TEXT = DATA_DIR / "clean_corpus.txt"
if CLEAN_TEXT.exists():
    CLEAN_TEXT.unlink()

lines_written = 0
for jsf in SOURCE_JSON_DIR.rglob("*.json"):
    with open(jsf, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            text = obj.get("text", "")
            if not text:
                continue
            for raw_line in text.split("\n"):
                norm = normalize_line(raw_line)
                if norm:
                    lines_written += 1
                    with open(CLEAN_TEXT, "a", encoding="utf-8") as out:
                        out.write(norm + "\n")

print("Clean corpus lines:", lines_written)

## 4. Train Custom BPE


In [4]:
from src.custom_bpe import CustomBPE, normalize_bengali

TARGET_VOCAB = 16000  # > 5000 as per assignment
MIN_FREQ = 3
CLEAN_TEXT_LINE_LIMIT = 5000  # set to int to cap lines; None uses the full clean corpus

custom_bpe = CustomBPE(
    vocab_size=TARGET_VOCAB,
    min_freq=MIN_FREQ,
    progress_every=500,
    normalize_fn=lambda s: normalize_bengali(s, map_digits=True, keep_latin=True),
    dropout=0.05,
    debug=True
)

def line_iter(limit=None):
    effective_limit = CLEAN_TEXT_LINE_LIMIT if limit is None else limit
    with open(CLEAN_TEXT, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if effective_limit is not None and i >= effective_limit:
                break
            yield line

custom_result = custom_bpe.train(line_iter())
print(f"Custom BPE merges: {len(custom_result.merges)} | vocab size (approx tokens seen): {len(custom_result.vocab)}")


[CustomBPE] Merge 500: ('গ', 'ু') | current vocab est=505
[CustomBPE] Merge 1000: ('ার', 'ে</w>') | current vocab est=1005
[CustomBPE] Merge 1000: ('ার', 'ে</w>') | current vocab est=1005
[CustomBPE] Merge 1500: ('মহ', 'িল') | current vocab est=1505
[CustomBPE] Merge 1500: ('মহ', 'িল') | current vocab est=1505
[CustomBPE] Merge 2000: ('ম', 'য়') | current vocab est=2005
[CustomBPE] Merge 2000: ('ম', 'য়') | current vocab est=2005
[CustomBPE] Merge 2500: ('ুয়', 'ারি') | current vocab est=2505
[CustomBPE] Merge 2500: ('ুয়', 'ারি') | current vocab est=2505
[CustomBPE] Merge 3000: ('থ', 'া</w>') | current vocab est=3005
[CustomBPE] Merge 3000: ('থ', 'া</w>') | current vocab est=3005
[CustomBPE] Merge 3500: ('উ', 'দ্ধ') | current vocab est=3505
[CustomBPE] Merge 3500: ('উ', 'দ্ধ') | current vocab est=3505
[CustomBPE] Merge 4000: ('ব', 'ক্') | current vocab est=4005
[CustomBPE] Merge 4000: ('ব', 'ক্') | current vocab est=4005
[CustomBPE] Merge 4500: ('প্র', 'কৃতি</w>') | current vocab est=

## 5. Train Hugging Face Tokenizers BPE (Optional, For Comparison)


In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, processors

hf_tokenizer_path = MODEL_DIR / 'hf_tokenizer.json'
if not hf_tokenizer_path.exists():
    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFC()
    ])
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(
        vocab_size=8000,
        min_frequency=5,
        special_tokens=["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
    )

    def batch_iterator(batch_size=1000, limit=None):
        effective_limit = CLEAN_TEXT_LINE_LIMIT if limit is None else limit
        with open(CLEAN_TEXT, 'r', encoding='utf-8') as f:
            batch = []
            used = 0
            for line in f:
                line = line.strip()
                if not line:
                    continue
                batch.append(line)
                used += 1
                if len(batch) == batch_size:
                    yield batch
                    batch = []
                if effective_limit is not None and used >= effective_limit:
                    break
            if batch:
                yield batch

    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
    tokenizer.post_processor = processors.TemplateProcessing(
        single='[CLS] $A [SEP]',
        pair='[CLS] $A [SEP] $B:1 [SEP]:1',
        special_tokens=[('[CLS]', tokenizer.token_to_id('[CLS]')), ('[SEP]', tokenizer.token_to_id('[SEP]'))]
    )
    tokenizer.save(str(hf_tokenizer_path))
else:
    tokenizer = Tokenizer.from_file(str(hf_tokenizer_path))
print('HF tokenizer vocab size:', tokenizer.get_vocab_size())


## 6. Evaluate Compression Metrics


In [5]:
from src.utils import compute_basic_metrics

metrics_sample_size = 5000
if 'CLEAN_TEXT_LINE_LIMIT' in globals() and CLEAN_TEXT_LINE_LIMIT is not None:
    metrics_sample_size = min(metrics_sample_size, CLEAN_TEXT_LINE_LIMIT)

metrics_custom = compute_basic_metrics(CLEAN_TEXT, custom_bpe.encode, sample_size=metrics_sample_size)
# metrics_hf = compute_basic_metrics(CLEAN_TEXT, lambda t: tokenizer.encode(t).ids, sample_size=metrics_sample_size)

print('Custom BPE Metrics:', json.dumps(metrics_custom, indent=2, ensure_ascii=False))
# print('HuggingFace BPE Metrics:', json.dumps(metrics_hf, indent=2, ensure_ascii=False))

# assert metrics_custom['assignment_compression_ratio'] >= 3 or metrics_hf['assignment_compression_ratio'] >= 3, \
#     'Need ratio >= 3 for assignment. Consider increasing vocab or cleaning.'
assert metrics_custom['assignment_compression_ratio'] >= 3, \
    'Need ratio >= 3 for assignment. Consider increasing vocab or cleaning.'


Custom BPE Metrics: {
  "lines_sampled": 5000,
  "total_chars": 26444005,
  "total_bytes": 67426621,
  "total_tokens": 5811945,
  "chars_per_token": 4.549940682508179,
  "bytes_per_token": 11.601386627024171,
  "assignment_compression_ratio": 4.549940682508179,
  "approx_byte_compression_ratio": 5.8006933135120855
}


## 7. Save Artifacts & (Optional) Push to Hugging Face Hub


In [7]:
from src.utils import save_json

ARTIFACTS_DIR = MODEL_DIR / 'artifacts'
ARTIFACTS_DIR.mkdir(exist_ok=True)

save_json(metrics_custom, ARTIFACTS_DIR / 'metrics_custom.json')
# save_json(metrics_hf, ARTIFACTS_DIR / 'metrics_hf.json')

CUSTOM_TOKENIZER_PATH = MODEL_DIR / 'custom_tokenizer.json'
custom_bpe.save(CUSTOM_TOKENIZER_PATH)

# Save custom merges
with open(ARTIFACTS_DIR / 'custom_merges.txt', 'w', encoding='utf-8') as f:
    for a,b in custom_bpe.merges:
        f.write(f'{a} {b}\n')

print('Custom tokenizer saved to:', CUSTOM_TOKENIZER_PATH)
print('Artifacts saved.')


Custom tokenizer saved to: c:\Personal Learning\ERA V4\Assignment 11\tsai_assignment_11\model\custom_tokenizer.json
Artifacts saved.


### Hugging Face Upload (Requires HF_TOKEN)


In [None]:
import os
from huggingface_hub import HfApi, HfFolder, create_repo, upload_file

hf_token = os.environ.get('HF_TOKEN')
repo_name = 'bengali-bpe-tokenizer'
user = None
if hf_token:
    api = HfApi(token=hf_token)
    user = api.whoami()['name']
    full_repo = f'{user}/{repo_name}'
    try:
        create_repo(full_repo, private=False)
    except Exception as e:
        print('Repo may already exist:', e)
    # Upload custom tokenizer artifacts
    upload_file(path_or_fileobj=str(CUSTOM_TOKENIZER_PATH), path_in_repo='custom_tokenizer.json', repo_id=full_repo)
    upload_file(path_or_fileobj=str(ARTIFACTS_DIR / 'custom_merges.txt'), path_in_repo='custom_merges.txt', repo_id=full_repo)
    upload_file(path_or_fileobj=str(ARTIFACTS_DIR / 'metrics_custom.json'), path_in_repo='metrics_custom.json', repo_id=full_repo)
    print('Uploaded custom tokenizer to:', f'https://huggingface.co/{full_repo}')
else:
    print('HF_TOKEN not set; skipping upload.')


## 8. Prepare Space Files
Place `custom_tokenizer.json` and `space/app.py` in a new Space repo.


In [None]:
SPACE_DIR = BASE_DIR / 'space'
SPACE_DIR.mkdir(exist_ok=True)

space_tokenizer_path = SPACE_DIR / 'custom_tokenizer.json'
shutil.copy(CUSTOM_TOKENIZER_PATH, space_tokenizer_path)
shutil.copy(BASE_DIR / 'src' / 'custom_bpe.py', SPACE_DIR / 'custom_bpe.py')
print('Copied custom tokenizer and supporting code to space directory.')


## 9. Quick Test Decode/Encode


In [None]:
sample_text = 'বাংলা ভাষা একটি ইন্দো-আর্য ভাষা।'
enc = tokenizer.encode(sample_text)
print(enc.tokens)
print('Decoded back:', tokenizer.decode(enc.ids))


## 10. Summary Report


In [None]:
summary = {
    'custom_bpe_vocab_estimate': len(custom_bpe.token2id),
    'hf_bpe_vocab_size': tokenizer.get_vocab_size(),
    'custom_metrics': metrics_custom,
    'hf_metrics': metrics_hf
}
print(json.dumps(summary, indent=2, ensure_ascii=False))
