In [None]:
import os
import time
import datetime

from datasets import load_dataset
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast


In [None]:
vocab_size = 50_000
save_path = "./vag_tokenizers/"
os.makedirs(save_path, exist_ok=True)

bos_tok = "<bos>"
eos_tok = "<eos>"

extra_char = [
    "1","2","3","4","5","6","7","8","9",
    "०","१","२","३","४","५","६","७","८","९"
]

special_tokens = [
    "<pad>", "<cls>", "<sep>", "<mask>", "<unk>",
    bos_tok, eos_tok, "<user>", "<assistant>"
] + extra_char


In [None]:
ds = load_dataset(
    "Sakonii/nepalitext-language-model-dataset",
    split="train"
)


README.md: 0.00B [00:00, ?B/s]

dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00014.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

data/train-00001-of-00014.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

data/train-00002-of-00014.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

data/train-00003-of-00014.parquet:   0%|          | 0.00/209M [00:00<?, ?B/s]

data/train-00004-of-00014.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

data/train-00005-of-00014.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

data/train-00006-of-00014.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

data/train-00007-of-00014.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

data/train-00008-of-00014.parquet:   0%|          | 0.00/209M [00:00<?, ?B/s]

data/train-00009-of-00014.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

data/train-00010-of-00014.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

data/train-00011-of-00014.parquet:   0%|          | 0.00/209M [00:00<?, ?B/s]

data/train-00012-of-00014.parquet:   0%|          | 0.00/209M [00:00<?, ?B/s]

data/train-00013-of-00014.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13141222 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/268189 [00:00<?, ? examples/s]

In [None]:
def text_iterator(dataset, batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i:i + batch_size]["text"]


In [None]:
def train_sentencepiece_tokenizer(dataset, vocab_size):
    start = time.time()

    # 🔴 tokenizer is defined HERE
    tokenizer = SentencePieceBPETokenizer()

    print(f"Training SentencePiece BPE tokenizer (vocab={vocab_size})")

    tokenizer.train_from_iterator(
        text_iterator(dataset),
        vocab_size=vocab_size,
        min_frequency=5,
        special_tokens=special_tokens,
        show_progress=True
    )

    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    base_name = f"vag_sentencepiece_bpe_{vocab_size}_{timestamp}"

    # Save SentencePiece files
    tokenizer.save_model(save_path, base_name)

    # ✅ Convert to Hugging Face tokenizer
    hf_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token=bos_tok,
        eos_token=eos_tok,
        unk_token="<unk>",
        pad_token="<pad>",
        cls_token="<cls>",
        sep_token="<sep>",
        mask_token="<mask>",
        additional_special_tokens=["<user>", "<assistant>"],
        padding_side="left",
        truncation_side="right",
        clean_up_tokenization_spaces=False,
    )

    hf_tokenizer.save_pretrained(
        os.path.join(save_path, f"{base_name}_hf")
    )

    print(f"Tokenizer saved in {(time.time() - start)/60:.2f} minutes")


In [None]:
train_sentencepiece_tokenizer(ds, vocab_size)


Training SentencePiece BPE tokenizer (vocab=50000)
Tokenizer saved in 16.21 minutes


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "/content/vag_tokenizers/vag_sentencepiece_bpe_50000_20260104_072313_hf",
    use_fast=True
)


In [None]:
print("Vocab size:", tokenizer.vocab_size)
print("Special tokens:", tokenizer.special_tokens_map)


Vocab size: 50000
Special tokens: {'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>', 'additional_special_tokens': ['<user>', '<assistant>']}


In [None]:
text = "श्रीगुरुभ्यो नमः। यो नेपाली वाक्य परीक्षण हो।"

out = tokenizer(text, return_tensors=None)
print(out["input_ids"])
print(tokenizer.convert_ids_to_tokens(out["input_ids"]))


[2114, 11887, 422, 1312, 42861, 1159, 1262, 14442, 3807, 2203]
['▁श्री', 'गुरु', 'भ', '्यो', '▁नमः।', '▁यो', '▁नेपाली', '▁वाक्य', '▁परीक्षण', '▁हो।']


In [None]:
decoded = tokenizer.decode(out["input_ids"], skip_special_tokens=True)
print(decoded)

श्रीगुरुभ्यो नमः। यो नेपाली वाक्य परीक्षण हो।


In [None]:
text = "अ॒ग्निमी॑ळे पु॒रोहि॑तं य॒ज्ञस्य॑ दे॒वमृ॒त्विज॑म् ।होता॑रं रत्न॒धात॑मम् ॥"

out = tokenizer(text, return_tensors=None)
print(out["input_ids"])
print(tokenizer.convert_ids_to_tokens(out["input_ids"]))


[1026, 455, 41983, 1563, 454, 428, 446, 1149, 455, 1209, 1093, 454, 413, 381, 1061, 455, 1541, 1535, 454, 1745, 455, 430, 423, 442, 455, 1083, 10295, 454, 1055, 1023, 1236, 1044, 454, 425, 381, 6132, 455, 1645, 413, 454, 423, 1055, 26387]
['▁अ', '॒', 'ग्नि', 'मी', '॑', 'ळ', 'े', '▁पु', '॒', 'रो', 'हि', '॑', 'त', 'ं', '▁य', '॒', 'ज्ञ', 'स्य', '॑', '▁दे', '॒', 'व', 'म', 'ृ', '॒', 'त्', 'विज', '॑', 'म्', '▁।', 'हो', 'ता', '॑', 'र', 'ं', '▁रत्न', '॒', 'धा', 'त', '॑', 'म', 'म्', '▁॥']


In [None]:
# out["input_ids"]

In [None]:
decoded = tokenizer.decode(out["input_ids"])
print(decoded)

अ॒ग्निमी॑ळे पु॒रोहि॑तं य॒ज्ञस्य॑ दे॒वमृ॒त्विज॑म् ।होता॑रं रत्न॒धात॑मम् ॥
