In [1]:
import os
import re
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
import pandas as pd

In [2]:
CURRICULUM_TXT = "telugu_curriculum.txt"
TOKENS17_TXT = "telugu_17M_tokens.txt"

In [3]:
def load_csv_text(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"CSV file not found: {path}")
    df = pd.read_csv(path)
    col = df.columns[0]
    return df[col].dropna().astype(str).tolist()

In [4]:
def load_txt(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Text file not found: {path}")
    with open(path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

In [5]:
def clean(lines):
    return [re.sub(r"\s+", " ", ln).strip() for ln in lines if ln.strip()]

In [6]:
curriculum = clean(load_txt(CURRICULUM_TXT))
tokens17 = clean(load_txt(TOKENS17_TXT))

In [7]:
corpus = curriculum + tokens17
print(f"Total lines: {len(corpus)}")

Total lines: 428350


In [8]:
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=32000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

In [9]:
os.makedirs("tokenizer_temp", exist_ok=True)
with open("tokenizer_temp/corpus.txt", "w", encoding="utf-8") as f:
    for line in corpus:
        f.write(line + "\n")

In [10]:
tokenizer.train(["tokenizer_temp/corpus.txt"], trainer)






In [11]:
os.makedirs("telugu_tokenizer", exist_ok=True)
tokenizer.save("telugu_tokenizer/tokenizer.json")

In [12]:
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="telugu_tokenizer/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<bos>",
    eos_token="<eos>"
)

In [13]:
hf_tokenizer.save_pretrained("telugu_tokenizer")

('telugu_tokenizer/tokenizer_config.json',
 'telugu_tokenizer/special_tokens_map.json',
 'telugu_tokenizer/tokenizer.json')

In [14]:
print("Done. Saved to ./telugu_tokenizer/")

Done. Saved to ./telugu_tokenizer/
