# Train Marathi BPE Tokenizer

This notebook wraps functions from train.py so you can run/load training steps interactively. Edit the config cell as needed, then run cells.


In [None]:
# Import training helpers from train.py
from train import TrainConfig, ensure_dirs, import_tokenizer_class, load_training_texts, save_text, train_and_save_tokenizer
print('Imported training utilities from train.py')


In [None]:
# Configure training (edit values then run cells)
cfg = TrainConfig(
    dataset='ai4bharat/samanantar',
    subset='mr',
    split='train',
    lang_side='tgt',
    num_examples=10000,
    vocab_size=6000,
    model_dir='model',
    data_dir='data',
    output_text_filename='marathi_samanantar.txt',
    streaming=True,
)
cfg


In [None]:
# Prepare dirs and load text (this may stream and take time depending on num_examples)
ensure_dirs(cfg)
tokenizer_cls = import_tokenizer_class(cfg.tokenizer_module, cfg.tokenizer_class)
text = load_training_texts(cfg)
print(f'Loaded text length: {len(text):,} characters')
save_text(text, cfg)


In [None]:
# Train tokenizer (disabled by default to avoid accidental long runs).
# Uncomment the next line to run training from the notebook.
# train_and_save_tokenizer(tokenizer_cls, text, cfg)
print('To train, uncomment the train_and_save_tokenizer(...) call in this cell')


Notes:
- If training runs long, run training in a terminal instead.
- This notebook simply imports and calls functions in train.py so you can debug interactively.