# Make a Custom Tokenizer

In [22]:
from datasets import load_dataset

raw_dataset = load_dataset("json", data_files="../new-data/whisper_finetune_dataset.jsonl", split="train")

# Optionally split it again
from datasets import DatasetDict
raw_dataset = raw_dataset.train_test_split(test_size=0.1)
raw_dataset = DatasetDict({
    "train": raw_dataset["train"],
    "test": raw_dataset["test"]
})

In [23]:
from pathlib import Path

output_path = Path("transcripts.txt")

with output_path.open("w", encoding="utf-8") as f:
    for split in raw_dataset:
        for example in raw_dataset[split]:
            text = example["text"].strip()
            if text:
                f.write(text + "\n")

In [24]:
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path

# Point to your transcripts file
paths = [str(Path("transcripts.txt"))]

# Train a BPE tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=paths, vocab_size=8000, min_frequency=2, special_tokens=[
    "<s>", "</s>", "<pad>", "<unk>"
])

# Save in the expected format
tokenizer.save_model("custom-tokenizer")






['custom-tokenizer/vocab.json', 'custom-tokenizer/merges.txt']

In [None]:
from transformers import WhisperTokenizerFast
hf_tokenizer = WhisperTokenizer(
    vocab_file="custom-tokenizer/vocab.json",
    merges_file="custom-tokenizer/merges.txt",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
)

hf_tokenizer.save_pretrained("custom-tokenizer")

('custom-tokenizer/tokenizer_config.json',
 'custom-tokenizer/special_tokens_map.json',
 'custom-tokenizer/vocab.json',
 'custom-tokenizer/merges.txt',
 'custom-tokenizer/normalizer.json',
 'custom-tokenizer/added_tokens.json')