## Whisper tokenizer test

In [18]:
from pathlib import Path

# Define base path
BASE_PATH = Path(Path().resolve().parent, "models", "whisper")

# Define model names
MODEL_NAMES = ["tiny", "tiny.en", "small", "small.en", "base", "base.en", "medium", "medium.en", "large", "large-v2", "large-v3"]

# Create paths and check if they exist
MODEL_PATHS = {name.replace('.', '_'): Path(BASE_PATH, f"whisper-{name}") for name in MODEL_NAMES}
for name, path in MODEL_PATHS.items():
    if not path.exists():
        raise FileNotFoundError(f"Path {path} not found. Please verify if the model was correctly downloaded.")


In [37]:
from transformers import WhisperTokenizer

# Load tokenizers
TOKENIZERS = {}
for name, path in MODEL_PATHS.items():
    print(f"Loading tokenizer for model: {name}")
    TOKENIZERS[name] = WhisperTokenizer.from_pretrained(path)
    print(f"Tokenizer for model {name} has {len(TOKENIZERS[name].get_vocab())} tokens.")
    print()


Loading tokenizer for model: tiny
Tokenizer for model tiny has 51865 tokens.

Loading tokenizer for model: tiny_en
Tokenizer for model tiny_en has 51864 tokens.

Loading tokenizer for model: small
Tokenizer for model small has 51865 tokens.

Loading tokenizer for model: small_en
Tokenizer for model small_en has 51864 tokens.

Loading tokenizer for model: base
Tokenizer for model base has 51865 tokens.

Loading tokenizer for model: base_en
Tokenizer for model base_en has 51864 tokens.

Loading tokenizer for model: medium
Tokenizer for model medium has 50364 tokens.

Loading tokenizer for model: medium_en
Tokenizer for model medium_en has 51864 tokens.

Loading tokenizer for model: large
Tokenizer for model large has 51865 tokens.

Loading tokenizer for model: large-v2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer for model large-v2 has 51865 tokens.

Loading tokenizer for model: large-v3
Tokenizer for model large-v3 has 51866 tokens.



In [41]:
from tabulate import tabulate

def tokenize_texts(texts: dict) -> None:
    """
    Tokenize multiple texts from all tokenizers and print lengths side by side.
    
    Args:
    texts (dict): A dictionary with keys as text descriptions and values as the texts.
    """
    headers = ["Model"] + list(texts.keys())
    table = []

    for name, tokenizer in TOKENIZERS.items():
        row = [name]
        for text in texts.values():
            tokens = tokenizer(text)['input_ids']
            row.append(len(tokens))
        table.append(row)

    print(tabulate(table, headers=headers, tablefmt="pretty"))


In [42]:
hi_text = "यह किसी लेख, निबंध या रचना का अंश भी हो सकता है किन्तु स्वयं में पूर्ण होना चाहिए। किसी भी शब्द, वाक्य, सूत्र से सम्बद्ध विचार एवं भावों अपने अर्जित ज्ञान, निजी अनुभूति से संजोकर प्रवाहमयी शैली के माध्यम से गद्यभाषा में अभिव्यक्त करना अनुच्छेद कहलाता है।"

hinglish_text = "yah kisee lekh, nibandh ya rachana ka ansh bhee ho sakata hai kintu svayan mein poorn hona chaahie. kisee bhee shabd, vaaky, sootr se sambaddh vichaar evan bhaavon ko apane arjit gyaan, nijee anubhooti se sanjokar pravaahamayee shailee ke maadhyam se gadyabhaasha mein abhivyakt karana anuchchhed kahalaata hai."

en_text = "It can be a part of any article, essay or composition but it should be complete in itself. Expressing the thoughts and feelings related to any word, sentence, formula in prose language through a flowing style by collecting them with your acquired knowledge and personal experience is called a paragraph."

texts = {
    "Hindi": hi_text,
    "English": en_text,
    "Hinglish": hinglish_text
}

tokenize_texts(texts)


+-----------+-------+---------+----------+
|   Model   | Hindi | English | Hinglish |
+-----------+-------+---------+----------+
|   tiny    |  275  |   65    |   126    |
|  tiny_en  |  398  |   59    |   137    |
|   small   |  275  |   65    |   126    |
| small_en  |  398  |   59    |   137    |
|   base    |  275  |   65    |   126    |
|  base_en  |  398  |   59    |   137    |
|  medium   |  275  |   65    |   126    |
| medium_en |  398  |   59    |   137    |
|   large   |  275  |   65    |   126    |
| large-v2  |  275  |   65    |   126    |
| large-v3  |  275  |   59    |   126    |
+-----------+-------+---------+----------+


In [35]:
ids = TOKENIZERS["tiny_en"](hi_text)['input_ids']
tokens = TOKENIZERS["tiny_en"].convert_ids_to_tokens(ids)
print(tokens, end="\n\n")

ids = TOKENIZERS["tiny"](hi_text)['input_ids']
tokens = TOKENIZERS["tiny"].convert_ids_to_tokens(ids)
print(tokens)


['<|startoftranscript|>', '<|notimestamps|>', 'à¤', '¯', 'à¤', '¹', 'Ġà¤', 'ķ', 'à¤', '¿', 'à¤', '¸', 'à¥', 'Ģ', 'Ġà¤', '²', 'à¥', 'ĩ', 'à¤', 'ĸ', ',', 'Ġà¤', '¨', 'à¤', '¿', 'à¤', '¬', 'à¤', 'Ĥ', 'à¤', '§', 'Ġà¤', '¯', 'à¤¾', 'Ġà¤', '°', 'à¤', 'ļ', 'à¤', '¨', 'à¤¾', 'Ġà¤', 'ķ', 'à¤¾', 'Ġà¤', 'ħ', 'à¤', 'Ĥ', 'à¤', '¶', 'Ġà¤', 'Ń', 'à¥', 'Ģ', 'Ġà¤', '¹', 'à¥', 'ĭ', 'Ġà¤', '¸', 'à¤', 'ķ', 'à¤', '¤', 'à¤¾', 'Ġà¤', '¹', 'à¥', 'Ī', 'Ġà¤', 'ķ', 'à¤', '¿', 'à¤', '¨', 'à¥', 'į', 'à¤', '¤', 'à¥', 'ģ', 'Ġà¤', '¸', 'à¥', 'į', 'à¤', 'µ', 'à¤', '¯', 'à¤', 'Ĥ', 'Ġà¤', '®', 'à¥', 'ĩ', 'à¤', 'Ĥ', 'Ġà¤', 'ª', 'à¥', 'Ĥ', 'à¤', '°', 'à¥', 'į', 'à¤', '£', 'Ġà¤', '¹', 'à¥', 'ĭ', 'à¤', '¨', 'à¤¾', 'Ġà¤', 'ļ', 'à¤¾', 'à¤', '¹', 'à¤', '¿', 'à¤', 'ı', 'à¥', '¤', 'Ġà¤', 'ķ', 'à¤', '¿', 'à¤', '¸', 'à¥', 'Ģ', 'Ġà¤', 'Ń', 'à¥', 'Ģ', 'Ġà¤', '¶', 'à¤', '¬', 'à¥', 'į', 'à¤', '¦', ',', 'Ġà¤', 'µ', 'à¤¾', 'à¤', 'ķ', 'à¥', 'į', 'à¤', '¯', ',', 'Ġà¤', '¸', 'à¥', 'Ĥ', 'à¤', '¤', 'à¥', 'į', 'à¤', '°', 'Ġà¤', '¸', 'à¥', 'ĩ'