In [None]:
!pip install -U datasets evaluate jiwer

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (19

In [None]:
import multiprocessing
from multiprocessing import Pool

import torch
from tqdm import tqdm
from evaluate import load
from datasets import load_dataset
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset

def run_eval(dataset):
    print("Evaluating", dataset)

    if torch.cuda.is_available():
        device = 0                 # GPU‑0
        torch_dtype = torch.float16
    else:
        device = -1                # CPU
        torch_dtype = torch.float32

    pipe = pipeline(
        "automatic-speech-recognition",
        model="rbcurzon/whisper-small-ph",
        device=device,
        torch_dtype=torch_dtype,
    )

    all_predictions = [
        pred["text"]
        for pred in tqdm(
            pipe(
                KeyDataset(dataset, "audio"),
                return_timestamps=True,
                generate_kwargs={
                    "task": "transcribe",
                    "language": "tagalog",
                    "max_new_tokens": 128,
                },
            ),
            total=len(dataset),
        )
    ]

    wer_metric = load("wer")
    wer = 100 * wer_metric.compute(
        references=dataset["transcription"], predictions=all_predictions
    )
    return f"{dataset}: {wer:.2f}% WER"

In [None]:
from multiprocessing import Pool
from datasets import load_dataset
import multiprocessing
from functools import partial

def load_hf_datasets():
  ds_list = []
  subsets = ["bik", "ceb", "hil", "ilo", "mrw", "pag", "tgl", "war", "pam", "bisaya"]

  with Pool(multiprocessing.cpu_count()) as pool:
    # Use partial to pre-apply the 'split' argument and the dataset path to load_dataset
    load_dataset_with_split = partial(load_dataset, "rbcurzon/ph_dialect_asr", split="test")
    for subset in pool.imap_unordered(load_dataset_with_split, subsets):
      print(f"Loading dataset for subset: {subset}")
      ds_list.append(subset)

  return ds_list

In [None]:
ds_list = load_hf_datasets()

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).
Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00006.parquet:   0%|          | 0.00/464M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/76.1M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/439 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/110 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 110
})


train-00000-of-00001.parquet:   0%|          | 0.00/93.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/962 [00:00<?, ? examples/s]

train-00001-of-00006.parquet:   0%|          | 0.00/466M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/241 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 241
})


train-00000-of-00001.parquet:   0%|          | 0.00/90.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

train-00002-of-00006.parquet:   0%|          | 0.00/460M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/887 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/222 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 222
})


train-00000-of-00001.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

train-00003-of-00006.parquet:   0%|          | 0.00/451M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/563 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/141 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 141
})


train-00000-of-00001.parquet:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

train-00004-of-00006.parquet:   0%|          | 0.00/465M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/628 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/157 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 157
})


train-00000-of-00004.parquet:   0%|          | 0.00/441M [00:00<?, ?B/s]

train-00005-of-00006.parquet:   0%|          | 0.00/471M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/437M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

test-00000-of-00002.parquet:   0%|          | 0.00/250M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/437M [00:00<?, ?B/s]

test-00001-of-00002.parquet:   0%|          | 0.00/260M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3261 [00:00<?, ? examples/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/455M [00:00<?, ?B/s]

test-00000-of-00003.parquet:   0%|          | 0.00/359M [00:00<?, ?B/s]

test-00001-of-00003.parquet:   0%|          | 0.00/371M [00:00<?, ?B/s]

test-00002-of-00003.parquet:   0%|          | 0.00/372M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/225 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/418 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/541 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/964 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 541
})


train-00000-of-00001.parquet:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/420 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/105 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 105
})


train-00000-of-00002.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/427M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

Loading dataset for subset: Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 964
})


train-00000-of-00012.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/875 [00:00<?, ? examples/s]

train-00001-of-00012.parquet:   0%|          | 0.00/677M [00:00<?, ?B/s]

train-00002-of-00012.parquet:   0%|          | 0.00/704M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/219 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription'],
    num_rows: 219
})


train-00003-of-00012.parquet:   0%|          | 0.00/690M [00:00<?, ?B/s]

train-00004-of-00012.parquet:   0%|          | 0.00/717M [00:00<?, ?B/s]

train-00005-of-00012.parquet:   0%|          | 0.00/79.9M [00:00<?, ?B/s]

train-00006-of-00012.parquet:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

train-00007-of-00012.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00008-of-00012.parquet:   0%|          | 0.00/357M [00:00<?, ?B/s]

train-00009-of-00012.parquet:   0%|          | 0.00/785M [00:00<?, ?B/s]

train-00010-of-00012.parquet:   0%|          | 0.00/770M [00:00<?, ?B/s]

train-00011-of-00012.parquet:   0%|          | 0.00/774M [00:00<?, ?B/s]

test-00000-of-00004.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

test-00001-of-00004.parquet:   0%|          | 0.00/145M [00:00<?, ?B/s]

test-00002-of-00004.parquet:   0%|          | 0.00/542M [00:00<?, ?B/s]

test-00003-of-00004.parquet:   0%|          | 0.00/781M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9938 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2705 [00:00<?, ? examples/s]

Loading dataset for subset: Dataset({
    features: ['audio', 'transcription', 'raw', 'id', 'num_samples', 'path', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 2705
})


In [None]:
results = []

for ds in ds_list:
  results.append(run_eval(ds))

Evaluating Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 110
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

Device set to use cuda:0
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluating Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 241
})


Device set to use cuda:0
  6%|▌         | 14/241 [00:14<02:44,  1.38it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 39%|███▉      | 94/241 [01:22<01:47,  1.37it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 54%|█████▎    | 129/241 [01:52<02:09,  1.16s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 68%|██████▊   | 165/241 [02:20<00:36,  2.07it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
100%|██████████| 241/241 [03:18<00:00,  1.21it/s]


Evaluating Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 222
})


Device set to use cuda:0
  9%|▉         | 20/222 [00:16<03:46,  1.12s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 15%|█▌        | 34/222 [00:26<01:37,  1.93it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 23%|██▎       | 52/222 [00:38<01:36,  1.76it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 27%|██▋       | 61/222 [00:49<04:30,  1.68s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 34%|███▍      | 75/222 [00:59<01:50,  1.33it/s]Whisper

Evaluating Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 141
})


Device set to use cuda:0
  6%|▌         | 8/141 [00:06<02:16,  1.03s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 13%|█▎        | 18/141 [00:14<01:49,  1.13it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 16%|█▋        | 23/141 [00:21<02:05,  1.06s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 17%|█▋        | 24/141 [00:23<02:14,  1.15s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 18%|█▊        | 25/141 [00:24<02:18,  1.19s/it]Whisper 

Evaluating Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 157
})


Device set to use cuda:0
  3%|▎         | 5/157 [00:06<02:56,  1.16s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 24%|██▎       | 37/157 [00:40<03:50,  1.92s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 32%|███▏      | 51/157 [00:51<01:26,  1.23it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 35%|███▌      | 55/157 [00:54<01:35,  1.07it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 48%|████▊     | 76/157 [01:12<01:40,  1.24s/it]Whisper 

Evaluating Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 541
})


Device set to use cuda:0
  2%|▏         | 11/541 [00:13<10:53,  1.23s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  4%|▎         | 19/541 [00:24<14:04,  1.62s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  4%|▍         | 24/541 [00:34<13:19,  1.55s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  6%|▌         | 33/541 [00:47<12:34,  1.49s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 13%|█▎        | 68/541 [01:31<12:10,  1.54s/it]Whisper

Evaluating Dataset({
    features: ['audio', 'transcription', 'raw'],
    num_rows: 105
})


Device set to use cuda:0
  0%|          | 0/105 [00:00<?, ?it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  3%|▎         | 3/105 [00:04<02:18,  1.35s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 12%|█▏        | 13/105 [00:22<03:46,  2.46s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 18%|█▊        | 19/105 [00:37<05:16,  3.68s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 22%|██▏       | 23/105 [00:44<02:28,  1.81s/it]Whisper did not p

Evaluating Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 964
})


Device set to use cuda:0
 17%|█▋        | 160/964 [03:35<26:54,  2.01s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 24%|██▎       | 227/964 [05:07<14:13,  1.16s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 25%|██▌       | 245/964 [05:36<14:12,  1.19s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 26%|██▌       | 253/964 [05:45<13:41,  1.16s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 29%|██▉       | 278/964 [06:18<14:33,  1.27s/it]Wh

Evaluating Dataset({
    features: ['audio', 'transcription'],
    num_rows: 219
})


Device set to use cuda:0
  1%|          | 2/219 [00:00<01:21,  2.66it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  3%|▎         | 7/219 [00:04<02:09,  1.64it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  4%|▎         | 8/219 [00:04<01:59,  1.77it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  5%|▌         | 11/219 [00:06<02:06,  1.64it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  9%|▉         | 20/219 [00:10<01:35,  2.09it/s]Whisper di

Evaluating Dataset({
    features: ['audio', 'transcription', 'raw', 'id', 'num_samples', 'path', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 2705
})


Device set to use cuda:0
  0%|          | 13/2705 [00:08<47:34,  1.06s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  1%|          | 15/2705 [00:12<54:38,  1.22s/it]  Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  1%|          | 31/2705 [00:39<1:13:09,  1.64s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  1%|▏         | 39/2705 [00:49<1:00:27,  1.36s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  2%|▏         | 54/2705 [01:10<1:05:58,  1.4

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots()

# Example data
subsets = ["bik", "ceb", "hil", "ilo", "mrw", "pag", "tgl", "war", "pam", "bisaya"]
y_pos = np.arange(len(subsets))

small_model = results

# medium_model = [16.030283080974325, 21.57816005983545, 15.73165947430365, 17.93032786885246]

for model, scores in zip(['small'], [small_model]):
    ax.barh(y_pos, scores, height=0.4, color='steelblue', label=model)

for i, v in enumerate(scores):
    ax.text(v + 0.3, i, f"{v:.2f}", va='center', fontsize=10)

ax.set_yticks(y_pos)
ax.set_yticklabels(subsets, fontsize=10)
ax.invert_yaxis()

ax.set_xlabel('Word Error Rate (%)', fontsize=12)
ax.set_title('WER by Dialect – Small Model', fontsize=14, fontweight='bold')
ax.legend(loc='upper right', fontsize=10)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig("small_model_wer.png", dpi=300)
plt.show()
