# Fine-tune Whisper for Swahili

* using small Common Voice Data: https://huggingface.co/datasets/cdli/common_voice_swahili_small

## Settings 

--> adapt for your scenario

In [1]:
from huggingface_hub import login
HF_TOKEN = input()
login(token=HF_TOKEN)

### Directories

In [2]:
import os 

# storage in Volume that will persist
LOCAL_STORAGE_DIR = '/jupyter_kernel'

BASE_DIR = os.path.join(LOCAL_STORAGE_DIR, 'trained_models')
!mkdir -p {BASE_DIR}

# directory for model training
OUTPUT_DIR = os.path.join(BASE_DIR, 'sw_cv_tune_whisper_small_1')
# OUTPUT_DIR = os.path.join(BASE_DIR, 'sw_train_with_cv_whisper_small_1')
# OUTPUT_DIR = os.path.join(BASE_DIR, 'sw_train_with_cv_whisper_largev3_1')

print(f"Will write model to: {OUTPUT_DIR}")
if os.path.exists(OUTPUT_DIR):
    raise ValueError(f"Output directory already exists - if you continue this will overwrite data and may lead to strange results...")


Will write model to: /jupyter_kernel/trained_models/sw_cv_tune_whisper_small_1


### Model settings

In [3]:
# WHISPER_MODEL_TYPE = "openai/whisper-tiny" 
WHISPER_MODEL_TYPE = "openai/whisper-small" 
# WHISPER_MODEL_TYPE = "openai/whisper-large-v3" 

LANGUAGE = 'sw'
TASK = "transcribe"

# which parts of the model to update
UPDATE_ENCODER = True
UPDATE_DECODER = True
UPDATE_PROJ = True

#################
## Base Model
#################

BASE_MODEL_NAME = WHISPER_MODEL_TYPE
print('Base model will be loaded from:', BASE_MODEL_NAME)

Base model will be loaded from: openai/whisper-small


### Trainer Settings

--> adjust as needed or keep defaults

In [4]:

LOGGING_STEPS = 5
# if save steps is 0, only last and best model will be written
SAVE_STEPS = 50

# training duration
MAX_EPOCHS = 5
MAX_STEPS = 1000  # for larger datasets, you will want to increase this

# Learning Rate and LR Scheduler (LR_END and LR_DECAY_POWER only apply to polynomial)
LEARNING_RATE = 1e-4 #@param
LR_SCHEDULER_TYPE = 'polynomial' # constant_with_warmup or polynomial
LR_WARMUP_STEPS = 100
LR_END = 1e-8
LR_DECAY_POWER = 4
# see: https://huggingface.co/docs/transformers/v4.46.2/en/main_classes/optimizer_schedules#transformers.SchedulerType
# and here: https://www.kaggle.com/code/snnclsr/learning-rate-schedulers
# constant --> 'constant_with_warmup'
# polynomial --> 'get_polynomial_decay_schedule_with_warmup'

BATCH_SIZE = 32
EVAL_BATCH_SIZE = 16

#@markdown other settings relevant for evaluation
MAX_GEN_LEN = 128 # increase if your data has long sequences!
EVAL_ON_START = True
EVAL_STEPS = 50

# for CPU, set both to false
USE_FP16 = True
USE_BF16 = False # only some GPUs support this, eg A100, A40

# checkpoints get huge for large models (~18 GB!)
NUM_CHECKPOINTS_TO_STORE = 2

## Imports and Prep

In [5]:
import datasets
from huggingface_hub import hf_hub_download
import numpy as np
import pandas as pd
import os
import torch

# more efficient dataset handling
datasets.disable_caching()
print('cache:', datasets.is_caching_enabled())

torch.set_num_threads(1)
torch.get_num_threads()


cache: False


1

In [6]:
# check if we have gpu
if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available, using CPU instead")

GPU is available


In [7]:
!nvidia-smi

Wed Sep 10 20:54:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      On  |   00000000:80:00.0 Off |                    0 |
| N/A   76C    P0             36W /   72W |   22562MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [8]:
from huggingface_hub import hf_hub_download

import random
import torchaudio
import librosa


import tarfile
import datasets
import matplotlib.pyplot as plt
import pandas as pd

import torch
import time


from dataclasses import dataclass
from typing import Any, Dict, List, Union

from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration
import os
import csv
import shutil
import numpy as np


import evaluate
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

transcript_normalizer = BasicTextNormalizer()

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [9]:
def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params

In [10]:
def get_wer(references, predictions, normalize=True, verbose=True):
  rs = references
  ps = predictions
  if normalize:
    ps = [transcript_normalizer(x) for x in predictions]
    rs = [transcript_normalizer(x) for x in references]
  if verbose:
    for r, p in zip(rs, ps):
      print(r)
      print(p)
      print()

  return wer_metric.compute(references=rs, predictions=ps)


def compute_metrics(pred):
    # for training metrics
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_strs = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_strs = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # calculate a per-example average
    wers = []
    cers = []
    for pred_str, label_str in zip(pred_strs, label_strs):
      p = transcript_normalizer(pred_str)
      l = transcript_normalizer(label_str)
      wer = wer_metric.compute(predictions=[p], references=[l])
      cer = cer_metric.compute(predictions=[p], references=[l])
      wers.append(wer)
      cers.append(cer)

    wer = np.mean([min(1.0,x) for x in wers])
    cer = np.mean([min(1.0,x) for x in cers])
    print('adjusted:', wer, cer)
    print('un-adjusted:', np.mean(wers), np.mean(cers))
    return {"wer": wer, "cer": cer}



In [11]:
def load_dataset(dataset_name, limit_to_30_seconds=True):
    """
    Load a dataset from Hugging Face Hub.
    If limit_to_30_seconds is True, will only load examples with audio length <= 30 seconds.
    """
    ds = datasets.load_dataset(dataset_name, split='test', streaming=False)
    orig_len = len(ds)
    if limit_to_30_seconds:
        ds = ds.filter(lambda example: example['audio_length'] <= 30)
        print(f"Filtered dataset from {orig_len} to {len(ds)} examples with audio length <= 30 seconds")
    return ds

In [12]:
# The following warning can be ignored:
# "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
# See: https://discuss.huggingface.co/t/finetuning-whisper-attention-mask-not-set-and-canot-be-inferred/97456
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

## Download datasets and prepare features

### Optimizing some settings for dataset access

In [13]:
datasets.disable_caching()
print('cache:', datasets.is_caching_enabled())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device is: ', device)

# IMPORTANT! need to set to 1 to avoid the mapping to hang!
torch.set_num_threads(1)
torch.get_num_threads()

num_proc = min(32, os.cpu_count())
print('# processors:', num_proc)



cache: False
device is:  cuda
# processors: 17


### Load feature extractor

--> for the model type you specified above

In [14]:

# Load processor
print('Using Language: ', LANGUAGE)
print('Using model:', WHISPER_MODEL_TYPE)
processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_TYPE, language=LANGUAGE, task=TASK)

# since this tokenizer isn't a FastTokenizer, so there is no point in running it with is_batched=True
# see: processor.tokenizer.is_fast
def prepare_features(example):
    example["input_features"] = processor.feature_extractor(example["audio"]["array"], sampling_rate=example["audio"]["sampling_rate"]).input_features[0]
    example["labels"] = processor.tokenizer(example["transcription"]).input_ids
    # also count number of tokens
    example["token_length"] = len(example["labels"])
    return example

Using Language:  sw
Using model: openai/whisper-small


### Load and prepare Swahili Common Voice dataset

In [15]:
cv_ds = datasets.load_dataset("cdli/common_voice_swahili_small", streaming=False)
cv_ds

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/358M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/357M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/58.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4548 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/397 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/335 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['speaker_id', 'audio_id', 'transcription', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment', 'audio', 'audio_length'],
        num_rows: 4548
    })
    test: Dataset({
        features: ['speaker_id', 'audio_id', 'transcription', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment', 'audio', 'audio_length'],
        num_rows: 397
    })
    validation: Dataset({
        features: ['speaker_id', 'audio_id', 'transcription', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment', 'audio', 'audio_length'],
        num_rows: 335
    })
})

In [None]:
ds_train = cv_ds['train'].shuffle(seed=42).flatten_indices()
ds_train = ds_train.map(prepare_features, remove_columns=['audio'], writer_batch_size=1, num_proc=num_proc)

print(ds_train)

In [None]:
ds_dev = cv_ds['validation'].shuffle(seed=42).flatten_indices()
ds_dev = ds_dev.map(prepare_features, remove_columns=['audio'], writer_batch_size=1, num_proc=num_proc)

print(ds_dev)

In [None]:
ds_test = cv_ds['test'].shuffle(seed=42).flatten_indices()
ds_test = ds_test.map(prepare_features, remove_columns=['audio'], writer_batch_size=1, num_proc=num_proc)

print(ds_dev)

## Prepare Trainer

In [None]:
base_model = WhisperForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)
_ = base_model.to(device)
print('Using Language: ', LANGUAGE)
print('Using model:', WHISPER_MODEL_TYPE)

# ensure task and language for training
base_model.generation_config.language = LANGUAGE
base_model.generation_config.task = TASK
base_model.generation_config.forced_decoder_ids = None
base_model.config.forced_decoder_ids = None
# to use gradient checkpointing
base_model.config.use_cache = False
print('language set to:', base_model.generation_config.language)

In [None]:
# which layers to tune
base_model.model.encoder.requires_grad_(UPDATE_ENCODER)
base_model.model.decoder.requires_grad_(UPDATE_DECODER)
base_model.proj_out.requires_grad_(UPDATE_PROJ)

print("Overview to number of model parameters to be updated:")
print('* encoder params to update/total:', count_trainable_parameters(base_model.model.encoder), base_model.model.encoder.num_parameters())
print('* decoder parans to update/total:', count_trainable_parameters(base_model.model.decoder), base_model.model.decoder.num_parameters())

print('* overall # trainable parameters:', count_trainable_parameters(base_model))
print('*     overall # model parameters:', base_model.model.num_parameters())

In [None]:
# Training Hyper Parameters
# don't change settings here, but instead at very top!
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    logging_dir=os.path.join(OUTPUT_DIR, 'logs'),
    logging_steps=LOGGING_STEPS,
    report_to=["tensorboard"],
    include_num_input_tokens_seen=True,
    ### on GPU, can either do fp16 or bf16 depending on specific GPU
    fp16=USE_FP16, 
    bf16=USE_BF16, 
    push_to_hub=False,
    remove_unused_columns=False,
    #
    num_train_epochs=MAX_EPOCHS,
    max_steps=MAX_STEPS,
    #
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    #
    per_device_train_batch_size=BATCH_SIZE,
    #
    eval_on_start=EVAL_ON_START,
    predict_with_generate=True,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    eval_steps=EVAL_STEPS,
    eval_strategy="steps",
    generation_max_length=MAX_GEN_LEN,
    #
    metric_for_best_model="wer",
    greater_is_better=False,
    #
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    #
    # only applies to polynomial schedule (constant ignores args)
    lr_scheduler_kwargs={
        "lr_end": LR_END, # The final LR.  Crucial for polynomial decay.
        "power": LR_DECAY_POWER, # for decay
        # we don't need to set the other arguments as they are already set in the args outside
        #"num_warmup_steps": WARMUP_STEPS, # The number of steps for the warmup phase.
        #"num_training_steps": MAX_STEPS, # The total number of training steps.
        #"lr_init": 1e-5 # we take the LR setting
    },

    learning_rate=LEARNING_RATE,
    warmup_steps=LR_WARMUP_STEPS, # what happens if we have this and the LR schedule args ?
    #
    save_steps=SAVE_STEPS,
    save_strategy="steps",
    save_total_limit=NUM_CHECKPOINTS_TO_STORE,
    load_best_model_at_end=True,
    # group_by_length=True
    # auto_find_batch_size=True
)

print('trainer args set, writing to:', OUTPUT_DIR)

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=base_model.config.decoder_start_token_id,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=base_model,
    train_dataset=ds_train,
    eval_dataset=ds_dev,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor
)


## Run the training

Note: tensorboard doesn't show properly in jupyter notebooks, use the tensorboard_server.py tool to host a tensorboard instance on Modal, using below model training dir:

In [None]:
print('model training dir:', OUTPUT_DIR)

In [None]:
# train from scratch
trainer.train()

# # alternatively, you can continue training if a previous job was interrupted
# trainer.train(resume_from_checkpoint = True)


## Post-Training Evaluation

### On Swahili CV dev-set

In [None]:
# run on dev-set 
# (should give the same result shown in trainig progress on dev set)
trainer.evaluate(ds_dev, language=LANGUAGE)

### On Swahili CV test-set

In [None]:
# run on test-set 
# (should give the same result shown in trainig progress on dev set)
trainer.evaluate(ds_test, language=LANGUAGE)

### On Swahili non-standard speech test set

In [None]:
ds_test_nss = load_dataset("cdli/kenyan_swahili_nonstandard_speech_v0", limit_to_30_seconds=True)
ds_test_nss = ds_test_nss.map(prepare_features, remove_columns=['audio'], writer_batch_size=1, num_proc=num_proc)
ds_test_nss
print(f"Loaded dataset with {len(ds_test_nss)} examples")

In [None]:
trainer.evaluate(ds_test_nss.take(100), language=LANGUAGE)

## Store Model

--> save best model

In [None]:
# with "load_best_model_at_end=True" set in the settings (this is the default, so don't change that), after training is completed the best model is loaded and then saved
best_model_dir = os.path.join(OUTPUT_DIR, 'best_model')
print(f"Saving to: {best_model_dir}")
trainer.model.save_pretrained(best_model_dir, safe_serialization=True)
trainer.tokenizer.save_pretrained(best_model_dir)