In [8]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, load_metric, load_from_disk, Audio
from evaluate import load
import librosa
import pandas as pd
import re
import unicodedata

In [9]:
models = ["openai/whisper-small", "afr_FLEURS_model","afr_de_FLEURS_model","afr_nl_FLEURS_model","afr_NCHLT_FLEURS_model",
         "afr_de_FLEURS_model_5h", "afr_south_hollandic_model", "afr_flemish_model", "afr_south_hollandic_v2_model", 
          "afr_flemish_v2_model"]

dataset = load_from_disk('dataset_afr_only')
dataset = dataset['test']

dataset = dataset.rename_column('text', 'transcription')
dataset = dataset.rename_column('file', 'audio')

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

dest_folder = 'evaluation_output_nchlt'
model_folder = 'models'



def get_metrics(model_name):
    
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    
    if model_name == "openai/whisper-small":
        model_path = model_name
    else:
        model_path = model_folder + "/" + model_name
    
    model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda")

    def map_to_pred(batch):

        audio = batch["audio"]

        input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

        batch["reference"] = processor.tokenizer._normalize(batch['transcription'])


        with torch.no_grad():

            predicted_ids = model.generate(input_features.to("cuda"))[0]

        transcription = processor.decode(predicted_ids)

        batch["prediction"] = processor.tokenizer._normalize(transcription)

        return batch
    
    
    result = dataset.map(map_to_pred)

    wer = load("wer")
    cer = load("cer")
    
    wer_res = 100 * wer.compute(references=result["reference"], predictions=result["prediction"])
    cer_res = 100 * cer.compute(references=result["reference"], predictions=result["prediction"])
    
    #Save the references and predictions to file for qualitative evaluation
    df = pd.DataFrame({'reference': result['reference'], 'prediction': result['prediction']})
    df.to_csv(f"{dest_folder}/{model_name.replace('/','_')}.csv",sep=',', index=False)
    
    return wer_res, cer_res

In [12]:
wers = []
cers = []
for model in models:
    w, c = get_metrics(model)
    wers.append(w)
    cers.append(c)

df = pd.DataFrame({'model': models, 'wer': wers, 'cer':cers})
df.to_csv(f'{dest_folder}/model_metrics.csv',sep=',', index=False)
df

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3002 [00:00<?, ? examples/s]

Unnamed: 0,model,wer,cer
0,openai/whisper-small,171.304348,66.939708
1,afr_FLEURS_model,104.8,29.064504
2,afr_de_FLEURS_model,71.084058,15.972213
3,afr_nl_FLEURS_model,91.443478,22.35973
4,afr_NCHLT_FLEURS_model,17.37971,2.817017
5,afr_de_FLEURS_model_5h,83.304348,19.760143
6,afr_south_hollandic_model,103.257971,26.629015
7,afr_flemish_model,106.62029,32.42578
8,afr_south_hollandic_v2_model,79.930435,18.324285
9,afr_flemish_v2_model,89.310145,23.961054


In [13]:
df.sort_values(by='wer')

Unnamed: 0,model,wer,cer
4,afr_NCHLT_FLEURS_model,17.37971,2.817017
2,afr_de_FLEURS_model,71.084058,15.972213
8,afr_south_hollandic_v2_model,79.930435,18.324285
5,afr_de_FLEURS_model_5h,83.304348,19.760143
9,afr_flemish_v2_model,89.310145,23.961054
3,afr_nl_FLEURS_model,91.443478,22.35973
6,afr_south_hollandic_model,103.257971,26.629015
1,afr_FLEURS_model,104.8,29.064504
7,afr_flemish_model,106.62029,32.42578
0,openai/whisper-small,171.304348,66.939708


In [14]:
df.sort_values(by='cer')

Unnamed: 0,model,wer,cer
4,afr_NCHLT_FLEURS_model,17.37971,2.817017
2,afr_de_FLEURS_model,71.084058,15.972213
8,afr_south_hollandic_v2_model,79.930435,18.324285
5,afr_de_FLEURS_model_5h,83.304348,19.760143
3,afr_nl_FLEURS_model,91.443478,22.35973
9,afr_flemish_v2_model,89.310145,23.961054
6,afr_south_hollandic_model,103.257971,26.629015
1,afr_FLEURS_model,104.8,29.064504
7,afr_flemish_model,106.62029,32.42578
0,openai/whisper-small,171.304348,66.939708
