In [2]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, load_metric, load_from_disk, Audio, concatenate_datasets
from evaluate import load
import librosa
import pandas as pd
import re
import unicodedata




In [7]:
x = load_dataset("mozilla-foundation/common_voice_17_0", "af")
x

Downloading builder script:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.98M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.54M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/502k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.80M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/63.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.0k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 82it [00:00, 81986.40it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 54it [00:00, ?it/s][A


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 62it [00:00, 61827.59it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 15it [00:00, 10717.98it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 192it [00:00, 28588.39it/s]


Generating validated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 198it [00:00, 97199.46it/s]


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 82
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 54
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 62
    })
    other: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 15
    })
    invalidated: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 192
  

In [10]:
x['test']['audio'][0]

{'path': 'C:\\Users\\sebas\\.cache\\huggingface\\datasets\\downloads\\extracted\\e94e27a43875e19030aae795fb344b16cb46a327d2eb24ae22e24e729c35b29d\\af_test_0/common_voice_af_39016673.mp3',
 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -8.97276823e-06, -1.62792894e-05, -7.94395510e-06]),
 'sampling_rate': 48000}

In [4]:
models = ["openai/whisper-small", "afr_FLEURS_model","afr_de_FLEURS_model","afr_nl_FLEURS_model","afr_NCHLT_FLEURS_model",
         "afr_de_FLEURS_model_5h", "afr_south_hollandic_model", "afr_flemish_model", "afr_south_hollandic_v2_model", 
          "afr_flemish_v2_model"]

dataset = load_dataset("mozilla-foundation/common_voice_17_0", "af", trust_remote_code=True)
dataset = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])

dataset = dataset.rename_column('sentence', 'transcription')

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

dest_folder = 'evaluation_output_common_voice'
model_folder = 'models'



def get_metrics(model_name):
    
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    
    if model_name == "openai/whisper-small":
        model_path = model_name
    else:
        model_path = model_folder + "/" + model_name
    
    model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda")

    def map_to_pred(batch):

        audio = batch["audio"]

        input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

        batch["reference"] = processor.tokenizer._normalize(batch['transcription'])


        with torch.no_grad():

            predicted_ids = model.generate(input_features.to("cuda"))[0]

        transcription = processor.decode(predicted_ids)

        batch["prediction"] = processor.tokenizer._normalize(transcription)

        return batch
    
    
    result = dataset.map(map_to_pred)

    wer = load("wer")
    cer = load("cer")
    
    wer_res = 100 * wer.compute(references=result["reference"], predictions=result["prediction"])
    cer_res = 100 * cer.compute(references=result["reference"], predictions=result["prediction"])
    
    #Save the references and predictions to file for qualitative evaluation
    df = pd.DataFrame({'reference': result['reference'], 'prediction': result['prediction']})
    df.to_csv(f"{dest_folder}/{model_name.replace('/','_')}.csv",sep=',', index=False)
    
    return wer_res, cer_res

In [5]:
wers = []
cers = []
for model in models:
    w, c = get_metrics(model)
    wers.append(w)
    cers.append(c)

df = pd.DataFrame({'model': models, 'wer': wers, 'cer':cers})
df.to_csv(f'{dest_folder}/model_metrics.csv',sep=',', index=False)
df

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Unnamed: 0,model,wer,cer
0,openai/whisper-small,110.988484,64.301059
1,afr_FLEURS_model,92.274472,50.470405
2,afr_de_FLEURS_model,35.028791,9.682199
3,afr_nl_FLEURS_model,57.677543,20.045929
4,afr_NCHLT_FLEURS_model,30.182342,8.73398
5,afr_de_FLEURS_model_5h,48.464491,15.193718
6,afr_south_hollandic_model,71.497121,30.04667
7,afr_flemish_model,81.81382,35.706349
8,afr_south_hollandic_v2_model,54.318618,18.430995
9,afr_flemish_v2_model,60.076775,24.749981


In [6]:
df.sort_values(by='wer')

Unnamed: 0,model,wer,cer
4,afr_NCHLT_FLEURS_model,30.182342,8.73398
2,afr_de_FLEURS_model,35.028791,9.682199
5,afr_de_FLEURS_model_5h,48.464491,15.193718
8,afr_south_hollandic_v2_model,54.318618,18.430995
3,afr_nl_FLEURS_model,57.677543,20.045929
9,afr_flemish_v2_model,60.076775,24.749981
6,afr_south_hollandic_model,71.497121,30.04667
7,afr_flemish_model,81.81382,35.706349
1,afr_FLEURS_model,92.274472,50.470405
0,openai/whisper-small,110.988484,64.301059


In [7]:
df.sort_values(by='cer')

Unnamed: 0,model,wer,cer
4,afr_NCHLT_FLEURS_model,30.182342,8.73398
2,afr_de_FLEURS_model,35.028791,9.682199
5,afr_de_FLEURS_model_5h,48.464491,15.193718
8,afr_south_hollandic_v2_model,54.318618,18.430995
3,afr_nl_FLEURS_model,57.677543,20.045929
9,afr_flemish_v2_model,60.076775,24.749981
6,afr_south_hollandic_model,71.497121,30.04667
7,afr_flemish_model,81.81382,35.706349
1,afr_FLEURS_model,92.274472,50.470405
0,openai/whisper-small,110.988484,64.301059
