<a href="https://colab.research.google.com/github/ridwanbello/accent_recognition/blob/main/Evaluation_of_different_Accents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading dataset

In [3]:
!pip install datasets==2.12.0
!pip install fsspec==2023.9.2

Collecting datasets==2.12.0
  Downloading datasets-2.12.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.12.0)
  Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)
Collecting responses<0.19 (from datasets==2.12.0)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.12.0)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.15-py311-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.14-py310-none-any.whl.metadata (6.6 kB)
Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading d

Collecting fsspec==2023.9.2
  Downloading fsspec-2023.9.2-py3-none-any.whl.metadata (6.7 kB)
Downloading fsspec-2023.9.2-py3-none-any.whl (173 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/173.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.4/173.4 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2023.9.2 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2023.9.2


In [4]:
import os
import itertools
import pandas as pd
import soundfile as sf
from datasets import load_dataset


base_path = "/content/afrispeech"

def download_accent_csv(accent="yoruba", batch_size = 1000):
  main_path = os.path.join(base_path, accent)
  os.makedirs(main_path, exist_ok=True)

  ds = load_dataset("tobiolatunji/afrispeech-200", name=accent, split="train", streaming=True, cache_dir=None)

  data_records = []

  for i, example in enumerate(itertools.islice(ds, 0, batch_size)):
      audio_array = example["audio"]["array"]
      sampling_rate = example["audio"]["sampling_rate"]
      accent = example["accent"]
      reference_text = example["transcript"]
      file_path = f"{main_path}/{accent}_{i}.wav"
      sf.write(file_path, audio_array, sampling_rate)

      data_records.append({
          "audio_path": file_path,
          "reference_text": reference_text
      })

  # Convert to DataFrame and save CSV
  df = pd.DataFrame(data_records)
  csv_path = os.path.join(base_path, f"{accent}_audio_text.csv")
  df.to_csv(csv_path, index=False)
  print(f"CSV saved to {csv_path}")

Reading metadata...: 14369it [00:00, 23206.01it/s]


CSV saved to /content/afrispeech/yoruba_audio_text.csv


In [None]:
accents = ["yoruba", "hausa", "igbo", "swahili", ]
for accent in accents:
  print(f"Dowloading {accent} accent")
  download_accent_csv(accent, 1000)
  print(f"{accent} accent finished downloaded")


## Downloading Whisper model

In [None]:
! pip install git+https://github.com/openai/whisper.git

In [1]:
! pip install faster-whisper jiwer pandas soundfile tqdm

Collecting faster-whisper
  Downloading faster_whisper-1.2.0-py3-none-any.whl.metadata (16 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-15.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downl

In [9]:
import os, time
import pandas as pd
import soundfile as sf
from tqdm import tqdm
from faster_whisper import WhisperModel
import jiwer

In [11]:
import os
import itertools
import pandas as pd
import soundfile as sf
from tqdm import tqdm
from faster_whisper import WhisperModel
import jiwer
from math import ceil

# Input CSV columns: audio_path, reference_text
CSV_PATH = "/content/afrispeech/yoruba_audio_text.csv"

MODELS = [
    ("tiny", {"compute_type": "float16"}),
    ("base", {"compute_type": "float16"}),
    ("small", {"compute_type": "float16"}),
    ("medium", {"compute_type": "float16"}),
    ("large-v2", {"compute_type": "float16"}),
    ("large-v3", {"compute_type": "float16"})
]

df = pd.read_csv(CSV_PATH)

def audio_duration(path):
    info = sf.info(path)
    return info.frames / float(info.samplerate)

def evaluate_model(model_name, model_kwargs, language=None):
    model = WhisperModel(model_name, **model_kwargs)
    rows = []
    total_ref, total_hyp = [], []
    total_audio_sec, total_decode_sec = 0.0, 0.0

    for _, r in tqdm(df.iterrows(), total=len(df)):
        wav = r["audio_path"]
        ref = str(r["reference_text"])

        t0 = time.time()
        segments, info = model.transcribe(wav, language=language)
        hyp_text = "".join([s.text for s in segments]).strip()
        t1 = time.time()

        dur = audio_duration(wav)
        dec = t1 - t0

        # Convert seconds to minutes and round
        dur_min = round(dur / 60, 2)
        dec_min = round(dec / 60, 2)

        total_audio_sec += dur
        total_decode_sec += dec

        rows.append({
            "model": model_name,
            "audio_path": wav,
            "reference": ref,
            "hypothesis": hyp_text,
            "duration_min": dur_min,
            "decode_min": dec_min
        })
        total_ref.append(ref)
        total_hyp.append(hyp_text)

    # Metrics
    wer = jiwer.wer(total_ref, total_hyp)
    cer = jiwer.cer(total_ref, total_hyp)
    rtf = round(total_decode_sec / max(total_audio_sec, 1e-6), 2)

    agg = {
        "model": model_name,
        "language": str(language),
        "WER": round(wer, 4),
        "CER": round(cer, 4),
        "RTF": rtf,
        "total_audio_minutes": round(total_audio_sec / 60, 2),
        "total_decode_minutes": round(total_decode_sec / 60, 2)
    }
    return pd.DataFrame(rows), agg

all_rows = []
aggs = []

for name, kwargs in MODELS:
    for lang in ["en", None]:
        print(f"Now working on {name} model size and {lang} lang")
        per_utts, agg = evaluate_model(name, kwargs, language=lang)
        all_rows.append(per_utts)
        aggs.append(agg)

# Concatenate all results
per_utterance_df = pd.concat(all_rows, ignore_index=True)
summary_df = pd.DataFrame(aggs).sort_values("WER")

# Save results
per_utterance_df.to_csv("whisper_eval_optimized_per_utterance.csv", index=False)
summary_df.to_csv("whisper_eval_optimized_summary.csv", index=False)

print(summary_df)


tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

100%|██████████| 100/100 [00:37<00:00,  2.66it/s]
100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/145M [00:00<?, ?B/s]

100%|██████████| 100/100 [00:41<00:00,  2.43it/s]
100%|██████████| 100/100 [00:43<00:00,  2.29it/s]
100%|██████████| 100/100 [00:40<00:00,  2.46it/s]
100%|██████████| 100/100 [00:41<00:00,  2.42it/s]
100%|██████████| 100/100 [00:48<00:00,  2.08it/s]
100%|██████████| 100/100 [01:09<00:00,  1.43it/s]


tokenizer.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:55<00:00,  1.79it/s]
100%|██████████| 100/100 [01:09<00:00,  1.45it/s]
100%|██████████| 100/100 [00:55<00:00,  1.82it/s]
100%|██████████| 100/100 [01:01<00:00,  1.63it/s]

       model language     WER     CER   RTF  total_audio_minutes  \
8   large-v2       en  0.3757  0.1631  0.07                13.22   
9   large-v2     None  0.3764  0.1634  0.09                13.22   
11  large-v3     None  0.3826  0.1650  0.08                13.22   
10  large-v3       en  0.3840  0.1667  0.07                13.22   
6     medium       en  0.3882  0.1710  0.06                13.22   
7     medium     None  0.4062  0.1837  0.09                13.22   
4      small       en  0.4569  0.2114  0.05                13.22   
5      small     None  0.4604  0.2120  0.05                13.22   
2       base       en  0.5569  0.2749  0.05                13.22   
3       base     None  0.5611  0.2771  0.05                13.22   
1       tiny     None  0.6188  0.3335  0.05                13.22   
0       tiny       en  0.6222  0.3339  0.05                13.22   

    total_decode_minutes  
8                   0.93  
9                   1.15  
11                  1.02  
10     




In [17]:
import os
import pandas as pd
import soundfile as sf
import time
from tqdm import tqdm
from faster_whisper import WhisperModel
import jiwer
from whisper.normalizers import EnglishTextNormalizer

MODELS = [
    ("tiny", {"compute_type": "float16"}),
    ("base", {"compute_type": "float16"}),
    ("small", {"compute_type": "float16"}),
    ("medium", {"compute_type": "float16"}),
    ("large-v2", {"compute_type": "float16"}),
    ("large-v3", {"compute_type": "float16"}),
]

normalizer = EnglishTextNormalizer()

def audio_duration(path):
    info = sf.info(path)
    return info.frames / float(info.samplerate)

def evaluate_whisper_csv(csv_path, output_dir="results"):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(csv_path)

    def evaluate_model(model_name, model_kwargs, language=None):
        model = WhisperModel(model_name, **model_kwargs)
        rows = []
        total_ref, total_hyp = [], []
        total_audio_sec, total_decode_sec = 0.0, 0.0

        for _, r in tqdm(df.iterrows(), total=len(df)):
            wav = r["audio_path"]
            ref = str(r["reference_text"])
            ref_norm = normalizer(ref)

            t0 = time.time()
            segments, info = model.transcribe(wav, language=language)
            hyp_text = "".join([s.text for s in segments]).strip()
            hyp_norm = normalizer(hyp_text)
            t1 = time.time()

            dur = audio_duration(wav)
            dec = t1 - t0

            dur_min = round(dur / 60, 2)
            dec_min = round(dec / 60, 2)

            rows.append({
                "model": model_name,
                "audio_path": wav,
                "reference": ref_norm,
                "hypothesis": hyp_norm,
                "duration_min": dur_min,
                "decode_min": dec_min
            })

            total_audio_sec += dur
            total_decode_sec += dec
            total_ref.append(ref_norm)
            total_hyp.append(hyp_norm)

        wer = jiwer.wer(total_ref, total_hyp)
        cer = jiwer.cer(total_ref, total_hyp)
        rtf = round(total_decode_sec / max(total_audio_sec, 1e-6), 2)

        lang_label = language if language is not None else "All"

        agg = {
            "model": model_name,
            "language": lang_label,
            "WER": round(wer, 4),
            "CER": round(cer, 4),
            "RTF": rtf,
            "total_audio_minutes": round(total_audio_sec / 60, 2),
            "total_decode_minutes": round(total_decode_sec / 60, 2)
        }
        return pd.DataFrame(rows), agg

    all_rows = []
    aggs = []

    for name, kwargs in MODELS:
        for lang in ["en", None]:
            print(f"Now working on {name} model size and {lang} lang")
            per_utts, agg = evaluate_model(name, kwargs, language=lang)
            all_rows.append(per_utts)
            aggs.append(agg)

    per_utterance_df = pd.concat(all_rows, ignore_index=True)
    summary_df = pd.DataFrame(aggs).sort_values("WER")

    per_utterance_csv = os.path.join(output_dir, f"{os.path.basename(csv_path).replace('.csv','')}_per_utterance.csv")
    summary_csv = os.path.join(output_dir, f"{os.path.basename(csv_path).replace('.csv','')}_summary.csv")

    per_utterance_df.to_csv(per_utterance_csv, index=False)
    summary_df.to_csv(summary_csv, index=False)

    print(f"Saved per-utterance results to {per_utterance_csv}")
    print(f"Saved summary results to {summary_csv}")

    return per_utterance_df, summary_df


In [21]:
from google.colab import drive
import os

drive.mount('/content/drive')


def save_to_drive(drive_path, summary):
  os.makedirs(drive_path, exist_ok=True)

  summary_df.to_csv(os.path.join(drive_path, summary), index=False)

  print(f"CSV files saved to {drive_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
accent_csvs = ["yoruba_audio_text.csv", "igbo_audio_text.csv", "swahili_audio_text.csv", "hausa_audio_text.csv",]
for csv in accent_csvs:
    print(f"Processing {csv}")
    csv_path = f"/content/afrispeech/{csv}"
    _, summary_df = evaluate_whisper_csv(csv_path, output_dir="whisper_evaluation_results")
    drive_path = "/content/drive/MyDrive/Afrispeech/whisper_evaluation_results/"
    # Pass a string filename, not the DataFrame, to save_to_drive
    save_to_drive(drive_path, f"{os.path.basename(csv).replace('.csv','')}_summary.csv")

Processing yoruba_audio_text.csv
Now working on tiny model size and en lang


100%|██████████| 100/100 [00:38<00:00,  2.63it/s]


Now working on tiny model size and None lang


100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


Now working on base model size and en lang


100%|██████████| 100/100 [00:42<00:00,  2.38it/s]


Now working on base model size and None lang


100%|██████████| 100/100 [00:42<00:00,  2.34it/s]


Now working on small model size and en lang


100%|██████████| 100/100 [00:40<00:00,  2.45it/s]


Now working on small model size and None lang


100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Now working on medium model size and en lang


100%|██████████| 100/100 [00:48<00:00,  2.08it/s]


Now working on medium model size and None lang


100%|██████████| 100/100 [01:09<00:00,  1.43it/s]


Now working on large-v2 model size and en lang


100%|██████████| 100/100 [00:56<00:00,  1.79it/s]


Now working on large-v2 model size and None lang


100%|██████████| 100/100 [01:08<00:00,  1.45it/s]


Now working on large-v3 model size and en lang


100%|██████████| 100/100 [00:55<00:00,  1.81it/s]


Now working on large-v3 model size and None lang


100%|██████████| 100/100 [01:01<00:00,  1.63it/s]

Saved per-utterance results to whisper_evaluation_results/yoruba_audio_text_per_utterance.csv
Saved summary results to whisper_evaluation_results/yoruba_audio_text_summary.csv





NameError: name 'save_to_drive' is not defined

In [25]:
drive_path = "/content/drive/MyDrive/Afrispeech/whisper_evaluation_results/"
save_to_drive(drive_path, f"{os.path.basename(csv).replace('.csv','')}_summary.csv")

CSV files saved to /content/drive/MyDrive/Afrispeech/whisper_evaluation_results/
