In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install transformers datasets evaluate soundfile librosa pandas jiwer tqdm --quiet

import os
import glob
import json
import torch
import evaluate
import soundfile as sf
import pandas as pd
import librosa
from tqdm import tqdm

from transformers import WhisperProcessor, WhisperForConditionalGeneration

In [None]:
FOLDER = "/content/drive/My Drive/VoiceBridge_SAP_sample"
JSON_FILE = os.path.join(FOLDER, "digital_assistant_metadata.json")
MODEL_NAME = "openai/whisper-small"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"



with open(JSON_FILE, 'r') as f:
    json_data = json.load(f)

reference_dict = {entry["Filename"]: entry["Prompt"]["Transcript"].strip().lower()
                  for entry in json_data}
#metadata_dict = {entry["Filename"]: entry for entry in json_data}

ratings_dict = {
    entry["Filename"]: entry.get("Ratings", [])
    for entry in json_data
}

wav_files = sorted(glob.glob(os.path.join(FOLDER, "*.wav")))

model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to("cuda")
model.eval()


def load_audio(path, target_sr=16000):
    # resample audio to 16kHz if needed
    speech_array, sr = sf.read(path)
    if sr != target_sr:
        speech_array = librosa.resample(speech_array, orig_sr=sr, target_sr=target_sr)
    return speech_array



@torch.inference_mode()
def batch_transcribe(wav_paths, batch_size=4):
    transcriptions = []
    for i in tqdm(range(0, len(wav_paths), batch_size), desc="Transcribing"):
        batch = [load_audio(p) for p in wav_paths[i:i+batch_size]]
        features = processor(batch, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)
        pred_ids = model.generate(features)
        decoded = processor.batch_decode(pred_ids, skip_special_tokens=True)
        transcriptions.extend([t.strip().lower() for t in decoded])
    return transcriptions


#metric = evaluate.load("wer")
def extract_disease(ratings):
    """Return the first non-empty disease label per file."""
    for r in ratings:
        d = r.get("Dimension Category Description", "").strip()
        if d:
            return d
    return None


#transcrive all audio recordings
preds = batch_transcribe(wav_files)
metric = evaluate.load("wer")

# store WER per file
results = []
for wav_path, pred in zip(wav_files, preds):
    filename = os.path.basename(wav_path)
    ref = reference_dict.get(filename, "")
    wer = metric.compute(predictions=[pred], references=[ref])
    disease = extract_disease(ratings_dict.get(filename, []))
    results.append({
        "Filename": filename,
        "Prediction": pred,
        "Reference": ref,
        "WER": wer,
        "Disease": disease
    })

results_df = pd.DataFrame(results)
print("Finished evaluating all samples.")

overall_wer = results_df["WER"].mean()
print(f"\nOverall average WER: {overall_wer:.3f}")

# Top 10 worst WER
top10 = results_df.sort_values("WER", ascending=False).head(10)
print("\nTop 10 highest WER samples:")
print(top10[["Filename", "WER", "Reference", "Prediction"]].to_string(index=False))

# average WER/disease
if results_df["Disease"].notna().any():
    disease_summary = results_df.groupby("Disease")["WER"].mean().sort_values(ascending=False)
    print("\nAverage WER per disease:")
    print(disease_summary)
else:
    print("\nNo disease found.")


Transcribing: 100%|██████████| 23/23 [00:41<00:00,  1.82s/it]


Finished evaluating all samples.

Overall average WER: 0.813

Top 10 highest WER samples:
                                            Filename       WER                                        Reference                                                                                                                                                                                                                    Prediction
  2dcb3ebc-59ea-4275-75a6-08dd3ba78fcd_242_13367.wav 10.571429       set the air conditioning to (se-) seventy. ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ ḍᵃ
23b27207-6d7f-4da9-a85b-08dcf77c2ab9_29853_13492.wav  4.000000                                       audiobook.                                                                                                                                   