### Do the whisper transcriptions

In [29]:
import whisper
import os
from g2p_en import G2p
from functools import lru_cache
from typing import Dict, List, Tuple
import jiwer
import pandas as pd
from collections import defaultdict
import re
# I don't think you need this but if you have issues uncomment it
#import nltk
#nltk.download('averaged_perceptron_tagger_eng')

In [30]:
# path to your audios
PARTICIPANT_AUDIOS = "/home/enchanted/Matcha-TTS/tts-outputs/"
# path to a csv that has the file name and transcript
PARTICIPANT_TRANSCRIPTIONS = "/home/paige/Documents/BB_pepper-experiment/analysis/test.csv"
model = whisper.load_model("medium.en")

In [31]:
asr_transcriptions = {}

# Get a disctionary with : {filename: transcription}
for filename in os.listdir(PARTICIPANT_AUDIOS):
    if filename.endswith(".wav"):
        result = model.transcribe(f"{PARTICIPANT_AUDIOS}/{filename}")
        asr_transcriptions[filename] = result["text"]

## Analysis

### Normalize the texts

In [32]:
_g2p = G2p()


def tokenize_text(text: str) -> List[str]:
    """
    Returns a list of words
    Ignores punctuation and numbers
    """
    # Lowercase, keep only words and apostrophes/hyphens inside words
    return re.findall(r"[a-zA-Z]+(?:['-][a-zA-Z]+)?", text.lower())

@lru_cache(maxsize=100_000)
def word_to_phone_token(word: str) -> str:
    """
    Return a robust phonetic code for a single word.
    Priority: g2p_en (ARPAbet phones)
    The output is a compact string that's stable for equality checks.
    """
    # ARPAbet phones from g2p_en; strip stress digits (AH0->AH) so homophones match
    phones = [p for p in _g2p(word) if re.match(r"^[A-Z]+[0-9]?$", p)]
    if phones:
        phones = [re.sub(r"\d", "", p) for p in phones]
        # join phones with '-' so the whole word is ONE token for jiwer
        return "PH:" + "-".join(phones)
    # If g2p can't produce phones, mark as literal word so it won't silently match
    return "W:" + word


def phonetic_transform(s: str) -> str:
    return " ".join(word_to_phone_token(w) for w in tokenize_text(s))


### Scoring

In [33]:
def find_errors(
    data: Dict[str, List[Tuple[str, str]]]
) -> List[Dict[str, object]]:
    rows = []
    for fname, pairs in data.items():
        for i, (ref, hyp) in enumerate(pairs, 1):
            ref_phon = phonetic_transform(ref)
            hyp_phon = phonetic_transform(hyp)

            # Compute WER on the transformed strings (no extra kwargs)
            er = jiwer.wer(ref_phon, hyp_phon)

            if er > 0.0:
                rows.append({
                    "file": fname,
                    "original": ref,
                    "transcription": hyp,
                    "error_rate": round(er, 3),
                })
    return rows


In [34]:
# align the original text and transcriptions on file name
transcription_pairs = {}

origin_transcriptions = pd.read_csv(PARTICIPANT_TRANSCRIPTIONS)

# Build the dataset structure
transcription_pairs = defaultdict(list)

for _, row in origin_transcriptions.iterrows():
    filename = row['filename']
    origin_transcript = row['transcription']
    asr_transcript = asr_transcriptions.get(filename, "")
    transcription_pairs[filename].append((origin_transcript, asr_transcript))

errors = find_errors(transcription_pairs)
# can export to csv if you want
print(errors)

[{'file': 'test-ja-000.wav', 'original': 'yes I see you', 'transcription': ' duh', 'error_rate': 1.0}, {'file': 'test-def-000.wav', 'original': 'Hello! How is going? Your part should be done in a couple of days!', 'transcription': " Hello, how are you? Listen to me if I'm not too old for you.", 'error_rate': 0.857}]


In [None]:



origin_transcriptions = pd.read_csv(PARTICIPANT_TRANSCRIPTIONS)
 
origin_transcriptions["wer"] = 0.0
origin_transcriptions["match"] = ""
 
for idx, row in origin_transcriptions.iterrows():
    ref = row["ref"]
    hyp = row["asr"]
 
    origin_transcriptions.at[idx, "wer"] = find_errors(ref, hyp)
    origin_transcriptions["match"] = origin_transcriptions["wer"].apply(
        lambda x: "yes" if x > 0.0 else "no"
    )
 
origin_transcriptions.to_csv("aligned_transcriptions_with_match.csv", index=False)