In [1]:
#English
import whisper
from jiwer import wer
import re

model = whisper.load_model("base")

audio_files = [
    "Downloads/dev-clean/LibriSpeech/dev-clean/8297/275155/8297-275155-0000.flac",
    "Downloads/dev-clean/LibriSpeech/dev-clean/8297/275155/8297-275155-0002.flac",
    "Downloads/dev-clean/LibriSpeech/dev-clean/8297/275155/8297-275155-0011.flac",
    "Downloads/dev-clean/LibriSpeech/dev-clean/8297/275155/8297-275155-0032.flac"
]

original_texts = [
    "ON THE NEXT DAY BUT ONE RANDAL ARRANGED HIS DEPARTURE FOR SYDENHAM SO AS TO ARRIVE AT THE HOTEL AN HOUR BEFORE THE TIME APPOINTED FOR THE DINNER",  
    "AFTER READING ONE OR TWO OF THE POLITICAL ARTICLES HE ARRIVED AT THE COLUMNS SPECIALLY DEVOTED TO FASHIONABLE INTELLIGENCE",
    "BUT IF THESE NEWSPAPER PEOPLE WAITED TO FIND OUT WHETHER A REPORT IS TRUE OR FALSE HOW MUCH GOSSIP WOULD SOCIETY GET IN ITS FAVORITE NEWSPAPERS",
    "WHILE HE WAS WALKING UP AND DOWN THE PLATFORM WITH A MIND DOUBLY DISTRESSED BY ANXIETY ABOUT HIS BROTHER AND ANXIETY ABOUT SYDNEY THE TRAIN FROM LONDON CAME IN"
]

total_wer = 0
num_files = len(audio_files)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    text = text.strip() 
    return text

for audio_file, original_text in zip(audio_files, original_texts):
    result = model.transcribe(audio_file)
    transcription = result['text']

    normalized_original = normalize_text(original_text)
    normalized_transcription = normalize_text(transcription)

    print(f"Original Text: '{normalized_original}'")
    print(f"Transcribed Text: '{normalized_transcription}'")

    error_rate = wer(normalized_original, normalized_transcription)
    
    print(f"Word Error Rate (WER) Score for {audio_file} (lower is better): {error_rate:.4f}")

    total_wer += error_rate

average_wer = total_wer / num_files
print(f"Average Word Error Rate (WER) Score (lower is better): {average_wer:.4f}")


  checkpoint = torch.load(fp, map_location=device)


Original Text: 'on the next day but one randal arranged his departure for sydenham so as to arrive at the hotel an hour before the time appointed for the dinner'
Transcribed Text: 'on the next day but one, randall arranged his departure for sidonham, so as to arrive at the hotel an hour before the time appointed for the dinner.'
Word Error Rate (WER) Score for Downloads/dev-clean/LibriSpeech/dev-clean/8297/275155/8297-275155-0000.flac (lower is better): 0.1429
Original Text: 'after reading one or two of the political articles he arrived at the columns specially devoted to fashionable intelligence'
Transcribed Text: 'after reading one or two of the political articles, he arrived at the columns specially devoted to fashionable intelligence.'
Word Error Rate (WER) Score for Downloads/dev-clean/LibriSpeech/dev-clean/8297/275155/8297-275155-0002.flac (lower is better): 0.1053
Original Text: 'but if these newspaper people waited to find out whether a report is true or false how much gossip w

In [2]:
#Spanish
import whisper
from jiwer import wer
import re

model = whisper.load_model("base")

audio_files = [
    "Downloads/es_co_female/cof_07508_01601808212.wav",
    "Downloads/es_co_female/cof_03397_01983407356.wav",
    "Downloads/es_co_female/cof_09334_00430370240.wav",
    "Downloads/es_co_female/cof_01523_02106561942.wav"
]

original_texts = [
    "¿Quieres que revise tu nómina y los depósitos que han hecho en el último mes?",  
    "¿Puedes revisar si hay alguna tienda departo cerca de la casa de mis papás?",
    "También necesitamos una calcomanía de un dinosaurio para el computador",
    "¿Cómo puedo registrarme para obtener una bicicleta con el programa de la ciudad?"
]

total_wer = 0
num_files = len(audio_files)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    text = text.strip() 
    return text

for audio_file, original_text in zip(audio_files, original_texts):
    result = model.transcribe(audio_file)
    transcription = result['text']

    normalized_original = normalize_text(original_text)
    normalized_transcription = normalize_text(transcription)

    print(f"Original Text: '{normalized_original}'")
    print(f"Transcribed Text: '{normalized_transcription}'")

    error_rate = wer(normalized_original, normalized_transcription)
    
    print(f"Word Error Rate (WER) Score for {audio_file} (lower is better): {error_rate:.4f}")

    total_wer += error_rate

average_wer = total_wer / num_files
print(f"Average Word Error Rate (WER) Score (lower is better): {average_wer:.4f}")


Original Text: '¿quieres que revise tu nómina y los depósitos que han hecho en el último mes?'
Transcribed Text: '¿quieres que revise tu nomina y los depósitos que han hecho en el último mes?'
Word Error Rate (WER) Score for Downloads/es_co_female/cof_07508_01601808212.wav (lower is better): 0.0667
Original Text: '¿puedes revisar si hay alguna tienda departo cerca de la casa de mis papás?'
Transcribed Text: 'puedes revisarse alguna tienda de apartamento cerca la casa de mis papás'
Word Error Rate (WER) Score for Downloads/es_co_female/cof_03397_01983407356.wav (lower is better): 0.5714
Original Text: 'también necesitamos una calcomanía de un dinosaurio para el computador'
Transcribed Text: 'también necesitamos una calcul humanidad de un dinosaurio para el computador.'
Word Error Rate (WER) Score for Downloads/es_co_female/cof_09334_00430370240.wav (lower is better): 0.3000
Original Text: '¿cómo puedo registrarme para obtener una bicicleta con el programa de la ciudad?'
Transcribed Text

In [None]:
#French
import whisper
from jiwer import wer
import re

model = whisper.load_model("base")

audio_files = [
    "Downloads/FR/FR/0a0bc3af-7e7b-4396-ba6b-8c9e112cb926.flac",
    "Downloads/FR/FR/00c2529e-9705-464e-bbc2-acbe19916095.flac",
    "Downloads/FR/FR/0a65d511-9829-4643-98ff-0af0aad27c2e.flac",
    "Downloads/FR/FR/0b4ac057-0232-4669-b41f-bfd9d8b6ef16.flac"
]

original_texts = [
    "un plan massif comme lors de la crise financière seul près de deux cent cinquante quatre milliards d'euros sont débloqués dont la moitié sous forme d'emprunts d'état priorité à l'emploi et aux collectivités locales des plans de relance immédiate",  
    "le refuserais de mettre un genou à terre comme le souhaiterais manifestement le ministre de l'intérieur et il aura suffit d'une manifestation de vingt mille personnes devant le palais de la justice je rappelle que les gilets jaunes c'était deux cent cinquante mille personnes",
    "qualité tout ce qui existe il veut les entrainer derrière lui en leur donnant des responsabilités et des libertés la difficulté c'est qu'il a pris soin dans les trois années précédantes de le retirer tout le financement",
    "les plateformes de l'igpn de l'hygiène servent aussi à cela donc je le rappelle fermement et je le ferai par instructions dans les jours qui viennent le port du numéro rio est obligatoire et nous contrôlerons"
]

total_wer = 0
num_files = len(audio_files)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    text = text.strip() 
    return text

for audio_file, original_text in zip(audio_files, original_texts):
    result = model.transcribe(audio_file)
    transcription = result['text']

    normalized_original = normalize_text(original_text)
    normalized_transcription = normalize_text(transcription)

    print(f"Original Text: '{normalized_original}'")
    print(f"Transcribed Text: '{normalized_transcription}'")

    error_rate = wer(normalized_original, normalized_transcription)
    
    print(f"Word Error Rate (WER) Score for {audio_file} (lower is better): {error_rate:.4f}")

    total_wer += error_rate

average_wer = total_wer / num_files
print(f"Average Word Error Rate (WER) Score (lower is better): {average_wer:.4f}")


Original Text: 'un plan massif comme lors de la crise financière seul près de deux cent cinquante quatre milliards d'euros sont débloqués dont la moitié sous forme d'emprunts d'état priorité à l'emploi et aux collectivités locales des plans de relance immédiate'
Transcribed Text: 'un plan massif comme l'ordre de la crise financière. seule près de 254 milliards d'euros sont développés dont la moitié sous forme d'empreint d'etat, priorité à l'emploi et aux collectivités locales. des plans de relance inédits avaient...'
Word Error Rate (WER) Score for Downloads/FR/FR/0a0bc3af-7e7b-4396-ba6b-8c9e112cb926.flac (lower is better): 0.3333
