In [1]:
import jiwer
from jiwer import wer
import re
import pandas as pd
from sacrebleu.metrics import BLEU, CHRF, TER

In [2]:
def getWER(ground_truth, hypothesis):
    for i in range(len(ground_truth)):
        ground_truth[i] = re.sub(r'\W+', ' ', ground_truth[i])
        hypothesis[i] = re.sub(r'\W+', ' ', hypothesis[i])

    transformation = jiwer.Compose([
        jiwer.ToLowerCase(),
        jiwer.RemoveWhiteSpace(replace_by_space=True),
        jiwer.RemoveMultipleSpaces(),
        jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
    ]) 

    error = jiwer.wer(
        ground_truth, 
        hypothesis, 
        truth_transform=transformation, 
        hypothesis_transform=transformation
    )
    print(error)
    return error

In [59]:
def getBLEU(ground_truth, hypothesis):
    refs = ground_truth
    sys = hypothesis
    bleu = BLEU()
    bleu_score = bleu.corpus_score(sys, refs)
    print(bleu_score)
    return bleu_score

In [54]:
paths = []
englishs = []
germans = []
lines = [line.rstrip() for line in open("covost_v2.en_de.dev.tsv")]
for i in range(len(lines)):
    if i==0:
        continue
    value = re.split(r'\t', lines[i])
    paths.append(value[0])
    englishs.append(value[1])
    germans.append(value[2])

df = pd.DataFrame()
df['path'] = paths
df['german'] = germans
df['english'] = englishs
df = df.astype(str)

In [55]:
df

Unnamed: 0,path,german,english
0,common_voice_en_18862286.mp3,Der Rhythmus wird in der Bossa-Nova-Musik typi...,The rhythm is typically played as a snare rim ...
1,common_voice_en_18854895.mp3,Der Verwaltungssitz des Countys ist Prineville.,The county seat is Prineville.
2,common_voice_en_18854899.mp3,In Barkhamsted gibt es keine öffentlichen Verk...,There is no public transportation in Barkhamsted.
3,common_voice_en_18860698.mp3,Im Laufe seiner Karriere zeichnete Berryman Ta...,"During his career, Berryman drew thousands of ..."
4,common_voice_en_18860699.mp3,Aus lauter Angst fliehen die Männer am nächste...,"Scared, the men flee the camp the next morning."
...,...,...,...
15489,common_voice_en_19789230.mp3,Es kommunizierte unter Verwendung des Microsof...,It communicated using the Microsoft Notificati...
15490,common_voice_en_20064029.mp3,Ein kleiner Doppelkrater überlagert einen Teil...,A small double-crater overlays a portion of th...
15491,common_voice_en_20064031.mp3,"Die Hauptkulturen sind Weizen, Gerste und Linsen.","Wheat, barley and lentil are the main crops."
15492,common_voice_en_20064032.mp3,Der Township-Manager ist Giovanni D. Ahmad.,Giovanni D. Ahmad is the township manager.


In [56]:
paths = []
englishs = []

lines = [line.rstrip() for line in open("whisper_transcribed_medium.txt")]

count = 0
for i in range(len(lines)-1):
    value = re.split(r'\t', lines[i])
    paths.append(value[0])
    try:
        englishs.append(value[1])
    except:
        englishs.append('')
        count += 1

df_pred = pd.DataFrame()
df_pred['path'] = paths
df_pred['english_pred'] = englishs
df_pred = df_pred.astype(str)

In [57]:
len(df_pred)

15494

In [23]:
df['path'] = df['path'].astype('|S') 
df_pred['path'] = df_pred['path'].astype('|S') 
df_joined = df.merge(df_pred, on = "path", how = "inner")

In [60]:
ground_truth = df['english'].to_list()
hypothesis = df_pred['english_pred'].to_list()
getWER(ground_truth, hypothesis)
getBLEU(ground_truth, hypothesis)

0.5063705720670965
BLEU = 1.45 5.4/1.6/0.9/0.6 (BP = 1.000 ratio = 7.400 hyp_len = 37 ref_len = 5)


BLEU = 1.45 5.4/1.6/0.9/0.6 (BP = 1.000 ratio = 7.400 hyp_len = 37 ref_len = 5)

In [61]:
paths = []
germans = []

lines = [line.rstrip() for line in open("translated_base.txt")]

count = 0
for i in range(len(lines)-1):
    value = re.split(r'\t', lines[i])
    paths.append(value[0])
    try:
        germans.append(value[1])
    except:
        germans.append('')
        count += 1

df_pred = pd.DataFrame()
df_pred['path'] = paths
df_pred['german_pred'] = germans
df_pred = df_pred.astype(str)

In [62]:
df_pred

Unnamed: 0,path,german_pred
0,b'common_voice_en_18862286.mp3',Der Rhythmus wird typischerweise als Snare-Fel...
1,b'common_voice_en_18854895.mp3',Der Verwaltungssitz (County Seat) ist Pranavir.
2,b'common_voice_en_18854899.mp3',Es gibt keine öffentlichen Verkehrsmittel in B...
3,b'common_voice_en_18860698.mp3',Während seiner Karriere Barryman zeichnete Tau...
4,b'common_voice_en_18860699.mp3',Erschreckte die Männer am nächsten Morgen aus ...
...,...,...
15489,b'common_voice_en_19789230.mp3',Es kommuniziert mit dem Microsoft-Benachrichti...
15490,b'common_voice_en_20064029.mp3',Ein kleiner Doppelkrater überlagert einen Teil...
15491,b'common_voice_en_20064031.mp3',Weizen Gerste und Linsen sind die Hauptkulturen
15492,b'common_voice_en_20064032.mp3',Giovanni D Hammett ist Mtoneship Manager


In [13]:
df

Unnamed: 0,path,german,english
0,common_voice_en_18862286.mp3,Der Rhythmus wird in der Bossa-Nova-Musik typi...,The rhythm is typically played as a snare rim ...
1,common_voice_en_18854895.mp3,Der Verwaltungssitz des Countys ist Prineville.,The county seat is Prineville.
2,common_voice_en_18854899.mp3,In Barkhamsted gibt es keine öffentlichen Verk...,There is no public transportation in Barkhamsted.
3,common_voice_en_18860698.mp3,Im Laufe seiner Karriere zeichnete Berryman Ta...,"During his career, Berryman drew thousands of ..."
4,common_voice_en_18860699.mp3,Aus lauter Angst fliehen die Männer am nächste...,"Scared, the men flee the camp the next morning."
...,...,...,...
15489,common_voice_en_19789230.mp3,Es kommunizierte unter Verwendung des Microsof...,It communicated using the Microsoft Notificati...
15490,common_voice_en_20064029.mp3,Ein kleiner Doppelkrater überlagert einen Teil...,A small double-crater overlays a portion of th...
15491,common_voice_en_20064031.mp3,"Die Hauptkulturen sind Weizen, Gerste und Linsen.","Wheat, barley and lentil are the main crops."
15492,common_voice_en_20064032.mp3,Der Township-Manager ist Giovanni D. Ahmad.,Giovanni D. Ahmad is the township manager.


In [49]:
ground_truth = df['german'].to_list()
hypothesis = df_pred['german_pred'].to_list()
getWER(ground_truth[:1], hypothesis[:1])
getBLEU(ground_truth[:1], hypothesis[:1])

getBLEU(["Der Rhythmus wird in der Bossa-Nova-Musik typischerweise als Snare-Rim-Muster gesplielt"], 
        ["Der Rhythmus wird typischerweise als Snare Felgenmuster in einer Nova Musik gespielt"])

0.7142857142857143
BLEU = 0.00 0.0 (BP = 1.000 ratio = 12.000 hyp_len = 12 ref_len = 1)
BLEU = 0.00 0.0 (BP = 1.000 ratio = 12.000 hyp_len = 12 ref_len = 1)


BLEU = 0.00 0.0 (BP = 1.000 ratio = 12.000 hyp_len = 12 ref_len = 1)

In [50]:
bleu = sacrebleu.corpus_bleu(["Der Rhythmus wird in der Bossa-Nova-Musik typischerweise als Snare-Rim-Muster gesplielt"], 
        ["Der Rhythmus wird typischerweise als Snare Felgenmuster in einer Nova Musik gespielt"])
print(bleu)

BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 10.000 hyp_len = 10 ref_len = 1)


In [53]:
sacrebleu.sacrebleu.parse_agrs()

AttributeError: module 'sacrebleu' has no attribute 'sacrebleu'

In [48]:
print((ground_truth[0]), (hypothesis[0]))

Der Rhythmus wird in der Bossa-Nova-Musik typischerweise als Snare-Rim-Muster gespielt. Der Rhythmus wird typischerweise als Snare Felgenmuster in einer Nova Musik gespielt


In [43]:
getBLEU(['I really like the yogurt cause it has no sugar'], ['I LIKE this yogurt, it has no sugar'])

BLEU = 11.11 11.1 (BP = 1.000 ratio = 9.000 hyp_len = 9 ref_len = 1)


BLEU = 11.11 11.1 (BP = 1.000 ratio = 9.000 hyp_len = 9 ref_len = 1)

In [42]:
import sacrebleu
bleu = sacrebleu.corpus_bleu(['I really like the yogurt cause it has no sugar'], ['I LIKE this yogurt, it has no sugar'])
print(bl)

BLEU = 4.20 10.0/5.6/3.1/1.8 (BP = 1.000 ratio = 10.000 hyp_len = 10 ref_len = 1)
