In [1]:
import jiwer
from jiwer import wer
import re
import pandas as pd
from sacrebleu.metrics import BLEU, CHRF, TER

In [2]:
def getWER(ground_truth, hypothesis):
    for i in range(len(ground_truth)):
        ground_truth[i] = re.sub(r'\W+', ' ', ground_truth[i])
        hypothesis[i] = re.sub(r'\W+', ' ', hypothesis[i])

    transformation = jiwer.Compose([
        jiwer.ToLowerCase(),
        jiwer.RemoveWhiteSpace(replace_by_space=True),
        jiwer.RemoveMultipleSpaces(),
        jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
    ]) 
    
#     print(ground_truth)
#     print(hypothesis)

    error = jiwer.wer(
        ground_truth, 
        hypothesis, 
        truth_transform=transformation, 
        hypothesis_transform=transformation
    )
    print(error)
    return error

In [3]:
def getBLEU(ground_truth, hypothesis):
    refs = ground_truth
    sys = hypothesis
    bleu = BLEU()
    bleu_score = bleu.corpus_score(sys, refs)
    print(bleu_score)
    return bleu_score

In [4]:
paths = []
englishs = []
germans = []
lines = [line.rstrip() for line in open("covost_v2.en_de.dev.tsv")]
for i in range(len(lines)):
    if i==0:
        continue
    value = re.split(r'\t', lines[i])
    paths.append(value[0])
    englishs.append(value[1])
    germans.append(value[2])

df = pd.DataFrame()
df['path'] = paths
df['german'] = germans
df['english'] = englishs
df = df.astype(str)

In [5]:
df

Unnamed: 0,path,german,english
0,common_voice_en_18862286.mp3,Der Rhythmus wird in der Bossa-Nova-Musik typi...,The rhythm is typically played as a snare rim ...
1,common_voice_en_18854895.mp3,Der Verwaltungssitz des Countys ist Prineville.,The county seat is Prineville.
2,common_voice_en_18854899.mp3,In Barkhamsted gibt es keine öffentlichen Verk...,There is no public transportation in Barkhamsted.
3,common_voice_en_18860698.mp3,Im Laufe seiner Karriere zeichnete Berryman Ta...,"During his career, Berryman drew thousands of ..."
4,common_voice_en_18860699.mp3,Aus lauter Angst fliehen die Männer am nächste...,"Scared, the men flee the camp the next morning."
...,...,...,...
15489,common_voice_en_19789230.mp3,Es kommunizierte unter Verwendung des Microsof...,It communicated using the Microsoft Notificati...
15490,common_voice_en_20064029.mp3,Ein kleiner Doppelkrater überlagert einen Teil...,A small double-crater overlays a portion of th...
15491,common_voice_en_20064031.mp3,"Die Hauptkulturen sind Weizen, Gerste und Linsen.","Wheat, barley and lentil are the main crops."
15492,common_voice_en_20064032.mp3,Der Township-Manager ist Giovanni D. Ahmad.,Giovanni D. Ahmad is the township manager.


In [7]:
paths = []
englishs = []

lines = [line.rstrip() for line in open("whisper_transcribed_medium.txt")]

count = 0
for i in range(len(lines)-1):
    value = re.split(r'\t', lines[i])
    paths.append(value[0])
    try:
        englishs.append(value[1])
    except:
        englishs.append('')
        count += 1

df_pred = pd.DataFrame()
df_pred['path'] = paths
df_pred['english_pred'] = englishs
df_pred = df_pred.astype(str)

In [8]:
df_pred

Unnamed: 0,path,english_pred
0,common_voice_en_19703104.mp3,The airport built a glass vault shelter to re...
1,common_voice_en_19620264.mp3,Early potato chip bags or wax paper with the ...
2,common_voice_en_19536604.mp3,It was only partially destroyed and was resto...
3,common_voice_en_19414447.mp3,Chorus was to retain its melting operations a...
4,common_voice_en_19822347.mp3,Something happened has frequently be criticiz...
...,...,...
15489,common_voice_en_20016471.mp3,Or was present for the lecture and was very i...
15490,common_voice_en_20024406.mp3,Some variants have odd trumps.
15491,common_voice_en_18843438.mp3,He later became known as Boston's favorite son.
15492,common_voice_en_19606229.mp3,"Martin Brest, Steven Spielberg and Sydney Pol..."


In [9]:
df['path'] = df['path'].astype('|S') 
df_pred['path'] = df_pred['path'].astype('|S') 
df_joined = df.merge(df_pred, on = "path", how = "inner")

In [10]:
ground_truth = df_joined['english'].to_list()
hypothesis = df_joined['english_pred'].to_list()
getWER(ground_truth, hypothesis)
getBLEU(ground_truth, hypothesis)

0.0890081944051992
BLEU = 0.84 2.0/1.1/0.6/0.4 (BP = 1.000 ratio = 10.000 hyp_len = 50 ref_len = 5)


BLEU = 0.84 2.0/1.1/0.6/0.4 (BP = 1.000 ratio = 10.000 hyp_len = 50 ref_len = 5)

In [11]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
helsinki_en2de = pipeline("translation_en_to_de", "Helsinki-NLP/opus-mt-en-de")

In [13]:
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable


In [14]:
from tqdm import tqdm

In [None]:
hypothesis_en2de = [] 
for i in tqdm(range(0, len(hypothesis), 32)):
    hypothesis_en2de.append(helsinki_en2de(hypothesis[i:i+32]))

 52%|█████████████████████████████████████████▏                                     | 253/485 [1:01:40<55:31, 14.36s/it]

In [None]:
ground_truth_translation = df_joined['german'].to_list()

In [None]:
# make lens match
len(hypothesis_en2de)

hypothesis_translation = []
for batch in hypothesis_en2de:
    for h in batch:
        hypothesis_translation.append(h['translation_text'])

len(hypothesis_translation)

In [117]:
# getWER(ground_truth_translation[:1], hypothesis_translation2[:1])
# getBLEU(ground_truth_translation[:1], hypothesis_translation2[:1])

['Der Rhythmus wird in der Bossa Nova Musik typischerweise als Snare Rim Muster gespielt ']
['Der Rhythmus wird typischerweise als Snare Felgenmuster in einer Nova Musik gespielt ']
0.7142857142857143
BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 12.000 hyp_len = 12 ref_len = 1)


BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 12.000 hyp_len = 12 ref_len = 1)

In [15]:
ground_truth_translation[:5]

['Der Rhythmus wird in der Bossa-Nova-Musik typischerweise als Snare-Rim-Muster gespielt.',
 'Der Verwaltungssitz des Countys ist Prineville.',
 'In Barkhamsted gibt es keine öffentlichen Verkehrsmittel.',
 'Im Laufe seiner Karriere zeichnete Berryman Tausende Comics mit Kommentaren zu amerikanischen Präsidenten und Politik.',
 'Aus lauter Angst fliehen die Männer am nächsten Morgen aus dem Camp.']

In [17]:
hypothesis_translation[:5]

['Der Rhythmus wird typischerweise als Snare-Felgenmuster in Bosa Nova-Musik gespielt',
 'Der Verwaltungssitz (County Seat) ist Pranavil.',
 'Es gibt keine öffentlichen Verkehrsmittel in Baukampsted',
 'Während seiner Karriere zog Berryman Tausende von Cartoons kommentieren amerikanischen Präsidenten und Politik',
 'Angst Die Männer flüchten am nächsten Morgen aus dem Lager']

In [None]:
# save translations
paths = df_joined['path'].to_list()
translated = ""
for i in range(len(hypothesis_translation)):
    translated += str(paths[i]) + '\t' + hypothesis_translation[i] + '\n' 

with open("translated_medium.txt", "w") as text_file:                                                               
    print(translated, file=text_file)