In [1]:
import pandas as pd

In [2]:
DATASET_ROOT_PATH = '/mnt/coding-nvme/datasets/audio_ds/cv-corpus-14.0-2023-06-23/hi/'

In [3]:
data = pd.read_csv(DATASET_ROOT_PATH + 'test.csv')

In [4]:
data = data[data['down_votes'] == 0]

In [5]:
data.drop(['down_votes', 'up_votes', 'text_hi'], axis=1, inplace=True)

In [7]:
import os
def _get_file_name(row):
    return os.path.join(DATASET_ROOT_PATH, 'clips', row['path'])

In [8]:
import librosa
def _audio_greater_than_20_sec(row):
    file_name = row['path']
    audio, sr = librosa.load(file_name)
    if len(audio) > 20 * sr:
        return False
    return row['path']

In [9]:
data['path'] = data.apply(_get_file_name, axis=1)
data['path'] = data.apply(_audio_greater_than_20_sec, axis=1)
data = data[data['path'] != False]

In [10]:
data = data.assign(**{
    'wer': None,
    'mer': None, 
    'wil': None, 
    'wip': None, 
    'cer': None,
})

# word error rate (WER)
# match error rate (MER)
# word information lost (WIL)
# word information preserved (WIP)
# character error rate (CER)

In [12]:
from models.output.whisper_medium_fp16_transformers import Model, load_audio
model = Model(
            model_name_or_path='models/output/whisper_medium_fp16_transformers',
            cuda_visible_device="0", 
            device='cuda',
      )

Moving model to cuda
dtype of model acc to config:  torch.float16
dtype of loaded model:  torch.float16


In [12]:
# import whisper
# model2 = whisper.load_model(
#     name='large-v2',
#     device='cuda:0',
#     download_root='models/compiled',
# )

In [17]:
from jiwer import wer, mer, wil, wip, cer

def calculate(row):
    reference = row['text_en'].strip().lower()
    # For fp16 model
    audio = load_audio(row['path'])
    hypothesis = model.transcribe(audio, language='en').strip().lower()
    
    # For Whisper (Original model)
    # audio = whisper.load_audio(row['path'])
    # audio = whisper.pad_or_trim(audio)
    # mel = whisper.log_mel_spectrogram(audio, device='cuda')
    # options = whisper.DecodingOptions(language="hi")
    # result = whisper.decode(model2, mel, options)
    # hypothesis = result.text.strip()

    
    row['wer'] = wer(reference, hypothesis)
    row['mer'] = mer(reference, hypothesis)
    row['wil'] = wil(reference, hypothesis)
    row['wip'] = wip(reference, hypothesis)
    row['cer'] = cer(reference, hypothesis)
    
    return row

In [18]:
# data = data.head(1000)
data = data.apply(calculate, axis=1)

In [21]:
print('WER: ', data['wer'].mean())
print('MER: ', data['mer'].mean())
print('WIL: ', data['wil'].mean())
print('WIP: ', data['wip'].mean())
print('CER: ', data['cer'].mean())

WER:  0.7687369334800929
MER:  0.6021552292207548
WIL:  0.6926061617106999
WIP:  0.3073938382893001
CER:  0.5355428574973872


In [None]:
from jiwer import wer, mer, wil, wip, cer

ref_sent = 'साइना नेहवाल भारत पहुंची, एयरपोर्ट पर भव्य स्वागत'
pred_sent = 'साइना नेहवाल भारत पहुंची, एयरपोर्ट पर भव्य स्वागत'

print(wer(ref_sent, pred_sent))
print(mer(ref_sent, pred_sent))
print(wil(ref_sent, pred_sent))
print(wip(ref_sent, pred_sent))
print(cer(ref_sent, pred_sent))

# print(ref_sent)