In [None]:
import os
import torch
import soundfile as sf

from transformers import pipeline
from transformers.utils import is_flash_attn_2_available
from jiwer import wer

#### Load models with flash attention

Loading models sizes with flash attention. For every loading and inference, compute and store the memory usage, word error rate and real-time factor

In [None]:
tiny = 'openai/whisper-tiny'
base = 'openai/whisper-base'
small = 'openai/whisper-small'
medium = 'openai/whisper-medium'
large = 'openai/whisper-large-v3'

models = [tiny, base, small, medium, large]

audio_files = os.listdir('audio_files')

Create a file to store test results

In [None]:
results_file = 'whisper_results.txt'
if os.path.exists(results_file):
    os.remove(results_file)

header = "******** Whisper Automatic Speech Recognition test ********\n"
underline = "___________________________________________________________\n"

with open(results_file, 'a') as f:
    f.write(header)
    f.write(underline)

#### Testing pipeline

In [None]:
for model in models:
    torch.cuda.empty_cache()
    asr_pipeline = pipeline('automatic-speech-recognition',
                            model=model,
                            torch_dtype=torch.float16,
                            device='cuda:0',
                            model_kwargs={'attn_implementation': 'flash_attention_2'} if is_flash_attn_2_available() else {'attn_implementation': 'sdpa'}
    )

    # check how much memory the model is using when loaded
    gpu_mem_loaded = torch.cuda.memory_allocated('cuda:0')

    for audio_file in audio_files:
        torch.cuda.empty_cache()

        start = torch.cuda.Event(enable_timing=True)
        start.record()
        
        output = asr_pipeline(
            f'audio_files/{audio_file}',
            chunk_length_s=30,
            batch_size=24,
            return_timestamps=True
        )

        end = torch.cuda.Event(enable_timing=True)
        end.record()

        torch.cuda.synchronize()
        timer = start.elapsed_time(end)/1000

        gpu_mem_inf = torch.cuda.memory_allocated('cuda:0')
        delta_gpu_mem = gpu_mem_inf-gpu_mem_loaded

        # extract audio file duration and compute Real-Time Factor
        audio_info = sf.info(f'audio_files/{audio_file}')
        rtf = timer/audio_info.duration

        # compute Word Error Rate
        with open(f'references/{audio_file[:-4]}.txt', 'r') as f:
            reference = f.read().replace('\n', ' ')
        
        # if first char in hypothesis is whitespace, remove it
        if output['text'][0] == ' ':
            output['text'] = output['text'][1:]
        hypothesis = output['text']
        wer_result = wer(reference, hypothesis)

        result = {
                  'Model': model + '-flash',
                  'Audio file': audio_file,
                  'WER': round(wer_result, 5),
                  'RTF': round(rtf, 5),
                  'GPU memory while loaded [MB]': round(gpu_mem_loaded*1e-6, 5),
                  'GPU memory while inference [MB]': round(gpu_mem_inf*1e-6, 5),
                  'Delta GPU memory [MB]': round(delta_gpu_mem*1e-6, 5),
        }

        with open(results_file, 'a') as infile:
            for key, value in result.items():
                infile.write(f'{key:<32}: {value}\n')
            infile.write(underline)
        
        print(f'Model {model} finished')

Write results to file