In [1]:
import os
import torch
import soundfile as sf

from transformers import pipeline
from transformers.utils import is_flash_attn_2_available
from jiwer import wer

#### Load models

Loading different model sizes. For every loading and inference, compute and store the memory usage, word error rate and real-time factor

In [2]:
tiny = 'openai/whisper-tiny'
base = 'openai/whisper-base'
small = 'openai/whisper-small'
medium = 'openai/whisper-medium'
large = 'openai/whisper-large-v3'

models = [tiny, base, small, medium, large]

home_dir = os.path.expanduser('~')
audio_files = os.listdir(os.path.join(home_dir, 'audio_files'))

audio_files.sort()

Create a file to store test results

In [3]:
results_file = 'whisper_results.txt'
if os.path.exists(results_file):
    os.remove(results_file)

header = "******** Whisper Automatic Speech Recognition test ********\n"
underline = "___________________________________________________________\n"

with open(results_file, 'a') as f:
    f.write(header)
    f.write(underline)

#### Testing pipeline with flash attention

In [4]:
for model in models:
    torch.cuda.empty_cache()
    asr_pipeline = pipeline('automatic-speech-recognition',
                            model=model,
                            torch_dtype=torch.float16,
                            device='cuda:0',
                            model_kwargs={'attn_implementation': 'flash_attention_2'} if is_flash_attn_2_available() else {'attn_implementation': 'sdpa'}
    )

    # check how much memory the model is using when loaded
    gpu_mem_loaded = torch.cuda.memory_allocated('cuda:0')

    for audio_file in audio_files:
        torch.cuda.empty_cache()

        start = torch.cuda.Event(enable_timing=True)
        start.record()
        
        output = asr_pipeline(
            os.path.join(home_dir, 'audio_files', audio_file),
            chunk_length_s=30,
            batch_size=24,
            return_timestamps=True
        )

        end = torch.cuda.Event(enable_timing=True)
        end.record()

        torch.cuda.synchronize()
        timer = start.elapsed_time(end)/1000

        gpu_mem_inf = torch.cuda.memory_allocated('cuda:0')
        delta_gpu_mem = gpu_mem_inf-gpu_mem_loaded

        # extract audio file duration and compute Real-Time Factor
        audio_info = sf.info(os.path.join(home_dir, 'audio_files', audio_file))
        rtf = timer/audio_info.duration

        # compute Word Error Rate
        with open(f'references/{audio_file[:-4]}.txt', 'r') as f:
            reference = f.read().replace('\n', ' ')
        
        # if first char in hypothesis is whitespace, remove it
        if output['text'][0] == ' ':
            output['text'] = output['text'][1:]
        hypothesis = output['text']
        wer_result = wer(reference, hypothesis)

        result = {
                  'Model': model + '-flash',
                  'Audio file': audio_file,
                  'WER': round(wer_result, 5),
                  'RTF': round(rtf, 5),
                  'GPU memory while loaded [MB]': round(gpu_mem_loaded*1e-6, 5),
                #   'GPU memory while inference [MB]': round(gpu_mem_inf*1e-6, 5),
                #   'Delta GPU memory [MB]': round(delta_gpu_mem*1e-6, 5),
        }

        with open(results_file, 'a') as infile:
            for key, value in result.items():
                infile.write(f'{key:<32}: {value}\n')
            infile.write(underline)
        
        print(f'Model {model} finished inference on {audio_file}')
    
    del asr_pipeline

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model openai/whisper-tiny finished inference on 01.wav
Model openai/whisper-tiny finished inference on 02.wav
Model openai/whisper-tiny finished inference on 03.wav
Model openai/whisper-tiny finished inference on 04.wav
Model openai/whisper-tiny finished inference on 05.wav
Model openai/whisper-tiny finished inference on 06.wav
Model openai/whisper-tiny finished inference on 07.wav
Model openai/whisper-tiny finished inference on 08.wav
Model openai/whisper-tiny finished inference on 09.wav
Model openai/whisper-tiny finished inference on 10.wav


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model openai/whisper-base finished inference on 01.wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Model openai/whisper-base finished inference on 02.wav
Model openai/whisper-base finished inference on 03.wav
Model openai/whisper-base finished inference on 04.wav
Model openai/whisper-base finished inference on 05.wav
Model openai/whisper-base finished inference on 06.wav
Model openai/whisper-base finished inference on 07.wav
Model openai/whisper-base finished inference on 08.wav
Model openai/whisper-base finished inference on 09.wav
Model openai/whisper-base finished inference on 10.wav


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model openai/whisper-small finished inference on 01.wav
Model openai/whisper-small finished inference on 02.wav
Model openai/whisper-small finished inference on 03.wav
Model openai/whisper-small finished inference on 04.wav
Model openai/whisper-small finished inference on 05.wav
Model openai/whisper-small finished inference on 06.wav
Model openai/whisper-small finished inference on 07.wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Model openai/whisper-small finished inference on 08.wav
Model openai/whisper-small finished inference on 09.wav
Model openai/whisper-small finished inference on 10.wav


model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Model openai/whisper-medium finished inference on 01.wav
Model openai/whisper-medium finished inference on 02.wav
Model openai/whisper-medium finished inference on 03.wav
Model openai/whisper-medium finished inference on 04.wav
Model openai/whisper-medium finished inference on 05.wav
Model openai/whisper-medium finished inference on 06.wav
Model openai/whisper-medium finished inference on 07.wav
Model openai/whisper-medium finished inference on 08.wav
Model openai/whisper-medium finished inference on 09.wav
Model openai/whisper-medium finished inference on 10.wav


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Model openai/whisper-large-v3 finished inference on 01.wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Model openai/whisper-large-v3 finished inference on 02.wav
Model openai/whisper-large-v3 finished inference on 03.wav
Model openai/whisper-large-v3 finished inference on 04.wav
Model openai/whisper-large-v3 finished inference on 05.wav
Model openai/whisper-large-v3 finished inference on 06.wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Model openai/whisper-large-v3 finished inference on 07.wav
Model openai/whisper-large-v3 finished inference on 08.wav
Model openai/whisper-large-v3 finished inference on 09.wav
Model openai/whisper-large-v3 finished inference on 10.wav


#### Testing pipeline without flash attention

In [5]:
from whisper import load_model

models = ['tiny', 'base', 'small', 'medium', 'large']

In [6]:
for model in models:
    torch.cuda.empty_cache()
    asr_pipeline = load_model(model).to('cuda:0')

    # check how much memory the model is using when loaded
    gpu_mem_loaded = torch.cuda.memory_allocated('cuda:0')

    for audio_file in audio_files:
        torch.cuda.empty_cache()

        start = torch.cuda.Event(enable_timing=True)
        start.record()
        
        output = asr_pipeline.transcribe(os.path.join(home_dir, 'audio_files', audio_file), fp16=True)

        end = torch.cuda.Event(enable_timing=True)
        end.record()

        torch.cuda.synchronize()
        timer = start.elapsed_time(end)/1000

        gpu_mem_inf = torch.cuda.memory_allocated('cuda:0')
        delta_gpu_mem = gpu_mem_inf-gpu_mem_loaded

        # extract audio file duration and compute Real-Time Factor
        audio_info = sf.info(os.path.join(home_dir, 'audio_files', audio_file))
        rtf = timer/audio_info.duration

        # compute Word Error Rate
        with open(f'references/{audio_file[:-4]}.txt', 'r') as f:
            reference = f.read().replace('\n', ' ')
        
        # if first char in hypothesis is whitespace, remove it
        if output['text'][0] == ' ':
            output['text'] = output['text'][1:]
        hypothesis = output['text']
        wer_result = wer(reference, hypothesis)

        result = {
                  'Model': model,
                  'Audio file': audio_file,
                  'WER': round(wer_result, 5),
                  'RTF': round(rtf, 5),
                  'GPU memory while loaded [MB]': round(gpu_mem_loaded*1e-6, 5),
                #   'GPU memory while inference [MB]': round(gpu_mem_inf*1e-6, 5),
                #   'Delta GPU memory [MB]': round(delta_gpu_mem*1e-6, 5),
        }

        with open(results_file, 'a') as infile:
            for key, value in result.items():
                infile.write(f'{key:<32}: {value}\n')
            infile.write(underline)
        
        print(f'Model {model} finished inference on {audio_file}')
    
    del asr_pipeline

Model tiny finished inference on 01.wav
Model tiny finished inference on 02.wav
Model tiny finished inference on 03.wav
Model tiny finished inference on 04.wav
Model tiny finished inference on 05.wav
Model tiny finished inference on 06.wav
Model tiny finished inference on 07.wav
Model tiny finished inference on 08.wav
Model tiny finished inference on 09.wav
Model tiny finished inference on 10.wav
Model base finished inference on 01.wav
Model base finished inference on 02.wav
Model base finished inference on 03.wav
Model base finished inference on 04.wav
Model base finished inference on 05.wav
Model base finished inference on 06.wav
Model base finished inference on 07.wav
Model base finished inference on 08.wav
Model base finished inference on 09.wav
Model base finished inference on 10.wav
Model small finished inference on 01.wav
Model small finished inference on 02.wav
Model small finished inference on 03.wav
Model small finished inference on 04.wav
Model small finished inference on 05