In [1]:
from datasets import load_dataset
from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperProcessor

from dataclasses import dataclass

import torch
from torch.utils.data import DataLoader
import numpy as np

import time
from tqdm import tqdm
import subprocess as sp
import os
import sched

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
librispeech = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

Found cached dataset librispeech_asr_dummy (/home/sanchit_huggingface_co/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [3]:
# processors/tokenizers are the same for all models
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")

def preprocess(batch):    
    batch["input_features"] = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt").input_features[0]
    return batch

dataset_processed = librispeech.map(preprocess, remove_columns=librispeech.column_names)

Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b/cache-471c43302a97ea10.arrow


In [4]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    """

    processor: WhisperProcessor

    def __call__(self, features):
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        return batch

In [5]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
        processor=processor,
    )

In [6]:
dataloader = DataLoader(dataset_processed.with_format("torch"), batch_size=1, collate_fn=data_collator)

In [7]:
def get_gpu_memory():
    """Python equivalent of nvidia-smi, copied from https://stackoverflow.com/a/67722676"""
    output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
    
    COMMAND = "nvidia-smi --query-gpu=memory.used --format=csv"
    
    try:
        memory_use_info = output_to_list(sp.check_output(COMMAND.split(),stderr=sp.STDOUT))[1:]
    
    except sp.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    
    memory_use_values = [int(x.split()[0]) for i, x in enumerate(memory_use_info)]
    return memory_use_values

In [8]:
whisper_checkpoints = ["tiny.en", "base.en"]
runtime_results = {checkpoint: [] for checkpoint in whisper_checkpoints}
vram_results = {checkpoint: [] for checkpoint in whisper_checkpoints}

In [9]:
for checkpoint in whisper_checkpoints:
    print(50*"=", checkpoint, 50*"=")
    checkpoint_id = f"openai/whisper-{checkpoint}"

    model = WhisperForConditionalGeneration.from_pretrained(checkpoint_id)
    model.to("cuda")
    model.half()

    start = time.time()
    for batch in tqdm(dataloader):
        predicted_ids = model.generate(batch["input_features"].to("cuda").half(), max_new_tokens=128, use_cache=True)
    runtime = time.time() - start

    runtime_results[checkpoint] = runtime
    vram_results[checkpoint] = get_gpu_memory()[0]

    del model
    torch.cuda.empty_cache()



100%|███████████████████████████████████████████████████████████████████████████████████| 73/73 [00:09<00:00,  7.33it/s]




100%|███████████████████████████████████████████████████████████████████████████████████| 73/73 [00:11<00:00,  6.47it/s]


In [10]:
runtime_results

{'tiny.en': 9.957613468170166, 'base.en': 11.280346632003784}

In [11]:
vram_results

{'tiny.en': 2760, 'base.en': 2902}

In [12]:
# repeat experiment - this time disable cache during generation
for checkpoint in whisper_checkpoints:
    print(50*"=", checkpoint, 50*"=")
    checkpoint_id = f"openai/whisper-{checkpoint}"

    model = WhisperForConditionalGeneration.from_pretrained(checkpoint_id)
    model.to("cuda")
    model.half()

    start = time.time()
    for batch in tqdm(dataloader):
        predicted_ids = model.generate(batch["input_features"].to("cuda").half(), max_new_tokens=128, use_cache=False)
    runtime = time.time() - start

    runtime_results[checkpoint] = runtime
    vram_results[checkpoint] = get_gpu_memory()[0]

    del model
    torch.cuda.empty_cache()



100%|███████████████████████████████████████████████████████████████████████████████████| 73/73 [00:12<00:00,  5.80it/s]




100%|███████████████████████████████████████████████████████████████████████████████████| 73/73 [00:18<00:00,  3.97it/s]


In [13]:
runtime_results

{'tiny.en': 12.600297212600708, 'base.en': 18.400490760803223}

In [14]:
vram_results

{'tiny.en': 2760, 'base.en': 2902}