In [1]:
from datasets import load_dataset
from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperProcessor

import torch
from torch.utils.data import DataLoader
import numpy as np

import time
from tqdm import tqdm
import subprocess as sp
import os
import sched

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint_id = "openai/whisper-tiny.en"
processor = WhisperProcessor.from_pretrained(checkpoint_id)

In [3]:
librispeech = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

def preprocess(batch):    
    batch["input_features"] = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt").input_features[0]
    return batch

dataset_processed = librispeech.map(preprocess, remove_columns=librispeech.column_names)

dataloader = DataLoader(dataset_processed.with_format("torch"), batch_size=1)

Found cached dataset librispeech_asr_dummy (/home/sanchit_huggingface_co/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b/cache-471c43302a97ea10.arrow


In [4]:
def get_gpu_memory():
    """
    Python equivalent of nvidia-smi, copied from https://stackoverflow.com/a/67722676
    and verified as being equivalent ✅
    """
    output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
    
    COMMAND = "nvidia-smi --query-gpu=memory.used --format=csv"
    
    try:
        memory_use_info = output_to_list(sp.check_output(COMMAND.split(),stderr=sp.STDOUT))[1:]
    
    except sp.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    
    memory_use_values = [int(x.split()[0]) for i, x in enumerate(memory_use_info)]
    return memory_use_values

In [12]:
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_id, use_cache=True)
model.to("cuda")
model.half()

start = time.time()
for batch in tqdm(dataloader):
    predicted_ids = model.generate(batch["input_features"].to("cuda").half(), max_new_tokens=128, use_cache=True)
runtime = time.time() - start

print("Runtime: ", runtime)
print("VRAM: ", get_gpu_memory()[0])

# if we don't delete and re-load the model the GPU use is lower the second time round: warm-up effects?
del model
torch.cuda.empty_cache()

100%|███████████████████████████████████████████████████████████████████████████████████| 73/73 [00:08<00:00,  8.12it/s]


Runtime:  8.990428924560547
VRAM:  1381


In [13]:
# repeat experiment - this time disable cache during generation
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_id, use_cache=False)
model.to("cuda")
model.half()

start = time.time()
for batch in tqdm(dataloader):
    predicted_ids = model.generate(batch["input_features"].to("cuda").half(), max_new_tokens=128, use_cache=False)
runtime = time.time() - start

print("Runtime: ", runtime)
print("VRAM: ", get_gpu_memory()[0])

del model
torch.cuda.empty_cache()

100%|███████████████████████████████████████████████████████████████████████████████████| 73/73 [00:11<00:00,  6.09it/s]


Runtime:  11.993675231933594
VRAM:  1381
