In [6]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
dir(model)

In [None]:
import torch
import onnx

torch.onnx.export(
    model,  # model being run
    torch.randn(1,80, 3000),  # model input (or a tuple for multiple inputs)
    "whisper_model.onnx",  # where to save the model (can be a file or file-like object)
    export_params=True,  # store the trained parameter weights inside the model file
    opset_version=13,  # the ONNX version to export the model to
)

In [1]:
import whisper
import torch
import onnx

In [None]:
model_path = '/home/mat/Documents/whisper_triton/tiny.en.pt'

In [2]:
dynamic_axis = dict()
inputs_pytorch = torch.Tensor([1,512])
# model = torch.load(model_path)
model = whisper.load_model("tiny")


  checkpoint = torch.load(fp, map_location=device)


In [None]:
def convert_to_onnx(
    model_pytorch: torch.nn.Module,
    output_path: str,
    inputs_pytorch: Dict[str, torch.Tensor],
    quantization: bool,
    var_output_seq: bool,
    output_names: List[str],
    load_external_data: bool = False,
) -> None:
    """
    Convert a Pytorch model to an ONNX graph by tracing the provided input inside the Pytorch code.
    Pytorch sometimes fails to infer output tensor shape of models
    In ONNX graph, some axis name may be marked like "Divoutput_dim_1" which is a generated name,
    and there may be a warning:
    ** "WARNING: The shape inference of prim::Constant type is missing, so it may result in wrong shape inference
    for the exported graph. Please consider adding it in symbolic function." **
    ex.: https://discuss.pytorch.org/t/bidirectional-lstm-and-onnx-runtime-warnings/136374
    :param model_pytorch: Pytorch model (transformers)
    :param output_path: where to save ONNX file
    :param inputs_pytorch: Tensor, can be dummy data, shape is not important as we declare all axes as dynamic.
    Should be on the same device than the model (CPU or GPU)
    :param quantization: model is quantized
    :param var_output_seq: variable size sequence
    :param output_names: list of output names in ONNX model
    """
    if quantization:
        try:
            from pytorch_quantization.nn import TensorQuantizer
        except ImportError:
            raise ImportError(
                "It seems that pytorch-quantization is not yet installed. "
                "It is required when you enable the quantization flag and use CUDA device."
                "Please find installation instructions on "
                "https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization or use:\n"
                "pip3 install git+ssh://git@github.com/NVIDIA/TensorRT#egg=pytorch-quantization\\&"
                "subdirectory=tools/pytorch-quantization/"
            )

        TensorQuantizer.use_fb_fake_quant = True
    if hasattr(model_pytorch, "config") and hasattr(model_pytorch.config, "use_cache"):
        use_cache = getattr(model_pytorch.config, "use_cache")
        setattr(model_pytorch.config, "use_cache", False)

    # dynamic axis == variable length axis
    dynamic_axis = dict()
    for k in inputs_pytorch.keys():
        if var_output_seq:
            # seq axis name is fixed to be matched with output seq axis name (for output shape prediction)
            dynamic_axis[k] = {0: "batch_size", 1: "sequence"}
        else:
            # if there is no specific requirement, each axis name is unique, fix some issue on T5 model
            dynamic_axis[k] = {0: "batch_size", 1: f"sequence-{k}"}
    for output_name in output_names:
        dynamic_axis[output_name] = {0: "batch_size"}
        if var_output_seq:
            dynamic_axis[output_name][1] = "sequence"
    # replace int64 input tensors by int32 -> for ONNX Runtime binding API and expected by TensorRT engine
    for k, v in inputs_pytorch.items():
        if not isinstance(v, torch.Tensor):
            continue
        if v.dtype in [torch.long, torch.int64]:
            inputs_pytorch[k] = v.type(torch.int32)
    # get input names in the same order as in the model forward
    model_args = model_pytorch.forward.__code__.co_varnames
    input_names = []
    for arg_name in model_args:
        if arg_name in inputs_pytorch.keys():
            input_names.append(arg_name)
    # sentence transformer model forward is kargs and kwargs
    if len(input_names) == 0:
        input_names = list(inputs_pytorch.keys())
    with torch.no_grad():
        torch.onnx.export(
            model_pytorch,  # model to optimize
            args=tuple(inputs_pytorch.values()),  # tuple of multiple inputs
            f=output_path,  # output path / file object
            opset_version=13,  # the ONNX version to use, >= 13 supports channel quantized model
            do_constant_folding=True,  # simplify model (replace constant expressions)
            input_names=input_names,  # input names
            output_names=output_names,  # output names
            dynamic_axes=dynamic_axis,  # declare dynamix axis for each input / output
            training=TrainingMode.EVAL,  # always put the model in evaluation mode
            verbose=False,
        )
    proto = onnx.load(output_path, load_external_data=load_external_data)
    save_onnx(proto=proto, model_path=output_path)
    if quantization:
        TensorQuantizer.use_fb_fake_quant = False
    if hasattr(model_pytorch, "config") and hasattr(model_pytorch.config, "use_cache"):
        setattr(model_pytorch.config, "use_cache", use_cache)


In [14]:
import numpy as np

inputs = dict()
inputs["input_ids"] = torch.tensor(np.random.randn(1,80,3000),dtype=torch.float)
inputs["attention_mask"] = torch.tensor(np.random.randn(1,3000),dtype=torch.float)

model_pytorch = model
output_path = 'model.onnx'
inputs_pytorch = inputs



In [15]:
from typing import Dict

def convert_to_onnx(model_pytorch, output_path: str, inputs_pytorch: Dict[str, torch.Tensor]):
    with torch.no_grad():
        torch.onnx.export(
            model_pytorch,  # model to optimize
            args = (inputs_pytorch["input_ids"], inputs_pytorch["attention_mask"]), # tuple of multiple inputs
            f=output_path,  # output_path/ file object
            opset_version=12,    # the ONNX version to use
            do_constant_folding=True, # simplify model (replace constant expressions)
            input_names=["input_ids", "attention_mask"], # input names
            output_names=["model_output"],
            dynamic_axes={ # declare dynamic axis for each input / output (dynamic axis == variable length axis)
                "input_ids": {0:"batch_size", 1:"sequence"},
                "attention_mask": {0:"batch_size", 1:"sequence"},
                "model_output": {0:"batch_size"}
            },
            verbose=False,
        )   

In [16]:
convert_to_onnx(model_pytorch, output_path,inputs_pytorch)

  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):


ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds