From a272d1f6b34dce64e1ff957fa200c3c224ece003 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 22 Sep 2025 18:27:28 +0000 Subject: [PATCH 01/11] Added support for AutoModelForCTC class Signed-off-by: Tanisha Chawada --- QEfficient/__init__.py | 2 + QEfficient/base/__init__.py | 1 + .../transformers/models/modeling_auto.py | 538 ++++++++++++++++++ QEfficient/utils/constants.py | 2 + 4 files changed, 543 insertions(+) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index be4b86321..b5e4ef429 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -43,6 +43,7 @@ def check_qaic_sdk(): from QEfficient.base import ( QEFFAutoModel, QEFFAutoModelForCausalLM, + QEFFAutoModelForCTC, QEFFAutoModelForImageTextToText, QEFFAutoModelForSpeechSeq2Seq, QEFFCommonLoader, @@ -63,6 +64,7 @@ def check_qaic_sdk(): "cloud_ai_100_exec_kv", "QEFFAutoModel", "QEFFAutoModelForCausalLM", + "QEFFAutoModelForCTC", "QEffAutoPeftModelForCausalLM", "QEFFAutoModelForImageTextToText", "QEFFAutoModelForSpeechSeq2Seq", diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py index d29ca7d29..d106a0759 100644 --- a/QEfficient/base/__init__.py +++ b/QEfficient/base/__init__.py @@ -9,6 +9,7 @@ from QEfficient.transformers.models.modeling_auto import ( # noqa: F401 QEFFAutoModel, QEFFAutoModelForCausalLM, + QEFFAutoModelForCTC, QEFFAutoModelForImageTextToText, QEFFAutoModelForSpeechSeq2Seq, ) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index cdacc7760..abcb03f2c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -16,6 +16,7 @@ from transformers import ( AutoModel, AutoModelForCausalLM, + AutoModelForCTC, AutoModelForImageTextToText, AutoModelForSpeechSeq2Seq, PreTrainedTokenizer, @@ -3077,3 +3078,540 @@ def generate( generated_ids=generated_ids, perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time), ) + + +class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin): + """ + The QEFFAutoModelForSpeechSeq2Seq class is designed for transformers models with a sequence-to-sequence speech-to-text modeling head, including Whisper and other Encoder-Decoder speech models. + Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + + ``Mandatory`` Args: + :model (nn.Module): PyTorch model + + .. code-block:: python + + from QEfficient import QEFFAutoModelForSpeechSeq2Seq + from processors import AutoProcessor + + # Initialize the model using from_pretrained similar to transformers.AutoModelForSpeechSeq2Seq. + model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained("model_name") + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=16, device_group=[0]) # Considering you have a Cloud AI 100 SKU + + #prepare inputs + processor = AutoProcessor.from_pretrained(model_name) + input_audio, sample_rate = [...] # audio data loaded in via some external audio package, such as librosa or soundfile + input_features = ( + processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32) + ) + decoder_input_ids = ( + torch.ones((batch_size, 1), dtype=torch.int64) * model.model.config.decoder_start_token_id + ).numpy() + decoder_position_ids = torch.arange(1, dtype=torch.int64).view(1, 1).repeat(batch_size, 1).numpy() + inputs = dict( + input_features=input_features, + decoder_input_ids=decoder_input_ids, + decoder_position_ids=decoder_position_ids, + ) + + # You can now execute the model + model.generate(inputs, generation_len=150) + """ + + _hf_auto_class = AutoModelForSpeechSeq2Seq + _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, KVCacheTransform] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + def __init__(self, model: nn.Module, **kwargs): + model_class_name = model.__class__.__name__ + if not (model_class_name.endswith("ForConditionalGeneration")): + raise TypeError(f"Required pytorch module with ForConditionalGeneration, got {model_class_name}") + + model.config.use_cache = True + super().__init__(model, **kwargs) + self.num_layers = model.config.num_hidden_layers + self.hash_params["qeff_auto_class"] = self.__class__.__name__ + + @property + def get_model_config(self) -> dict: + return self.model.config.__dict__ + + def export(self, export_dir: Optional[str] = None) -> str: + """ + Exports the model to ``ONNX`` format using ``torch.onnx.export``. + + ``Optional`` Args: + :export_dir (str, optional): The directory path to store ONNX-graph. + + Returns: + :str: Path of the generated ``ONNX`` graph. + """ + inputs = self.model.get_dummy_inputs() + dynamic_axes = self.model.get_onnx_dynamic_axes() + output_names = self.model.get_output_names() + return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir) + + def compile( + self, + onnx_path: Optional[str] = None, + compile_dir: Optional[str] = None, + *, + prefill_seq_len: Optional[int] = 1, + encoder_ctx_len: Optional[int] = None, + ctx_len: int = 150, + full_batch_size: Optional[int] = None, + kv_cache_batch_size: Optional[int] = None, + batch_size: int = 1, + num_devices: int = 1, + num_cores: int = 16, # FIXME: Make this mandatory arg + mxfp6_matmul: bool = False, + mxint8_kv_cache: bool = False, + num_speculative_tokens: Optional[int] = None, + **compiler_options, + ) -> str: + """ + This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. + If the model has not been exported yet, this method will handle the export process. + You can pass any other arguments that the `qaic-exec` takes as extra kwargs. + + ``Optional`` Args: + :onnx_path (str, optional): Path to pre-exported onnx model. + :compile_dir (str, optional): Path for saving the qpc generated. + :encoder_ctx_len (int, optional): The maximum length of context for encoder, based on the AutoProcessor output. ``Defaults to checking config, if None in config then 1500`` + :ctx_len (int, optional): The maximum length of context to keep for decoding. ``Defaults to 150``. + :batch_size (int, optional): Batch size. ``Defaults to 1``. + :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. + :num_cores (int): Number of cores used to compile the model. + :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. + :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``. + + - aic_hw_version=ai100 -> -aic-hw-version=ai100 + - aic_hw_version=ai200 -> -aic-hw-version=ai200 + + Other args are not yet implemented for AutoModelForSpeechSeq2Seq + Returns: + :str: Path of the compiled ``qpc`` package. + """ + specializations, compiler_options = self.model.get_specializations( + batch_size, + encoder_ctx_len, + ctx_len, + **compiler_options, + ) + + if full_batch_size: + logger.warning("Continuous batching is not yet enabled for AutoModelForSpeechSeq2Seq") + + if kv_cache_batch_size: + logger.warning("Prefix caching is not yet enabled for AutoModelForSpeechSeq2Seq") + + if mxint8_kv_cache: + logger.warning("mxint8 cache is not yet enabled for AutoModelForSpeechSeq2Seq") + + if num_speculative_tokens: + logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq") + + output_names = self.model.get_output_names() + + kv_cache_dtype = "float16" + custom_io = {} + + custom_io["input_features"] = kv_cache_dtype + + # Slice output_names to get input names + for output_name in output_names: + if output_name.endswith("_RetainedState"): + custom_io[output_name[: -len("_RetainedState")]] = kv_cache_dtype + + # Get output names + for output_name in output_names: + if output_name.endswith("_RetainedState"): + custom_io[output_name] = kv_cache_dtype + + return self._compile( + onnx_path=onnx_path, + compile_dir=compile_dir, + compile_only=True, + retained_state=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + custom_io=custom_io, + **compiler_options, + ) + + def generate( + self, + inputs: torch.Tensor, + generation_len: int, + streamer: Optional[TextStreamer] = None, + device_ids: List[int] = None, + ) -> Union[torch.Tensor, np.ndarray]: + """ + This method generates output until ``endoftranscript`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of audio tensor passed. + + ``Mandatory`` Args: + :processor: autoprocessor to process inputs and decode logits + :inputs (torch.Tensor): inputs to run the execution. + :generation_len (int): length upto which to generate + :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + Returns: + :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + """ + if not isinstance(self.qpc_path, Path): + raise TypeError("Please run compile API first!") + + inputs = self.auto_correct_inputs(inputs) + if self.qpc_session is None: + self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) + self.batch_size = self.qpc_session.bindings[0].dims[0] + + inputs["input_features"] = inputs["input_features"].numpy().astype(np.float16) + + # add start token id and initial position ids to inputs + seq_len = 1 + inputs["input_ids"] = ( + torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id + ).numpy() + inputs["position_ids"] = ( + torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy() + ) + + self.qpc_session.skip_buffers( + [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")] + ) + + outputs = { + "logits": np.random.randn(self.batch_size, 1, self.model.config.vocab_size).astype(np.float32), + } + self.qpc_session.set_buffers(outputs) + + # encoder run + start = perf_counter() + outputs = self.qpc_session.run(inputs) + + # array to hold generated tokens + generated_ids = np.full((self.batch_size, generation_len + 1), self.model.config.eos_token_id) + generated_ids[:, 0] = [self.model.config.decoder_start_token_id] + logits = outputs["logits"] + next_token = logits.argmax(-1) + generated_ids[:, 1] = next_token.squeeze(1) + + if streamer: + streamer.put(next_token) + + inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float16) + + loop_start = perf_counter() + for num_tokens in range(generation_len): + outputs = self.qpc_session.run(inputs) + logits = outputs["logits"] + next_token = logits.argmax(-1) + generated_ids[:, num_tokens + 1] = next_token.squeeze(1) + + if next_token[0][0] == self.model.config.eos_token_id: + break + + inputs["input_ids"] = next_token + inputs["position_ids"] += 1 + + if streamer: + streamer.put(next_token) + end = perf_counter() + + prefill_time, decode_perf, total_perf, total_time = calculate_latency(num_tokens, loop_start, start, end) + + return CloudAI100ExecInfoNew( + batch_size=self.batch_size, + generated_ids=generated_ids, + perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time), + ) + + +class QEFFAutoModelForCTC(QEFFTransformersBase): + """ + The QEFFAutoModelForCTC class is designed for transformer models with a Connectionist Temporal Classification (CTC) speech-to-text head, + including Wav2Vec2 and other encoder-only speech models optimized for alignment-free transcription. + Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + + ``Mandatory`` Args: + :model (nn.Module): PyTorch model + + .. code-block:: python + + import torchaudio + import soundfile + from QEfficient import QEFFAutoModelForCTC + from transformers import AutoProcessor + torchaudio.set_audio_backend("soundfile") + + # Initialize the model using from_pretrained similar to transformers.AutoModelForCTC. + model=QEFFAutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h") + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU + + #prepare input + waveform,sample_rate=torchaudio.load("Path to .wav file.") + processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + # Resample the waveform if necessary + if waveform.shape[0] > 1: + waveform = waveform.mean(dim=0) + if sample_rate != 16000: + resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) + waveform = resampler(waveform) + + # You can now execute the model + out, logits = model.generate(processor,inputs=waveform) + """ + _hf_auto_class = AutoModelForCTC + _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + def __init__(self, model: nn.Module, **kwargs): + super().__init__(model, **kwargs) + self.model.base_model.config.use_cache = True + + self.hash_params["qeff_auto_class"] = self.__class__.__name__ + + @classmethod + @with_replaced_quantizers + def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **kwargs): + """ + This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCTC. + Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. + + Args: + pretrained_model_name_or_path (str): The name or path of the pre-trained model. + + .. code-block:: python + + import torchaudio + import soundfile + from QEfficient import QEFFAutoModelForCTC + from transformers import AutoProcessor + torchaudio.set_audio_backend("soundfile") + + # Initialize the model using from_pretrained similar to transformers.AutoModelForCTC. + model=QEFFAutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h") + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU + + #prepare input + waveform,sample_rate=torchaudio.load("Path to .wav file.") + processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + # Resample the waveform if necessary + if waveform.shape[0] > 1: + waveform = waveform.mean(dim=0) + if sample_rate != 16000: + resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) + waveform = resampler(waveform) + + # You can now execute the model + out, logits = model.generate(processor,inputs=waveform) + """ + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') + + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") + + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + + # This is support models that should be classified to in a different auto class but transformers load them via this class + kv_offload = kwargs.pop("kv_offload", None) + if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP: + return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__]( + model, kv_offload=kv_offload, **kwargs + ) + + return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, pooling=pooling, **kwargs) + + @property + def get_model_config(self) -> dict: + return self.model.config.__dict__ + + def export(self, export_dir: Optional[str] = None) -> str: + """ + Exports the model to ``ONNX`` format using ``torch.onnx.export``. + + ``Optional`` Args: + :export_dir (str, optional): The directory path to store ONNX-graph. + + Returns: + :str: Path of the generated ``ONNX`` graph. + """ + bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + seq_len = constants.WAV2VEC2_MAX_SEQ_LEN + + example_inputs = { + "input_values": torch.zeros((bs, seq_len), dtype=torch.float32), + } + + dynamic_axes = {"input_values": {0: "batch_size", 1: "seq_len"}} + + output_names = ["logits"] + + return self._export( + example_inputs, + output_names, + dynamic_axes, + export_dir=export_dir, + ) + + def compile( + self, + onnx_path: Optional[str] = None, + compile_dir: Optional[str] = None, + *, + seq_len: Union[int, List[int]] = 480000, + batch_size: int = 1, + num_devices: int = 1, + num_cores: int = 16, # FIXME: Make this mandatory arg + mxfp6_matmul: bool = False, + **compiler_options, + ) -> str: + """ + This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. + If the model has not been exported yet, this method will handle the export process. + You can pass any other arguments that the `qaic-exec` takes as extra kwargs. + + ``Optional`` Args: + :onnx_path (str, optional): Path to pre-exported onnx model. + :compile_dir (str, optional): Path for saving the qpc generated. + :seq_len (Union[int, List[int]]): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``. + :batch_size (int, optional): Batch size. ``Defaults to 1``. + :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. + :num_cores (int): Number of cores used to compile the model. + :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. + :compiler_options (dict, optional): Additional compiler options. + + For QAIC Compiler: Extra arguments for qaic-exec can be passed. + :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` + + Params are converted to flags as below: + + - aic_hw_version=ai100 -> -aic-hw-version=ai100 + - aic_hw_version=ai200 -> -aic-hw-version=ai200 + + For QNN Compiler: Following arguments can be passed. + :enable_qnn (bool): Enables QNN Compilation. + :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. + + Returns: + :str: Path of the compiled ``qpc`` package. + """ + + specializations = [ + {"batch_size": batch_size, "seq_len": sl} for sl in (seq_len if isinstance(seq_len, list) else [seq_len]) + ] + + return self._compile( + onnx_path=onnx_path, + compile_dir=compile_dir, + compile_only=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + **compiler_options, + ) + + def generate( + self, + processor, + inputs: torch.Tensor, + device_ids: List[int] = None, + runtime_ai100: bool = True, + ) -> Union[torch.Tensor, np.ndarray]: + """ + This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + ``Mandatory`` Args: + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + :processor (AutoProcessor): The Processor to use for encoding the waveform. + ``optional`` Args: + :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + Returns: + :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + """ + # AI_100 runtime + if runtime_ai100: + if not isinstance(self.qpc_path, Path): + raise TypeError("Please run compile API first!") + + return self.cloud_ai_100_feature_generate(processor,inputs=inputs, device_ids=device_ids) + # PyTorch runtime + else: + return self.pytorch_feature_generate(processor,model=self.model, inputs=inputs) + + def cloud_ai_100_feature_generate( + self, + processor, + inputs: torch.Tensor, + device_ids: List[int] = [0], + ) -> np.ndarray: + """ + Generates features with list of prompts using AI 100 runtime. + + ``Mandatory`` Args: + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + :processor (AutoProcessor): The Processor to use for encoding the waveform. + ``Optional`` Args: + device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. + + """ + + if self.qpc_session is None: + self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) + self.batch_size = self.qpc_session.bindings[0].dims[0] + + # Dynamic switching to closest seq_Len based on input_ids_len + inputs=processor(inputs, return_tensors="pt") + input_ids_len = inputs["input_values"].shape[-1] + + for allowed_shape in self.qpc_session.allowed_shapes: + seq_len_allowed = allowed_shape[1][1][1] + + if seq_len_allowed >= input_ids_len: + self.seq_len = seq_len_allowed + break + + # To handle single seq_len as we can't fetch allowed shapes for single seq_len + self.seq_len = self.qpc_session.bindings[0].dims[1] if not hasattr(self, "seq_len") else self.seq_len + input_values = np.array( + torch.nn.functional.pad(inputs["input_values"], (0, self.seq_len - input_ids_len), "constant", 0) + ) + inputs = dict(input_values=input_values) + outputs = self.qpc_session.run(inputs) + logits=outputs['logits'] + predicted_ids = np.argmax(logits, axis=-1) + transcriptions = processor.decode(torch.tensor(predicted_ids[0])) + return transcriptions, logits + + def pytorch_feature_generate(self,processor, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]: + """ + Generates features from a list of text prompts using a PyTorch model. + + ``Mandatory`` Args: + :model: The transformed PyTorch model used for generating features. + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + :processor (AutoProcessor): The Processor to use for encoding the waveform. + + """ + input_values=processor(inputs[0],return_tensors="pt", max_length=self.seq_len,truncation=True,padding='max_length').input_values + logits=model(input_values[0]).logits + logits=logits.detach().numpy() + predicted_ids=np.argmax(logits,axis=-1) + transcriptions=processor.batch_decode(predicted_ids) + return transcriptions, logits \ No newline at end of file diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index f8552b169..daf9de0c0 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -122,6 +122,8 @@ def get_models_dir(): # Gemma3 Constant GEMMA3_MAX_POSITION_EMBEDDINGS = 32768 +#Wav2Vec2 Constant +WAV2VEC2_MAX_SEQ_LEN = 480000 class Constants: # Export Constants. From 366ccbff44cc2d5953bc35ddabcab551ad429c4a Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 22 Sep 2025 18:30:38 +0000 Subject: [PATCH 02/11] Added support for AutoModelForCTC class Signed-off-by: Tanisha Chawada --- .../transformers/models/modeling_auto.py | 253 ------------------ 1 file changed, 253 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index abcb03f2c..e98c7cd14 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -3080,259 +3080,6 @@ def generate( ) -class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin): - """ - The QEFFAutoModelForSpeechSeq2Seq class is designed for transformers models with a sequence-to-sequence speech-to-text modeling head, including Whisper and other Encoder-Decoder speech models. - Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. - - ``Mandatory`` Args: - :model (nn.Module): PyTorch model - - .. code-block:: python - - from QEfficient import QEFFAutoModelForSpeechSeq2Seq - from processors import AutoProcessor - - # Initialize the model using from_pretrained similar to transformers.AutoModelForSpeechSeq2Seq. - model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained("model_name") - - # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16, device_group=[0]) # Considering you have a Cloud AI 100 SKU - - #prepare inputs - processor = AutoProcessor.from_pretrained(model_name) - input_audio, sample_rate = [...] # audio data loaded in via some external audio package, such as librosa or soundfile - input_features = ( - processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32) - ) - decoder_input_ids = ( - torch.ones((batch_size, 1), dtype=torch.int64) * model.model.config.decoder_start_token_id - ).numpy() - decoder_position_ids = torch.arange(1, dtype=torch.int64).view(1, 1).repeat(batch_size, 1).numpy() - inputs = dict( - input_features=input_features, - decoder_input_ids=decoder_input_ids, - decoder_position_ids=decoder_position_ids, - ) - - # You can now execute the model - model.generate(inputs, generation_len=150) - """ - - _hf_auto_class = AutoModelForSpeechSeq2Seq - _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, KVCacheTransform] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] - - def __init__(self, model: nn.Module, **kwargs): - model_class_name = model.__class__.__name__ - if not (model_class_name.endswith("ForConditionalGeneration")): - raise TypeError(f"Required pytorch module with ForConditionalGeneration, got {model_class_name}") - - model.config.use_cache = True - super().__init__(model, **kwargs) - self.num_layers = model.config.num_hidden_layers - self.hash_params["qeff_auto_class"] = self.__class__.__name__ - - @property - def get_model_config(self) -> dict: - return self.model.config.__dict__ - - def export(self, export_dir: Optional[str] = None) -> str: - """ - Exports the model to ``ONNX`` format using ``torch.onnx.export``. - - ``Optional`` Args: - :export_dir (str, optional): The directory path to store ONNX-graph. - - Returns: - :str: Path of the generated ``ONNX`` graph. - """ - inputs = self.model.get_dummy_inputs() - dynamic_axes = self.model.get_onnx_dynamic_axes() - output_names = self.model.get_output_names() - return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir) - - def compile( - self, - onnx_path: Optional[str] = None, - compile_dir: Optional[str] = None, - *, - prefill_seq_len: Optional[int] = 1, - encoder_ctx_len: Optional[int] = None, - ctx_len: int = 150, - full_batch_size: Optional[int] = None, - kv_cache_batch_size: Optional[int] = None, - batch_size: int = 1, - num_devices: int = 1, - num_cores: int = 16, # FIXME: Make this mandatory arg - mxfp6_matmul: bool = False, - mxint8_kv_cache: bool = False, - num_speculative_tokens: Optional[int] = None, - **compiler_options, - ) -> str: - """ - This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. - If the model has not been exported yet, this method will handle the export process. - You can pass any other arguments that the `qaic-exec` takes as extra kwargs. - - ``Optional`` Args: - :onnx_path (str, optional): Path to pre-exported onnx model. - :compile_dir (str, optional): Path for saving the qpc generated. - :encoder_ctx_len (int, optional): The maximum length of context for encoder, based on the AutoProcessor output. ``Defaults to checking config, if None in config then 1500`` - :ctx_len (int, optional): The maximum length of context to keep for decoding. ``Defaults to 150``. - :batch_size (int, optional): Batch size. ``Defaults to 1``. - :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. - :num_cores (int): Number of cores used to compile the model. - :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. - :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. - :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``. - - - aic_hw_version=ai100 -> -aic-hw-version=ai100 - - aic_hw_version=ai200 -> -aic-hw-version=ai200 - - Other args are not yet implemented for AutoModelForSpeechSeq2Seq - Returns: - :str: Path of the compiled ``qpc`` package. - """ - specializations, compiler_options = self.model.get_specializations( - batch_size, - encoder_ctx_len, - ctx_len, - **compiler_options, - ) - - if full_batch_size: - logger.warning("Continuous batching is not yet enabled for AutoModelForSpeechSeq2Seq") - - if kv_cache_batch_size: - logger.warning("Prefix caching is not yet enabled for AutoModelForSpeechSeq2Seq") - - if mxint8_kv_cache: - logger.warning("mxint8 cache is not yet enabled for AutoModelForSpeechSeq2Seq") - - if num_speculative_tokens: - logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq") - - output_names = self.model.get_output_names() - - kv_cache_dtype = "float16" - custom_io = {} - - custom_io["input_features"] = kv_cache_dtype - - # Slice output_names to get input names - for output_name in output_names: - if output_name.endswith("_RetainedState"): - custom_io[output_name[: -len("_RetainedState")]] = kv_cache_dtype - - # Get output names - for output_name in output_names: - if output_name.endswith("_RetainedState"): - custom_io[output_name] = kv_cache_dtype - - return self._compile( - onnx_path=onnx_path, - compile_dir=compile_dir, - compile_only=True, - retained_state=True, - specializations=specializations, - convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, - mdp_ts_num_devices=num_devices, - aic_num_cores=num_cores, - custom_io=custom_io, - **compiler_options, - ) - - def generate( - self, - inputs: torch.Tensor, - generation_len: int, - streamer: Optional[TextStreamer] = None, - device_ids: List[int] = None, - ) -> Union[torch.Tensor, np.ndarray]: - """ - This method generates output until ``endoftranscript`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - This is a sequential execution based on the ``batch_size`` of the compiled model and the number of audio tensor passed. - - ``Mandatory`` Args: - :processor: autoprocessor to process inputs and decode logits - :inputs (torch.Tensor): inputs to run the execution. - :generation_len (int): length upto which to generate - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model - Returns: - :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. - """ - if not isinstance(self.qpc_path, Path): - raise TypeError("Please run compile API first!") - - inputs = self.auto_correct_inputs(inputs) - if self.qpc_session is None: - self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) - self.batch_size = self.qpc_session.bindings[0].dims[0] - - inputs["input_features"] = inputs["input_features"].numpy().astype(np.float16) - - # add start token id and initial position ids to inputs - seq_len = 1 - inputs["input_ids"] = ( - torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id - ).numpy() - inputs["position_ids"] = ( - torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy() - ) - - self.qpc_session.skip_buffers( - [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")] - ) - - outputs = { - "logits": np.random.randn(self.batch_size, 1, self.model.config.vocab_size).astype(np.float32), - } - self.qpc_session.set_buffers(outputs) - - # encoder run - start = perf_counter() - outputs = self.qpc_session.run(inputs) - - # array to hold generated tokens - generated_ids = np.full((self.batch_size, generation_len + 1), self.model.config.eos_token_id) - generated_ids[:, 0] = [self.model.config.decoder_start_token_id] - logits = outputs["logits"] - next_token = logits.argmax(-1) - generated_ids[:, 1] = next_token.squeeze(1) - - if streamer: - streamer.put(next_token) - - inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float16) - - loop_start = perf_counter() - for num_tokens in range(generation_len): - outputs = self.qpc_session.run(inputs) - logits = outputs["logits"] - next_token = logits.argmax(-1) - generated_ids[:, num_tokens + 1] = next_token.squeeze(1) - - if next_token[0][0] == self.model.config.eos_token_id: - break - - inputs["input_ids"] = next_token - inputs["position_ids"] += 1 - - if streamer: - streamer.put(next_token) - end = perf_counter() - - prefill_time, decode_perf, total_perf, total_time = calculate_latency(num_tokens, loop_start, start, end) - - return CloudAI100ExecInfoNew( - batch_size=self.batch_size, - generated_ids=generated_ids, - perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time), - ) - - class QEFFAutoModelForCTC(QEFFTransformersBase): """ The QEFFAutoModelForCTC class is designed for transformer models with a Connectionist Temporal Classification (CTC) speech-to-text head, From 25f3c20cbfa95d36f23940d6e610e8566fd7cac3 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 22 Sep 2025 18:31:35 +0000 Subject: [PATCH 03/11] Added support for AutoModelForCTC class Signed-off-by: Tanisha Chawada --- QEfficient/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index b5e4ef429..43177d88d 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -9,14 +9,14 @@ import warnings from QEfficient.utils import custom_format_warning - +import QEfficient.utils.model_registery # noqa: F401 +from QEfficient.utils.logging_utils import logger # For faster downloads via hf_transfer # This code is put above import statements as this needs to be executed before # hf_transfer is imported (will happen on line 15 via leading imports) os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Placeholder for all non-transformer models registered in QEfficient -import QEfficient.utils.model_registery # noqa: F401 -from QEfficient.utils.logging_utils import logger + # custom warning for the better logging experience warnings.formatwarning = custom_format_warning From a3b78554fec8a111997ba30b8d14b2986b197095 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 22 Sep 2025 18:32:18 +0000 Subject: [PATCH 04/11] Added support for AutoModelForCTC class Signed-off-by: Tanisha Chawada --- examples/lora_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/lora_models.py b/examples/lora_models.py index a578c8350..9a98f9201 100644 --- a/examples/lora_models.py +++ b/examples/lora_models.py @@ -109,7 +109,8 @@ Content: -A new study has found that the brain of an old person can still make new neurons. The study was conducted by a team of researchers from the University of California, Los Angeles. The team studied the brains that were able to make new neurons. The team found that the brains of these people were able to make new neurons in the hippocampus, which is the part of the brain that is responsible for memory and learning. The team also found that the brains of these people were able to make new neurons in the cortex, which is the part of the brain that is responsible for thinking and reasoning. The team also found that the brains of these people were able to make new neurons in the cerebellum, which +A new study has found that the brain of an old person can still make new neurons. The study was conducted by a team of researchers from the University of California, Los Angeles. The team studied the brains that were able to make new neurons. The team found that the brains of these people were able to make new neurons in the hippocampus, which is the part of the brain that is responsible for memory and learning. +The team also found that the brains of these people were able to make new neurons in the cortex, which is the part of the brain that is responsible for thinking and reasoning. The team also found that the brains of these people were able to make new neurons in the cerebellum, which <5> James slept 2/3 * 9 = <<2/3*9=6>>6 hours. From c6c94b57db5f94b45d2e7d21eeb105b0d953eca1 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 22 Sep 2025 18:37:20 +0000 Subject: [PATCH 05/11] Solving lint error Signed-off-by: Tanisha Chawada --- QEfficient/__init__.py | 3 ++- pyproject.toml | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 43177d88d..33c6f5588 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -8,9 +8,10 @@ import os import warnings -from QEfficient.utils import custom_format_warning import QEfficient.utils.model_registery # noqa: F401 +from QEfficient.utils import custom_format_warning from QEfficient.utils.logging_utils import logger + # For faster downloads via hf_transfer # This code is put above import statements as this needs to be executed before # hf_transfer is imported (will happen on line 15 via leading imports) diff --git a/pyproject.toml b/pyproject.toml index ea3c3405d..d7a95aae6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,9 +63,9 @@ namespaces = false attr = "QEfficient.__version__" [tool.ruff] -line-length = 120 +line-length = 500 # Enable the isort rules. -lint.extend-select = ["I"] +extend-select = ["I"] target-version = "py310" [tool.pytest.ini_options] From b170ef6ea18a7e543643c835c303e2ffcdd7f677 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 23 Sep 2025 11:19:00 +0000 Subject: [PATCH 06/11] Added support for wav2vec2 model Signed-off-by: Tanisha Chawada --- .../transformers/models/modeling_auto.py | 37 ++++++++++--------- QEfficient/utils/constants.py | 3 +- pyproject.toml | 4 +- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index e98c7cd14..41dc05615 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -3082,7 +3082,7 @@ def generate( class QEFFAutoModelForCTC(QEFFTransformersBase): """ - The QEFFAutoModelForCTC class is designed for transformer models with a Connectionist Temporal Classification (CTC) speech-to-text head, + The QEFFAutoModelForCTC class is designed for transformer models with a Connectionist Temporal Classification (CTC) speech-to-text head, including Wav2Vec2 and other encoder-only speech models optimized for alignment-free transcription. Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. @@ -3091,7 +3091,7 @@ class QEFFAutoModelForCTC(QEFFTransformersBase): .. code-block:: python - import torchaudio + import torchaudio import soundfile from QEfficient import QEFFAutoModelForCTC from transformers import AutoProcessor @@ -3105,7 +3105,7 @@ class QEFFAutoModelForCTC(QEFFTransformersBase): #prepare input waveform,sample_rate=torchaudio.load("Path to .wav file.") - processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") # Resample the waveform if necessary if waveform.shape[0] > 1: waveform = waveform.mean(dim=0) @@ -3116,6 +3116,7 @@ class QEFFAutoModelForCTC(QEFFTransformersBase): # You can now execute the model out, logits = model.generate(processor,inputs=waveform) """ + _hf_auto_class = AutoModelForCTC _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] @@ -3138,7 +3139,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k .. code-block:: python - import torchaudio + import torchaudio import soundfile from QEfficient import QEFFAutoModelForCTC from transformers import AutoProcessor @@ -3152,7 +3153,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k #prepare input waveform,sample_rate=torchaudio.load("Path to .wav file.") - processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") # Resample the waveform if necessary if waveform.shape[0] > 1: waveform = waveform.mean(dim=0) @@ -3297,10 +3298,10 @@ def generate( if not isinstance(self.qpc_path, Path): raise TypeError("Please run compile API first!") - return self.cloud_ai_100_feature_generate(processor,inputs=inputs, device_ids=device_ids) + return self.cloud_ai_100_feature_generate(processor, inputs=inputs, device_ids=device_ids) # PyTorch runtime else: - return self.pytorch_feature_generate(processor,model=self.model, inputs=inputs) + return self.pytorch_feature_generate(processor, model=self.model, inputs=inputs) def cloud_ai_100_feature_generate( self, @@ -3316,7 +3317,7 @@ def cloud_ai_100_feature_generate( :processor (AutoProcessor): The Processor to use for encoding the waveform. ``Optional`` Args: device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. - + """ if self.qpc_session is None: @@ -3324,7 +3325,7 @@ def cloud_ai_100_feature_generate( self.batch_size = self.qpc_session.bindings[0].dims[0] # Dynamic switching to closest seq_Len based on input_ids_len - inputs=processor(inputs, return_tensors="pt") + inputs = processor(inputs, return_tensors="pt") input_ids_len = inputs["input_values"].shape[-1] for allowed_shape in self.qpc_session.allowed_shapes: @@ -3341,12 +3342,12 @@ def cloud_ai_100_feature_generate( ) inputs = dict(input_values=input_values) outputs = self.qpc_session.run(inputs) - logits=outputs['logits'] + logits = outputs["logits"] predicted_ids = np.argmax(logits, axis=-1) transcriptions = processor.decode(torch.tensor(predicted_ids[0])) return transcriptions, logits - def pytorch_feature_generate(self,processor, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]: + def pytorch_feature_generate(self, processor, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]: """ Generates features from a list of text prompts using a PyTorch model. @@ -3356,9 +3357,11 @@ def pytorch_feature_generate(self,processor, model, inputs: Union[torch.Tensor, :processor (AutoProcessor): The Processor to use for encoding the waveform. """ - input_values=processor(inputs[0],return_tensors="pt", max_length=self.seq_len,truncation=True,padding='max_length').input_values - logits=model(input_values[0]).logits - logits=logits.detach().numpy() - predicted_ids=np.argmax(logits,axis=-1) - transcriptions=processor.batch_decode(predicted_ids) - return transcriptions, logits \ No newline at end of file + input_values = processor( + inputs[0], return_tensors="pt", max_length=self.seq_len, truncation=True, padding="max_length" + ).input_values + logits = model(input_values[0]).logits + logits = logits.detach().numpy() + predicted_ids = np.argmax(logits, axis=-1) + transcriptions = processor.batch_decode(predicted_ids) + return transcriptions, logits diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index daf9de0c0..06f4f8138 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -122,9 +122,10 @@ def get_models_dir(): # Gemma3 Constant GEMMA3_MAX_POSITION_EMBEDDINGS = 32768 -#Wav2Vec2 Constant +# Wav2Vec2 Constant WAV2VEC2_MAX_SEQ_LEN = 480000 + class Constants: # Export Constants. SEQ_LEN = 32 diff --git a/pyproject.toml b/pyproject.toml index d7a95aae6..ea3c3405d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,9 +63,9 @@ namespaces = false attr = "QEfficient.__version__" [tool.ruff] -line-length = 500 +line-length = 120 # Enable the isort rules. -extend-select = ["I"] +lint.extend-select = ["I"] target-version = "py310" [tool.pytest.ini_options] From f0e1bee51bec3035d18a49c6ad86beb74e0ad99f Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 29 Sep 2025 07:10:17 +0000 Subject: [PATCH 07/11] Added test for AutoModelForCTC class. Signed-off-by: Tanisha Chawada --- .../transformers/models/modeling_auto.py | 45 ++-- QEfficient/utils/constants.py | 2 +- examples/lora_models.py | 3 +- examples/wav2vec2_example/README.md | 21 ++ .../run_wav2vec2_inference.py | 24 +++ .../models/test_audio_embedding_models.py | 199 ++++++++++++++++++ 6 files changed, 267 insertions(+), 27 deletions(-) create mode 100644 examples/wav2vec2_example/README.md create mode 100644 examples/wav2vec2_example/run_wav2vec2_inference.py create mode 100644 tests/transformers/models/test_audio_embedding_models.py diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 41dc05615..3b306e256 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -3090,31 +3090,29 @@ class QEFFAutoModelForCTC(QEFFTransformersBase): :model (nn.Module): PyTorch model .. code-block:: python - import torchaudio - import soundfile from QEfficient import QEFFAutoModelForCTC from transformers import AutoProcessor - torchaudio.set_audio_backend("soundfile") # Initialize the model using from_pretrained similar to transformers.AutoModelForCTC. - model=QEFFAutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h") + model=QEFFAutoModelForCTC.from_pretrained(model_name) # Now you can directly compile the model for Cloud AI 100 model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU #prepare input - waveform,sample_rate=torchaudio.load("Path to .wav file.") - processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") - # Resample the waveform if necessary - if waveform.shape[0] > 1: - waveform = waveform.mean(dim=0) + processor = AutoProcessor.from_pretrained(model_name) + input_audio, sample_rate = [...] # audio data loaded in via some external audio package, such as librosa or soundfile + + # Resample the input_audio if necessary + if input_audio.shape[0] > 1: + input_audio = input_audio.mean(dim=0) if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) - waveform = resampler(waveform) + input_audio = resampler(input_audio) # You can now execute the model - out, logits = model.generate(processor,inputs=waveform) + out = model.generate(processor,inputs=input_audio) """ _hf_auto_class = AutoModelForCTC @@ -3140,29 +3138,28 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k .. code-block:: python import torchaudio - import soundfile from QEfficient import QEFFAutoModelForCTC from transformers import AutoProcessor - torchaudio.set_audio_backend("soundfile") # Initialize the model using from_pretrained similar to transformers.AutoModelForCTC. - model=QEFFAutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h") + model=QEFFAutoModelForCTC.from_pretrained(model_name) # Now you can directly compile the model for Cloud AI 100 model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU #prepare input - waveform,sample_rate=torchaudio.load("Path to .wav file.") - processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") - # Resample the waveform if necessary - if waveform.shape[0] > 1: - waveform = waveform.mean(dim=0) + processor = AutoProcessor.from_pretrained(model_name) + input_audio, sample_rate = [...] # audio data loaded in via some external audio package, such as librosa or soundfile + + # Resample the input_audio if necessary + if input_audio.shape[0] > 1: + input_audio = input_audio.mean(dim=0) if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) - waveform = resampler(waveform) + input_audio = resampler(input_audio) # You can now execute the model - out, logits = model.generate(processor,inputs=waveform) + out = model.generate(processor,inputs=input_audio) """ if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -3344,8 +3341,8 @@ def cloud_ai_100_feature_generate( outputs = self.qpc_session.run(inputs) logits = outputs["logits"] predicted_ids = np.argmax(logits, axis=-1) - transcriptions = processor.decode(torch.tensor(predicted_ids[0])) - return transcriptions, logits + transcriptions = processor.batch_decode(torch.tensor(predicted_ids)) + return transcriptions def pytorch_feature_generate(self, processor, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]: """ @@ -3364,4 +3361,4 @@ def pytorch_feature_generate(self, processor, model, inputs: Union[torch.Tensor, logits = logits.detach().numpy() predicted_ids = np.argmax(logits, axis=-1) transcriptions = processor.batch_decode(predicted_ids) - return transcriptions, logits + return transcriptions diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 06f4f8138..57fba282b 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -123,7 +123,7 @@ def get_models_dir(): GEMMA3_MAX_POSITION_EMBEDDINGS = 32768 # Wav2Vec2 Constant -WAV2VEC2_MAX_SEQ_LEN = 480000 +WAV2VEC2_MAX_SEQ_LEN = 480000 # 30 seconds of audio at 16 kHz sampling rate (16,000 samples/sec × 30 sec) class Constants: diff --git a/examples/lora_models.py b/examples/lora_models.py index 9a98f9201..a578c8350 100644 --- a/examples/lora_models.py +++ b/examples/lora_models.py @@ -109,8 +109,7 @@ Content: -A new study has found that the brain of an old person can still make new neurons. The study was conducted by a team of researchers from the University of California, Los Angeles. The team studied the brains that were able to make new neurons. The team found that the brains of these people were able to make new neurons in the hippocampus, which is the part of the brain that is responsible for memory and learning. -The team also found that the brains of these people were able to make new neurons in the cortex, which is the part of the brain that is responsible for thinking and reasoning. The team also found that the brains of these people were able to make new neurons in the cerebellum, which +A new study has found that the brain of an old person can still make new neurons. The study was conducted by a team of researchers from the University of California, Los Angeles. The team studied the brains that were able to make new neurons. The team found that the brains of these people were able to make new neurons in the hippocampus, which is the part of the brain that is responsible for memory and learning. The team also found that the brains of these people were able to make new neurons in the cortex, which is the part of the brain that is responsible for thinking and reasoning. The team also found that the brains of these people were able to make new neurons in the cerebellum, which <5> James slept 2/3 * 9 = <<2/3*9=6>>6 hours. diff --git a/examples/wav2vec2_example/README.md b/examples/wav2vec2_example/README.md new file mode 100644 index 000000000..fba8d9ad2 --- /dev/null +++ b/examples/wav2vec2_example/README.md @@ -0,0 +1,21 @@ +# Speech Recognition with Wav2Vec2 +This directory contains an example script of how to use the AutoModelForCTC class. (for now, Wav2Vec2 models on audio <30 seconds only has been validated) + +## Required packages: +- `librosa==0.10.2` +- `soundfile==0.13.1` + +You can install them using pip: +```sh +pip install librosa==0.10.2 soundfile==0.13.1 +``` + +To run example script after package installations: +```sh +python run_wav2vec2_inference.py +``` + +Expected output for given data sample: +```sh +MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL +``` \ No newline at end of file diff --git a/examples/wav2vec2_example/run_wav2vec2_inference.py b/examples/wav2vec2_example/run_wav2vec2_inference.py new file mode 100644 index 000000000..961aabeb8 --- /dev/null +++ b/examples/wav2vec2_example/run_wav2vec2_inference.py @@ -0,0 +1,24 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from datasets import load_dataset +from transformers import AutoProcessor + +from QEfficient import QEFFAutoModelForCTC + +base_model_name = "facebook/wav2vec2-base-960h" + +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +data = ds[0]["audio"]["array"] +# reshape to so shape corresponds to data with batch size 1 +data = data.reshape(-1) +sample_rate = ds[0]["audio"]["sampling_rate"] +processor = AutoProcessor.from_pretrained(base_model_name) + +model = QEFFAutoModelForCTC.from_pretrained(base_model_name) +model.compile(num_cores=16) +print(model.generate(processor, inputs=data)) diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py new file mode 100644 index 000000000..5df7c1405 --- /dev/null +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -0,0 +1,199 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import os +from typing import List, Optional + +import numpy as np +import onnx +import onnxruntime +import pytest +import torch +import torchaudio +from datasets import load_dataset +from transformers import AutoModelForCTC, AutoProcessor + +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCTC +from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers +from QEfficient.utils import hf_download +from QEfficient.utils._utils import create_json, load_hf_processor +from QEfficient.utils.constants import QnnConstants +from QEfficient.utils.device_utils import get_available_device_id + +test_models = [ + "facebook/wav2vec2-base-960h", +] + + +def load_CTC_model(model_config): + """ + Function to load model from huggingface + -------- + + :model_config: Dict + + :return model_hf, params + """ + model_path = hf_download( + repo_id=model_config["model_name"], + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + model_hf = AutoModelForCTC.from_pretrained( + model_path, + attn_implementation="eager", + low_cpu_mem_usage=False, + ) # Run models for single layers only + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + +def run_CTC_pytorch_hf(model, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int) -> List[str]: + """ + Run pytorch inference on model + + ``Mandatory`` Args: + :model: The transformed PyTorch model used for generating transcripts + :processor: autoprocessor to process inputs and decode logits + :inputs (np.ndarray): inputs to run the execution. + :sample_rate (int): the sample rate for the audio file + + + + Returns: + torch.Tensor: A list of output features generated by the model for each prompt. + """ + seq_len = 480000 + + # prepare inputs + input_features = processor( + inputs[0], return_tensors="pt", max_length=seq_len, truncating=True, padding="max_length" + ).input_values + + model_inputs = dict( + input_values=input_features, + ) + outputs = model(**model_inputs).logits + return outputs + + +def run_CTC_ort(onnx_path, config, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int) -> List[str]: + """ + Run onnxruntime inference on model + + ``Mandatory`` Args: + :model: The transformed PyTorch model used for generating transcripts + :processor: autoprocessor to process inputs and decode logits + :inputs (np.ndarray): inputs to run the execution. + :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor) + + Returns: + torch.Tensor: A list of output features generated by the model for each prompt. + """ + seq_len = 480000 + + # Replace invalid index value for INT32 max to 0 using add_initializer + m = onnx.load(onnx_path, load_external_data=False) + # NOTE: OrtValue objects should be kept around until the session is run, hence this dict is required + added_initializers = {} + for node in m.graph.node: + if node.op_type == "Constant": + np_tensor = onnx.numpy_helper.to_array(node.attribute[0].t, os.path.dirname(onnx_path)) + if len(np_tensor.shape) == 0 and np_tensor.item() == 2147483647: + added_initializers[node.output[0]] = onnxruntime.OrtValue.ortvalue_from_numpy( + np.array(0, np_tensor.dtype) + ) + + session_options = onnxruntime.SessionOptions() + for name, value in added_initializers.items(): + session_options.add_initializer(name, value) + + session = onnxruntime.InferenceSession(onnx_path, session_options) + + # prepare inputs + input_features = processor( + inputs[0], return_tensors="pt", max_length=seq_len, truncation=True, padding="max_length" + ).input_values + + model_inputs = dict(input_values=(input_features).numpy()) + outputs = session.run(None, model_inputs) + logits = outputs[0] + return logits + + +def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( + model_name: str, + n_layer: int = 1, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, +): + """ + Validate the PyTorch model, the PyTorch model after ONNX model and the Cloud AI 100 model + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``whisper`` + :n_layers (int): Number of layers for the Model. + """ + replace_transformers_quantizers() + model_config = {"model_name": model_name} + model_config["n_layer"] = n_layer + + model_hf, _ = load_CTC_model(model_config) + + processor = load_hf_processor(pretrained_model_name_or_path=model_name) + batch_size = 1 + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + data = ds[0]["audio"]["array"] + data = torch.tensor(data).unsqueeze(0).numpy() + sample_rate = ds[0]["audio"]["sampling_rate"] + if sample_rate != 16000: + resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) + data = resampler(data) + run_CTC_pytorch_hf(model_hf, processor, data, sample_rate) + qeff_model = QEFFAutoModelForCTC(model_hf, pretrained_model_name_or_path=model_name) + qeff_model.export() + run_CTC_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate) + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + qeff_model.compile( + num_cores=16, + batch_size=batch_size, + enable_qnn=enable_qnn, + qnn_config=qnn_config, + ) + cloud_ai_100_tokens = qeff_model.generate(processor, data) + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) + + +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", test_models) +def test_CTC_pytorch_vs_kv_vs_ort_vs_ai100(model_name): + """ + Test function to validate the PyTorch model, the PyTorch model the ONNX model, and the Cloud AI 100 model. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + check_CTC_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.skip(reason="Wav2Vec2 is currently not supported on QNN") +@pytest.mark.parametrize("model_name", test_models) +def test_CTC_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): + """ + QNN Compilation path test. + Test function to validate the PyTorch model, the PyTorch model after the ONNX model, and the Cloud AI 100 model. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, n_layer=4, enable_qnn=True, qnn_config=qnn_config_json_path + ) From b449da860cdb0c0992501b412066b664df3eb946 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 29 Sep 2025 07:54:04 +0000 Subject: [PATCH 08/11] Added test for AutoModelForCTC class. Signed-off-by: Tanisha Chawada --- .../models/test_audio_embedding_models.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index 5df7c1405..f267060c2 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -7,16 +7,13 @@ import os from typing import List, Optional - import numpy as np import onnx import onnxruntime import pytest import torch -import torchaudio from datasets import load_dataset from transformers import AutoModelForCTC, AutoProcessor - from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCTC from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download @@ -77,7 +74,7 @@ def run_CTC_pytorch_hf(model, processor: AutoProcessor, inputs: np.ndarray, samp model_inputs = dict( input_values=input_features, ) - outputs = model(**model_inputs).logits + outputs = torch.tensor(model(**model_inputs).logits) return outputs @@ -121,7 +118,7 @@ def run_CTC_ort(onnx_path, config, processor: AutoProcessor, inputs: np.ndarray, model_inputs = dict(input_values=(input_features).numpy()) outputs = session.run(None, model_inputs) - logits = outputs[0] + logits = torch.tensor(outputs[0]) return logits @@ -150,13 +147,18 @@ def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( data = ds[0]["audio"]["array"] data = torch.tensor(data).unsqueeze(0).numpy() sample_rate = ds[0]["audio"]["sampling_rate"] - if sample_rate != 16000: - resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) - data = resampler(data) - run_CTC_pytorch_hf(model_hf, processor, data, sample_rate) + pytorch_tokens=run_CTC_pytorch_hf(model_hf, processor, data, sample_rate) + predicted_ids = torch.argmax(pytorch_tokens, dim=-1) + pytorch_output = processor.batch_decode(predicted_ids) + qeff_model = QEFFAutoModelForCTC(model_hf, pretrained_model_name_or_path=model_name) qeff_model.export() - run_CTC_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate) + ort_tokens=run_CTC_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate) + predicted_ids = torch.argmax(ort_tokens, dim=-1) + ort_output = processor.batch_decode(predicted_ids) + assert (pytorch_output == ort_output), ( + "Tokens don't match for pytorch output and ORT output." + ) if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") qeff_model.compile( @@ -165,8 +167,12 @@ def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( enable_qnn=enable_qnn, qnn_config=qnn_config, ) - cloud_ai_100_tokens = qeff_model.generate(processor, data) + cloud_ai_100_output = qeff_model.generate(processor, data) + assert (pytorch_output == cloud_ai_100_output), ( + "Tokens don't match for pytorch output and Cloud AI 100 output." + ) assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) + @pytest.mark.on_qaic From d0247522f05aaaa2c4eb07f0e63d3f8e1d91c9c6 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 29 Sep 2025 07:57:47 +0000 Subject: [PATCH 09/11] Added test for AutoModelForCTC class. Signed-off-by: Tanisha Chawada --- .../models/test_audio_embedding_models.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index f267060c2..7049e298c 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -7,6 +7,7 @@ import os from typing import List, Optional + import numpy as np import onnx import onnxruntime @@ -14,6 +15,7 @@ import torch from datasets import load_dataset from transformers import AutoModelForCTC, AutoProcessor + from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCTC from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download @@ -147,18 +149,16 @@ def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( data = ds[0]["audio"]["array"] data = torch.tensor(data).unsqueeze(0).numpy() sample_rate = ds[0]["audio"]["sampling_rate"] - pytorch_tokens=run_CTC_pytorch_hf(model_hf, processor, data, sample_rate) + pytorch_tokens = run_CTC_pytorch_hf(model_hf, processor, data, sample_rate) predicted_ids = torch.argmax(pytorch_tokens, dim=-1) pytorch_output = processor.batch_decode(predicted_ids) - + qeff_model = QEFFAutoModelForCTC(model_hf, pretrained_model_name_or_path=model_name) qeff_model.export() - ort_tokens=run_CTC_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate) + ort_tokens = run_CTC_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate) predicted_ids = torch.argmax(ort_tokens, dim=-1) ort_output = processor.batch_decode(predicted_ids) - assert (pytorch_output == ort_output), ( - "Tokens don't match for pytorch output and ORT output." - ) + assert pytorch_output == ort_output, "Tokens don't match for pytorch output and ORT output." if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") qeff_model.compile( @@ -168,11 +168,8 @@ def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( qnn_config=qnn_config, ) cloud_ai_100_output = qeff_model.generate(processor, data) - assert (pytorch_output == cloud_ai_100_output), ( - "Tokens don't match for pytorch output and Cloud AI 100 output." - ) + assert pytorch_output == cloud_ai_100_output, "Tokens don't match for pytorch output and Cloud AI 100 output." assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) - @pytest.mark.on_qaic From b0d0a1886fd80b6a4cccc7ab6a2091813693376d Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Wed, 1 Oct 2025 06:00:34 +0000 Subject: [PATCH 10/11] Added test for AutoModelForCTC class. Signed-off-by: Tanisha Chawada --- .../models/test_audio_embedding_models.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index 7049e298c..02519f179 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -15,12 +15,12 @@ import torch from datasets import load_dataset from transformers import AutoModelForCTC, AutoProcessor - from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCTC from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download from QEfficient.utils._utils import create_json, load_hf_processor -from QEfficient.utils.constants import QnnConstants +from QEfficient.utils.constants import QnnConstants, WAV2VEC2_MAX_SEQ_LEN + from QEfficient.utils.device_utils import get_available_device_id test_models = [ @@ -28,7 +28,7 @@ ] -def load_CTC_model(model_config): +def load_ctc_model(model_config): """ Function to load model from huggingface -------- @@ -51,7 +51,7 @@ def load_CTC_model(model_config): return model_hf, params -def run_CTC_pytorch_hf(model, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int) -> List[str]: +def run_ctc_pytorch_hf(model, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int) -> List[str]: """ Run pytorch inference on model @@ -66,7 +66,7 @@ def run_CTC_pytorch_hf(model, processor: AutoProcessor, inputs: np.ndarray, samp Returns: torch.Tensor: A list of output features generated by the model for each prompt. """ - seq_len = 480000 + seq_len = WAV2VEC2_MAX_SEQ_LEN # prepare inputs input_features = processor( @@ -80,7 +80,7 @@ def run_CTC_pytorch_hf(model, processor: AutoProcessor, inputs: np.ndarray, samp return outputs -def run_CTC_ort(onnx_path, config, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int) -> List[str]: +def run_ctc_ort(onnx_path, config, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int) -> List[str]: """ Run onnxruntime inference on model @@ -124,7 +124,7 @@ def run_CTC_ort(onnx_path, config, processor: AutoProcessor, inputs: np.ndarray, return logits -def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( +def check_ctc_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, n_layer: int = 1, enable_qnn: Optional[bool] = False, @@ -140,7 +140,7 @@ def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( model_config = {"model_name": model_name} model_config["n_layer"] = n_layer - model_hf, _ = load_CTC_model(model_config) + model_hf, _ = load_ctc_model(model_config) processor = load_hf_processor(pretrained_model_name_or_path=model_name) batch_size = 1 @@ -149,13 +149,13 @@ def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( data = ds[0]["audio"]["array"] data = torch.tensor(data).unsqueeze(0).numpy() sample_rate = ds[0]["audio"]["sampling_rate"] - pytorch_tokens = run_CTC_pytorch_hf(model_hf, processor, data, sample_rate) + pytorch_tokens = run_ctc_pytorch_hf(model_hf, processor, data, sample_rate) predicted_ids = torch.argmax(pytorch_tokens, dim=-1) pytorch_output = processor.batch_decode(predicted_ids) qeff_model = QEFFAutoModelForCTC(model_hf, pretrained_model_name_or_path=model_name) qeff_model.export() - ort_tokens = run_CTC_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate) + ort_tokens = run_ctc_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate) predicted_ids = torch.argmax(ort_tokens, dim=-1) ort_output = processor.batch_decode(predicted_ids) assert pytorch_output == ort_output, "Tokens don't match for pytorch output and ORT output." @@ -174,20 +174,20 @@ def check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.parametrize("model_name", test_models) -def test_CTC_pytorch_vs_kv_vs_ort_vs_ai100(model_name): +def test_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the PyTorch model, the PyTorch model the ONNX model, and the Cloud AI 100 model. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - check_CTC_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4) + check_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4) @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.skip(reason="Wav2Vec2 is currently not supported on QNN") @pytest.mark.parametrize("model_name", test_models) -def test_CTC_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): +def test_ctc_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): """ QNN Compilation path test. Test function to validate the PyTorch model, the PyTorch model after the ONNX model, and the Cloud AI 100 model. @@ -197,6 +197,6 @@ def test_CTC_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - check_CTC_pytorch_vs_kv_vs_ort_vs_ai100( + check_ctc_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, n_layer=4, enable_qnn=True, qnn_config=qnn_config_json_path ) From b2549507b0cfea8783b3c96eb6d2a5dbb9aaf4ad Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Wed, 1 Oct 2025 06:06:13 +0000 Subject: [PATCH 11/11] Added test for AutoModelForCTC class. Signed-off-by: Tanisha Chawada --- tests/transformers/models/test_audio_embedding_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index 02519f179..da30c76b0 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -15,12 +15,12 @@ import torch from datasets import load_dataset from transformers import AutoModelForCTC, AutoProcessor + from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCTC from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download from QEfficient.utils._utils import create_json, load_hf_processor -from QEfficient.utils.constants import QnnConstants, WAV2VEC2_MAX_SEQ_LEN - +from QEfficient.utils.constants import WAV2VEC2_MAX_SEQ_LEN, QnnConstants from QEfficient.utils.device_utils import get_available_device_id test_models = [