diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 62c6bec199f..9035a7a458c 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -4979,9 +4979,9 @@ def test_static_qwen2_5(self): if "Error" in msg: self.fail(msg["Error"]) else: - inference_speed_ref = {"SM8650": 110, "SM8750": 130} + inference_speed_ref = {"SM8650": 115, "SM8750": 155} self.assertLessEqual(msg["wiki_ppl"], 15) - self.assertLessEqual(msg["pte_size"], 800000000) # 800mb + self.assertLessEqual(msg["pte_size"], 600000000) # 600mb if self.model in inference_speed_ref: self.assertGreaterEqual( msg["inference_speed"], inference_speed_ref[self.model] diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py index 0ffb168c8f9..f85f48f3de4 100644 --- a/examples/qualcomm/oss_scripts/llama/__init__.py +++ b/examples/qualcomm/oss_scripts/llama/__init__.py @@ -211,8 +211,8 @@ class Qwen2_5_0_5B(LLMModelConfig): num_sharding = 1 # quant config - ptq = QuantDtype.use_16a8w - group_size = None + ptq = QuantDtype.use_16a4w_block + group_size = 16 masked_softmax = True r1 = False r2 = False @@ -233,13 +233,13 @@ class Qwen2_5_1_5B(LLMModelConfig): num_sharding = 1 # quant config - ptq = QuantDtype.use_16a8w - group_size = None + ptq = QuantDtype.use_16a4w_block + group_size = 16 masked_softmax = True r1 = False r2 = False r3 = True - custom_annotation = () + custom_annotation = (annotate_output_16a8w,) @register_llm_model("qwen3-0_6b")