diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 62c6bec199f..9035a7a458c 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4979,9 +4979,9 @@ def test_static_qwen2_5(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                inference_speed_ref = {"SM8650": 110, "SM8750": 130}
+                inference_speed_ref = {"SM8650": 115, "SM8750": 155}
                 self.assertLessEqual(msg["wiki_ppl"], 15)
-                self.assertLessEqual(msg["pte_size"], 800000000)  # 800mb
+                self.assertLessEqual(msg["pte_size"], 600000000)  # 600mb
                 if self.model in inference_speed_ref:
                     self.assertGreaterEqual(
                         msg["inference_speed"], inference_speed_ref[self.model]
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 0ffb168c8f9..f85f48f3de4 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -211,8 +211,8 @@ class Qwen2_5_0_5B(LLMModelConfig):
 
     num_sharding = 1
     # quant config
-    ptq = QuantDtype.use_16a8w
-    group_size = None
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 16
     masked_softmax = True
     r1 = False
     r2 = False
@@ -233,13 +233,13 @@ class Qwen2_5_1_5B(LLMModelConfig):
 
     num_sharding = 1
     # quant config
-    ptq = QuantDtype.use_16a8w
-    group_size = None
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 16
     masked_softmax = True
     r1 = False
     r2 = False
     r3 = True
-    custom_annotation = ()
+    custom_annotation = (annotate_output_16a8w,)
 
 
 @register_llm_model("qwen3-0_6b")