pytorch · cccclai · Sep 9, 2025 · Sep 8, 2025
@@ -4979,9 +4979,9 @@ def test_static_qwen2_5(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                inference_speed_ref = {"SM8650": 110, "SM8750": 130}
+                inference_speed_ref = {"SM8650": 115, "SM8750": 155}
                 self.assertLessEqual(msg["wiki_ppl"], 15)
-                self.assertLessEqual(msg["pte_size"], 800000000)  # 800mb
+                self.assertLessEqual(msg["pte_size"], 600000000)  # 600mb
                 if self.model in inference_speed_ref:
                     self.assertGreaterEqual(
                         msg["inference_speed"], inference_speed_ref[self.model]

@@ -211,8 +211,8 @@ class Qwen2_5_0_5B(LLMModelConfig):
 
     num_sharding = 1
     # quant config
-    ptq = QuantDtype.use_16a8w
-    group_size = None
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 16
     masked_softmax = True
     r1 = False
     r2 = False
@@ -233,13 +233,13 @@ class Qwen2_5_1_5B(LLMModelConfig):
 
     num_sharding = 1
     # quant config
-    ptq = QuantDtype.use_16a8w
-    group_size = None
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 16
     masked_softmax = True
     r1 = False
     r2 = False
     r3 = True
-    custom_annotation = ()
+    custom_annotation = (annotate_output_16a8w,)
 
 
 @register_llm_model("qwen3-0_6b")