pytorch · SS-JIA · Nov 11, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
@@ -32,24 +32,23 @@ def call(self, graph_module: torch.fx.GraphModule):
                 exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to_dw.default,
                 exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default,
             ]:
-                # Replace quantize op feeding into conv2d (first argument is the quantized input)
-                quantized_input_node = node.args[0]
-                if isinstance(
-                    quantized_input_node, torch.fx.Node
-                ) and utils.is_quant_node(quantized_input_node):
-                    # Get the arguments from the original quantize node
-                    input_tensor = quantized_input_node.args[0]
-                    scale = quantized_input_node.args[1]
-                    zero_point = quantized_input_node.args[2]
+                for quantized_input_node in node.args:
+                    if isinstance(
+                        quantized_input_node, torch.fx.Node
+                    ) and utils.is_quant_node(quantized_input_node):
+                        # Get the arguments from the original quantize node
+                        input_tensor = quantized_input_node.args[0]
+                        scale = quantized_input_node.args[1]
+                        zero_point = quantized_input_node.args[2]
 
-                    nodes_to_replace.append(
-                        {
-                            "old_node": quantized_input_node,
-                            "new_target": exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default,
-                            "args": (input_tensor, scale, zero_point),
-                            "node_type": "quantize_input",
-                        }
-                    )
+                        nodes_to_replace.append(
+                            {
+                                "old_node": quantized_input_node,
+                                "new_target": exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default,
+                                "args": (input_tensor, scale, zero_point),
+                                "node_type": "quantize_input",
+                            }
+                        )
 
                 # Find dequantize ops that consume the output of this conv2d
                 for user in node.users:

@@ -368,7 +368,7 @@ def constrain_op_arg_repset(self, arg_i: int, op_repsets: utils.OpRepSets) -> No
 
         arg_repset = op_repsets.get_arg_repset(arg_i)
         if arg_repset.is_constrained():
-            return arg_repset
+            return
 
         arg_node = op_repsets.op_node.args[arg_i]
 
@@ -378,21 +378,33 @@ def constrain_op_arg_repset(self, arg_i: int, op_repsets: utils.OpRepSets) -> No
         arg_repset = self.trace_node_users_to_constrain_repset(arg_node, arg_repset)
         op_repsets.try_constrain_with_arg_repset(arg_i, arg_repset)
 
+    def constrain_op_out_repset(self, op_repsets: utils.OpRepSets) -> None:
+        """
+        Similar to the `constrain_op_arg_repset` function, but for the output repset of
+        the operator.
+        """
+        out_repset = op_repsets.get_out_repset(0)
+        if out_repset.is_constrained():
+            return
+
+        op_node = op_repsets.op_node
+        out_respset = self.trace_node_users_to_constrain_repset(op_node, out_repset)
+
+        op_repsets.try_constrain_with_out_repset(out_respset)
+
     def constrain_op_repsets(self, op_repsets: utils.OpRepSets) -> None:
         # For most ops, constraining the argument repsets will also contrain the output
         # repset due to OpRepSets maintaining synchronization rules.
         for i in range(len(op_repsets.op_node.args)):
             if utils.is_tensor_arg_node(op_repsets.op_node.args[i]):
                 self.constrain_op_arg_repset(i, op_repsets)
 
-        # TODO(ssjia): For most ops, inputs and outputs must be synchronized, so there
-        # is no need to constrain output repsets explicitly. Currently, the exceptions
-        # (i.e. choose qparams) already define constrined repsets for the output, so
-        # there is again no need to explicitly constrain the outputs. If an operator
-        # appears later on that does not sync input and output representations, and
-        # defines ambiguous repsets for the output tensor(s), then we will need to add
-        # additional logic to this function to constrain the output repsets separately
-        # from the input repsets.
+        # However, some operators do not sync input and output representations and also
+        # define ambiguous repsets for the output tensor(s). In those cases we will need
+        # to execute additional logic to constrain the output repsets separately from
+        # the input repsets.
+        if not op_repsets.sync_primary_io_repr and op_repsets.sync_outs_repr:
+            self.constrain_op_out_repset(op_repsets)
 
     def set_op_node_tensor_reprs(
         self, graph_module: torch.fx.GraphModule, op_node: torch.fx.Node

@@ -636,7 +636,7 @@ def register_quantized_binary_op():
 def register_quantize_for_conv2d_op():
     return OpFeatures(
         inputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,
+            utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
         ],
         outputs_storage=[
             utils.PACKED_INT8_4W4C_BUFFER,
@@ -656,7 +656,7 @@ def register_dequantize_for_conv2d_op():
             utils.PACKED_INT8_4W4C_BUFFER,
         ],
         outputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,
+            utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
         ],
         supports_resize=False,
     )
@@ -711,6 +711,7 @@ def register_view_ops():
         exir_ops.edge.aten.unsqueeze_copy.default,
         exir_ops.edge.aten.clone.default,
         exir_ops.edge.aten.permute_copy.default,
+        exir_ops.edge.aten.gather.default,
     ]
 )
 def register_view_ops_with_buffer_meta():