From dcce9da511b8879ff1a8599eaf1f10540b30f871 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 11 Nov 2024 08:32:42 -0800
Subject: [PATCH] [ET-VK] Update partitioner to account for custom packed
 arguments

## Problem

Convolution operators, especially for pointwise convolution, may have sizes like

```
W=1, H=1, C=320, N=1280
```

When represented as a texture, this tensor would normally require a texture with extents

```
(1, 1, 320 / 4 * 1280 = 102400)
```

which would normally exceed texture limits. The new partitioner system detects this and prevents nodes with similar weights from being lowered to Vulkan. However, the partitioner system does not account for the fact that the operator implementation uses a specialized prepacking algorithm which results in valid texture limits for the packed weights.

## Changes

* Add field to `OpFeatures` class to annotate that some arguments in an op should be skipped when checking against texture limits
* Update metadata tagging pass to ignore annotating constant tensor nodes so that they don't influence memory layout and storage type proposals. Without this change, the tagging pass will try to use buffer storage for the pointwise convolution since the weight can only be represented as a buffer under normal circumstances.

Differential Revision: [D65759236](https://our.internmc.facebook.com/intern/diff/D65759236/)

ghstack-source-id: 252885980
Pull Request resolved: https://github.com/pytorch/executorch/pull/6753
---
 .../vulkan/_passes/insert_prepack_nodes.py    |  4 +
 .../vulkan/_passes/tag_memory_meta_pass.py    | 73 +++++++++++++------
 backends/vulkan/op_registry.py                | 10 +++
 .../vulkan/partitioner/vulkan_partitioner.py  |  9 ++-
 4 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/backends/vulkan/_passes/insert_prepack_nodes.py b/backends/vulkan/_passes/insert_prepack_nodes.py
index 37665a6da8e..7876806d6d1 100644
--- a/backends/vulkan/_passes/insert_prepack_nodes.py
+++ b/backends/vulkan/_passes/insert_prepack_nodes.py
@@ -35,6 +35,10 @@ def prepack_not_required(node: torch.fx.Node) -> bool:
         if not is_param_node(program, node):
             return True
 
+        # Annotate that this node is going to represented as a tensorref in the Vulkan
+        # compute graph. This will be useful for later graph passes.
+        node.meta["vkdg_tensorref"] = True
+
         for user in node.users:
             if user.op == "call_function" and handles_own_prepacking(
                 # pyre-ignore
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index fd0bd3648e6..0a6a2d42d44 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -39,6 +39,30 @@ def set_memory_metadata(
     utils.set_node_spec_attr(node, "vk_memory_layout", layout)
 
 
+def insert_transition_node(
+    graph_module: torch.fx.GraphModule,
+    node: torch.fx.Node,
+    arg: torch.fx.Node,
+    storage: VkStorageType,
+    layout: VkMemoryLayout,
+) -> None:
+    """
+    Insert a clone node to copy the original tensor to a tensor with the desired storage
+    type and memory layout.
+    """
+    with graph_module.graph.inserting_before(node):
+        clone_node = graph_module.graph.create_node(
+            "call_function",
+            exir_ops.edge.aten.clone.default,
+            (arg,),
+        )
+        clone_node.meta["val"] = arg.meta["val"]
+        clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
+        clone_node.meta["spec"].const = False
+        set_memory_metadata(clone_node, storage, layout)
+        arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)
+
+
 class TagMemoryMetaPass(ExportPass):
     """
     There are a variety of ways that tensors can be represented in Vulkan. The two main
@@ -174,14 +198,33 @@ def propose_node_layout(
         else:
             return next(iter(valid_layouts))
 
+    def should_annotate(self, node) -> bool:
+        if not isinstance(node, torch.fx.Node):
+            return False
+
+        if not isinstance(node.meta["val"], FakeTensor):
+            return False
+
+        # Storage type and memory layout for tensorref will be determined at runtime
+        # so there's no use in setting those attributes ahead of time.
+        if node.meta.get("vkdg_tensorref", False):
+            return False
+
+        return True
+
+    def should_delay_annotation(self, node: torch.fx.Node) -> bool:
+        # For prepack nodes, delay setting the storage type and memory layout as long as
+        # possible. This is to minimize the number of transitions, since it can be
+        # difficult to predict what storage type and memory layout should be used at the
+        # time the prepack node is observed.
+        return node.target == exir_ops.edge.et_vk.prepack.default
+
+    # noqa
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         sorted_nodes: NodeList = topo_sort(list(graph_module.graph.nodes))
 
         for node in sorted_nodes:
-            if not isinstance(node.meta["val"], FakeTensor):
-                continue
-
-            if node.target == exir_ops.edge.et_vk.prepack.default:
+            if not self.should_annotate(node) or self.should_delay_annotation(node):
                 continue
 
             storage = self.propose_node_storage(node)
@@ -191,11 +234,11 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
             inserting_transitions_for_node = False
             for i, arg in enumerate(node.args):
-                if not isinstance(arg, torch.fx.Node):
-                    continue
-                if not isinstance(arg.meta["val"], FakeTensor):
+                if not self.should_annotate(arg):
                     continue
 
+                assert isinstance(arg, torch.fx.Node)
+
                 arg_storage = utils.get_node_storage_type(arg)
                 arg_layout = utils.get_node_memory_layout(arg)
 
@@ -215,22 +258,10 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                         f"[Vulkan Delegate] Inserting transition(s) for {node.format_node()}:"
                     )
 
+                insert_transition_node(graph_module, node, arg, storage, layout)
+
                 logger.info(
                     f"   args {i} ({arg}): ({arg_storage}, {arg_layout}) -> ({storage}, {layout})"
                 )
 
-                # Insert a clone node to copy the original tensor to a tensor with the
-                # desired storage type and memory layout.
-                with graph_module.graph.inserting_before(node):
-                    clone_node = graph_module.graph.create_node(
-                        "call_function",
-                        exir_ops.edge.aten.clone.default,
-                        (arg,),
-                    )
-                    clone_node.meta["val"] = arg.meta["val"]
-                    clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
-                    clone_node.meta["spec"].const = False
-                    set_memory_metadata(clone_node, storage, layout)
-                    arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)
-
         return PassResult(graph_module, True)
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 3a6191bccb6..eeec5ab37e6 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -90,6 +90,9 @@ class OpFeatures:
         # then the insert_prepack_nodes pass will not insert prepack nodes for the args
         # of the op.
         "handles_own_prepacking",
+        # Optional dictionary to specify a custom function to calculate the required
+        # image extents for a particular argument index.
+        "skip_limits_check",
         # Optional check function used during partitioning to determine if a node's
         # inputs are supported by the operator implementation.
         "check_node_fn",
@@ -103,6 +106,7 @@ def __init__(
         optimal_storage: Optional[VkStorageType] = None,
         optimal_layout: Optional[VkMemoryLayout] = None,
         handles_own_prepacking: bool = False,
+        skip_limits_check: Optional[Set[int]] = None,
         check_node_fn: Optional[Callable] = None,
     ):
         self.texture_impl: Optional[TextureImplFeatures] = texture_impl
@@ -111,6 +115,11 @@ def __init__(
         self.optimal_storage: Optional[VkStorageType] = optimal_storage
         self.optimal_layout: Optional[VkMemoryLayout] = optimal_layout
         self.handles_own_prepacking: bool = handles_own_prepacking
+
+        self.skip_limits_check: Set[int] = set()
+        if skip_limits_check is not None:
+            self.skip_limits_check = skip_limits_check
+
         self.check_node_fn: Callable = allow_node
         if check_node_fn is not None:
             self.check_node_fn = check_node_fn
@@ -433,6 +442,7 @@ def register_convolution_op(features: OpFeatures):
     features.optimal_storage = VkStorageType.TEXTURE_3D
     features.optimal_layout = VkMemoryLayout.TENSOR_CHANNELS_PACKED
     features.handles_own_prepacking = True
+    features.skip_limits_check = {1, 2}
     return features
 
 
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 7b2ad3fdfde..64e672fd695 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -82,8 +82,13 @@ def op_node_is_compatible(
         valid_texture_layouts = utils.possible_node_memory_layouts(
             node, self.texture_limits
         )
-        for arg in node.args:
-            if isinstance(arg, torch.fx.Node) and utils.is_tensor_node(arg):
+
+        for i, arg in enumerate(node.args):
+            if (
+                isinstance(arg, torch.fx.Node)
+                and utils.is_tensor_node(arg)
+                and i not in features.skip_limits_check
+            ):
                 arg_texture_layouts = utils.possible_node_memory_layouts(
                     arg, self.texture_limits
                 )