From 3c25ae81bf97dc3f5bc85de043cbb356b493e54b Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 12 Nov 2025 15:22:31 -0800
Subject: [PATCH 1/7] [ET-VK] Re-implement split_with_sizes

As title. The current implementation of split_with_sizes uses functions from the `Copy.[h|cpp]` file in particular `add_copy_channel_offset_node`. However, the shaders dispatched by this function  have a critical bug where the output tensor is passed in separately with difference access types, i.e.

```cpp
graph.execute_nodes().emplace_back(new DispatchNode(
        graph,
        VK_KERNEL_FROM_STR(kernel_name),
        global_size,
        local_size,
        // Inputs and Outputs
        {
            {out, vkapi::kWrite},
            {out, vkapi::kRead},
            {in, vkapi::kRead},
        },
```

This creates many validation layer errors because the memory barriers for the resource cannot be formed properly. The shader essentially relies on undefined behaviour to work correctly

To fix, this diff re-implements the operator from scratch with a dedicated compute shader.

Differential Revision: [D86910642](https://our.internmc.facebook.com/intern/diff/D86910642/)

[ghstack-poisoned]
---
 backends/vulkan/op_registry.py                |   4 +-
 .../runtime/graph/ops/glsl/common.glslh       |  11 +
 .../graph/ops/glsl/copy_channel_offset.glsl   |  80 -----
 .../graph/ops/glsl/copy_channel_offset.yaml   |  12 -
 .../runtime/graph/ops/glsl/copy_offset.glsl   |  68 ----
 .../runtime/graph/ops/glsl/copy_offset.yaml   |  17 -
 .../ops/glsl/copy_packed_dim_offset.glsl      | 135 --------
 .../ops/glsl/copy_packed_dim_offset.yaml      |  12 -
 .../runtime/graph/ops/glsl/indexing.glslh     |  49 ++-
 .../runtime/graph/ops/glsl/split_buffer.glsl  |  50 +++
 .../runtime/graph/ops/glsl/split_buffer.yaml  |  16 +
 .../runtime/graph/ops/glsl/split_texture.glsl |  66 ++++
 .../runtime/graph/ops/glsl/split_texture.yaml |  15 +
 .../vulkan/runtime/graph/ops/impl/Copy.cpp    | 317 ------------------
 backends/vulkan/runtime/graph/ops/impl/Copy.h |  84 -----
 .../vulkan/runtime/graph/ops/impl/Repeat.cpp  |   2 -
 .../vulkan/runtime/graph/ops/impl/Split.cpp   | 183 +++++-----
 .../graph/ops/impl/utils/TensorUtils.h        |   2 +-
 backends/vulkan/test/op_tests/cases.py        |  56 +---
 19 files changed, 295 insertions(+), 884 deletions(-)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/impl/Copy.cpp
 delete mode 100644 backends/vulkan/runtime/graph/ops/impl/Copy.h

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 461278500a6..feba4f6f072 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -740,6 +740,7 @@ def register_cat_op():
     [
         exir_ops.edge.aten.select_copy.int,
         exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.split_with_sizes_copy.default,
     ]
 )
 def register_transfer_ops():
@@ -782,10 +783,7 @@ def register_ported_op():
 # Ops ported from PyTorch Vulkan backend. These ops are in a separate registry because they support all packed dimensions
 @update_features(
     [
-        # Tensor combination
         exir_ops.edge.aten.repeat.default,
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
     ]
 )
 def register_ported_op_all_packed_dims():
diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
index 8340a8b9b2f..9ade64910f2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
@@ -86,4 +86,15 @@ int quantize_and_pack(const vec4 vals, const float inv_scale, const int zp) {
   return pack_into_int32(quantized);
 }
 
+#ifdef DEBUG_MODE
+
+#define printf debugPrintfEXT
+
+void printVec4(vec4 texel) {
+  debugPrintfEXT(
+      "texel: %f, %f, %f, %f\\n", texel.x, texel.y, texel.z, texel.w);
+}
+
+#endif // DEBUG_MODE
+
 #endif // COMMON_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
deleted file mode 100644
index 39aa9b11a0d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  // Operates on (x, y, z) logical extents.
-  // channel_range is stored in range.w
-  ivec4 range;
-  // Analogus to range variable in copy. It defines the # of channel being
-  // copied.
-  // dst channel offset is stored in dst_offset.w
-  ivec4 dst_offset;
-  int src_channel_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  // Note: Unlike other shaders, the range is often not equal to the destination
-  // texture extent.
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(lpos, range.xyz))) {
-    return;
-  }
-
-  const ivec3 out_lpos = lpos + dst_offset.xyz;
-
-  const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim);
-
-  // First read the existing values to make sure the boundary values stay.
-  VEC4_T v = load_texel_lpos(existing_out, out_lpos, out_axis_map);
-
-  ivec4 in_tidx = out_tidx;
-  for (int i=0; i<4; i++) {
-
-    in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i;
-
-    // Handle the partial update for begining of channel in an existing tensor.
-    // If the source channel index is below zero or exceeds the range, we skip
-    // updating the element to avoid overwriting existing data.
-    if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) {
-      continue;
-    }
-
-    // Readjust for the source offset.
-    in_tidx[packed_dim] += src_channel_offset;
-
-    ivec4 in_posi = tidx_to_posi(in_tidx, in_sizes, in_axis_map, packed_dim);
-    v[i] = load_texel(t_in, in_posi.xyz)[in_posi.w];
-  }
-
-  write_texel_lpos(t_out, out_lpos, v, out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
deleted file mode 100644
index 984d9a09d43..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-copy_channel_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: copy_channel_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
deleted file mode 100644
index 178814a90c3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type(STORAGE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec3 range;
-  // xyz is source offset w is channel size
-  ivec4 src_offset;
-  // xyz is destination offset w is channel size
-  ivec4 dst_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-${layout_declare_spec_const(C, "int", "batch_index_function", "0")}
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range))) {
-    return;
-  }
-
-  ivec3 in_pos = pos + src_offset.xyz;
-  ivec3 out_pos = pos + dst_offset.xyz;
-  if (src_offset.w > 0) {
-    if (batch_index_function == 1) {
-      // batch index is calculated using source channel size
-      const int channel_index = pos.z % src_offset.w;
-      const int batch_index = pos.z / src_offset.w;
-      out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
-    } else if (batch_index_function == 2) {
-      // batch index is calculated using destination channel size
-      const int channel_index = pos.z % dst_offset.w;
-      const int batch_index = pos.z / dst_offset.w;
-      in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w;
-    }
-  }
-
-  write_texel_lpos(
-    t_out,
-    out_pos,
-    load_texel_lpos(t_in, in_pos, in_axis_map),
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
deleted file mode 100644
index 09f5ca36ea4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-copy_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-      - VALUE: int8
-      - VALUE: uint8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
-  shader_variants:
-    - NAME: copy_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
deleted file mode 100644
index 3100565d08a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 range;
-
-  // xyz is source offset w is channel size
-  ivec4 src_offset;
-
-  // xyz is destination offset w is channel size
-  ivec4 dst_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range.xyz))) {
-    return;
-  }
-
-  // Position in input tensor
-  ivec3 in_pos = pos + src_offset.xyz;
-  in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
-
-  // Read input value mapping to this output texel
-  VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
-
-  // Starting offset to read from a texel
-  const int src_lane_offset = src_offset[packed_dim] & 0x3;
-  const bool has_src_lane_offset = src_lane_offset != 0;
-
-  // If input lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_src_lane_offset) {
-    // Boundary values will come from next input texel in the packed dim.
-    ivec3 next_in_pos = in_pos;
-    next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
-    VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);
-
-    // Keep input values from the end of current input pixel based on src_lane_offset
-    // offset 1 means the first lane of current input texel is not a part of the output texel
-    // offset 2 means first 2 lanes are not and so on
-    // Copy next texel's values towards the end of input texel, based on lane offset
-    // offset 1 means the first lane from next texel is part of the input texel
-    // offset 2 means first 2 lanes from next texel is part of the input texel and so on
-    if (src_lane_offset == 1) {
-      in_value = ivec4(in_value.yzw, next_value.x);
-    } else if (src_lane_offset == 2) {
-      in_value = ivec4(in_value.zw, next_value.xy);
-    } else {
-      in_value = ivec4(in_value.w, next_value.xyz);
-    }
-  }
-
-  // Starting offset to write at within a texel
-  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
-  const bool has_dst_lane_offset = out_lane_offset != 0;
-
-  ivec3 out_pos = pos + dst_offset.xyz;
-  out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
-
-  VEC4_T out_value;
-
-  // If lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_dst_lane_offset) {
-    // When position in packed dim is > 0
-    if (pos[packed_dim] > 0) {
-      // Boundary values will come from previous input texel in the packed dim.
-      ivec3 prev_in_pos = in_pos;
-      prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
-      VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);
-
-      // Shift values toward the beginning based on out_lane_offset
-      // offset 1 means the last lane from the previous texel is a part of the output texel
-      // offset 2 means last 2 lanes and so on
-      if (out_lane_offset == 1) {
-        out_value.x = prev_value.w;
-      } else if (out_lane_offset == 2) {
-        out_value.xy = prev_value.zw;
-      } else {
-        out_value.xyz = prev_value.yzw;
-      }
-    } else {
-      // When position in packed dim is == 0
-      // Boundary values will be the previous texel values.
-      out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
-    }
-
-    // Copy input values towards the end of output array, based on lane offset
-    // offset 1 means the first lane from previous texel is part of the output texel starting at offset
-    // offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
-    if (out_lane_offset == 1) {
-      out_value.yzw = in_value.xyz;
-    } else if (out_lane_offset == 2) {
-      out_value.zw = in_value.xy;
-    } else {
-      out_value.w = in_value.x;
-    }
-  } else {
-    out_value = in_value;
-  }
-
-  write_texel_lpos(
-    t_out,
-    out_pos,
-    out_value,
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
deleted file mode 100644
index 6e55876cb28..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-copy_packed_dim_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: copy_packed_dim_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
index 38016547d19..b9ac0e5dace 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -259,17 +259,28 @@ void clamp_tensor_idx(const BufferMetadata meta, inout TensorIndex tidx) {
   tidx.data[1] = min(tidx.data[1], meta.sizes[1] - 1);
 }
 
-// Does not account for axis mapping or batches
+// Does not account for axis mapping
 TensorIndex4D texture_pos_to_tensor4d_idx_simple(
     const TextureMetadata meta, const ivec3 pos) {
   TensorIndex4D tidx;
   tidx.data.xyz = pos;
   tidx.data.w = 0;
   tidx.data[meta.packed_dim] *= 4;
+
+  // Compute batch idx accounting for batch concatenation, assuming channels as
+  // the concatenation dim.
+  if (meta.sizes.w > 1) {
+    int channels = meta.sizes.z;
+    if (meta.packed_dim == 2) {
+      channels = align_up_4(channels);
+    }
+    tidx.data.w = tidx.data.z / channels;
+    tidx.data.z = tidx.data.z % channels;
+  }
   return tidx;
 }
 
-// Does not account for axis mapping or batches
+// Does not account for axis mapping
 ivec3 tensor4d_idx_to_texel_pos_simple(
     const TextureMetadata meta, const TensorIndex4D tidx) {
   ivec3 texel_pos;
@@ -278,10 +289,20 @@ ivec3 tensor4d_idx_to_texel_pos_simple(
 
   texel_pos = tidx.data.xyz;
   texel_pos[meta.packed_dim] = div_4(packed_dim_idx);
+
+  // Account for batch concatenation, assuming channels as the concatenation dim
+  if (meta.sizes.w > 1) {
+    int channels_ntexels = meta.sizes.z;
+    if (meta.packed_dim == 2) {
+      channels_ntexels = div_up_4(channels_ntexels);
+    }
+    texel_pos.z += tidx.data.w * channels_ntexels;
+  }
+
   return texel_pos;
 }
 
-// Does not account for axis mapping or batches
+// Does not account for axis mapping
 TextureElementIndex tensor4d_idx_to_texture_element_idx_simple(
     const TextureMetadata meta, const TensorIndex4D tidx) {
   const int packed_dim_idx = tidx.data[meta.packed_dim];
@@ -289,6 +310,16 @@ TextureElementIndex tensor4d_idx_to_texture_element_idx_simple(
   tex_idx.pos = tidx.data.xyz;
   tex_idx.pos[meta.packed_dim] = div_4(packed_dim_idx);
   tex_idx.comp = mod_4(packed_dim_idx);
+
+  // Account for batch concatenation, assuming channels as the concatenation dim
+  if (meta.sizes.w > 1) {
+    int channels_ntexels = meta.sizes.z;
+    if (meta.packed_dim == 2) {
+      channels_ntexels = div_up_4(channels_ntexels);
+    }
+    tex_idx.pos.z += tidx.data.w * channels_ntexels;
+  }
+
   return tex_idx;
 }
 
@@ -316,13 +347,21 @@ void printTensorIndex(const TensorIndex tidx) {
     );
 }
 
-void printTensorIndex4D(const TensorIndex tidx) {
+void printTensorIndex4D(const TensorIndex4D tidx) {
     debugPrintfEXT(
         "TensorIndex4D: [%u, %u, %u, %u]\\n",
-        tidx.data[0][0], tidx.data[0][1], tidx.data[0][2], tidx.data[0][3]
+        tidx.data[0], tidx.data[1], tidx.data[2], tidx.data[3]
+    );
+}
+
+void printTextureElementIndex(const TextureElementIndex tex_idx) {
+    debugPrintfEXT(
+        "TextureElementIndex: pos=[%d %d %d] comp=%d\\n",
+        tex_idx.pos.x, tex_idx.pos.y, tex_idx.pos.z, tex_idx.comp
     );
 }
 
+
 void printBufferMetadata(const BufferMetadata meta) {
     debugPrintfEXT(
         "BufferMetadata: ndim=%u numel=%u\\n  sizes=[%u %u %u %u %u %u %u %u]\\n  dim_order=[%u %u %u %u %u %u %u %u]\\n  strides=[%u %u %u %u %u %u %u %u]\\n",
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl
new file mode 100644
index 00000000000..0505c9e7bcd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_input", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int split_dim = 0;
+layout(constant_id = 4) const int split_idx = 0;
+layout(constant_id = 5) const int split_offset = 0;
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
+    return;
+  }
+
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
+
+  TensorIndex input_tidx = out_tidx;
+  input_tidx.data[div_4(split_dim)][mod_4(split_dim)] += split_offset;
+
+  const uint input_bufi = tensor_idx_to_linear_idx(inp, input_tidx);
+
+  t_out[out_bufi] = t_input[input_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml
new file mode 100644
index 00000000000..fd52c0ac721
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+split_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: split_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
new file mode 100644
index 00000000000..92d7ce548e2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+${define_required_extensions(DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "common.glslh"
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_output", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_input", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int split_dim = 0;
+layout(constant_id = 4) const int split_idx = 0;
+layout(constant_id = 5) const int split_offset = 0;
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+
+  VEC4_T out_texel = VEC4_T(0);
+
+  int limit = min(
+      4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]);
+
+  TensorIndex4D input_tidx = out_tidx;
+  input_tidx.data[split_dim] += split_offset;
+
+  for (int comp = 0; comp < limit; comp++) {
+    TextureElementIndex input_elem_pos = tensor4d_idx_to_texture_element_idx_simple(
+        inp, input_tidx);
+
+    VEC4_T input_texel = texelFetch(t_input, input_elem_pos.pos, 0);
+    out_texel[comp] = input_texel[input_elem_pos.comp];
+
+    input_tidx.data[outp.packed_dim]++;
+  }
+
+  imageStore(t_output, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml
new file mode 100644
index 00000000000..89446df831b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+split_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: split_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
deleted file mode 100644
index bd648dbae2d..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using utils::ivec3;
-using utils::ivec4;
-using utils::uvec3;
-
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ivec3& range,
-    const ivec4& src_offset,
-    const ivec4& dst_offset,
-    const ValueRef out,
-    bool calc_out_pos_using_src_chnl,
-    bool calc_in_pos_using_dst_chnl) {
-  std::string kernel_name = "copy_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-
-  auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)),
-          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
-          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       (calc_out_pos_using_src_chnl      ? 1
-            : calc_in_pos_using_dst_chnl ? 2
-                                         : 0)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_copy_packed_dim_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ivec3& range,
-    const ivec4& src_offset,
-    const ivec4& dst_offset,
-    const ValueRef out) {
-  // Check the packed dimension is same for both tensors, also check if the
-  // packed dimension is Width or Height. Since the function does not support
-  // channel packing.
-  VK_CHECK_COND(
-      graph.packed_dim_of(in) == graph.packed_dim_of(out) &&
-      (graph.packed_dim_of(in) == WHCN::kWidthDim ||
-       graph.packed_dim_of(in) == WHCN::kHeightDim));
-
-  std::string kernel_name = "copy_packed_dim_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  // A copy of range with the last element set to batch size of the input tensor
-  ivec4 final_range = {
-      range[0], range[1], range[2], dim_at(in_sizes, kBatch4D)};
-  ivec3 global_wg_size = graph.logical_limits_of(out);
-
-  const auto packed_dim = graph.packed_dim_of(in);
-  // The starting offset in a texel where this tensor will start copying from
-  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
-  // The starting offset in a texel where this tensor will start copying to
-  const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
-
-  // The total packed texels this tensor will be copied from
-  // The first texel of tensor data in packed dimension will be copied from
-  // remaining lanes from current source Hence (4 - src_lane_offset) is added
-  // to tensor size in packed dimension
-  const auto src_packed_size = utils::div_up_4(
-      (4 - src_lane_offset) + utils::val_at(-packed_dim, out_sizes));
-
-  // The total packed texels this tensor will be copied to
-  // The first texel of tensor data in packed dimension will be copied to
-  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
-  // to tensor size in packed dimension
-  const auto dst_packed_size = utils::div_up_4(
-      (4 - dst_lane_offset) + utils::val_at(-packed_dim, in_sizes));
-
-  // If the starting src offset is not 0, and the total packed texels is
-  // greater than the source texel range
-  const bool has_additional_src_work =
-      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
-  // If the starting dst offset is not 0, and the total packed texels is
-  // greater than the source texel range
-  const bool has_additional_dst_work =
-      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
-
-  if (has_additional_src_work || has_additional_dst_work) {
-    global_wg_size[packed_dim]++; // Increase the global work group size in
-                                  // packed dimension
-    final_range[packed_dim]++; // Increase the range in packed dimension
-  }
-
-  auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {out, vkapi::kRead},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(
-              &final_range, sizeof(final_range), sizeof(ivec4)),
-          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
-          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_copy_channel_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    int32_t channel_range,
-    int32_t src_channel_offset,
-    int32_t dst_channel_offset,
-    const ValueRef out) {
-  // Likely need to prepad these numbers.
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-
-  // NOTE: This function should be able to support 1d and 2d tensors when
-  // range=1, src_offset=dst_offset=1.
-  VK_CHECK_COND(graph.dim_of(in) >= 3, "Src dim should be at least 3");
-  VK_CHECK_COND(graph.dim_of(out) >= 3, "Dst dim should be at least 3");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(in_sizes) >= src_channel_offset + channel_range,
-      "Src channel (",
-      src_channel_offset,
-      ") and range (",
-      channel_range,
-      ") should be less than or equal to input tensor's channel size (",
-      dim_at<kChannel4D>(in_sizes),
-      ")");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(out_sizes) >= dst_channel_offset + channel_range,
-      "Dst channel (",
-      dst_channel_offset,
-      ") and range (",
-      channel_range,
-      ") should be less than or equal to input tensor's channel size (",
-      dim_at<kChannel4D>(out_sizes),
-      ")");
-
-  VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
-  VK_CHECK_COND(
-      src_channel_offset >= 0, "Src channel offset must be non-negative");
-  VK_CHECK_COND(
-      dst_channel_offset >= 0, "Dst channel offset must be non-negative");
-
-  std::string kernel_name = "copy_channel_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  int32_t out_channels = dim_at<kChannel4D>(out_sizes);
-
-  // Copy one batch at a time.
-  for (int batch_idx = 0; batch_idx < dim_at<kBatch4D>(in_sizes); batch_idx++) {
-    // Mapping the tensor NCHW coordinates into texture XYZ coordinates
-    int32_t dst_first_z = dst_channel_offset / 4;
-    int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
-
-    // We copy the entire width and height dimension. For the channel dimension,
-    // we use the z-dimension of the global_size to specify the texture range.
-    // The shader combines the global invocation id and the dst_offset to get
-    // the actual coordinate.
-
-    const ivec3 dst_offset{
-        0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)};
-
-    const uvec3 global_size{
-        utils::safe_downcast<uint32_t>(dim_at<kWidth4D>(in_sizes)),
-        utils::safe_downcast<uint32_t>(dim_at<kHeight4D>(in_sizes)),
-        utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
-    const uvec3 local_size = graph.create_local_wg_size(global_size);
-
-    const utils::ivec4 range_params = {
-        static_cast<int>(global_size[0]),
-        static_cast<int>(global_size[1]),
-        static_cast<int>(global_size[2]),
-        channel_range};
-
-    const ivec4 offset_params = {
-        dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset};
-
-    auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        // Inputs and Outputs
-        {
-            {out, vkapi::kWrite},
-            {out, vkapi::kRead},
-            {in, vkapi::kRead},
-        },
-        // Parameter buffers
-        {},
-        // Push Constants
-        {graph.sizes_pc_of(out),
-         graph.sizes_pc_of(in),
-         PushConstantDataInfo(&range_params, sizeof(range_params)),
-         PushConstantDataInfo(&offset_params, sizeof(offset_params)),
-         PushConstantDataInfo(&src_channel_offset, sizeof(src_channel_offset))},
-        // Specialization Constants
-        {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-        // Resize Args
-        {},
-        // Resizing Logic
-        nullptr));
-  }
-}
-
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef range_ref,
-    ValueRef src_offset_ref,
-    ValueRef dst_offset_ref,
-    ValueRef out) {
-  ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref));
-  ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
-  ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
-
-  ivec4 src_offset = {src[0], src[1], src[2], 0};
-  ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};
-
-  add_copy_offset_node(
-      graph, in, range, src_offset, dst_offset, out, false, false);
-}
-
-void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
-}
-
-void copy_channel_offset(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  ValueRef in = args[0];
-  ValueRef channel_range_ref = args[1];
-  ValueRef src_channel_offset_ref = args[2];
-  ValueRef dst_channel_offset_ref = args[3];
-  ValueRef out = args[4];
-
-  auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
-  auto src_channel_offset =
-      graph.extract_scalar<int64_t>(src_channel_offset_ref);
-  auto dst_channel_offset =
-      graph.extract_scalar<int64_t>(dst_channel_offset_ref);
-
-  add_copy_channel_offset_node(
-      graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(etvk.copy_offset, copy_offset);
-  VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
deleted file mode 100644
index 41956d482d9..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-namespace vkcompute {
-
-// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
-// texture extents specified by the range, src_offset, and dst_offset (all are
-// in texture coordinate (x, y, z) from the input image to the output image.
-// src_offset.w and dst_offset.w may contain channel size information.
-//
-// It is possible to have input and output to point to the same image
-// object. But when the source range and destination range overlap, the behavior
-// is undefined.
-//
-// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl
-// can be used to specify an indexing function in the shader
-// If calc_out_pos_using_src_chnl is set to true channel and batch index will be
-// calculated based on source channel size and will be used to determine
-// destination texel position.
-//
-// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be
-// calculated based on destination channel size and will be used to determine
-// source texel position.
-//
-// If both are true calc_out_pos_using_src_chnl is picked. If both are false no
-// index calculation happens.
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const utils::ivec3& range,
-    const utils::ivec4& src_offset,
-    const utils::ivec4& dst_offset,
-    const ValueRef out,
-    bool calc_out_pos_using_src_chnl,
-    bool calc_in_pos_using_dst_chnl);
-
-// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that
-// its used when copying packed dimension, if tensor is width or height packed.
-// src_offset.w and dst_offset.w may contain channel size information.
-//
-// It copies the texture extents specified by the range, src_offset, and
-// dst_offset (all are in texture coordinate (x, y, z) from the input image to
-// the output image.
-void add_copy_packed_dim_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const utils::ivec3& range,
-    const utils::ivec4& src_offset,
-    const utils::ivec4& dst_offset,
-    const ValueRef out);
-
-// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
-// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
-// The range and offset arguments are in the tensor coordinate. It assumes the
-// underlying texture is channel-packed.
-//
-// This function is specialized implementation for copying
-// channel packed values. The complication comes from when reading / writing the
-// channel dimension on indices that are not aligned to packing, we will need
-// be careful about the boundaries.
-//
-// It achieves the following:
-//   out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
-//       in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
-void add_copy_channel_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    int32_t channel_range,
-    int32_t src_channel_offset,
-    int32_t dst_channel_offset,
-    const ValueRef out);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 72c1637a2c9..2b42c0bd150 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -14,8 +14,6 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
-
 namespace vkcompute {
 
 namespace {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index f87af08ee69..4e62ae8806d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -8,134 +8,131 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-namespace vkcompute {
+#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
 
-void add_split_with_sizes_default_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const std::vector<int64_t>& split_sizes,
-    int64_t dim,
-    ValueRef out_list_ref) {
-  const ValueListPtr out_list = graph.get_value_list(out_list_ref);
+namespace vkcompute {
 
-  const int64_t input_ndim = graph.dim_of(in);
+using utils::GPUMemoryLayout;
+using utils::StorageType;
+
+void resize_split_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef input = args.at(0).refs.at(0);
+  const ValueRef split_sizes_ref = args.at(1).refs.at(0);
+  const ValueRef dim_ref = args.at(2).refs.at(0);
+  const ValueRef out_list_ref = args.at(3).refs.at(0);
+
+  const ValueListPtr out_list = graph->get_value_list(out_list_ref);
+  const std::vector<int64_t> split_sizes =
+      *(graph->get_int_list(split_sizes_ref));
+  const int64_t dim = graph->extract_scalar<int64_t>(dim_ref);
+
+  const int64_t input_ndim = graph->dim_of(input);
   const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
                                      : static_cast<DimIndex>(dim - input_ndim);
 
-  VK_CHECK_COND(out_list->size() == split_sizes.size());
+  std::vector<int64_t> input_sizes = graph->sizes_of(input);
 
   for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
     const int64_t split_size = split_sizes.at(split_idx);
     const ValueRef out_ref = out_list->at(split_idx);
 
-    VK_CHECK_COND(dim_at(graph.sizes_of(out_ref), dim_index) == split_size);
-  }
-
-  const auto packed_dim = graph.packed_dim_of(in);
-  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
+    std::vector<int64_t> out_sizes = input_sizes;
+    out_sizes.at(dim_index) = split_size;
 
-  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
-  const auto dim_xyz_index = std::min(2, -dim_index - 1);
-
-  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-
-  const bool is_splitting_channel = (dim_index == kChannel4D);
-
-  // if splitting channels
-  if (is_splitting_channel) {
-    // set source offset w as channel size of the input tensor
-    src_offset[3] = dim_at(graph.sizes_of(in), kChannel4D);
+    graph->virtual_resize(out_ref, out_sizes);
   }
+}
 
-  for (ValueRef out_ref : *out_list) {
-    // Doesn't need to use split_size since we have already verified that the
-    // output tensor's size matches with the split_size.
-    const auto out_channel_size = dim_at(graph.sizes_of(out_ref), kChannel4D);
-    const utils::ivec3 range = graph.logical_limits_of(out_ref);
-
-    if (dim_index == packed_dim_index) {
-      // if splitting channels, use add_copy_channel_offset_node function as
-      // add_copy_packed_dim_offset_node does not support channel packing
-      if (is_splitting_channel) {
-        add_copy_channel_offset_node(
-            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
-        src_offset[dim_xyz_index] += out_channel_size;
-      } else {
-        // dst_offset[3] is not used now but will be used in the future when
-        // add_copy_packed_dim_offset_node will support channel packing
-        //
-        // set destination offset w as channel size of the output tensor if
-        // splitting channel
-        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
-        add_copy_packed_dim_offset_node(
-            graph, in, range, src_offset, dst_offset, out_ref);
-        src_offset[dim_xyz_index] +=
-            dim_at(graph.sizes_of(out_ref), packed_dim_index);
-      }
-    } else {
-      // set destination offset w as channel size of the output tensor if
-      // splitting channels
-      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
-      src_offset[dim_xyz_index] +=
-          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
-    }
+void add_split_node(
+    ComputeGraph& graph,
+    const ValueRef input,
+    const std::vector<int64_t>& split_sizes,
+    const int64_t dim,
+    const ValueRef out,
+    const int split_idx) {
+  std::string kernel_name = "split";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  vkapi::ParamsBindList param_ubos = {
+      graph.meta_ubo(out), graph.meta_ubo(input)};
+
+  int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, graph.dim_of(input));
+
+  // Calculate the offset for this split by summing previous split sizes
+  int64_t split_offset = 0;
+  for (int i = 0; i < split_idx; i++) {
+    split_offset += split_sizes[i];
   }
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {input, vkapi::kRead}},
+      // Shader params buffers
+      param_ubos,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {utils::safe_downcast<int32_t>(dim_whcn),
+       static_cast<int32_t>(split_idx),
+       static_cast<int32_t>(split_offset)},
+      // Resize Args
+      {},
+      // Resizing Logic
+      nullptr));
 }
 
-void add_split_with_sizes_default_node(
+void add_split_with_sizes_node(
     ComputeGraph& graph,
-    ValueRef in,
-    ValueRef split_sizes_ref,
-    ValueRef dim_ref,
-    ValueRef out) {
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
+    const ValueRef input,
+    const std::vector<int64_t>& split_sizes,
+    const int64_t dim,
+    const ValueRef out_list_ref) {
+  const ValueListPtr out_list = graph.get_value_list(out_list_ref);
+
+  VK_CHECK_COND(out_list->size() == split_sizes.size());
 
-  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
+  // Dispatch a shader for each output tensor
+  for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
+    const ValueRef out_ref = out_list->at(split_idx);
+    add_split_node(graph, input, split_sizes, dim, out_ref, split_idx);
+  }
 }
 
 void split_with_sizes_copy_default(
     ComputeGraph& graph,
     const std::vector<ValueRef>& args) {
-  add_split_with_sizes_default_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-void add_split_tensor_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef split_size_ref,
-    ValueRef dim_ref,
-    ValueRef out) {
-  const int64_t split_size = graph.extract_scalar<int64_t>(split_size_ref);
-  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-
-  const int64_t input_ndim = graph.dim_of(in);
-  const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
-                                     : static_cast<DimIndex>(dim - input_ndim);
-  const int64_t size = dim_at(graph.sizes_of(in), dim_index);
-  const std::vector<int64_t> split_sizes(size / split_size, split_size);
+  ValueRef input = args[0];
+  ValueRef split_sizes_ref = args[1];
+  ValueRef dim_ref = args[2];
+  ValueRef out_list_ref = args[3];
 
-  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
-}
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
 
-void split_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_split_tensor_node(graph, args[0], args[1], args[2], args[3]);
+  add_split_with_sizes_node(graph, input, split_sizes, dim, out_list_ref);
 }
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(
       aten.split_with_sizes_copy.default, split_with_sizes_copy_default);
-  VK_REGISTER_OP(aten.split.Tensor, split_tensor);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
index b62bf661995..05234c7790f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -69,7 +69,7 @@ template <
         std::is_integral<T>::value && std::is_signed<T>::value,
         int>::type = 0>
 T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) {
-  return ndim - 1 - nchw_dim;
+  return ndim - 1 - normalize(nchw_dim, ndim);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index f59c3e30aeb..b21a8458a89 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -6,7 +6,6 @@
 
 
 import itertools
-
 from collections import namedtuple
 from typing import Callable
 
@@ -1519,64 +1518,11 @@ def get_split_with_sizes_inputs():
 
     test_suite.layouts = [
         "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.dtypes = ["at::kFloat"]
-    return test_suite
-
-
-@register_test_suite("aten.split.Tensor")
-def get_split_tensor_inputs():
-    test_suite = VkTestSuite(
-        [
-            # Split on Width
-            ((S1, 7, 10, 12), 12, 3),
-            ((S1, 7, 10, 12), 3, 3),
-            ((S1, 7, 10, 12), 1, 3),
-            ((7, 10, 12), 12, 2),
-            ((7, 10, 12), 3, 2),
-            ((7, 10, 12), 1, 2),
-            ((10, 12), 12, 1),
-            ((10, 12), 3, 1),
-            ((10, 12), 1, 1),
-            ((12,), 12, 0),
-            ((12,), 3, 0),
-            ((12,), 1, 0),
-            # Split on Height
-            ((S1, 7, 12, 8), 12, 2),
-            ((S1, 7, 12, 8), 3, 2),
-            ((S1, 7, 12, 8), 1, 2),
-            ((7, 12, 8), 12, 1),
-            ((7, 12, 8), 3, 1),
-            ((7, 12, 8), 1, 1),
-            ((12, 8), 12, 0),
-            ((12, 8), 3, 0),
-            ((12, 8), 1, 0),
-            # Split  on Batch
-            ((12, 7, 10, 10), 12, 0),
-            ((12, 7, 10, 10), 3, 0),
-            ((12, 7, 10, 10), 1, 0),
-            # Split  on Channel
-            ((7, 15, 10, 10), 15, 1),
-            ((7, 15, 10, 10), 5, 1),
-            ((7, 15, 10, 10), 3, 1),
-            ((7, 15, 10, 10), 1, 1),
-            ((15, 10, 10), 15, 0),
-            ((15, 10, 10), 5, 0),
-            ((15, 10, 10), 3, 0),
-            ((15, 10, 10), 1, 0),
-        ]
-    )
-
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
     test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
     return test_suite
 
 

From ae55647bd014605473862ccd7f5eca571a772b1f Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 12 Nov 2025 15:22:34 -0800
Subject: [PATCH 2/7] [ET-VK] Better separation of quantized vs non-quantized
 memory layouts

As title. Make sure that ops that do not support quantized tensors do not get assigned memory layouts that are intended for quantized tensors.

Differential Revision: [D86910639](https://our.internmc.facebook.com/intern/diff/D86910639/)

[ghstack-poisoned]
---
 .../vulkan/_passes/tag_memory_meta_pass.py    |  5 +-
 backends/vulkan/utils.py                      | 48 ++++++++++++-------
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index 43796c043c8..00b6c62d5d2 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -226,9 +226,10 @@ def get_arg_tensor_source_repset(
         """
         arg_node = op_node.args[arg_i]
 
-        # For non-tensor arguments, return ANY_STORAGE
+        # For non-tensor arguments, return ALL_STORAGES_REPSET so that the respset does
+        # not appear to be empty.
         if not utils.is_tensor_arg_node(arg_node):
-            return utils.ANY_STORAGE
+            return utils.ALL_STORAGES_REPSET
 
         # Special case for cat - use the first tensor in the list as representative
         if isinstance(arg_node, list):
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index fca8173ffb7..2ca2ddf19b7 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -8,26 +8,18 @@
 from typing import Any, List, Optional, Set, Tuple, Union
 
 import torch
-
 from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
     VkMemoryLayout,
     VkStorageType,
 )
-
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
 )
-
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
-
 from executorch.exir.tensor import TensorSpec
-
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
-
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorConverter
-
 from torch.export import ExportedProgram
-
 from torch.export.exported_program import InputKind
 from torch.export.graph_signature import TensorArgument
 
@@ -399,10 +391,23 @@ def node_has_target(node: Any, target: str):
     VkStorageType.TEXTURE_3D,
 }
 
+# Memory layouts available to non-quantized tensors
 all_memory_layouts: Set[VkMemoryLayout] = {
     VkMemoryLayout.TENSOR_WIDTH_PACKED,
     VkMemoryLayout.TENSOR_HEIGHT_PACKED,
     VkMemoryLayout.TENSOR_CHANNELS_PACKED,
+}
+
+# Memory layouts available to quantized tensors
+all_quantized_memory_layouts: Set[VkMemoryLayout] = {
+    VkMemoryLayout.PACKED_INT8_4W4C,
+    VkMemoryLayout.PACKED_INT8_4H4W,
+}
+
+universal_memory_layout_set: Set[VkMemoryLayout] = {
+    VkMemoryLayout.TENSOR_WIDTH_PACKED,
+    VkMemoryLayout.TENSOR_HEIGHT_PACKED,
+    VkMemoryLayout.TENSOR_CHANNELS_PACKED,
     VkMemoryLayout.PACKED_INT8_4W4C,
     VkMemoryLayout.PACKED_INT8_4H4W,
 }
@@ -761,7 +766,7 @@ def make_filtered_tensor_repset(
 
 ## Convenience TensorRepSet definitions
 
-PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set())
+# Only includes memory layouts that can be used by non-quantized tensors
 
 CONTIGUOUS_ANY = TensorRepSet(
     {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_WIDTH_PACKED}
@@ -782,9 +787,18 @@ def make_filtered_tensor_repset(
 
 ANY_TEXTURE = TensorRepSet(set(), all_memory_layouts)
 ANY_BUFFER = TensorRepSet(all_memory_layouts, set())
-
 ANY_STORAGE = TensorRepSet(all_memory_layouts, all_memory_layouts)
+
+# Only includes memory layouts that can be used by quantized tensors
+
+PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set())
+
+# Special use RepSets
+
 NO_STORAGE = TensorRepSet(set(), set())
+ALL_STORAGES_REPSET = TensorRepSet(
+    universal_memory_layout_set, universal_memory_layout_set
+)
 
 
 class TensorRepSetList:
@@ -908,19 +922,19 @@ def __init__(  # noqa: C901
         # Now, go through the arguments of the operator and create a filtered repset
         # for each based on the actual tensor value.
         args_repset_list = TensorRepSetList([])
-        common_arg_repset = ANY_STORAGE
+        common_arg_repset = ALL_STORAGES_REPSET
         for i, arg_node in enumerate(op_node.args):
             arg_repset = inputs_repsets[i]
 
-            # Use ANY_STORAGE for non-tensor nodes so they don't cause the op repsets to
-            # appear empty
+            # Use ALL_STORAGES_REPSET for non-tensor nodes so they don't cause the op
+            # repsets to appear empty
             if not is_tensor_arg_node(arg_node):
-                args_repset_list.append(ANY_STORAGE)
+                args_repset_list.append(ALL_STORAGES_REPSET)
             # NO_STORAGE is used to denote that an input is either a non tensor arg or
             # a weight tensor that is not prepacked. Similar to the above, use
-            # ANY_STORAGE in this case.
+            # ALL_STORAGES_REPSET in this case.
             elif arg_repset.is_empty():
-                args_repset_list.append(ANY_STORAGE)
+                args_repset_list.append(ALL_STORAGES_REPSET)
             else:
                 assert not arg_repset.is_empty()
 
@@ -933,7 +947,7 @@ def __init__(  # noqa: C901
 
         # Repeat for output tensors.
         outs_repset_list = TensorRepSetList([])
-        common_out_repset = ANY_STORAGE
+        common_out_repset = ALL_STORAGES_REPSET
         if num_tensors_in_node(op_node) == 1:
             common_out_repset = make_filtered_tensor_repset(
                 op_node.meta["val"], outputs_repsets[0], texture_limits

From 4c881d3c4dc55a43c73a0242d7b8c255eef9c2a2 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 12 Nov 2025 15:22:38 -0800
Subject: [PATCH 3/7] [ET-VK] Add some utility compile options + improve export
 script

Title says it all!

Add two additional export options:

1. `skip_memory_planning` - skips the memory planning pass which can be useful for debugging.
2. `small_texture_limits` - sets the default texture limit to be (2048, 2048, 2048) which is compatible with more devices (i.e. desktop/laptop GPUs) compared to the default (16384, 16384, 2048) which is more targeted for mobile GPUs

Also adds some improvements to the export script that were made while debugging the `YOLO_NAS` model (https://github.com/pytorch/executorch/issues/15700)

Differential Revision: [D86910640](https://our.internmc.facebook.com/intern/diff/D86910640/)

[ghstack-poisoned]
---
 backends/vulkan/test/utils.py        | 57 ++++++++++++++-----
 backends/vulkan/vulkan_preprocess.py | 61 +++++++++++---------
 examples/vulkan/export.py            | 83 +++++++++++++++++++++++++---
 3 files changed, 154 insertions(+), 47 deletions(-)

diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
index 90edc094ec7..bece35a2e35 100644
--- a/backends/vulkan/test/utils.py
+++ b/backends/vulkan/test/utils.py
@@ -8,18 +8,14 @@
 import logging
 from collections import OrderedDict
 from copy import deepcopy
-
 from enum import auto, Enum
 from typing import Any, List, Optional, Tuple
 
 import executorch.backends.vulkan.utils as utils
-
 import torch
-
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
@@ -36,7 +32,6 @@
 )
 from executorch.extension.pytree import tree_flatten
 from torch.export import export
-
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -275,16 +270,25 @@ def check_outputs_equal(
                 )
             return result
         else:
+            result = True
             for i in range(len(ref_output)):
-                if not torch.allclose(
-                    model_output[i], ref_output[i], atol=atol, rtol=rtol
-                ):
-                    print(f"\n=== Output {i} comparison failed ===")
-                    print_tensor_comparison_errors(
-                        model_output[i], ref_output[i], atol, rtol
-                    )
-                    return False
-            return True
+                if isinstance(ref_output[i], torch.Tensor):
+                    if not torch.allclose(
+                        model_output[i], ref_output[i], atol=atol, rtol=rtol
+                    ):
+                        print(f"\n=== Output {i} comparison failed ===")
+                        print_tensor_comparison_errors(
+                            model_output[i], ref_output[i], atol, rtol
+                        )
+                        result = False
+                elif isinstance(ref_output[i], int):
+                    if not model_output[i] == ref_output[i]:
+                        print(f"\n=== Output {i} comparison failed ===")
+                        print(f"{model_output[i]} vs {ref_output[[i]]}")
+                        result = False
+                else:
+                    print(f"WARNING: Output {i} has type {type(ref_output[i])}")
+            return result
     else:
         # If one output, eager returns tensor while executor tuple of size 1
         result = torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol)
@@ -326,7 +330,7 @@ def run_and_check_output(
     model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
 
     # Generate reference outputs using the reference model
-    ref_output = reference_model(*sample_inputs)
+    ref_output, _ = tree_flatten(reference_model(*sample_inputs))
 
     # Check if outputs are equal
     return check_outputs_equal(
@@ -805,3 +809,26 @@ def find_bad_operators(
         "all_operators": all_operators,
         "test_count": test_count,
     }
+
+
+def make_indent(indent_level):
+    indent_str = ""
+    for i in range(indent_level):
+        indent_str += " "
+    return indent_str
+
+
+def print_output(outputs, n: int = 0, indent_level: int = 0):
+    if isinstance(outputs, (list, tuple)):
+        print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}")
+        new_indent_level = indent_level + 2
+        for n, test_out in enumerate(outputs):
+            print_output(test_out, n, new_indent_level)
+    elif isinstance(outputs, torch.Tensor):
+        print(
+            f"{make_indent(indent_level)}output_{n} = test_utils.random_uniform_tensor({outputs.shape}, low={outputs.min().item()}, high={outputs.max().item()},  dtype={outputs.dtype})"
+        )
+    elif isinstance(outputs, int):
+        print(f"{make_indent(indent_level)}output_{n} = {outputs}")
+    else:
+        print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}")
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 3a3f6cdf4fe..3ccbdc8ab85 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import copy
 from functools import partial
 from typing import Any, Callable, Dict, final, List
 
@@ -127,15 +128,21 @@ def preprocess(  # noqa: C901
         module_compile_spec: List[CompileSpec],
     ) -> PreprocessResult:
         compile_options = parse_compile_spec(module_compile_spec)
-        limits_x = compile_options.get(
-            "texture_limits_x", utils.DEFAULT_TEXTURE_LIMITS[0]
-        )
-        limits_y = compile_options.get(
-            "texture_limits_y", utils.DEFAULT_TEXTURE_LIMITS[1]
-        )
-        limits_z = compile_options.get(
-            "texture_limits_z", utils.DEFAULT_TEXTURE_LIMITS[2]
-        )
+
+        default_texture_limits = copy.deepcopy(utils.DEFAULT_TEXTURE_LIMITS)
+        # 2048 is the typical limit value for 3D textures, but mobile GPUs often support
+        # 16384. Since the Vulkan delegate primarily targets mobile GPUs at the moment,
+        # 16394 is the default texture limit used. This option is provided as a
+        # convenient way to switch to using a limit of 2048 for image textures which
+        # will be compatible with most GPUs.
+        if compile_options.get("small_texture_limits", False):
+            default_texture_limits[0] = 2048
+            default_texture_limits[1] = 2048
+            default_texture_limits[2] = 2048
+
+        limits_x = compile_options.get("texture_limits_x", default_texture_limits[0])
+        limits_y = compile_options.get("texture_limits_y", default_texture_limits[1])
+        limits_z = compile_options.get("texture_limits_z", default_texture_limits[2])
         texture_limits = (limits_x, limits_y, limits_z)
 
         default_storage_type = compile_options.get(
@@ -204,22 +211,26 @@ def preprocess(  # noqa: C901
 
         # Finally, apply dynamic shape passes and memory planning pass. These passes
         # must be applied only when the graph structure is finalized.
-        greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False)
-        mem_planning_suite = MemoryPlanningAlgorithmSuite(
-            algo_list=[greedy_memory_planning]
-        )
-        # This is a workaround to allow the memory planning pass to work without having
-        # to first apply ToOutVarPass(). See the `greedy()` function in
-        # `exir.memory_planning`; if this attribute isn't set, assertions in
-        # `collect_spec_from_nodes()` will fail.
-        program.graph_module.encounter_to_out_var_failure = True
-        program = apply_passes(
-            program,
-            [
-                ConstraintBasedSymShapeEvalPass(),
-                MemoryPlanningPass(memory_planning_algo=mem_planning_suite),
-            ],
-        )
+        final_passes = [
+            ConstraintBasedSymShapeEvalPass(),
+        ]
+        if not compile_options.get("skip_memory_planning", False):
+            greedy_memory_planning = partial(
+                greedy, allow_overlapping_allocations=False
+            )
+            mem_planning_suite = MemoryPlanningAlgorithmSuite(
+                algo_list=[greedy_memory_planning]
+            )
+            # This is a workaround to allow the memory planning pass to work without having
+            # to first apply ToOutVarPass(). See the `greedy()` function in
+            # `exir.memory_planning`; if this attribute isn't set, assertions in
+            # `collect_spec_from_nodes()` will fail.
+            program.graph_module.encounter_to_out_var_failure = True
+            final_passes.append(
+                MemoryPlanningPass(memory_planning_algo=mem_planning_suite)
+            )
+
+        program = apply_passes(program, final_passes)
 
         graph_builder = VkGraphBuilder(
             program,
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index dace37e5473..c3043a97bac 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -11,28 +11,27 @@
 import argparse
 import logging
 
-import backends.vulkan.test.utils as test_utils
-
+import executorch.backends.vulkan.test.utils as test_utils
 import torch
 import torchvision
-
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.devtools import BundledProgram
 from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
+from executorch.examples.models import MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
 from executorch.exir import to_edge_transform_and_lower
 from executorch.extension.export_util.utils import save_pte_program
 from executorch.extension.pytree import tree_flatten
 from torch.export import Dim, export
 
-from ..models import MODEL_NAME_TO_MODEL
-from ..models.model_factory import EagerModelFactory
-
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
+import urllib
+
 
 def is_vision_model(model_name):
     if model_name in [
@@ -70,6 +69,38 @@ def get_vision_model_dynamic_shapes():
     )
 
 
+def get_dog_image_tensor(image_size=224, normalization="imagenet"):
+    url, filename = (
+        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+        "dog.jpg",
+    )
+    try:
+        urllib.URLopener().retrieve(url, filename)
+    except:
+        urllib.request.urlretrieve(url, filename)
+
+    from PIL import Image
+    from torchvision import transforms
+
+    input_image = Image.open(filename).convert("RGB")
+
+    transforms_list = [
+        transforms.Resize((image_size, image_size)),
+        transforms.ToTensor(),
+    ]
+    if normalization == "imagenet":
+        transforms_list.append(
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        )
+
+    preprocess = transforms.Compose(transforms_list)
+
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+    input_batch = (input_batch,)
+    return input_batch
+
+
 def init_model(model_name):
     if model_name == "convnext_small":
         return torchvision.models.convnext_small()
@@ -77,13 +108,29 @@ def init_model(model_name):
         return torchvision.models.densenet161()
     if model_name == "shufflenet_v2_x1_0":
         return torchvision.models.shufflenet_v2_x1_0()
+    if model_name == "YOLO_NAS_S":
+        try:
+            from super_gradients.common.object_names import Models
+            from super_gradients.training import models
+        except ImportError:
+            raise ImportError(
+                "Please install super-gradients to use the YOLO_NAS_S model."
+            )
+
+        return models.get(Models.YOLO_NAS_S, pretrained_weights="coco")
 
     return None
 
 
 def get_sample_inputs(model_name):
+    # Lock the random seed for reproducibility
+    torch.manual_seed(42)
+
     if is_vision_model(model_name):
-        return get_vision_model_sample_input()
+        return (get_vision_model_sample_input(),)
+    if model_name == "YOLO_NAS_S":
+        input_batch = get_dog_image_tensor(640)
+        return input_batch
 
     return None
 
@@ -117,6 +164,24 @@ def main() -> None:
         "False",
     )
 
+    parser.add_argument(
+        "--small_texture_limits",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="sets the default texture limit to be (2048, 2048, 2048) which is "
+        "compatible with more devices (i.e. desktop/laptop GPUs) compared to the "
+        "default (16384, 16384, 2048) which is more targeted for mobile GPUs. Default "
+        "is False.",
+    )
+
+    parser.add_argument(
+        "--skip_memory_planning",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Skips memory planning pass while lowering, which can be used for "
+        "debugging. Default is False.",
+    )
+
     parser.add_argument(
         "-s",
         "--strict",
@@ -189,6 +254,10 @@ def main() -> None:
 
     if args.force_fp16:
         compile_options["force_fp16"] = True
+    if args.skip_memory_planning:
+        compile_options["skip_memory_planning"] = True
+    if args.small_texture_limits:
+        compile_options["small_texture_limits"] = True
 
     logging.info(f"Exporting model {args.model_name} with Vulkan delegate")
 

From 63f76ddf003bb52205b22ed2b12ede668ce6737c Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 12 Nov 2025 15:22:42 -0800
Subject: [PATCH 4/7] [ET-VK][ez] Migrate slice/select shaders to use
 BufferMetadata/TextureMetadata

Title says it all!

Motivation: code simplification and allows these ops to handle high dim tensors.

Differential Revision: [D86910641](https://our.internmc.facebook.com/intern/diff/D86910641/)

[ghstack-poisoned]
---
 .../runtime/graph/ops/glsl/select.glslh       | 81 +++++++++++--------
 .../vulkan/runtime/graph/ops/glsl/slice.glslh | 58 +++++++------
 .../graph/ops/glsl/transfer_buffer.glsl       | 28 +++----
 .../graph/ops/glsl/transfer_texture.glsl      | 59 ++++++--------
 .../runtime/graph/ops/impl/Transfer.cpp       | 34 +++-----
 5 files changed, 129 insertions(+), 131 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
index 6509015b4b6..5390e2a4bb2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -9,70 +9,87 @@
 #ifndef SELECT_GLSLH
 #define SELECT_GLSLH
 
-#ifndef USING_BUFFER
+#ifdef USING_BUFFER
 
 /*
- * Enable the fast path if a texel loaded from the input texture can be used as
- * is to store to the output texture. The following conditions must be met:
+ * Converts output tensor indices to input tensor indices for the select operation
+ * on buffer storage.
  *
- * 1. The input and output textures have the same packed dimension.
- * 2. The selected_dim must not be the packed dimension of the input.
- * 3. The packed dimension of the input must "map" to the packed dimension of
- *    the output. This occurs if selected_dim is greater than the packed dimension
- *    of the input.
+ * This is done by "inserting" the select index at the selected_dim in the input
+ * tensor index.
+ *
+ * Parameters assumed to be defined:
+ * - inp: BufferMetadata
+ * - selected_dim
+ * - index
  */
-bool can_use_fast_path() {
-  if (out_packed_dim != in_packed_dim) {
-    return false;
+TensorIndex out_tidx_to_in_tidx(const TensorIndex out_tidx) {
+  TensorIndex in_tidx;
+  initialize(in_tidx);
+
+  int in_size = int(size_at(inp, selected_dim));
+  int adjusted_index = index;
+  if (index < 0) {
+    adjusted_index = index + in_size;
   }
-  if (selected_dim <= in_packed_dim) {
-    return false;
+
+  // Copy indices before selected_dim
+  for (int d = 0; d < selected_dim; d++) {
+    in_tidx.data[div_4(d)][mod_4(d)] = idx_at(out_tidx, d);
   }
-  return true;
+
+  // Insert the selected index
+  in_tidx.data[div_4(selected_dim)][mod_4(selected_dim)] = adjusted_index;
+
+  // Copy indices after selected_dim (shifted by 1)
+  for (int d = selected_dim; d < int_ndim(inp) - 1; d++) {
+    in_tidx.data[div_4(d + 1)][mod_4(d + 1)] = idx_at(out_tidx, d);
+  }
+
+  return in_tidx;
 }
 
-#endif // USING_BUFFER
+#else // texture storage
 
 /*
- * Given an output tensor index, return the corresponding input tensor index for
- * the select operator. This is done by "inserting" the select index at the
- * selected_dim in the input tensor index.
+ * Converts output tensor indices to input tensor indices for the select operation
+ * on texture storage.
  *
- * A simple example is (note all tensor index are in WHCN order):
- *   out_tidx = [7, 5, 9]
- *   selected_dim = 2
- *   index = 3
- *   in_tidx = [7, 3, 5, 9]
+ * This is done by "inserting" the select index at the selected_dim in the input
+ * tensor index.
  *
- * This function assumes that the following variables are defined in the layout:
- * - in_sizes
+ * Parameters assumed to be defined:
+ * - inp: TextureMetadata
  * - selected_dim
  * - index
  */
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx = ivec4(0);
+TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
+  TensorIndex4D in_tidx;
+  in_tidx.data = ivec4(0);
 
   int adjusted_index = index;
   if (index < 0) {
-    adjusted_index = index + in_sizes[selected_dim];
+    adjusted_index = index + inp.sizes[selected_dim];
   }
 
   // Handle different dimensions for selection
   if (selected_dim == 0) {
     // Select from width dimension
-    in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z);
+    in_tidx.data = ivec4(adjusted_index, out_tidx.data.x, out_tidx.data.y, out_tidx.data.z);
   } else if (selected_dim == 1) {
     // Select from height dimension
-    in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z);
+    in_tidx.data = ivec4(out_tidx.data.x, adjusted_index, out_tidx.data.y, out_tidx.data.z);
   } else if (selected_dim == 2) {
     // Select from channel dimension
-    in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z);
+    in_tidx.data = ivec4(out_tidx.data.x, out_tidx.data.y, adjusted_index, out_tidx.data.z);
   } else if (selected_dim == 3) {
     // Select from batch dimension
-    in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index);
+    in_tidx.data = ivec4(out_tidx.data.x, out_tidx.data.y, out_tidx.data.z, adjusted_index);
   }
 
   return in_tidx;
 }
 
+#endif // USING_BUFFER
+
 #endif // SELECT_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
index 87325754f4d..0a815c85d66 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -9,49 +9,61 @@
 #ifndef SLICE_GLSLH
 #define SLICE_GLSLH
 
-#ifndef USING_BUFFER
+#include "indexing.glslh"
 
-/**
- * Enable the fast path if a texel loaded from the input texture can be used as
- * is to store to the output texture. The following conditions must be met:
+#ifdef USING_BUFFER
+
+/*
+ * Converts output tensor indices to input tensor indices for the slice operation
+ * on buffer storage.
  *
- * 1. The input and output textures have the same packed dimension.
- * 2. The select_dim must not be the packed dimension of the input.
+ * Parameters assumed to be defined:
+ * - inp: BufferMetadata
+ * - selected_dim
+ * - start
+ * - step
  */
-bool can_use_fast_path() {
-  if (out_packed_dim != in_packed_dim) {
-    return false;
-  }
-  if (in_packed_dim == selected_dim) {
-    return false;
+TensorIndex out_tidx_to_in_tidx(const TensorIndex out_tidx) {
+  TensorIndex in_tidx = out_tidx;
+
+  int in_size = int(size_at(inp, selected_dim));
+  int adjusted_start = start;
+  if (start < 0) {
+    adjusted_start = start + in_size;
   }
-  return true;
+
+  uint out_idx = idx_at(out_tidx, selected_dim);
+  in_tidx.data[div_4(selected_dim)][mod_4(selected_dim)] =
+      adjusted_start + int(out_idx) * step;
+
+  return in_tidx;
 }
 
-#endif // USING_BUFFER
+#else // texture storage
 
 /*
- * Converts output tensor indices to input tensor indices for the slice operation.
- * This function maps the output indices to the corresponding input indices based on
- * the slice parameters (start, step, selected_dim).
+ * Converts output tensor indices to input tensor indices for the slice operation
+ * on texture storage.
  *
- * Parameters assumed to be defined in the layout specifier:
- * - in_sizes
+ * Parameters assumed to be defined:
+ * - inp: TextureMetadata
  * - selected_dim
  * - start
  * - step
  */
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx = out_tidx;
+TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
+  TensorIndex4D in_tidx = out_tidx;
 
   int adjusted_start = start;
   if (start < 0) {
-    adjusted_start = start + in_sizes[selected_dim];
+    adjusted_start = start + inp.sizes[selected_dim];
   }
 
-  in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step;
+  in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step;
 
   return in_tidx;
 }
 
+#endif // USING_BUFFER
+
 #endif // SLICE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
index 7605c59c72f..73b753ccc0b 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
@@ -11,18 +11,23 @@
 #define PRECISION ${PRECISION}
 #define UBO_PARAMS ${UBO_PARAMS}
 
-#define VEC4_T ${texel_type(DTYPE)}
 #define T ${buffer_scalar_type(DTYPE)}
 
 ${define_active_storage_type("buffer")}
 ${define_required_extensions(DTYPE)}
 
+#extension GL_EXT_control_flow_attributes : require
+
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+#include "indexing.glslh"
+
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
 
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
 $if UBO_PARAMS:
   $if OP_NAME == "slice":
     ${layout_declare_ubo(B, "int", "start")}
@@ -32,10 +37,6 @@ $if UBO_PARAMS:
     ${layout_declare_ubo(B, "int", "index")}
 
 layout(push_constant) uniform restrict Block {
-  ivec4 in_sizes;
-  ivec4 out_strides;
-  ivec4 in_strides;
-  int out_numel;
   int selected_dim;
   $if not UBO_PARAMS:
     $if OP_NAME == "slice":
@@ -46,24 +47,19 @@ layout(push_constant) uniform restrict Block {
       int index;
 };
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #include "${OP_NAME}.glslh"
 
 void main() {
-  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
-  if (out_bufi >= out_numel) {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
-  ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
+  TensorIndex in_tidx = out_tidx_to_in_tidx(out_tidx);
 
-  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+  const uint in_bufi = tensor_idx_to_linear_idx(inp, in_tidx);
   t_out[out_bufi] = t_in[in_bufi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
index 0f34713cb43..d2c9c025242 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
@@ -11,19 +11,25 @@
 #define PRECISION ${PRECISION}
 #define UBO_PARAMS ${UBO_PARAMS}
 
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
 
 ${define_active_storage_type("texture3d")}
 ${define_required_extensions(DTYPE)}
 
+#extension GL_EXT_control_flow_attributes : require
+
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+#include "common.glslh"
+#include "indexing.glslh"
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
 
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
 $if UBO_PARAMS:
   $if OP_NAME == "slice":
     ${layout_declare_ubo(B, "int", "start")}
@@ -33,8 +39,6 @@ $if UBO_PARAMS:
     ${layout_declare_ubo(B, "int", "index")}
 
 layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
   int selected_dim;
   $if not UBO_PARAMS:
     $if OP_NAME == "slice":
@@ -45,48 +49,33 @@ layout(push_constant) uniform restrict Block {
       int index;
 };
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #include "${OP_NAME}.glslh"
 
 void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+  if (out_of_bounds(out_pos, outp)) {
     return;
   }
 
-  if (can_use_fast_path()) {
-    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+  VEC4_T out_texel = VEC4_T(0);
 
-    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
-  }
-  else {
-    VEC4_T out_texel = VEC4_T(0);
-    for (int texel_i = 0; texel_i < 4; ++texel_i) {
-      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-      int element_idx = in_tidx[in_packed_dim] % 4;
-
-      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-      T selected_value = T(in_texel[element_idx]);
+  int limit = min(
+      4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]);
+  for (int comp = 0; comp < limit; comp++) {
+    TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx);
 
-      out_texel[texel_i] = selected_value;
+    TextureElementIndex in_elem_pos = tensor4d_idx_to_texture_element_idx_simple(
+        inp, in_tidx);
 
-      out_tidx[out_packed_dim]++;
-    }
+    VEC4_T in_texel = texelFetch(t_in, in_elem_pos.pos, 0);
+    out_texel[comp] = in_texel[in_elem_pos.comp];
 
-    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+    out_tidx.data[outp.packed_dim]++;
   }
+
+  imageStore(t_out, out_pos, out_texel);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
index 60127ecf9bd..1823271824a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
@@ -50,15 +50,16 @@ void add_transfer_copy_node(
       (transfer_type == TransferType::SELECT ||
        graph.is_scalar_or_none(step_ref));
 
-  vkapi::ParamsBindList param_buffers;
+  vkapi::ParamsBindList param_ubos = {graph.meta_ubo(out), graph.meta_ubo(in)};
+
   if (!param_is_scalar) {
     if (transfer_type == TransferType::SELECT) {
-      param_buffers = {
-          graph.get_or_create_int_param_buffer(index_or_start_ref, 0)};
+      param_ubos.append(
+          graph.get_or_create_int_param_buffer(index_or_start_ref, 0));
     } else { // TransferType::SLICE
-      param_buffers = {
-          graph.get_or_create_int_param_buffer(index_or_start_ref, 0),
-          graph.get_or_create_int_param_buffer(step_ref, 1)};
+      param_ubos.append(
+          graph.get_or_create_int_param_buffer(index_or_start_ref, 0));
+      param_ubos.append(graph.get_or_create_int_param_buffer(step_ref, 1));
     }
   } else {
     transfer_params.index_or_start_ref =
@@ -69,18 +70,6 @@ void add_transfer_copy_node(
   }
 
   std::vector<PushConstantDataInfo> push_constants;
-  push_constants.reserve(graph.is_buffer_storage(out) ? 5 : 3);
-
-  if (graph.is_buffer_storage(out)) {
-    push_constants.emplace_back(graph.sizes_pc_of(in));
-    push_constants.emplace_back(graph.strides_pc_of(out));
-    push_constants.emplace_back(graph.strides_pc_of(in));
-    push_constants.emplace_back(graph.numel_pc_of(out));
-  } else {
-    push_constants.emplace_back(graph.sizes_pc_of(out));
-    push_constants.emplace_back(graph.sizes_pc_of(in));
-  }
-
   if (param_is_scalar) {
     push_constants.emplace_back(&transfer_params, sizeof(transfer_params));
   } else {
@@ -88,11 +77,6 @@ void add_transfer_copy_node(
         &transfer_params.dim, sizeof(transfer_params.dim));
   }
 
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(out),
-      graph.hashed_layout_of(in),
-  };
-
   // Determine the shader directly
   std::string kernel_name;
   if (transfer_type == TransferType::SELECT) {
@@ -115,11 +99,11 @@ void add_transfer_copy_node(
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Parameter buffers
-      param_buffers,
+      param_ubos,
       // Push Constants
       push_constants,
       // Specialization Constants
-      spec_vars,
+      {},
       // Resize Args
       resize_args,
       // Resizing Logic

From 75785d8472bf2b3cd7df50fed4f7b35e6da67e57 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Thu, 13 Nov 2025 08:51:55 -0800
Subject: [PATCH 5/7] Update base for Update on "[ET-VK][ez] Migrate
 slice/select shaders to use BufferMetadata/TextureMetadata"

Title says it all!

Motivation: code simplification and allows these ops to handle high dim tensors.

Differential Revision: [D86910641](https://our.internmc.facebook.com/intern/diff/D86910641/)

[ghstack-poisoned]
---
 examples/vulkan/export.py | 61 ++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 23 deletions(-)

diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index c3043a97bac..b91f382f431 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -10,6 +10,7 @@
 
 import argparse
 import logging
+import os
 
 import executorch.backends.vulkan.test.utils as test_utils
 import torch
@@ -224,6 +225,13 @@ def main() -> None:
         help="Execute lower_module_and_test_output to validate the model. Default is False",
     )
 
+    parser.add_argument(
+        "--save_inputs",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Whether to save the inputs to the model. Default is False",
+    )
+
     args = parser.parse_args()
 
     if args.model_name in MODEL_NAME_TO_MODEL:
@@ -299,25 +307,18 @@ def main() -> None:
         atol = 2e-2
         rtol = 1e-1
 
-    # Test the model if --test flag is provided
-    if args.test:
-        test_result = test_utils.run_and_check_output(
-            reference_model=model,
-            executorch_program=exec_prog,
-            sample_inputs=example_inputs,
-            atol=atol,
-            rtol=rtol,
-        )
+    # Save regular program
+    save_pte_program(exec_prog, output_filename, args.output_dir)
+    logging.info(
+        f"Model exported and saved as {output_filename}.pte in {args.output_dir}"
+    )
 
-        if test_result:
-            logging.info(
-                "✓ Model test PASSED - outputs match reference within tolerance"
-            )
-        else:
-            logging.error("✗ Model test FAILED - outputs do not match reference")
-            raise RuntimeError(
-                "Model validation failed: ExecuTorch outputs do not match reference model outputs"
-            )
+    if args.save_inputs:
+        inputs_flattened, _ = tree_flatten(example_inputs)
+        for i, input_tensor in enumerate(inputs_flattened):
+            input_filename = os.path.join(args.output_dir, f"input{i}.bin")
+            input_tensor.numpy().tofile(input_filename)
+            f"Model input saved as {input_filename} in {args.output_dir}"
 
     if args.bundled:
         # Create bundled program
@@ -356,13 +357,27 @@ def main() -> None:
         logging.info(
             f"Bundled program exported and saved as {output_filename}.bpte in {args.output_dir}"
         )
-    else:
-        # Save regular program
-        save_pte_program(exec_prog, output_filename, args.output_dir)
-        logging.info(
-            f"Model exported and saved as {output_filename}.pte in {args.output_dir}"
+
+    # Test the model if --test flag is provided
+    if args.test:
+        test_result = test_utils.run_and_check_output(
+            reference_model=model,
+            executorch_program=exec_prog,
+            sample_inputs=example_inputs,
+            atol=atol,
+            rtol=rtol,
         )
 
+        if test_result:
+            logging.info(
+                "✓ Model test PASSED - outputs match reference within tolerance"
+            )
+        else:
+            logging.error("✗ Model test FAILED - outputs do not match reference")
+            raise RuntimeError(
+                "Model validation failed: ExecuTorch outputs do not match reference model outputs"
+            )
+
 
 if __name__ == "__main__":
     with torch.no_grad():

From 45a049a27fd1137dfb5bec9297c24ceabca1e9da Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Thu, 13 Nov 2025 19:33:34 -0800
Subject: [PATCH 6/7] Update base for Update on "[ET-VK][ez] Migrate
 slice/select shaders to use BufferMetadata/TextureMetadata"

Title says it all!

Motivation: code simplification and allows these ops to handle high dim tensors.

Differential Revision: [D86910641](https://our.internmc.facebook.com/intern/diff/D86910641/)

[ghstack-poisoned]
---
 .../vulkan/test/vulkan_compute_api_test.cpp   | 475 ++----------------
 1 file changed, 32 insertions(+), 443 deletions(-)

diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 7e3d957afdb..6be5014f248 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1911,413 +1911,6 @@ TEST(VulkanComputeGraphTest, test_clone) {
   }
 }
 
-TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 6;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  // Notice that copy_node operates on in texture's x, y, z dimension. In the
-  // comment, we provide the cooresponding coordinate in nchw.
-
-  // src_offset is (n=0, c=4, h=1, w=1)
-  ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
-
-  // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
-  // Argument is {x, y, z}.
-  // x = 0 since w = 0
-  // y = 2 since h = 2
-  // z = c / 4 + 2 since
-  //   1. there c/4 planes per batch, n=1 means we are on the first batch;
-  //   2. +2 because c = 8, with channel packing it means two texels.
-  ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
-
-  // range is (n=1, c=8, h=2, w=4)
-  // Argument is {x, y, z}.
-  // x = 4 since w = 4
-  // y = 2 since h = 2
-  // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
-  // bit misleading here, since it gives the impression that we are copying the
-  // entire channel. However, remember when we copy, we are trying to
-  // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
-  // range must be non zero.
-  ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, /*iota = */ true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  // We will examine the results in the dst_range
-  // The value in the cooresponding coordinate should match between the source
-  // and destination tensor. We loop thru the range, calculate both the src and
-  // dst index using the offsets, and compare the values in the extracted
-  // vector. They should match.
-  int n_idx = 0;
-  // at each nested loop, index range from dst_offset to dst_offset + range
-
-  for (int c_idx = 0; c_idx < 8; c_idx++) {
-    for (int h_idx = 0; h_idx < 2; h_idx++) {
-      for (int w_idx = 0; w_idx < 4; w_idx++) {
-        auto dst_idx =
-            get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
-        auto src_idx =
-            get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
-
-        EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  int64_t src_offset = 2;
-  int64_t dst_offset = 3;
-  int64_t range = 7;
-
-  ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
-  ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
-  ValueRef range_ref = graph.add_scalar<int64_t>(range);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto src_idx =
-              get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
-          auto dst_idx = get_buf_idx(
-              graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-        }
-      }
-    }
-  }
-}
-
-TEST(
-    VulkanComputeGraphTest,
-    DISABLED_test_etvk_copy_channel_offset_node_clean_boundary) {
-  // Tricky part for channel copy is handling the boundary across multiple copy.
-  // For example, when we concat two [3, 1, 1] nchw-tensors along the channel
-  // dimension, due to channel packing, elements from different source texel
-  // will be packed into same destination texel at the boundaries.
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef zero = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-  IOValueRef b = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-
-  // Make sure entire out tensor is zeroed. The zero tensor will be filled with
-  // zero later.
-  copyFn(
-      graph,
-      {zero.value,
-       graph.add_scalar<int64_t>(c),
-       graph.add_scalar<int64_t>(0),
-       graph.add_scalar<int64_t>(0),
-       out.value});
-
-  int64_t a_src_offset = 0;
-  int64_t a_dst_offset = 2;
-  int64_t a_range = 5;
-  // a will write to channge [2, 7)
-  copyFn(
-      graph,
-      {a.value,
-       graph.add_scalar<int64_t>(a_range),
-       graph.add_scalar<int64_t>(a_src_offset),
-       graph.add_scalar<int64_t>(a_dst_offset),
-       out.value});
-
-  // b will write to channel [6, 11)
-  // Intentional for b to override channel=6
-  int64_t b_src_offset = 0;
-  int64_t b_dst_offset = 6;
-  int64_t b_range = 5;
-
-  copyFn(
-      graph,
-      {b.value,
-       graph.add_scalar<int64_t>(b_range),
-       graph.add_scalar<int64_t>(b_src_offset),
-       graph.add_scalar<int64_t>(b_dst_offset),
-       out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  float a_value = 1.0f;
-  float b_value = 2.0f;
-  float zero_value = 0.0f;
-  fill_vtensor(graph, a, a_value);
-  fill_vtensor(graph, b, b_value);
-  fill_vtensor(graph, zero, zero_value);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    // c_idx only up to a_range-1 because the expected overwrite by b
-    for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1;
-         c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == a_value);
-        }
-      }
-    }
-  }
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == b_value);
-        }
-      }
-    }
-  }
-
-  // Also verify that data before a_dst_offset and after b_dst_offset + b_range
-  // are untouched.
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == zero_value);
-        }
-      }
-    }
-  }
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == zero_value);
-        }
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_etvk_copy_offset_int_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 6;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kInt, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kInt, memory_layout);
-
-  // Notice that copy_node operates on in texture's x, y, z dimension. In the
-  // comment, we provide the cooresponding coordinate in nchw.
-
-  // src_offset is (n=0, c=4, h=1, w=1)
-  ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
-
-  // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
-  // Argument is {x, y, z}.
-  // x = 0 since w = 0
-  // y = 2 since h = 2
-  // z = c / 4 + 2 since
-  //   1. there c/4 planes per batch, n=1 means we are on the first batch;
-  //   2. +2 because c = 8, with channel packing it means two texels.
-  ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
-
-  // range is (n=1, c=8, h=2, w=4)
-  // Argument is {x, y, z}.
-  // x = 4 since w = 4
-  // y = 2 since h = 2
-  // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
-  // bit misleading here, since it gives the impression that we are copying the
-  // entire channel. However, remember when we copy, we are trying to
-  // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
-  // range must be non zero.
-  ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0, /*iota = */ true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  // We will examine the results in the dst_range
-  // The value in the cooresponding coordinate should match between the source
-  // and destination tensor. We loop thru the range, calculate both the src and
-  // dst index using the offsets, and compare the values in the extracted
-  // vector. They should match.
-  int n_idx = 0;
-  // at each nested loop, index range from dst_offset to dst_offset + range
-
-  for (int c_idx = 0; c_idx < 8; c_idx++) {
-    for (int h_idx = 0; h_idx < 2; h_idx++) {
-      for (int w_idx = 0; w_idx < 4; w_idx++) {
-        auto dst_idx =
-            get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
-        auto src_idx =
-            get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
-
-        EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_int_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  int64_t src_offset = 2;
-  int64_t dst_offset = 3;
-  int64_t range = 7;
-
-  ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
-  ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
-  ValueRef range_ref = graph.add_scalar<int64_t>(range);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto src_idx =
-              get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
-          auto dst_idx = get_buf_idx(
-              graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-        }
-      }
-    }
-  }
-}
-
 TEST(VulkanComputeGraphTest, test_view_change_packing) {
   std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>>
       layout_pairs = {
@@ -2722,42 +2315,38 @@ void test_mm(
 
 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
 #define RUN_TESTS(dtype, storage_type, layout, prepack) \
-  test_mm(                                              \
-      /*B = */ 1,                                       \
-      /*M = */ 31,                                      \
-      /*K = */ 127,                                     \
-      /*N = */ 23,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);                                         \
-  test_mm(                                              \
-      /*B = */ 5,                                       \
-      /*M = */ 31,                                      \
-      /*K = */ 127,                                     \
-      /*N = */ 23,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);                                         \
-  test_mm(                                              \
-      /*B = */ 7,                                       \
-      /*M = */ 13,                                      \
-      /*K = */ 89,                                      \
-      /*N = */ 17,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);                                         \
-  test_mm(                                              \
-      /*B = */ 1,                                       \
-      /*M = */ 13,                                      \
-      /*K = */ 89,                                      \
-      /*N = */ 17,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);
+  test_mm(/*B = */ 1,                                   \
+          /*M = */ 31,                                  \
+          /*K = */ 127,                                 \
+          /*N = */ 23,                                  \
+          dtype,                                        \
+          storage_type,                                 \
+          layout,                                       \
+          prepack);                                     \
+  test_mm(/*B = */ 5,                                   \
+          /*M = */ 31,                                  \
+          /*K = */ 127,                                 \
+          /*N = */ 23,                                  \
+          dtype,                                        \
+          storage_type,                                 \
+          layout,                                       \
+          prepack);                                     \
+  test_mm(/*B = */ 7,                                   \
+          /*M = */ 13,                                  \
+          /*K = */ 89,                                  \
+          /*N = */ 17,                                  \
+          dtype,                                        \
+          storage_type,                                 \
+          layout,                                       \
+          prepack);                                     \
+  test_mm(/*B = */ 1,                                   \
+          /*M = */ 13,                                  \
+          /*K = */ 89,                                  \
+          /*N = */ 17,                                  \
+          dtype,                                        \
+          storage_type,                                 \
+          layout,                                       \
+          prepack);
 
   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
   CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);

From fa6210c9373e90ea84e29305004b5135547e59a9 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Fri, 14 Nov 2025 07:31:41 -0800
Subject: [PATCH 7/7] Update base for Update on "[ET-VK][ez] Migrate
 slice/select shaders to use BufferMetadata/TextureMetadata"

Title says it all!

Motivation: code simplification and allows these ops to handle high dim tensors.

Differential Revision: [D86910641](https://our.internmc.facebook.com/intern/diff/D86910641/)

[ghstack-poisoned]
---
 backends/vulkan/test/utils.py                 |  2 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   | 68 ++++++++++---------
 examples/vulkan/export.py                     |  4 +-
 3 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
index bece35a2e35..6d3fff452f8 100644
--- a/backends/vulkan/test/utils.py
+++ b/backends/vulkan/test/utils.py
@@ -813,7 +813,7 @@ def find_bad_operators(
 
 def make_indent(indent_level):
     indent_str = ""
-    for i in range(indent_level):
+    for _ in range(indent_level):
         indent_str += " "
     return indent_str
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 6be5014f248..7dd3bb84588 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2315,38 +2315,42 @@ void test_mm(
 
 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
 #define RUN_TESTS(dtype, storage_type, layout, prepack) \
-  test_mm(/*B = */ 1,                                   \
-          /*M = */ 31,                                  \
-          /*K = */ 127,                                 \
-          /*N = */ 23,                                  \
-          dtype,                                        \
-          storage_type,                                 \
-          layout,                                       \
-          prepack);                                     \
-  test_mm(/*B = */ 5,                                   \
-          /*M = */ 31,                                  \
-          /*K = */ 127,                                 \
-          /*N = */ 23,                                  \
-          dtype,                                        \
-          storage_type,                                 \
-          layout,                                       \
-          prepack);                                     \
-  test_mm(/*B = */ 7,                                   \
-          /*M = */ 13,                                  \
-          /*K = */ 89,                                  \
-          /*N = */ 17,                                  \
-          dtype,                                        \
-          storage_type,                                 \
-          layout,                                       \
-          prepack);                                     \
-  test_mm(/*B = */ 1,                                   \
-          /*M = */ 13,                                  \
-          /*K = */ 89,                                  \
-          /*N = */ 17,                                  \
-          dtype,                                        \
-          storage_type,                                 \
-          layout,                                       \
-          prepack);
+  test_mm(                                              \
+      /*B = */ 1,                                       \
+      /*M = */ 31,                                      \
+      /*K = */ 127,                                     \
+      /*N = */ 23,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 5,                                       \
+      /*M = */ 31,                                      \
+      /*K = */ 127,                                     \
+      /*N = */ 23,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 7,                                       \
+      /*M = */ 13,                                      \
+      /*K = */ 89,                                      \
+      /*N = */ 17,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 1,                                       \
+      /*M = */ 13,                                      \
+      /*K = */ 89,                                      \
+      /*N = */ 17,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);
 
   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
   CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index b91f382f431..58f2ccf1001 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -128,7 +128,7 @@ def get_sample_inputs(model_name):
     torch.manual_seed(42)
 
     if is_vision_model(model_name):
-        return (get_vision_model_sample_input(),)
+        return get_vision_model_sample_input()
     if model_name == "YOLO_NAS_S":
         input_batch = get_dog_image_tensor(640)
         return input_batch
@@ -143,7 +143,7 @@ def get_dynamic_shapes(model_name):
     return None
 
 
-def main() -> None:
+def main() -> None:  # noqa: C901
     logger = logging.getLogger("")
     logger.setLevel(logging.INFO)