diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index 43796c043c8..00b6c62d5d2 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -226,9 +226,10 @@ def get_arg_tensor_source_repset(
         """
         arg_node = op_node.args[arg_i]
 
-        # For non-tensor arguments, return ANY_STORAGE
+        # For non-tensor arguments, return ALL_STORAGES_REPSET so that the respset does
+        # not appear to be empty.
         if not utils.is_tensor_arg_node(arg_node):
-            return utils.ANY_STORAGE
+            return utils.ALL_STORAGES_REPSET
 
         # Special case for cat - use the first tensor in the list as representative
         if isinstance(arg_node, list):
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 461278500a6..feba4f6f072 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -740,6 +740,7 @@ def register_cat_op():
     [
         exir_ops.edge.aten.select_copy.int,
         exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.split_with_sizes_copy.default,
     ]
 )
 def register_transfer_ops():
@@ -782,10 +783,7 @@ def register_ported_op():
 # Ops ported from PyTorch Vulkan backend. These ops are in a separate registry because they support all packed dimensions
 @update_features(
     [
-        # Tensor combination
         exir_ops.edge.aten.repeat.default,
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
     ]
 )
 def register_ported_op_all_packed_dims():
diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
index 8340a8b9b2f..9ade64910f2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
@@ -86,4 +86,15 @@ int quantize_and_pack(const vec4 vals, const float inv_scale, const int zp) {
   return pack_into_int32(quantized);
 }
 
+#ifdef DEBUG_MODE
+
+#define printf debugPrintfEXT
+
+void printVec4(vec4 texel) {
+  debugPrintfEXT(
+      "texel: %f, %f, %f, %f\\n", texel.x, texel.y, texel.z, texel.w);
+}
+
+#endif // DEBUG_MODE
+
 #endif // COMMON_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
deleted file mode 100644
index 39aa9b11a0d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  // Operates on (x, y, z) logical extents.
-  // channel_range is stored in range.w
-  ivec4 range;
-  // Analogus to range variable in copy. It defines the # of channel being
-  // copied.
-  // dst channel offset is stored in dst_offset.w
-  ivec4 dst_offset;
-  int src_channel_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  // Note: Unlike other shaders, the range is often not equal to the destination
-  // texture extent.
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(lpos, range.xyz))) {
-    return;
-  }
-
-  const ivec3 out_lpos = lpos + dst_offset.xyz;
-
-  const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim);
-
-  // First read the existing values to make sure the boundary values stay.
-  VEC4_T v = load_texel_lpos(existing_out, out_lpos, out_axis_map);
-
-  ivec4 in_tidx = out_tidx;
-  for (int i=0; i<4; i++) {
-
-    in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i;
-
-    // Handle the partial update for begining of channel in an existing tensor.
-    // If the source channel index is below zero or exceeds the range, we skip
-    // updating the element to avoid overwriting existing data.
-    if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) {
-      continue;
-    }
-
-    // Readjust for the source offset.
-    in_tidx[packed_dim] += src_channel_offset;
-
-    ivec4 in_posi = tidx_to_posi(in_tidx, in_sizes, in_axis_map, packed_dim);
-    v[i] = load_texel(t_in, in_posi.xyz)[in_posi.w];
-  }
-
-  write_texel_lpos(t_out, out_lpos, v, out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
deleted file mode 100644
index 984d9a09d43..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-copy_channel_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: copy_channel_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
deleted file mode 100644
index 178814a90c3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type(STORAGE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec3 range;
-  // xyz is source offset w is channel size
-  ivec4 src_offset;
-  // xyz is destination offset w is channel size
-  ivec4 dst_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-${layout_declare_spec_const(C, "int", "batch_index_function", "0")}
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range))) {
-    return;
-  }
-
-  ivec3 in_pos = pos + src_offset.xyz;
-  ivec3 out_pos = pos + dst_offset.xyz;
-  if (src_offset.w > 0) {
-    if (batch_index_function == 1) {
-      // batch index is calculated using source channel size
-      const int channel_index = pos.z % src_offset.w;
-      const int batch_index = pos.z / src_offset.w;
-      out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
-    } else if (batch_index_function == 2) {
-      // batch index is calculated using destination channel size
-      const int channel_index = pos.z % dst_offset.w;
-      const int batch_index = pos.z / dst_offset.w;
-      in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w;
-    }
-  }
-
-  write_texel_lpos(
-    t_out,
-    out_pos,
-    load_texel_lpos(t_in, in_pos, in_axis_map),
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
deleted file mode 100644
index 09f5ca36ea4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-copy_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-      - VALUE: int8
-      - VALUE: uint8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
-  shader_variants:
-    - NAME: copy_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
deleted file mode 100644
index 3100565d08a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 range;
-
-  // xyz is source offset w is channel size
-  ivec4 src_offset;
-
-  // xyz is destination offset w is channel size
-  ivec4 dst_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range.xyz))) {
-    return;
-  }
-
-  // Position in input tensor
-  ivec3 in_pos = pos + src_offset.xyz;
-  in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
-
-  // Read input value mapping to this output texel
-  VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
-
-  // Starting offset to read from a texel
-  const int src_lane_offset = src_offset[packed_dim] & 0x3;
-  const bool has_src_lane_offset = src_lane_offset != 0;
-
-  // If input lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_src_lane_offset) {
-    // Boundary values will come from next input texel in the packed dim.
-    ivec3 next_in_pos = in_pos;
-    next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
-    VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);
-
-    // Keep input values from the end of current input pixel based on src_lane_offset
-    // offset 1 means the first lane of current input texel is not a part of the output texel
-    // offset 2 means first 2 lanes are not and so on
-    // Copy next texel's values towards the end of input texel, based on lane offset
-    // offset 1 means the first lane from next texel is part of the input texel
-    // offset 2 means first 2 lanes from next texel is part of the input texel and so on
-    if (src_lane_offset == 1) {
-      in_value = ivec4(in_value.yzw, next_value.x);
-    } else if (src_lane_offset == 2) {
-      in_value = ivec4(in_value.zw, next_value.xy);
-    } else {
-      in_value = ivec4(in_value.w, next_value.xyz);
-    }
-  }
-
-  // Starting offset to write at within a texel
-  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
-  const bool has_dst_lane_offset = out_lane_offset != 0;
-
-  ivec3 out_pos = pos + dst_offset.xyz;
-  out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
-
-  VEC4_T out_value;
-
-  // If lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_dst_lane_offset) {
-    // When position in packed dim is > 0
-    if (pos[packed_dim] > 0) {
-      // Boundary values will come from previous input texel in the packed dim.
-      ivec3 prev_in_pos = in_pos;
-      prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
-      VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);
-
-      // Shift values toward the beginning based on out_lane_offset
-      // offset 1 means the last lane from the previous texel is a part of the output texel
-      // offset 2 means last 2 lanes and so on
-      if (out_lane_offset == 1) {
-        out_value.x = prev_value.w;
-      } else if (out_lane_offset == 2) {
-        out_value.xy = prev_value.zw;
-      } else {
-        out_value.xyz = prev_value.yzw;
-      }
-    } else {
-      // When position in packed dim is == 0
-      // Boundary values will be the previous texel values.
-      out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
-    }
-
-    // Copy input values towards the end of output array, based on lane offset
-    // offset 1 means the first lane from previous texel is part of the output texel starting at offset
-    // offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
-    if (out_lane_offset == 1) {
-      out_value.yzw = in_value.xyz;
-    } else if (out_lane_offset == 2) {
-      out_value.zw = in_value.xy;
-    } else {
-      out_value.w = in_value.x;
-    }
-  } else {
-    out_value = in_value;
-  }
-
-  write_texel_lpos(
-    t_out,
-    out_pos,
-    out_value,
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
deleted file mode 100644
index 6e55876cb28..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-copy_packed_dim_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: copy_packed_dim_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
index 38016547d19..b9ac0e5dace 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -259,17 +259,28 @@ void clamp_tensor_idx(const BufferMetadata meta, inout TensorIndex tidx) {
   tidx.data[1] = min(tidx.data[1], meta.sizes[1] - 1);
 }
 
-// Does not account for axis mapping or batches
+// Does not account for axis mapping
 TensorIndex4D texture_pos_to_tensor4d_idx_simple(
     const TextureMetadata meta, const ivec3 pos) {
   TensorIndex4D tidx;
   tidx.data.xyz = pos;
   tidx.data.w = 0;
   tidx.data[meta.packed_dim] *= 4;
+
+  // Compute batch idx accounting for batch concatenation, assuming channels as
+  // the concatenation dim.
+  if (meta.sizes.w > 1) {
+    int channels = meta.sizes.z;
+    if (meta.packed_dim == 2) {
+      channels = align_up_4(channels);
+    }
+    tidx.data.w = tidx.data.z / channels;
+    tidx.data.z = tidx.data.z % channels;
+  }
   return tidx;
 }
 
-// Does not account for axis mapping or batches
+// Does not account for axis mapping
 ivec3 tensor4d_idx_to_texel_pos_simple(
     const TextureMetadata meta, const TensorIndex4D tidx) {
   ivec3 texel_pos;
@@ -278,10 +289,20 @@ ivec3 tensor4d_idx_to_texel_pos_simple(
 
   texel_pos = tidx.data.xyz;
   texel_pos[meta.packed_dim] = div_4(packed_dim_idx);
+
+  // Account for batch concatenation, assuming channels as the concatenation dim
+  if (meta.sizes.w > 1) {
+    int channels_ntexels = meta.sizes.z;
+    if (meta.packed_dim == 2) {
+      channels_ntexels = div_up_4(channels_ntexels);
+    }
+    texel_pos.z += tidx.data.w * channels_ntexels;
+  }
+
   return texel_pos;
 }
 
-// Does not account for axis mapping or batches
+// Does not account for axis mapping
 TextureElementIndex tensor4d_idx_to_texture_element_idx_simple(
     const TextureMetadata meta, const TensorIndex4D tidx) {
   const int packed_dim_idx = tidx.data[meta.packed_dim];
@@ -289,6 +310,16 @@ TextureElementIndex tensor4d_idx_to_texture_element_idx_simple(
   tex_idx.pos = tidx.data.xyz;
   tex_idx.pos[meta.packed_dim] = div_4(packed_dim_idx);
   tex_idx.comp = mod_4(packed_dim_idx);
+
+  // Account for batch concatenation, assuming channels as the concatenation dim
+  if (meta.sizes.w > 1) {
+    int channels_ntexels = meta.sizes.z;
+    if (meta.packed_dim == 2) {
+      channels_ntexels = div_up_4(channels_ntexels);
+    }
+    tex_idx.pos.z += tidx.data.w * channels_ntexels;
+  }
+
   return tex_idx;
 }
 
@@ -316,13 +347,21 @@ void printTensorIndex(const TensorIndex tidx) {
     );
 }
 
-void printTensorIndex4D(const TensorIndex tidx) {
+void printTensorIndex4D(const TensorIndex4D tidx) {
     debugPrintfEXT(
         "TensorIndex4D: [%u, %u, %u, %u]\\n",
-        tidx.data[0][0], tidx.data[0][1], tidx.data[0][2], tidx.data[0][3]
+        tidx.data[0], tidx.data[1], tidx.data[2], tidx.data[3]
+    );
+}
+
+void printTextureElementIndex(const TextureElementIndex tex_idx) {
+    debugPrintfEXT(
+        "TextureElementIndex: pos=[%d %d %d] comp=%d\\n",
+        tex_idx.pos.x, tex_idx.pos.y, tex_idx.pos.z, tex_idx.comp
     );
 }
 
+
 void printBufferMetadata(const BufferMetadata meta) {
     debugPrintfEXT(
         "BufferMetadata: ndim=%u numel=%u\\n  sizes=[%u %u %u %u %u %u %u %u]\\n  dim_order=[%u %u %u %u %u %u %u %u]\\n  strides=[%u %u %u %u %u %u %u %u]\\n",
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
index 6509015b4b6..5390e2a4bb2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -9,70 +9,87 @@
 #ifndef SELECT_GLSLH
 #define SELECT_GLSLH
 
-#ifndef USING_BUFFER
+#ifdef USING_BUFFER
 
 /*
- * Enable the fast path if a texel loaded from the input texture can be used as
- * is to store to the output texture. The following conditions must be met:
+ * Converts output tensor indices to input tensor indices for the select operation
+ * on buffer storage.
  *
- * 1. The input and output textures have the same packed dimension.
- * 2. The selected_dim must not be the packed dimension of the input.
- * 3. The packed dimension of the input must "map" to the packed dimension of
- *    the output. This occurs if selected_dim is greater than the packed dimension
- *    of the input.
+ * This is done by "inserting" the select index at the selected_dim in the input
+ * tensor index.
+ *
+ * Parameters assumed to be defined:
+ * - inp: BufferMetadata
+ * - selected_dim
+ * - index
  */
-bool can_use_fast_path() {
-  if (out_packed_dim != in_packed_dim) {
-    return false;
+TensorIndex out_tidx_to_in_tidx(const TensorIndex out_tidx) {
+  TensorIndex in_tidx;
+  initialize(in_tidx);
+
+  int in_size = int(size_at(inp, selected_dim));
+  int adjusted_index = index;
+  if (index < 0) {
+    adjusted_index = index + in_size;
   }
-  if (selected_dim <= in_packed_dim) {
-    return false;
+
+  // Copy indices before selected_dim
+  for (int d = 0; d < selected_dim; d++) {
+    in_tidx.data[div_4(d)][mod_4(d)] = idx_at(out_tidx, d);
   }
-  return true;
+
+  // Insert the selected index
+  in_tidx.data[div_4(selected_dim)][mod_4(selected_dim)] = adjusted_index;
+
+  // Copy indices after selected_dim (shifted by 1)
+  for (int d = selected_dim; d < int_ndim(inp) - 1; d++) {
+    in_tidx.data[div_4(d + 1)][mod_4(d + 1)] = idx_at(out_tidx, d);
+  }
+
+  return in_tidx;
 }
 
-#endif // USING_BUFFER
+#else // texture storage
 
 /*
- * Given an output tensor index, return the corresponding input tensor index for
- * the select operator. This is done by "inserting" the select index at the
- * selected_dim in the input tensor index.
+ * Converts output tensor indices to input tensor indices for the select operation
+ * on texture storage.
  *
- * A simple example is (note all tensor index are in WHCN order):
- *   out_tidx = [7, 5, 9]
- *   selected_dim = 2
- *   index = 3
- *   in_tidx = [7, 3, 5, 9]
+ * This is done by "inserting" the select index at the selected_dim in the input
+ * tensor index.
  *
- * This function assumes that the following variables are defined in the layout:
- * - in_sizes
+ * Parameters assumed to be defined:
+ * - inp: TextureMetadata
  * - selected_dim
  * - index
  */
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx = ivec4(0);
+TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
+  TensorIndex4D in_tidx;
+  in_tidx.data = ivec4(0);
 
   int adjusted_index = index;
   if (index < 0) {
-    adjusted_index = index + in_sizes[selected_dim];
+    adjusted_index = index + inp.sizes[selected_dim];
   }
 
   // Handle different dimensions for selection
   if (selected_dim == 0) {
     // Select from width dimension
-    in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z);
+    in_tidx.data = ivec4(adjusted_index, out_tidx.data.x, out_tidx.data.y, out_tidx.data.z);
   } else if (selected_dim == 1) {
     // Select from height dimension
-    in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z);
+    in_tidx.data = ivec4(out_tidx.data.x, adjusted_index, out_tidx.data.y, out_tidx.data.z);
   } else if (selected_dim == 2) {
     // Select from channel dimension
-    in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z);
+    in_tidx.data = ivec4(out_tidx.data.x, out_tidx.data.y, adjusted_index, out_tidx.data.z);
   } else if (selected_dim == 3) {
     // Select from batch dimension
-    in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index);
+    in_tidx.data = ivec4(out_tidx.data.x, out_tidx.data.y, out_tidx.data.z, adjusted_index);
   }
 
   return in_tidx;
 }
 
+#endif // USING_BUFFER
+
 #endif // SELECT_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
index 87325754f4d..0a815c85d66 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -9,49 +9,61 @@
 #ifndef SLICE_GLSLH
 #define SLICE_GLSLH
 
-#ifndef USING_BUFFER
+#include "indexing.glslh"
 
-/**
- * Enable the fast path if a texel loaded from the input texture can be used as
- * is to store to the output texture. The following conditions must be met:
+#ifdef USING_BUFFER
+
+/*
+ * Converts output tensor indices to input tensor indices for the slice operation
+ * on buffer storage.
  *
- * 1. The input and output textures have the same packed dimension.
- * 2. The select_dim must not be the packed dimension of the input.
+ * Parameters assumed to be defined:
+ * - inp: BufferMetadata
+ * - selected_dim
+ * - start
+ * - step
  */
-bool can_use_fast_path() {
-  if (out_packed_dim != in_packed_dim) {
-    return false;
-  }
-  if (in_packed_dim == selected_dim) {
-    return false;
+TensorIndex out_tidx_to_in_tidx(const TensorIndex out_tidx) {
+  TensorIndex in_tidx = out_tidx;
+
+  int in_size = int(size_at(inp, selected_dim));
+  int adjusted_start = start;
+  if (start < 0) {
+    adjusted_start = start + in_size;
   }
-  return true;
+
+  uint out_idx = idx_at(out_tidx, selected_dim);
+  in_tidx.data[div_4(selected_dim)][mod_4(selected_dim)] =
+      adjusted_start + int(out_idx) * step;
+
+  return in_tidx;
 }
 
-#endif // USING_BUFFER
+#else // texture storage
 
 /*
- * Converts output tensor indices to input tensor indices for the slice operation.
- * This function maps the output indices to the corresponding input indices based on
- * the slice parameters (start, step, selected_dim).
+ * Converts output tensor indices to input tensor indices for the slice operation
+ * on texture storage.
  *
- * Parameters assumed to be defined in the layout specifier:
- * - in_sizes
+ * Parameters assumed to be defined:
+ * - inp: TextureMetadata
  * - selected_dim
  * - start
  * - step
  */
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx = out_tidx;
+TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {
+  TensorIndex4D in_tidx = out_tidx;
 
   int adjusted_start = start;
   if (start < 0) {
-    adjusted_start = start + in_sizes[selected_dim];
+    adjusted_start = start + inp.sizes[selected_dim];
   }
 
-  in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step;
+  in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step;
 
   return in_tidx;
 }
 
+#endif // USING_BUFFER
+
 #endif // SLICE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl
new file mode 100644
index 00000000000..0505c9e7bcd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.glsl
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_input", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int split_dim = 0;
+layout(constant_id = 4) const int split_idx = 0;
+layout(constant_id = 5) const int split_offset = 0;
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
+    return;
+  }
+
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
+
+  TensorIndex input_tidx = out_tidx;
+  input_tidx.data[div_4(split_dim)][mod_4(split_dim)] += split_offset;
+
+  const uint input_bufi = tensor_idx_to_linear_idx(inp, input_tidx);
+
+  t_out[out_bufi] = t_input[input_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml
new file mode 100644
index 00000000000..fd52c0ac721
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_buffer.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+split_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: split_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
new file mode 100644
index 00000000000..92d7ce548e2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.glsl
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+${define_required_extensions(DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "common.glslh"
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_output", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_input", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int split_dim = 0;
+layout(constant_id = 4) const int split_idx = 0;
+layout(constant_id = 5) const int split_offset = 0;
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+
+  VEC4_T out_texel = VEC4_T(0);
+
+  int limit = min(
+      4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]);
+
+  TensorIndex4D input_tidx = out_tidx;
+  input_tidx.data[split_dim] += split_offset;
+
+  for (int comp = 0; comp < limit; comp++) {
+    TextureElementIndex input_elem_pos = tensor4d_idx_to_texture_element_idx_simple(
+        inp, input_tidx);
+
+    VEC4_T input_texel = texelFetch(t_input, input_elem_pos.pos, 0);
+    out_texel[comp] = input_texel[input_elem_pos.comp];
+
+    input_tidx.data[outp.packed_dim]++;
+  }
+
+  imageStore(t_output, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml
new file mode 100644
index 00000000000..89446df831b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/split_texture.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+split_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: split_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
index 7605c59c72f..73b753ccc0b 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
@@ -11,18 +11,23 @@
 #define PRECISION ${PRECISION}
 #define UBO_PARAMS ${UBO_PARAMS}
 
-#define VEC4_T ${texel_type(DTYPE)}
 #define T ${buffer_scalar_type(DTYPE)}
 
 ${define_active_storage_type("buffer")}
 ${define_required_extensions(DTYPE)}
 
+#extension GL_EXT_control_flow_attributes : require
+
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+#include "indexing.glslh"
+
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
 
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
 $if UBO_PARAMS:
   $if OP_NAME == "slice":
     ${layout_declare_ubo(B, "int", "start")}
@@ -32,10 +37,6 @@ $if UBO_PARAMS:
     ${layout_declare_ubo(B, "int", "index")}
 
 layout(push_constant) uniform restrict Block {
-  ivec4 in_sizes;
-  ivec4 out_strides;
-  ivec4 in_strides;
-  int out_numel;
   int selected_dim;
   $if not UBO_PARAMS:
     $if OP_NAME == "slice":
@@ -46,24 +47,19 @@ layout(push_constant) uniform restrict Block {
       int index;
 };
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #include "${OP_NAME}.glslh"
 
 void main() {
-  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
-  if (out_bufi >= out_numel) {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, outp)) {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
-  ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(outp, out_bufi);
+  TensorIndex in_tidx = out_tidx_to_in_tidx(out_tidx);
 
-  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+  const uint in_bufi = tensor_idx_to_linear_idx(inp, in_tidx);
   t_out[out_bufi] = t_in[in_bufi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
index 0f34713cb43..d2c9c025242 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
@@ -11,19 +11,25 @@
 #define PRECISION ${PRECISION}
 #define UBO_PARAMS ${UBO_PARAMS}
 
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
 
 ${define_active_storage_type("texture3d")}
 ${define_required_extensions(DTYPE)}
 
+#extension GL_EXT_control_flow_attributes : require
+
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+#include "common.glslh"
+#include "indexing.glslh"
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
 
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+
 $if UBO_PARAMS:
   $if OP_NAME == "slice":
     ${layout_declare_ubo(B, "int", "start")}
@@ -33,8 +39,6 @@ $if UBO_PARAMS:
     ${layout_declare_ubo(B, "int", "index")}
 
 layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
   int selected_dim;
   $if not UBO_PARAMS:
     $if OP_NAME == "slice":
@@ -45,48 +49,33 @@ layout(push_constant) uniform restrict Block {
       int index;
 };
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #include "${OP_NAME}.glslh"
 
 void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+  if (out_of_bounds(out_pos, outp)) {
     return;
   }
 
-  if (can_use_fast_path()) {
-    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+  VEC4_T out_texel = VEC4_T(0);
 
-    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
-  }
-  else {
-    VEC4_T out_texel = VEC4_T(0);
-    for (int texel_i = 0; texel_i < 4; ++texel_i) {
-      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-      int element_idx = in_tidx[in_packed_dim] % 4;
-
-      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-      T selected_value = T(in_texel[element_idx]);
+  int limit = min(
+      4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]);
+  for (int comp = 0; comp < limit; comp++) {
+    TensorIndex4D in_tidx = out_tidx_to_in_tidx(out_tidx);
 
-      out_texel[texel_i] = selected_value;
+    TextureElementIndex in_elem_pos = tensor4d_idx_to_texture_element_idx_simple(
+        inp, in_tidx);
 
-      out_tidx[out_packed_dim]++;
-    }
+    VEC4_T in_texel = texelFetch(t_in, in_elem_pos.pos, 0);
+    out_texel[comp] = in_texel[in_elem_pos.comp];
 
-    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+    out_tidx.data[outp.packed_dim]++;
   }
+
+  imageStore(t_out, out_pos, out_texel);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
deleted file mode 100644
index bd648dbae2d..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using utils::ivec3;
-using utils::ivec4;
-using utils::uvec3;
-
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ivec3& range,
-    const ivec4& src_offset,
-    const ivec4& dst_offset,
-    const ValueRef out,
-    bool calc_out_pos_using_src_chnl,
-    bool calc_in_pos_using_dst_chnl) {
-  std::string kernel_name = "copy_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-
-  auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)),
-          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
-          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       (calc_out_pos_using_src_chnl      ? 1
-            : calc_in_pos_using_dst_chnl ? 2
-                                         : 0)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_copy_packed_dim_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ivec3& range,
-    const ivec4& src_offset,
-    const ivec4& dst_offset,
-    const ValueRef out) {
-  // Check the packed dimension is same for both tensors, also check if the
-  // packed dimension is Width or Height. Since the function does not support
-  // channel packing.
-  VK_CHECK_COND(
-      graph.packed_dim_of(in) == graph.packed_dim_of(out) &&
-      (graph.packed_dim_of(in) == WHCN::kWidthDim ||
-       graph.packed_dim_of(in) == WHCN::kHeightDim));
-
-  std::string kernel_name = "copy_packed_dim_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  // A copy of range with the last element set to batch size of the input tensor
-  ivec4 final_range = {
-      range[0], range[1], range[2], dim_at(in_sizes, kBatch4D)};
-  ivec3 global_wg_size = graph.logical_limits_of(out);
-
-  const auto packed_dim = graph.packed_dim_of(in);
-  // The starting offset in a texel where this tensor will start copying from
-  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
-  // The starting offset in a texel where this tensor will start copying to
-  const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
-
-  // The total packed texels this tensor will be copied from
-  // The first texel of tensor data in packed dimension will be copied from
-  // remaining lanes from current source Hence (4 - src_lane_offset) is added
-  // to tensor size in packed dimension
-  const auto src_packed_size = utils::div_up_4(
-      (4 - src_lane_offset) + utils::val_at(-packed_dim, out_sizes));
-
-  // The total packed texels this tensor will be copied to
-  // The first texel of tensor data in packed dimension will be copied to
-  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
-  // to tensor size in packed dimension
-  const auto dst_packed_size = utils::div_up_4(
-      (4 - dst_lane_offset) + utils::val_at(-packed_dim, in_sizes));
-
-  // If the starting src offset is not 0, and the total packed texels is
-  // greater than the source texel range
-  const bool has_additional_src_work =
-      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
-  // If the starting dst offset is not 0, and the total packed texels is
-  // greater than the source texel range
-  const bool has_additional_dst_work =
-      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
-
-  if (has_additional_src_work || has_additional_dst_work) {
-    global_wg_size[packed_dim]++; // Increase the global work group size in
-                                  // packed dimension
-    final_range[packed_dim]++; // Increase the range in packed dimension
-  }
-
-  auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {out, vkapi::kRead},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(
-              &final_range, sizeof(final_range), sizeof(ivec4)),
-          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
-          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_copy_channel_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    int32_t channel_range,
-    int32_t src_channel_offset,
-    int32_t dst_channel_offset,
-    const ValueRef out) {
-  // Likely need to prepad these numbers.
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-
-  // NOTE: This function should be able to support 1d and 2d tensors when
-  // range=1, src_offset=dst_offset=1.
-  VK_CHECK_COND(graph.dim_of(in) >= 3, "Src dim should be at least 3");
-  VK_CHECK_COND(graph.dim_of(out) >= 3, "Dst dim should be at least 3");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(in_sizes) >= src_channel_offset + channel_range,
-      "Src channel (",
-      src_channel_offset,
-      ") and range (",
-      channel_range,
-      ") should be less than or equal to input tensor's channel size (",
-      dim_at<kChannel4D>(in_sizes),
-      ")");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(out_sizes) >= dst_channel_offset + channel_range,
-      "Dst channel (",
-      dst_channel_offset,
-      ") and range (",
-      channel_range,
-      ") should be less than or equal to input tensor's channel size (",
-      dim_at<kChannel4D>(out_sizes),
-      ")");
-
-  VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
-  VK_CHECK_COND(
-      src_channel_offset >= 0, "Src channel offset must be non-negative");
-  VK_CHECK_COND(
-      dst_channel_offset >= 0, "Dst channel offset must be non-negative");
-
-  std::string kernel_name = "copy_channel_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  int32_t out_channels = dim_at<kChannel4D>(out_sizes);
-
-  // Copy one batch at a time.
-  for (int batch_idx = 0; batch_idx < dim_at<kBatch4D>(in_sizes); batch_idx++) {
-    // Mapping the tensor NCHW coordinates into texture XYZ coordinates
-    int32_t dst_first_z = dst_channel_offset / 4;
-    int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
-
-    // We copy the entire width and height dimension. For the channel dimension,
-    // we use the z-dimension of the global_size to specify the texture range.
-    // The shader combines the global invocation id and the dst_offset to get
-    // the actual coordinate.
-
-    const ivec3 dst_offset{
-        0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)};
-
-    const uvec3 global_size{
-        utils::safe_downcast<uint32_t>(dim_at<kWidth4D>(in_sizes)),
-        utils::safe_downcast<uint32_t>(dim_at<kHeight4D>(in_sizes)),
-        utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
-    const uvec3 local_size = graph.create_local_wg_size(global_size);
-
-    const utils::ivec4 range_params = {
-        static_cast<int>(global_size[0]),
-        static_cast<int>(global_size[1]),
-        static_cast<int>(global_size[2]),
-        channel_range};
-
-    const ivec4 offset_params = {
-        dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset};
-
-    auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        // Inputs and Outputs
-        {
-            {out, vkapi::kWrite},
-            {out, vkapi::kRead},
-            {in, vkapi::kRead},
-        },
-        // Parameter buffers
-        {},
-        // Push Constants
-        {graph.sizes_pc_of(out),
-         graph.sizes_pc_of(in),
-         PushConstantDataInfo(&range_params, sizeof(range_params)),
-         PushConstantDataInfo(&offset_params, sizeof(offset_params)),
-         PushConstantDataInfo(&src_channel_offset, sizeof(src_channel_offset))},
-        // Specialization Constants
-        {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-        // Resize Args
-        {},
-        // Resizing Logic
-        nullptr));
-  }
-}
-
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef range_ref,
-    ValueRef src_offset_ref,
-    ValueRef dst_offset_ref,
-    ValueRef out) {
-  ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref));
-  ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
-  ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
-
-  ivec4 src_offset = {src[0], src[1], src[2], 0};
-  ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};
-
-  add_copy_offset_node(
-      graph, in, range, src_offset, dst_offset, out, false, false);
-}
-
-void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
-}
-
-void copy_channel_offset(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  ValueRef in = args[0];
-  ValueRef channel_range_ref = args[1];
-  ValueRef src_channel_offset_ref = args[2];
-  ValueRef dst_channel_offset_ref = args[3];
-  ValueRef out = args[4];
-
-  auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
-  auto src_channel_offset =
-      graph.extract_scalar<int64_t>(src_channel_offset_ref);
-  auto dst_channel_offset =
-      graph.extract_scalar<int64_t>(dst_channel_offset_ref);
-
-  add_copy_channel_offset_node(
-      graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(etvk.copy_offset, copy_offset);
-  VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
deleted file mode 100644
index 41956d482d9..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-namespace vkcompute {
-
-// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
-// texture extents specified by the range, src_offset, and dst_offset (all are
-// in texture coordinate (x, y, z) from the input image to the output image.
-// src_offset.w and dst_offset.w may contain channel size information.
-//
-// It is possible to have input and output to point to the same image
-// object. But when the source range and destination range overlap, the behavior
-// is undefined.
-//
-// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl
-// can be used to specify an indexing function in the shader
-// If calc_out_pos_using_src_chnl is set to true channel and batch index will be
-// calculated based on source channel size and will be used to determine
-// destination texel position.
-//
-// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be
-// calculated based on destination channel size and will be used to determine
-// source texel position.
-//
-// If both are true calc_out_pos_using_src_chnl is picked. If both are false no
-// index calculation happens.
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const utils::ivec3& range,
-    const utils::ivec4& src_offset,
-    const utils::ivec4& dst_offset,
-    const ValueRef out,
-    bool calc_out_pos_using_src_chnl,
-    bool calc_in_pos_using_dst_chnl);
-
-// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that
-// its used when copying packed dimension, if tensor is width or height packed.
-// src_offset.w and dst_offset.w may contain channel size information.
-//
-// It copies the texture extents specified by the range, src_offset, and
-// dst_offset (all are in texture coordinate (x, y, z) from the input image to
-// the output image.
-void add_copy_packed_dim_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const utils::ivec3& range,
-    const utils::ivec4& src_offset,
-    const utils::ivec4& dst_offset,
-    const ValueRef out);
-
-// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
-// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
-// The range and offset arguments are in the tensor coordinate. It assumes the
-// underlying texture is channel-packed.
-//
-// This function is specialized implementation for copying
-// channel packed values. The complication comes from when reading / writing the
-// channel dimension on indices that are not aligned to packing, we will need
-// be careful about the boundaries.
-//
-// It achieves the following:
-//   out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
-//       in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
-void add_copy_channel_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    int32_t channel_range,
-    int32_t src_channel_offset,
-    int32_t dst_channel_offset,
-    const ValueRef out);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 72c1637a2c9..2b42c0bd150 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -14,8 +14,6 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
-
 namespace vkcompute {
 
 namespace {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index f87af08ee69..4e62ae8806d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -8,134 +8,131 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-namespace vkcompute {
+#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
 
-void add_split_with_sizes_default_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const std::vector<int64_t>& split_sizes,
-    int64_t dim,
-    ValueRef out_list_ref) {
-  const ValueListPtr out_list = graph.get_value_list(out_list_ref);
+namespace vkcompute {
 
-  const int64_t input_ndim = graph.dim_of(in);
+using utils::GPUMemoryLayout;
+using utils::StorageType;
+
+void resize_split_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef input = args.at(0).refs.at(0);
+  const ValueRef split_sizes_ref = args.at(1).refs.at(0);
+  const ValueRef dim_ref = args.at(2).refs.at(0);
+  const ValueRef out_list_ref = args.at(3).refs.at(0);
+
+  const ValueListPtr out_list = graph->get_value_list(out_list_ref);
+  const std::vector<int64_t> split_sizes =
+      *(graph->get_int_list(split_sizes_ref));
+  const int64_t dim = graph->extract_scalar<int64_t>(dim_ref);
+
+  const int64_t input_ndim = graph->dim_of(input);
   const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
                                      : static_cast<DimIndex>(dim - input_ndim);
 
-  VK_CHECK_COND(out_list->size() == split_sizes.size());
+  std::vector<int64_t> input_sizes = graph->sizes_of(input);
 
   for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
     const int64_t split_size = split_sizes.at(split_idx);
     const ValueRef out_ref = out_list->at(split_idx);
 
-    VK_CHECK_COND(dim_at(graph.sizes_of(out_ref), dim_index) == split_size);
-  }
-
-  const auto packed_dim = graph.packed_dim_of(in);
-  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
+    std::vector<int64_t> out_sizes = input_sizes;
+    out_sizes.at(dim_index) = split_size;
 
-  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
-  const auto dim_xyz_index = std::min(2, -dim_index - 1);
-
-  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-
-  const bool is_splitting_channel = (dim_index == kChannel4D);
-
-  // if splitting channels
-  if (is_splitting_channel) {
-    // set source offset w as channel size of the input tensor
-    src_offset[3] = dim_at(graph.sizes_of(in), kChannel4D);
+    graph->virtual_resize(out_ref, out_sizes);
   }
+}
 
-  for (ValueRef out_ref : *out_list) {
-    // Doesn't need to use split_size since we have already verified that the
-    // output tensor's size matches with the split_size.
-    const auto out_channel_size = dim_at(graph.sizes_of(out_ref), kChannel4D);
-    const utils::ivec3 range = graph.logical_limits_of(out_ref);
-
-    if (dim_index == packed_dim_index) {
-      // if splitting channels, use add_copy_channel_offset_node function as
-      // add_copy_packed_dim_offset_node does not support channel packing
-      if (is_splitting_channel) {
-        add_copy_channel_offset_node(
-            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
-        src_offset[dim_xyz_index] += out_channel_size;
-      } else {
-        // dst_offset[3] is not used now but will be used in the future when
-        // add_copy_packed_dim_offset_node will support channel packing
-        //
-        // set destination offset w as channel size of the output tensor if
-        // splitting channel
-        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
-        add_copy_packed_dim_offset_node(
-            graph, in, range, src_offset, dst_offset, out_ref);
-        src_offset[dim_xyz_index] +=
-            dim_at(graph.sizes_of(out_ref), packed_dim_index);
-      }
-    } else {
-      // set destination offset w as channel size of the output tensor if
-      // splitting channels
-      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
-      src_offset[dim_xyz_index] +=
-          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
-    }
+void add_split_node(
+    ComputeGraph& graph,
+    const ValueRef input,
+    const std::vector<int64_t>& split_sizes,
+    const int64_t dim,
+    const ValueRef out,
+    const int split_idx) {
+  std::string kernel_name = "split";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  vkapi::ParamsBindList param_ubos = {
+      graph.meta_ubo(out), graph.meta_ubo(input)};
+
+  int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, graph.dim_of(input));
+
+  // Calculate the offset for this split by summing previous split sizes
+  int64_t split_offset = 0;
+  for (int i = 0; i < split_idx; i++) {
+    split_offset += split_sizes[i];
   }
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {input, vkapi::kRead}},
+      // Shader params buffers
+      param_ubos,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {utils::safe_downcast<int32_t>(dim_whcn),
+       static_cast<int32_t>(split_idx),
+       static_cast<int32_t>(split_offset)},
+      // Resize Args
+      {},
+      // Resizing Logic
+      nullptr));
 }
 
-void add_split_with_sizes_default_node(
+void add_split_with_sizes_node(
     ComputeGraph& graph,
-    ValueRef in,
-    ValueRef split_sizes_ref,
-    ValueRef dim_ref,
-    ValueRef out) {
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
+    const ValueRef input,
+    const std::vector<int64_t>& split_sizes,
+    const int64_t dim,
+    const ValueRef out_list_ref) {
+  const ValueListPtr out_list = graph.get_value_list(out_list_ref);
+
+  VK_CHECK_COND(out_list->size() == split_sizes.size());
 
-  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
+  // Dispatch a shader for each output tensor
+  for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
+    const ValueRef out_ref = out_list->at(split_idx);
+    add_split_node(graph, input, split_sizes, dim, out_ref, split_idx);
+  }
 }
 
 void split_with_sizes_copy_default(
     ComputeGraph& graph,
     const std::vector<ValueRef>& args) {
-  add_split_with_sizes_default_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-void add_split_tensor_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef split_size_ref,
-    ValueRef dim_ref,
-    ValueRef out) {
-  const int64_t split_size = graph.extract_scalar<int64_t>(split_size_ref);
-  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-
-  const int64_t input_ndim = graph.dim_of(in);
-  const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
-                                     : static_cast<DimIndex>(dim - input_ndim);
-  const int64_t size = dim_at(graph.sizes_of(in), dim_index);
-  const std::vector<int64_t> split_sizes(size / split_size, split_size);
+  ValueRef input = args[0];
+  ValueRef split_sizes_ref = args[1];
+  ValueRef dim_ref = args[2];
+  ValueRef out_list_ref = args[3];
 
-  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
-}
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
 
-void split_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_split_tensor_node(graph, args[0], args[1], args[2], args[3]);
+  add_split_with_sizes_node(graph, input, split_sizes, dim, out_list_ref);
 }
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(
       aten.split_with_sizes_copy.default, split_with_sizes_copy_default);
-  VK_REGISTER_OP(aten.split.Tensor, split_tensor);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
index 60127ecf9bd..1823271824a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
@@ -50,15 +50,16 @@ void add_transfer_copy_node(
       (transfer_type == TransferType::SELECT ||
        graph.is_scalar_or_none(step_ref));
 
-  vkapi::ParamsBindList param_buffers;
+  vkapi::ParamsBindList param_ubos = {graph.meta_ubo(out), graph.meta_ubo(in)};
+
   if (!param_is_scalar) {
     if (transfer_type == TransferType::SELECT) {
-      param_buffers = {
-          graph.get_or_create_int_param_buffer(index_or_start_ref, 0)};
+      param_ubos.append(
+          graph.get_or_create_int_param_buffer(index_or_start_ref, 0));
     } else { // TransferType::SLICE
-      param_buffers = {
-          graph.get_or_create_int_param_buffer(index_or_start_ref, 0),
-          graph.get_or_create_int_param_buffer(step_ref, 1)};
+      param_ubos.append(
+          graph.get_or_create_int_param_buffer(index_or_start_ref, 0));
+      param_ubos.append(graph.get_or_create_int_param_buffer(step_ref, 1));
     }
   } else {
     transfer_params.index_or_start_ref =
@@ -69,18 +70,6 @@ void add_transfer_copy_node(
   }
 
   std::vector<PushConstantDataInfo> push_constants;
-  push_constants.reserve(graph.is_buffer_storage(out) ? 5 : 3);
-
-  if (graph.is_buffer_storage(out)) {
-    push_constants.emplace_back(graph.sizes_pc_of(in));
-    push_constants.emplace_back(graph.strides_pc_of(out));
-    push_constants.emplace_back(graph.strides_pc_of(in));
-    push_constants.emplace_back(graph.numel_pc_of(out));
-  } else {
-    push_constants.emplace_back(graph.sizes_pc_of(out));
-    push_constants.emplace_back(graph.sizes_pc_of(in));
-  }
-
   if (param_is_scalar) {
     push_constants.emplace_back(&transfer_params, sizeof(transfer_params));
   } else {
@@ -88,11 +77,6 @@ void add_transfer_copy_node(
         &transfer_params.dim, sizeof(transfer_params.dim));
   }
 
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(out),
-      graph.hashed_layout_of(in),
-  };
-
   // Determine the shader directly
   std::string kernel_name;
   if (transfer_type == TransferType::SELECT) {
@@ -115,11 +99,11 @@ void add_transfer_copy_node(
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Parameter buffers
-      param_buffers,
+      param_ubos,
       // Push Constants
       push_constants,
       // Specialization Constants
-      spec_vars,
+      {},
       // Resize Args
       resize_args,
       // Resizing Logic
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
index b62bf661995..05234c7790f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -69,7 +69,7 @@ template <
         std::is_integral<T>::value && std::is_signed<T>::value,
         int>::type = 0>
 T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) {
-  return ndim - 1 - nchw_dim;
+  return ndim - 1 - normalize(nchw_dim, ndim);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index f59c3e30aeb..b21a8458a89 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -6,7 +6,6 @@
 
 
 import itertools
-
 from collections import namedtuple
 from typing import Callable
 
@@ -1519,64 +1518,11 @@ def get_split_with_sizes_inputs():
 
     test_suite.layouts = [
         "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.dtypes = ["at::kFloat"]
-    return test_suite
-
-
-@register_test_suite("aten.split.Tensor")
-def get_split_tensor_inputs():
-    test_suite = VkTestSuite(
-        [
-            # Split on Width
-            ((S1, 7, 10, 12), 12, 3),
-            ((S1, 7, 10, 12), 3, 3),
-            ((S1, 7, 10, 12), 1, 3),
-            ((7, 10, 12), 12, 2),
-            ((7, 10, 12), 3, 2),
-            ((7, 10, 12), 1, 2),
-            ((10, 12), 12, 1),
-            ((10, 12), 3, 1),
-            ((10, 12), 1, 1),
-            ((12,), 12, 0),
-            ((12,), 3, 0),
-            ((12,), 1, 0),
-            # Split on Height
-            ((S1, 7, 12, 8), 12, 2),
-            ((S1, 7, 12, 8), 3, 2),
-            ((S1, 7, 12, 8), 1, 2),
-            ((7, 12, 8), 12, 1),
-            ((7, 12, 8), 3, 1),
-            ((7, 12, 8), 1, 1),
-            ((12, 8), 12, 0),
-            ((12, 8), 3, 0),
-            ((12, 8), 1, 0),
-            # Split  on Batch
-            ((12, 7, 10, 10), 12, 0),
-            ((12, 7, 10, 10), 3, 0),
-            ((12, 7, 10, 10), 1, 0),
-            # Split  on Channel
-            ((7, 15, 10, 10), 15, 1),
-            ((7, 15, 10, 10), 5, 1),
-            ((7, 15, 10, 10), 3, 1),
-            ((7, 15, 10, 10), 1, 1),
-            ((15, 10, 10), 15, 0),
-            ((15, 10, 10), 5, 0),
-            ((15, 10, 10), 3, 0),
-            ((15, 10, 10), 1, 0),
-        ]
-    )
-
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
     test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
     return test_suite
 
 
diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
index 90edc094ec7..6d3fff452f8 100644
--- a/backends/vulkan/test/utils.py
+++ b/backends/vulkan/test/utils.py
@@ -8,18 +8,14 @@
 import logging
 from collections import OrderedDict
 from copy import deepcopy
-
 from enum import auto, Enum
 from typing import Any, List, Optional, Tuple
 
 import executorch.backends.vulkan.utils as utils
-
 import torch
-
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
@@ -36,7 +32,6 @@
 )
 from executorch.extension.pytree import tree_flatten
 from torch.export import export
-
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -275,16 +270,25 @@ def check_outputs_equal(
                 )
             return result
         else:
+            result = True
             for i in range(len(ref_output)):
-                if not torch.allclose(
-                    model_output[i], ref_output[i], atol=atol, rtol=rtol
-                ):
-                    print(f"\n=== Output {i} comparison failed ===")
-                    print_tensor_comparison_errors(
-                        model_output[i], ref_output[i], atol, rtol
-                    )
-                    return False
-            return True
+                if isinstance(ref_output[i], torch.Tensor):
+                    if not torch.allclose(
+                        model_output[i], ref_output[i], atol=atol, rtol=rtol
+                    ):
+                        print(f"\n=== Output {i} comparison failed ===")
+                        print_tensor_comparison_errors(
+                            model_output[i], ref_output[i], atol, rtol
+                        )
+                        result = False
+                elif isinstance(ref_output[i], int):
+                    if not model_output[i] == ref_output[i]:
+                        print(f"\n=== Output {i} comparison failed ===")
+                        print(f"{model_output[i]} vs {ref_output[[i]]}")
+                        result = False
+                else:
+                    print(f"WARNING: Output {i} has type {type(ref_output[i])}")
+            return result
     else:
         # If one output, eager returns tensor while executor tuple of size 1
         result = torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol)
@@ -326,7 +330,7 @@ def run_and_check_output(
     model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
 
     # Generate reference outputs using the reference model
-    ref_output = reference_model(*sample_inputs)
+    ref_output, _ = tree_flatten(reference_model(*sample_inputs))
 
     # Check if outputs are equal
     return check_outputs_equal(
@@ -805,3 +809,26 @@ def find_bad_operators(
         "all_operators": all_operators,
         "test_count": test_count,
     }
+
+
+def make_indent(indent_level):
+    indent_str = ""
+    for _ in range(indent_level):
+        indent_str += " "
+    return indent_str
+
+
+def print_output(outputs, n: int = 0, indent_level: int = 0):
+    if isinstance(outputs, (list, tuple)):
+        print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}")
+        new_indent_level = indent_level + 2
+        for n, test_out in enumerate(outputs):
+            print_output(test_out, n, new_indent_level)
+    elif isinstance(outputs, torch.Tensor):
+        print(
+            f"{make_indent(indent_level)}output_{n} = test_utils.random_uniform_tensor({outputs.shape}, low={outputs.min().item()}, high={outputs.max().item()},  dtype={outputs.dtype})"
+        )
+    elif isinstance(outputs, int):
+        print(f"{make_indent(indent_level)}output_{n} = {outputs}")
+    else:
+        print(f"{make_indent(indent_level)}output_{n} = {type(outputs)}")
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 7e3d957afdb..7dd3bb84588 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1911,413 +1911,6 @@ TEST(VulkanComputeGraphTest, test_clone) {
   }
 }
 
-TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 6;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  // Notice that copy_node operates on in texture's x, y, z dimension. In the
-  // comment, we provide the cooresponding coordinate in nchw.
-
-  // src_offset is (n=0, c=4, h=1, w=1)
-  ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
-
-  // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
-  // Argument is {x, y, z}.
-  // x = 0 since w = 0
-  // y = 2 since h = 2
-  // z = c / 4 + 2 since
-  //   1. there c/4 planes per batch, n=1 means we are on the first batch;
-  //   2. +2 because c = 8, with channel packing it means two texels.
-  ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
-
-  // range is (n=1, c=8, h=2, w=4)
-  // Argument is {x, y, z}.
-  // x = 4 since w = 4
-  // y = 2 since h = 2
-  // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
-  // bit misleading here, since it gives the impression that we are copying the
-  // entire channel. However, remember when we copy, we are trying to
-  // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
-  // range must be non zero.
-  ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, /*iota = */ true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  // We will examine the results in the dst_range
-  // The value in the cooresponding coordinate should match between the source
-  // and destination tensor. We loop thru the range, calculate both the src and
-  // dst index using the offsets, and compare the values in the extracted
-  // vector. They should match.
-  int n_idx = 0;
-  // at each nested loop, index range from dst_offset to dst_offset + range
-
-  for (int c_idx = 0; c_idx < 8; c_idx++) {
-    for (int h_idx = 0; h_idx < 2; h_idx++) {
-      for (int w_idx = 0; w_idx < 4; w_idx++) {
-        auto dst_idx =
-            get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
-        auto src_idx =
-            get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
-
-        EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  int64_t src_offset = 2;
-  int64_t dst_offset = 3;
-  int64_t range = 7;
-
-  ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
-  ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
-  ValueRef range_ref = graph.add_scalar<int64_t>(range);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto src_idx =
-              get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
-          auto dst_idx = get_buf_idx(
-              graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-        }
-      }
-    }
-  }
-}
-
-TEST(
-    VulkanComputeGraphTest,
-    DISABLED_test_etvk_copy_channel_offset_node_clean_boundary) {
-  // Tricky part for channel copy is handling the boundary across multiple copy.
-  // For example, when we concat two [3, 1, 1] nchw-tensors along the channel
-  // dimension, due to channel packing, elements from different source texel
-  // will be packed into same destination texel at the boundaries.
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef zero = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-  IOValueRef b = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-
-  // Make sure entire out tensor is zeroed. The zero tensor will be filled with
-  // zero later.
-  copyFn(
-      graph,
-      {zero.value,
-       graph.add_scalar<int64_t>(c),
-       graph.add_scalar<int64_t>(0),
-       graph.add_scalar<int64_t>(0),
-       out.value});
-
-  int64_t a_src_offset = 0;
-  int64_t a_dst_offset = 2;
-  int64_t a_range = 5;
-  // a will write to channge [2, 7)
-  copyFn(
-      graph,
-      {a.value,
-       graph.add_scalar<int64_t>(a_range),
-       graph.add_scalar<int64_t>(a_src_offset),
-       graph.add_scalar<int64_t>(a_dst_offset),
-       out.value});
-
-  // b will write to channel [6, 11)
-  // Intentional for b to override channel=6
-  int64_t b_src_offset = 0;
-  int64_t b_dst_offset = 6;
-  int64_t b_range = 5;
-
-  copyFn(
-      graph,
-      {b.value,
-       graph.add_scalar<int64_t>(b_range),
-       graph.add_scalar<int64_t>(b_src_offset),
-       graph.add_scalar<int64_t>(b_dst_offset),
-       out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  float a_value = 1.0f;
-  float b_value = 2.0f;
-  float zero_value = 0.0f;
-  fill_vtensor(graph, a, a_value);
-  fill_vtensor(graph, b, b_value);
-  fill_vtensor(graph, zero, zero_value);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    // c_idx only up to a_range-1 because the expected overwrite by b
-    for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1;
-         c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == a_value);
-        }
-      }
-    }
-  }
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == b_value);
-        }
-      }
-    }
-  }
-
-  // Also verify that data before a_dst_offset and after b_dst_offset + b_range
-  // are untouched.
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == zero_value);
-        }
-      }
-    }
-  }
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == zero_value);
-        }
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_etvk_copy_offset_int_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 6;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kInt, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kInt, memory_layout);
-
-  // Notice that copy_node operates on in texture's x, y, z dimension. In the
-  // comment, we provide the cooresponding coordinate in nchw.
-
-  // src_offset is (n=0, c=4, h=1, w=1)
-  ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
-
-  // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
-  // Argument is {x, y, z}.
-  // x = 0 since w = 0
-  // y = 2 since h = 2
-  // z = c / 4 + 2 since
-  //   1. there c/4 planes per batch, n=1 means we are on the first batch;
-  //   2. +2 because c = 8, with channel packing it means two texels.
-  ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
-
-  // range is (n=1, c=8, h=2, w=4)
-  // Argument is {x, y, z}.
-  // x = 4 since w = 4
-  // y = 2 since h = 2
-  // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
-  // bit misleading here, since it gives the impression that we are copying the
-  // entire channel. However, remember when we copy, we are trying to
-  // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
-  // range must be non zero.
-  ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0, /*iota = */ true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  // We will examine the results in the dst_range
-  // The value in the cooresponding coordinate should match between the source
-  // and destination tensor. We loop thru the range, calculate both the src and
-  // dst index using the offsets, and compare the values in the extracted
-  // vector. They should match.
-  int n_idx = 0;
-  // at each nested loop, index range from dst_offset to dst_offset + range
-
-  for (int c_idx = 0; c_idx < 8; c_idx++) {
-    for (int h_idx = 0; h_idx < 2; h_idx++) {
-      for (int w_idx = 0; w_idx < 4; w_idx++) {
-        auto dst_idx =
-            get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
-        auto src_idx =
-            get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
-
-        EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_int_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  int64_t src_offset = 2;
-  int64_t dst_offset = 3;
-  int64_t range = 7;
-
-  ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
-  ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
-  ValueRef range_ref = graph.add_scalar<int64_t>(range);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto src_idx =
-              get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
-          auto dst_idx = get_buf_idx(
-              graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-        }
-      }
-    }
-  }
-}
-
 TEST(VulkanComputeGraphTest, test_view_change_packing) {
   std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>>
       layout_pairs = {
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index fca8173ffb7..2ca2ddf19b7 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -8,26 +8,18 @@
 from typing import Any, List, Optional, Set, Tuple, Union
 
 import torch
-
 from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
     VkMemoryLayout,
     VkStorageType,
 )
-
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
 )
-
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
-
 from executorch.exir.tensor import TensorSpec
-
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
-
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorConverter
-
 from torch.export import ExportedProgram
-
 from torch.export.exported_program import InputKind
 from torch.export.graph_signature import TensorArgument
 
@@ -399,10 +391,23 @@ def node_has_target(node: Any, target: str):
     VkStorageType.TEXTURE_3D,
 }
 
+# Memory layouts available to non-quantized tensors
 all_memory_layouts: Set[VkMemoryLayout] = {
     VkMemoryLayout.TENSOR_WIDTH_PACKED,
     VkMemoryLayout.TENSOR_HEIGHT_PACKED,
     VkMemoryLayout.TENSOR_CHANNELS_PACKED,
+}
+
+# Memory layouts available to quantized tensors
+all_quantized_memory_layouts: Set[VkMemoryLayout] = {
+    VkMemoryLayout.PACKED_INT8_4W4C,
+    VkMemoryLayout.PACKED_INT8_4H4W,
+}
+
+universal_memory_layout_set: Set[VkMemoryLayout] = {
+    VkMemoryLayout.TENSOR_WIDTH_PACKED,
+    VkMemoryLayout.TENSOR_HEIGHT_PACKED,
+    VkMemoryLayout.TENSOR_CHANNELS_PACKED,
     VkMemoryLayout.PACKED_INT8_4W4C,
     VkMemoryLayout.PACKED_INT8_4H4W,
 }
@@ -761,7 +766,7 @@ def make_filtered_tensor_repset(
 
 ## Convenience TensorRepSet definitions
 
-PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set())
+# Only includes memory layouts that can be used by non-quantized tensors
 
 CONTIGUOUS_ANY = TensorRepSet(
     {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_WIDTH_PACKED}
@@ -782,9 +787,18 @@ def make_filtered_tensor_repset(
 
 ANY_TEXTURE = TensorRepSet(set(), all_memory_layouts)
 ANY_BUFFER = TensorRepSet(all_memory_layouts, set())
-
 ANY_STORAGE = TensorRepSet(all_memory_layouts, all_memory_layouts)
+
+# Only includes memory layouts that can be used by quantized tensors
+
+PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set())
+
+# Special use RepSets
+
 NO_STORAGE = TensorRepSet(set(), set())
+ALL_STORAGES_REPSET = TensorRepSet(
+    universal_memory_layout_set, universal_memory_layout_set
+)
 
 
 class TensorRepSetList:
@@ -908,19 +922,19 @@ def __init__(  # noqa: C901
         # Now, go through the arguments of the operator and create a filtered repset
         # for each based on the actual tensor value.
         args_repset_list = TensorRepSetList([])
-        common_arg_repset = ANY_STORAGE
+        common_arg_repset = ALL_STORAGES_REPSET
         for i, arg_node in enumerate(op_node.args):
             arg_repset = inputs_repsets[i]
 
-            # Use ANY_STORAGE for non-tensor nodes so they don't cause the op repsets to
-            # appear empty
+            # Use ALL_STORAGES_REPSET for non-tensor nodes so they don't cause the op
+            # repsets to appear empty
             if not is_tensor_arg_node(arg_node):
-                args_repset_list.append(ANY_STORAGE)
+                args_repset_list.append(ALL_STORAGES_REPSET)
             # NO_STORAGE is used to denote that an input is either a non tensor arg or
             # a weight tensor that is not prepacked. Similar to the above, use
-            # ANY_STORAGE in this case.
+            # ALL_STORAGES_REPSET in this case.
             elif arg_repset.is_empty():
-                args_repset_list.append(ANY_STORAGE)
+                args_repset_list.append(ALL_STORAGES_REPSET)
             else:
                 assert not arg_repset.is_empty()
 
@@ -933,7 +947,7 @@ def __init__(  # noqa: C901
 
         # Repeat for output tensors.
         outs_repset_list = TensorRepSetList([])
-        common_out_repset = ANY_STORAGE
+        common_out_repset = ALL_STORAGES_REPSET
         if num_tensors_in_node(op_node) == 1:
             common_out_repset = make_filtered_tensor_repset(
                 op_node.meta["val"], outputs_repsets[0], texture_limits
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 3a3f6cdf4fe..3ccbdc8ab85 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import copy
 from functools import partial
 from typing import Any, Callable, Dict, final, List
 
@@ -127,15 +128,21 @@ def preprocess(  # noqa: C901
         module_compile_spec: List[CompileSpec],
     ) -> PreprocessResult:
         compile_options = parse_compile_spec(module_compile_spec)
-        limits_x = compile_options.get(
-            "texture_limits_x", utils.DEFAULT_TEXTURE_LIMITS[0]
-        )
-        limits_y = compile_options.get(
-            "texture_limits_y", utils.DEFAULT_TEXTURE_LIMITS[1]
-        )
-        limits_z = compile_options.get(
-            "texture_limits_z", utils.DEFAULT_TEXTURE_LIMITS[2]
-        )
+
+        default_texture_limits = copy.deepcopy(utils.DEFAULT_TEXTURE_LIMITS)
+        # 2048 is the typical limit value for 3D textures, but mobile GPUs often support
+        # 16384. Since the Vulkan delegate primarily targets mobile GPUs at the moment,
+        # 16394 is the default texture limit used. This option is provided as a
+        # convenient way to switch to using a limit of 2048 for image textures which
+        # will be compatible with most GPUs.
+        if compile_options.get("small_texture_limits", False):
+            default_texture_limits[0] = 2048
+            default_texture_limits[1] = 2048
+            default_texture_limits[2] = 2048
+
+        limits_x = compile_options.get("texture_limits_x", default_texture_limits[0])
+        limits_y = compile_options.get("texture_limits_y", default_texture_limits[1])
+        limits_z = compile_options.get("texture_limits_z", default_texture_limits[2])
         texture_limits = (limits_x, limits_y, limits_z)
 
         default_storage_type = compile_options.get(
@@ -204,22 +211,26 @@ def preprocess(  # noqa: C901
 
         # Finally, apply dynamic shape passes and memory planning pass. These passes
         # must be applied only when the graph structure is finalized.
-        greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False)
-        mem_planning_suite = MemoryPlanningAlgorithmSuite(
-            algo_list=[greedy_memory_planning]
-        )
-        # This is a workaround to allow the memory planning pass to work without having
-        # to first apply ToOutVarPass(). See the `greedy()` function in
-        # `exir.memory_planning`; if this attribute isn't set, assertions in
-        # `collect_spec_from_nodes()` will fail.
-        program.graph_module.encounter_to_out_var_failure = True
-        program = apply_passes(
-            program,
-            [
-                ConstraintBasedSymShapeEvalPass(),
-                MemoryPlanningPass(memory_planning_algo=mem_planning_suite),
-            ],
-        )
+        final_passes = [
+            ConstraintBasedSymShapeEvalPass(),
+        ]
+        if not compile_options.get("skip_memory_planning", False):
+            greedy_memory_planning = partial(
+                greedy, allow_overlapping_allocations=False
+            )
+            mem_planning_suite = MemoryPlanningAlgorithmSuite(
+                algo_list=[greedy_memory_planning]
+            )
+            # This is a workaround to allow the memory planning pass to work without having
+            # to first apply ToOutVarPass(). See the `greedy()` function in
+            # `exir.memory_planning`; if this attribute isn't set, assertions in
+            # `collect_spec_from_nodes()` will fail.
+            program.graph_module.encounter_to_out_var_failure = True
+            final_passes.append(
+                MemoryPlanningPass(memory_planning_algo=mem_planning_suite)
+            )
+
+        program = apply_passes(program, final_passes)
 
         graph_builder = VkGraphBuilder(
             program,
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index dace37e5473..58f2ccf1001 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -10,29 +10,29 @@
 
 import argparse
 import logging
+import os
 
-import backends.vulkan.test.utils as test_utils
-
+import executorch.backends.vulkan.test.utils as test_utils
 import torch
 import torchvision
-
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.devtools import BundledProgram
 from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
+from executorch.examples.models import MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
 from executorch.exir import to_edge_transform_and_lower
 from executorch.extension.export_util.utils import save_pte_program
 from executorch.extension.pytree import tree_flatten
 from torch.export import Dim, export
 
-from ..models import MODEL_NAME_TO_MODEL
-from ..models.model_factory import EagerModelFactory
-
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
+import urllib
+
 
 def is_vision_model(model_name):
     if model_name in [
@@ -70,6 +70,38 @@ def get_vision_model_dynamic_shapes():
     )
 
 
+def get_dog_image_tensor(image_size=224, normalization="imagenet"):
+    url, filename = (
+        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+        "dog.jpg",
+    )
+    try:
+        urllib.URLopener().retrieve(url, filename)
+    except:
+        urllib.request.urlretrieve(url, filename)
+
+    from PIL import Image
+    from torchvision import transforms
+
+    input_image = Image.open(filename).convert("RGB")
+
+    transforms_list = [
+        transforms.Resize((image_size, image_size)),
+        transforms.ToTensor(),
+    ]
+    if normalization == "imagenet":
+        transforms_list.append(
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        )
+
+    preprocess = transforms.Compose(transforms_list)
+
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+    input_batch = (input_batch,)
+    return input_batch
+
+
 def init_model(model_name):
     if model_name == "convnext_small":
         return torchvision.models.convnext_small()
@@ -77,13 +109,29 @@ def init_model(model_name):
         return torchvision.models.densenet161()
     if model_name == "shufflenet_v2_x1_0":
         return torchvision.models.shufflenet_v2_x1_0()
+    if model_name == "YOLO_NAS_S":
+        try:
+            from super_gradients.common.object_names import Models
+            from super_gradients.training import models
+        except ImportError:
+            raise ImportError(
+                "Please install super-gradients to use the YOLO_NAS_S model."
+            )
+
+        return models.get(Models.YOLO_NAS_S, pretrained_weights="coco")
 
     return None
 
 
 def get_sample_inputs(model_name):
+    # Lock the random seed for reproducibility
+    torch.manual_seed(42)
+
     if is_vision_model(model_name):
         return get_vision_model_sample_input()
+    if model_name == "YOLO_NAS_S":
+        input_batch = get_dog_image_tensor(640)
+        return input_batch
 
     return None
 
@@ -95,7 +143,7 @@ def get_dynamic_shapes(model_name):
     return None
 
 
-def main() -> None:
+def main() -> None:  # noqa: C901
     logger = logging.getLogger("")
     logger.setLevel(logging.INFO)
 
@@ -117,6 +165,24 @@ def main() -> None:
         "False",
     )
 
+    parser.add_argument(
+        "--small_texture_limits",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="sets the default texture limit to be (2048, 2048, 2048) which is "
+        "compatible with more devices (i.e. desktop/laptop GPUs) compared to the "
+        "default (16384, 16384, 2048) which is more targeted for mobile GPUs. Default "
+        "is False.",
+    )
+
+    parser.add_argument(
+        "--skip_memory_planning",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Skips memory planning pass while lowering, which can be used for "
+        "debugging. Default is False.",
+    )
+
     parser.add_argument(
         "-s",
         "--strict",
@@ -159,6 +225,13 @@ def main() -> None:
         help="Execute lower_module_and_test_output to validate the model. Default is False",
     )
 
+    parser.add_argument(
+        "--save_inputs",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Whether to save the inputs to the model. Default is False",
+    )
+
     args = parser.parse_args()
 
     if args.model_name in MODEL_NAME_TO_MODEL:
@@ -189,6 +262,10 @@ def main() -> None:
 
     if args.force_fp16:
         compile_options["force_fp16"] = True
+    if args.skip_memory_planning:
+        compile_options["skip_memory_planning"] = True
+    if args.small_texture_limits:
+        compile_options["small_texture_limits"] = True
 
     logging.info(f"Exporting model {args.model_name} with Vulkan delegate")
 
@@ -230,25 +307,18 @@ def main() -> None:
         atol = 2e-2
         rtol = 1e-1
 
-    # Test the model if --test flag is provided
-    if args.test:
-        test_result = test_utils.run_and_check_output(
-            reference_model=model,
-            executorch_program=exec_prog,
-            sample_inputs=example_inputs,
-            atol=atol,
-            rtol=rtol,
-        )
+    # Save regular program
+    save_pte_program(exec_prog, output_filename, args.output_dir)
+    logging.info(
+        f"Model exported and saved as {output_filename}.pte in {args.output_dir}"
+    )
 
-        if test_result:
-            logging.info(
-                "✓ Model test PASSED - outputs match reference within tolerance"
-            )
-        else:
-            logging.error("✗ Model test FAILED - outputs do not match reference")
-            raise RuntimeError(
-                "Model validation failed: ExecuTorch outputs do not match reference model outputs"
-            )
+    if args.save_inputs:
+        inputs_flattened, _ = tree_flatten(example_inputs)
+        for i, input_tensor in enumerate(inputs_flattened):
+            input_filename = os.path.join(args.output_dir, f"input{i}.bin")
+            input_tensor.numpy().tofile(input_filename)
+            f"Model input saved as {input_filename} in {args.output_dir}"
 
     if args.bundled:
         # Create bundled program
@@ -287,13 +357,27 @@ def main() -> None:
         logging.info(
             f"Bundled program exported and saved as {output_filename}.bpte in {args.output_dir}"
         )
-    else:
-        # Save regular program
-        save_pte_program(exec_prog, output_filename, args.output_dir)
-        logging.info(
-            f"Model exported and saved as {output_filename}.pte in {args.output_dir}"
+
+    # Test the model if --test flag is provided
+    if args.test:
+        test_result = test_utils.run_and_check_output(
+            reference_model=model,
+            executorch_program=exec_prog,
+            sample_inputs=example_inputs,
+            atol=atol,
+            rtol=rtol,
         )
 
+        if test_result:
+            logging.info(
+                "✓ Model test PASSED - outputs match reference within tolerance"
+            )
+        else:
+            logging.error("✗ Model test FAILED - outputs do not match reference")
+            raise RuntimeError(
+                "Model validation failed: ExecuTorch outputs do not match reference model outputs"
+            )
+
 
 if __name__ == "__main__":
     with torch.no_grad():