pytorch · facebook-github-bot · Mar 13, 2025 · Mar 13, 2025
@@ -522,9 +522,6 @@ def register_view_op(features: OpFeatures):
 @update_features(
     [
         # Shape Manipulation
-        exir_ops.edge.aten.squeeze_copy.dims,
-        exir_ops.edge.aten.unsqueeze_copy.default,
-        exir_ops.edge.aten.permute_copy.default,
         exir_ops.edge.aten.t_copy.default,
         # Indexing and lookup
         exir_ops.edge.aten.flip.default,
@@ -556,10 +553,15 @@ def register_ported_op(features: OpFeatures):
     return features
 
 
+# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
 @update_features(
     [
         # Indexing and lookup
         exir_ops.edge.aten.slice_copy.Tensor,
+        # Shape Manipulation
+        exir_ops.edge.aten.squeeze_copy.dims,
+        exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.permute_copy.default,
     ]
 )
 def register_ported_op_all_packed_dims(features: OpFeatures):

@@ -21,56 +21,61 @@ layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_i
 
 layout(push_constant) uniform PRECISION restrict Block {
   ivec4 out_limits;
-  ivec4 sizes;
+  ivec4 in_sizes;
   // output dims
   ivec4 out_ndims;
   // x = output channels aligned to 4, y = input channels aligned to 4
-  ivec2 ch_info;
+  ivec2 channel_info;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+layout(constant_id = 3) const int packed_dim = C_DIM;
 
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 
 void main() {
-  const u16vec3 pos = u16vec3(gl_GlobalInvocationID);
+  u16vec3 pos = u16vec3(gl_GlobalInvocationID);
 
   if (any(greaterThanEqual(pos, out_limits.xyz))) {
     return;
   }
 
-  const int out_channel_4up = int(ch_info.x);
-  const int in_channel_4up = int(ch_info.y);
-  const int out_batch = int(sizes[3]);
   VEC4_T outval = VEC4_T(0.0);
-  ivec4 v = ivec4(0); // holds b,c,h,w
 
-  v[out_ndims[2]] = pos.y;
-  v[out_ndims[3]] = pos.x;
+  // scale up output position's packed dim
+  pos[packed_dim] <<= 2;
 
-  const int dst_index = pos.z << 2;
-  int dst_out_index = dst_index / out_channel_4up;
-  int dst_out_lane = dst_index % out_channel_4up;
+  // index of packed dim in bchw format
+  const int in_packed_dim_bchw_index = 3 - packed_dim;
 
-  for (int j = 0; j < 4; ++j, ++dst_out_lane) {
-    if (dst_out_index >= out_batch) {
-      // out of range
+  for (int j = 0; j < 4; ++j, pos[packed_dim]++) {
+    ivec4 in_bchw_pos = ivec4(0); // holds b,c,h,w
+    // determine input position based on output position and permute map
+    // out_ndims is in BCHW format
+    in_bchw_pos[out_ndims[0]] = (pos.z / channel_info.x);
+    in_bchw_pos[out_ndims[1]] = (pos.z % channel_info.x);
+    in_bchw_pos[out_ndims[2]] = pos.y;
+    in_bchw_pos[out_ndims[3]] = pos.x;
+
+    if (any(greaterThanEqual(in_bchw_pos.wzyx, in_sizes.xyzw))) {
       break;
     }
 
-    if (dst_out_lane == out_channel_4up) {
-      dst_out_lane = 0;
-      dst_out_index++;
-    }
+    // input tensor's packed dim pos (in xyz format) corresponding to output tensor's pos (which is also in xyz format)
+    const int in_packed_dim_pos = in_bchw_pos[in_packed_dim_bchw_index];
 
-    v[out_ndims[0]] = dst_out_index;
-    v[out_ndims[1]] = dst_out_lane;
+    // calculate input position in y axis using batch and channel index which is in_bchw_pos.x and in_bchw_pos.y respectively
+    in_bchw_pos.y = in_bchw_pos.y + in_bchw_pos.x * channel_info.y;
 
-    int src_index = v[0] * in_channel_4up + v[1];
+    // scale down input tensor's packed dim pos to perform fetch
+    in_bchw_pos[in_packed_dim_bchw_index] >>= 2;
 
-    VEC4_T inval = VEC4_T(texelFetch(image_in, u16vec3(v[3], v[2], src_index >> 2), 0));
-    outval[j] = inval[src_index & 0x3];
+    // fetch input texel
+    VEC4_T inval = VEC4_T(texelFetch(image_in, u16vec3(in_bchw_pos.wzy), 0));
+    outval[j] = inval[in_packed_dim_pos & 0x3];
   }
 
+  pos[packed_dim] = uint16_t(gl_GlobalInvocationID[packed_dim]);
+
   imageStore(image_out, pos, outval);
 }
@@ -28,8 +28,7 @@ void check_args(
     const api::vTensor& in,
     const std::vector<int64_t>& permute_dims,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
-  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_same_packed_dim(in, out));
 
   // This implementation doesn't not requires the input tensor to have the same
   // dim size as the argument. The code will work as long as the input tensor's
@@ -72,10 +71,14 @@ void add_permute_node(
   int32_t out_channels = dim_at<kChannel4D>(t_out->sizes());
   int32_t in_channels = dim_at<kChannel4D>(t_in->sizes());
 
-  int32_t out_c_aligned = utils::align_up_4(out_channels);
-  int32_t in_c_aligned = utils::align_up_4(in_channels);
+  const auto packed_dim = graph.packed_dim_of(in);
+  ivec2 channel_info = {out_channels, in_channels};
+  if (packed_dim == WHCN::kChannelsDim) {
+    channel_info[0] = utils::align_up_4(channel_info[0]);
+    channel_info[1] = utils::align_up_4(channel_info[1]);
+  }
 
-  const ivec2 ch_info = {out_c_aligned, in_c_aligned};
+  const vkapi::SpecVarList spec_vars = {packed_dim};
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
@@ -86,14 +89,14 @@ void add_permute_node(
        {in, vkapi::MemoryAccessType::READ}},
       {},
       // Specialization Constants
-      {},
+      spec_vars,
       // Resizing Logic
       nullptr,
       {},
       {{graph.logical_limits_pc_of(out),
-        graph.sizes_pc_of(out),
+        graph.sizes_pc_of(in),
         PushConstantDataInfo(&out_dims, sizeof(out_dims)),
-        PushConstantDataInfo(&ch_info, sizeof(ch_info))}}));
+        PushConstantDataInfo(&channel_info, sizeof(channel_info))}}));
 }
 
 void add_permute_node(

@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import itertools
+
 from collections import namedtuple
 from typing import Callable
 
@@ -457,26 +459,20 @@ def get_select_int_inputs():
 
 @register_test_suite(["aten.permute.default", "aten.permute_copy.default"])
 def get_permute_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((9, 2, 9, 4), [0, 1, 2, 3]),
-            ((9, 2, 9, 4), [0, 1, 3, 2]),
-            ((9, 2, 9, 4), [0, 2, 1, 3]),
-            ((9, 2, 9, 4), [0, 2, 3, 1]),
-            ((9, 2, 9, 4), [0, 3, 1, 2]),
-            ((9, 2, 9, 4), [0, 3, 2, 1]),
-            ((9, 2, 9, 4), [3, 0, 1, 2]),
-            ((9, 2, 9, 4), [3, 2, 0, 1]),
-            ((9, 2, 9, 4), [2, 3, 0, 1]),
-            ((9, 2, 9, 4), [2, 0, 3, 1]),
-            ((9, 2, 9), [2, 0, 1]),
-            ((9, 2, 9), [1, 2, 0]),
-            ((9, 2), [0, 1]),
-            ((9, 2), [1, 0]),
-        ]
-    )
+    batch_tests = [
+        ((9, 2, 5, 7), out_axis) for out_axis in itertools.permutations([0, 1, 2, 3])
+    ]
+    channel_tests = [
+        ((9, 2, 5), out_axis) for out_axis in itertools.permutations([0, 1, 2])
+    ]
+    wh_tests = [((9, 2), out_axis) for out_axis in itertools.permutations([0, 1])]
+    test_suite = VkTestSuite(batch_tests + channel_tests + wh_tests)
 
-    test_suite.layouts = ["utils::kChannelsPacked"]
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
+        "utils::kChannelsPacked",
+    ]
     return test_suite