pytorch
diff --git a/‎backends/arm/operator_support/to_copy_support.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/to_copy_support.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/compiler.py‎
Lines changed: 5 additions & 1 deletion b/‎backends/cadence/aot/compiler.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 73 additions & 0 deletions b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_exp.cpp‎
Lines changed: 4 additions & 4 deletions b/‎backends/cadence/fusion_g3/operators/op_exp.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl‎
Lines changed: 25 additions & 23 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl‎
Lines changed: 25 additions & 23 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml‎
Lines changed: 2 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/xnnpack/test/ops/test_cat.py‎
Lines changed: 9 additions & 0 deletions b/‎backends/xnnpack/test/ops/test_cat.py‎
Lines changed: 9 additions & 0 deletions
@@ -125,6 +125,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
         # Check dim_order (to_dim_order_copy)
         if "dim_order" in node.kwargs:
             dim_order = node.kwargs["dim_order"]
+            # pyre-ignore[6]
             if dim_order != list(range(len(dim_order))):
                 logger.info(
                     f"Argument {dim_order=} is not supported for "
 
@@ -33,6 +33,7 @@
     ExecutorchProgramManager,
     to_edge,
 )
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
@@ -186,14 +187,17 @@ def export_to_edge(
     edge_prog_manager = to_edge(
         expo_program,
         compile_config=EdgeCompileConfig(
-            _skip_dim_order=True,
             # Allow specific non-core aten ops in the IR.
             _core_aten_ops_exception_list=[
                 torch.ops.aten._native_batch_norm_legit_functional.default,
                 torch.ops.aten.linear.default,
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
+                # cadence replaced to_dim_order_copy with _to_copy for performance
+                # skip _to_copy op to get around of dim order check
+                # We should remove this op once cadence can support dim order
+                exir_ops.edge.aten._to_copy.default,
             ],
         ),
         constant_methods=constant_methods,
 
@@ -11,6 +11,7 @@
 
 # pyre-unsafe
 
+import copy
 import math
 from operator import neg
 from typing import cast, Dict, Iterable, Sequence, Set, Tuple
@@ -35,7 +36,12 @@
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
+from executorch.exir.dim_order_utils import get_memory_format
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
+from executorch.exir.passes.dim_order_ops_registry import (
+    DimOrderOpsMap,
+    MemoryFormatOpsMap,
+)
 from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
@@ -1799,6 +1805,72 @@ def call_operator(
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceToDimOrderCopyWithToCopyPass(ExportPass):
+    """
+    dim_order_ops::to_dim_order_copy is not supported, so this is an opt_level=0 pass.
+    If the dim order is sequential, we don't need the extra work with strides and
+    can just use to_copy.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in DimOrderOpsMap:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # new kwargs with dim_order, and no memory_format for the new op
+        nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
+
+        ndim = None
+
+        # can always get the shape, assuming rank is specialized
+
+        # pyre-ignore[16]: `None` has no attribute `to_tensor`
+        if isinstance(args[0], ProxyValue) and args[0].is_tensor():
+            # pyre-ignore[16]: `None` has no attribute `to_tensor`
+            ndim = args[0].to_tensor().dim()
+        elif isinstance(args[0], torch.Tensor):
+            # pyre-ignore[16]: `None` has no attribute `dim`
+            ndim = args[0].dim()
+        elif isinstance(args[0], torch.fx.immutable_collections.immutable_list):
+            # pyre-ignore[6]: Incompatible parameter type
+            ndim = len(args[0])
+        else:
+            assert 0, f"Expecting a Tensor or a ProxyValue but got {type(args[0])}"
+
+        # get the "to" memory format for the EdgeOp
+        contiguous_dim_order = list(range(ndim))
+        dim_order = nkwargs.pop("dim_order", None)
+
+        # Cadence only supports contiguous memory format
+        assert (
+            dim_order is None
+            # pyre-ignore[6]: Incompatible parameter type
+            or len(dim_order) == 0
+            or dim_order == contiguous_dim_order
+        ), "Expected dim order in congituous or prevserve memory format, but got {}".format(
+            dim_order
+        )
+
+        # bring back memory format
+        # pyre-ignore[6]: Incompatible parameter type
+        nkwargs["memory_format"] = get_memory_format(dim_order)
+
+        memory_format_op = MemoryFormatOpsMap[op]
+
+        return super().call_operator(
+            memory_format_op,
+            args,
+            nkwargs,
+            meta,
+        )
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceFullLikeWithFullPass(ExportPass):
     """
@@ -2108,4 +2180,5 @@ class CadenceReplaceOpsInGraph:
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
+        ReplaceToDimOrderCopyWithToCopyPass,
     ]
@@ -49,9 +49,9 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out);
 #endif
 
-  if (out.scalar_type() == ScalarType::Float) {
-    float* const out_data = out.mutable_data_ptr<float>();
-    const float* const in_data = in.const_data_ptr<float>();
+  if (in.scalar_type() == ScalarType::Float) {
+    float* __restrict__ out_data = out.mutable_data_ptr<float>();
+    const float* __restrict__ in_data = in.const_data_ptr<float>();
 
     XT_KERNEL_CHECK(
         ctx, out, xa_nn_elm_exp_f32_f32, out_data, in_data, out.numel());
@@ -66,4 +66,4 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
@@ -41,7 +41,7 @@ void main() {
     div_by_x % out_limits.y,
     div_by_x / out_limits.y);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (pos.z >= out_limits.z) {
     return;
   }
 
 
@@ -66,7 +66,7 @@ void main() {
   pos.y *= BATCH_SIZE_Y;
 
   // do not process if top pixel does not fit within the output range
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (pos.z >= out_limits.z) {
     return;
   }
 
 
@@ -44,7 +44,7 @@ void main() {
     div_by_x % out_limits.y,
     div_by_x / out_limits.y);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (pos.z >= out_limits.z) {
     return;
   }
 
 
@@ -12,7 +12,9 @@
 
 #define VEC4_T ${texel_type(DTYPE)}
 
-#define TILE_SIZE ${TILE_SIZE}
+#define TILE_SIZE_X ${TILE_SIZE_X}
+#define TILE_SIZE_Y ${TILE_SIZE_Y}
+#define LOCAL_WG_SIZE 64
 
 #define op(X, A, B) ${OPERATOR}
 
@@ -41,19 +43,19 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
-// 64 is the number of threads in the local wg
-$num_shared = 64 * TILE_SIZE * TILE_SIZE
-shared ivec2 pos_shared[${num_shared}];
+// For performance improvement, reduce register usage by caching positions in shared memory.
+// Offset index by 1 every 16 points to avoid bank access conflict.
+#define offset_pos_index(index) (index + ((index) >> 4))
+shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
 
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
  * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
-  const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
-  const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
+  const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y);
+  const uint shared_mem_stride = LOCAL_WG_SIZE;
 
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
   const ivec3 gpos = ivec3(
@@ -67,33 +69,32 @@ void main() {
   // +--------+--------+
   // | pos[2] | pos[3] |
   // +--------+--------+
-  ivec2 pos[TILE_SIZE * TILE_SIZE];
-  for (int y = 0, i = 0; y < TILE_SIZE; ++y) {
-    for (int x = 0; x < TILE_SIZE; ++x) {
-      pos[i] = ivec2(
-          gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
-      pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
+  ivec2 pos[TILE_SIZE_X * TILE_SIZE_Y];
+  for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
+    for (int x = 0; x < TILE_SIZE_X; ++x) {
+      pos[i] = ivec2(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y);
+      pos_shared[offset_pos_index((shared_mem_stride * i) + gl_LocalInvocationIndex)] = ivec3(pos[i], gpos.z);
       i++;
     }
   }
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
-  if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits.xyz))) {
+  if (gpos.z >= out_limits.z) {
     return;
   }
 
   // Compute the index of the input texture that needs to be loaded for each
   // output position. Note that negative indices can be produced indicating that
   // the top-left element is in a region added by padding.
-  ivec2 ipos[TILE_SIZE * TILE_SIZE];
-  for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
+  ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y];
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
     ipos[i] = pos[i] * stride - padding;
   }
 
-  vec4 sum[TILE_SIZE * TILE_SIZE];
+  vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
   sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
-  for (int i = 1; i < TILE_SIZE * TILE_SIZE; ++i) {
+  for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
     sum[i] = sum[0];
   }
 
@@ -109,7 +110,7 @@ void main() {
     const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
 
 #pragma unroll
-    for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
+    for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
       const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
       // For 2x2 tile size algorithm works as follows.
       // To explain the calculations below, the contents of one in_tex and the
@@ -151,10 +152,11 @@ void main() {
     }
   }
 
-  for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
-    const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
-    if (all(lessThan(ivec3(pos, gpos.z), out_limits.xyz))) {
-      imageStore(t_out, ivec3(pos, gpos.z), op(sum[i], out_min, out_max));
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
+    const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
+    const ivec3 pos = pos_shared[offset_pos_index(index)];
+    if (all(lessThan(pos, out_limits.xyz))) {
+      imageStore(t_out, pos, op(sum[i], out_min, out_max));
     }
   }
 }
@@ -9,7 +9,8 @@ conv2d_pw:
     OPERATOR: X
     NDIM: 3
     DTYPE: float
-    TILE_SIZE: 2
+    TILE_SIZE_X: 2
+    TILE_SIZE_Y: 2
   generate_variant_forall:
     DTYPE:
       - VALUE: half
 
@@ -187,6 +187,15 @@ def test_qs8_cat_gt_5(self):
                 inputs.append(torch.randn(1, 2, 3))
             self._test_cat(self.Cat(), tuple(inputs), cat_num=num_inputs, quant=True)
 
+    def test_qs8_cat_with_empty_tensor(self):
+        inputs = (
+            torch.randn(0, 2, 3),
+            torch.randn(1, 2, 3),
+            torch.randn(3, 2, 3),
+            torch.randn(0, 2, 3),
+        )
+        self._test_cat(self.Cat(), inputs, cat_num=4, quant=True)
+
     class CatNegativeDim(torch.nn.Module):
         def __init__(self):
             super().__init__()
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ void main() {`
`41`	`41`	`div_by_x % out_limits.y,`
`42`	`42`	`div_by_x / out_limits.y);`
`43`	`43`
`44`		`- if (any(greaterThanEqual(pos, out_limits))) {`
	`44`	`+ if (pos.z >= out_limits.z) {`
`45`	`45`	`return;`
`46`	`46`	`}`
`47`	`47`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ void main() {`
`66`	`66`	`pos.y *= BATCH_SIZE_Y;`
`67`	`67`
`68`	`68`	`// do not process if top pixel does not fit within the output range`
`69`		`- if (any(greaterThanEqual(pos, out_limits))) {`
	`69`	`+ if (pos.z >= out_limits.z) {`
`70`	`70`	`return;`
`71`	`71`	`}`
`72`	`72`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ void main() {`
`44`	`44`	`div_by_x % out_limits.y,`
`45`	`45`	`div_by_x / out_limits.y);`
`46`	`46`
`47`		`- if (any(greaterThanEqual(pos, out_limits))) {`
	`47`	`+ if (pos.z >= out_limits.z) {`
`48`	`48`	`return;`
`49`	`49`	`}`
`50`	`50`