pytorch · hsharma35 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/backends/cadence/generic/operators/op_avg_pool2d.cpp b/backends/cadence/generic/operators/op_avg_pool2d.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_avg_pool2d.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// Compute the avg_pool2d for in_data in NCHW layout. IT is the input datatype,
+// and AT is the accumulation datatype. 'quantized' is true when the input is
+// quantized tensor.
+template <typename IT, typename AT = IT, bool quantized = false>
+void avg_pool2d_nchw(
+    const IT* __restrict__ in_data,
+    const int32_t in_zero_point,
+    IT* __restrict__ out_data,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool count_include_pad,
+    int64_t divisor,
+    int leading_dims,
+    int ih,
+    int iw,
+    int oh,
+    int ow) {
+  int kh = kernel_size[0];
+  int kw = kernel_size[1];
+  int s0 = stride[0];
+  int s1 = stride[1];
+  int p0 = padding[0];
+  int p1 = padding[1];
+
+  for (int _n = 0; _n < leading_dims; ++_n) {
+    for (int _ih = 0, _oh = 0; _oh < oh; ++_oh, _ih += s0) {
+      int input_offset = _n * ih * iw;
+      int output_offset = _n * oh * ow + _oh * ow;
+      for (int _iw = 0, _ow = 0; _ow < ow; ++_ow, _iw += s1) {
+        int kh_lo = std::max(0, _ih - p0);
+        int kh_hi = std::min(ih, _ih + kh - p0);
+        int kw_lo = std::max(0, _iw - p1);
+        int kw_hi = std::min(iw, _iw + kw - p1);
+        // Count the number of contributions sans padding
+        int count = (kh_hi - kh_lo) * (kw_hi - kw_lo);
+        // Set the accumulator
+        AT acc = count_include_pad ? in_zero_point * (kh * kw - count) : 0;
+        // Accumulate values
+        for (int _kh = kh_lo; _kh < kh_hi; ++_kh) {
+          for (int _kw = kw_lo; _kw < kw_hi; ++_kw) {
+            int input_addr = input_offset + _kh * iw + _kw;
+            acc += in_data[input_addr];
+          }
+        }
+        // The divisor changes depending on whether the count includes
+        // padded cells or not.
+        float inv_divisor = 1. / (count_include_pad ? divisor : count);
+        float val = acc * inv_divisor;
+        if (quantized) {
+          int32_t min_val =
+              static_cast<int32_t>(std::numeric_limits<IT>::min());
+          int32_t max_val =
+              static_cast<int32_t>(std::numeric_limits<IT>::max());
+          out_data[output_offset + _ow] = std::min(
+              std::max(int32_t(std::nearbyint(val)), min_val), max_val);
+        } else {
+          out_data[output_offset + _ow] = val;
+        }
+      }
+    }
+  }
+}
+
+Tensor& avg_pool2d_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    optional<int64_t> divisor_override,
+    const optional<Tensor>& in_zero_point_t,
+    bool channel_last,
+    Tensor& out) {
+  ET_DCHECK_MSG(!channel_last, "NHWC layout for avg_pool2d not yet supported");
+  const int32_t in_zero_point = in_zero_point_t.has_value()
+      ? in_zero_point_t.value().const_data_ptr<int32_t>()[0]
+      : 0;
+  const int64_t divisor = divisor_override.has_value()
+      ? divisor_override.value()
+      : kernel_size[0] * kernel_size[1];
+
+  const int odim = out.dim();
+  const int on = getLeadingDims(out, odim - 2);
+  const int oh = out.size(odim - 2);
+  const int ow = out.size(odim - 1);
+  const int ih = input.size(odim - 2);
+  const int iw = input.size(odim - 1);
+
+  // We generate the kernel for float and uint8_t types. The operator also
+  // works for double, but does not support other dtypes.
+#define typed_avg_pool2d(btype, ctype, quantized, dtype) \
+  case ScalarType::dtype: {                              \
+    avg_pool2d_nchw<btype, ctype, quantized>(            \
+        input.const_data_ptr<btype>(),                   \
+        in_zero_point,                                   \
+        out.mutable_data_ptr<btype>(),                   \
+        kernel_size,                                     \
+        stride,                                          \
+        padding,                                         \
+        count_include_pad,                               \
+        divisor,                                         \
+        on,                                              \
+        ih,                                              \
+        iw,                                              \
+        oh,                                              \
+        ow);                                             \
+    break;                                               \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    typed_avg_pool2d(float, float, false, Float);
+    typed_avg_pool2d(uint8_t, int32_t, true, Byte);
+    default:
+      ET_DCHECK_MSG(
+          false,
+          "avg_pool2d not implemented for dtype %s",
+          torch::executor::toString(dtype));
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_avg_pool2d.h b/backends/cadence/generic/operators/op_avg_pool2d.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+::executorch::aten::Tensor& avg_pool2d_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& input,
+    ::executorch::aten::IntArrayRef kernel_size,
+    ::executorch::aten::IntArrayRef stride,
+    ::executorch::aten::IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    ::executorch::aten::optional<int64_t> divisor_override,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>&
+        in_zero_point_t,
+    bool channel_last,
+    ::executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_conv1d.cpp b/backends/cadence/generic/operators/op_conv1d.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_conv1d.h>
+
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// This implements a generic 1D float32 convolution kernel.
+// The input is of shape [n x c x w] (batch x channels x width)
+// The weight is of shape [oc x wc x ww] (out_channels x weight_channels x
+// weight_width) The output is of shape [n x oc x ow] (batch x out_channels x
+// out_width) The bias is of shape [oc]
+
+Tensor& conv1d_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    Tensor& out) {
+  // Extract dimensions
+  const int n = input.size(0);
+  const int c = input.size(1);
+  const int w = input.size(2);
+  const int oc = weight.size(0);
+  const int wc = weight.size(1);
+  const int ww = weight.size(2);
+  const int ow = out.size(2);
+
+  const int16_t s = static_cast<int16_t>(stride[0]);
+  const int16_t p = static_cast<int16_t>(padding[0]);
+  const int16_t d = static_cast<int16_t>(dilation[0]);
+  const int16_t g = static_cast<int16_t>(groups);
+
+  const float* p_in = input.const_data_ptr<float>();
+  const float* p_weight = weight.const_data_ptr<float>();
+  const float* p_bias = bias.const_data_ptr<float>();
+  float* p_out = out.mutable_data_ptr<float>();
+
+  const bool zero_pad_unit_dilation = d == 1 && p == 0;
+  const int ocpg = oc / g;
+  const int icpg = c / g;
+
+  for (int _n = 0; _n < n; ++_n) {
+    const float* in_batch = p_in + _n * c * w;
+    float* out_batch = p_out + _n * oc * ow;
+    for (int _g = 0; _g < g; ++_g) {
+      int sic = _g * icpg;
+      int soc = _g * ocpg;
+      for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+        float* out_plane = out_batch + _oc * ow;
+        const float* weight_batch = p_weight + _oc * wc * ww;
+        for (int _w = 0, _ow = 0; _ow < ow; _w += s, ++_ow) {
+          float acc = p_bias[_oc];
+          if (zero_pad_unit_dilation) {
+            for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+              const float* in_plane = in_batch + _ic * w;
+              const float* weight_plane = weight_batch + (_ic - sic) * ww;
+              for (int _ww = 0; _ww < ww; ++_ww) {
+                int ioff = _w + _ww;
+                acc += in_plane[ioff] * weight_plane[_ww];
+              }
+            }
+          } else {
+            for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+              const float* in_plane = in_batch + _ic * w;
+              const float* weight_plane = weight_batch + (_ic - sic) * ww;
+              for (int _ww = 0; _ww < ww; ++_ww) {
+                int w_pos = _w + d * _ww - p;
+                if (w_pos >= 0 && w_pos < w) {
+                  acc += in_plane[w_pos] * weight_plane[_ww];
+                }
+              }
+            }
+          }
+          out_plane[_ow] = acc;
+        }
+      }
+    }
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_conv1d.h b/backends/cadence/generic/operators/op_conv1d.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+Tensor& conv1d_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    Tensor& out);
+
+} // namespace native
+} // namespace generic
+} // namespace impl