Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions backends/cadence/generic/operators/op_avg_pool2d.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/cadence/generic/operators/op_avg_pool2d.h>

#include <algorithm>
#include <cmath>

#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>

namespace impl {
namespace generic {
namespace native {

using ::executorch::aten::IntArrayRef;
using ::executorch::aten::optional;
using ::executorch::aten::ScalarType;
using ::executorch::aten::Tensor;
using ::executorch::runtime::getLeadingDims;
using ::executorch::runtime::KernelRuntimeContext;

// Compute the avg_pool2d for in_data in NCHW layout. IT is the input datatype,
// and AT is the accumulation datatype. 'quantized' is true when the input is
// quantized tensor.
template <typename IT, typename AT = IT, bool quantized = false>
void avg_pool2d_nchw(
const IT* __restrict__ in_data,
const int32_t in_zero_point,
IT* __restrict__ out_data,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool count_include_pad,
int64_t divisor,
int leading_dims,
int ih,
int iw,
int oh,
int ow) {
int kh = kernel_size[0];
int kw = kernel_size[1];
int s0 = stride[0];
int s1 = stride[1];
int p0 = padding[0];
int p1 = padding[1];

for (int _n = 0; _n < leading_dims; ++_n) {
for (int _ih = 0, _oh = 0; _oh < oh; ++_oh, _ih += s0) {
int input_offset = _n * ih * iw;
int output_offset = _n * oh * ow + _oh * ow;
for (int _iw = 0, _ow = 0; _ow < ow; ++_ow, _iw += s1) {
int kh_lo = std::max(0, _ih - p0);
int kh_hi = std::min(ih, _ih + kh - p0);
int kw_lo = std::max(0, _iw - p1);
int kw_hi = std::min(iw, _iw + kw - p1);
// Count the number of contributions sans padding
int count = (kh_hi - kh_lo) * (kw_hi - kw_lo);
// Set the accumulator
AT acc = count_include_pad ? in_zero_point * (kh * kw - count) : 0;
// Accumulate values
for (int _kh = kh_lo; _kh < kh_hi; ++_kh) {
for (int _kw = kw_lo; _kw < kw_hi; ++_kw) {
int input_addr = input_offset + _kh * iw + _kw;
acc += in_data[input_addr];
}
}
// The divisor changes depending on whether the count includes
// padded cells or not.
float inv_divisor = 1. / (count_include_pad ? divisor : count);
float val = acc * inv_divisor;
if (quantized) {
int32_t min_val =
static_cast<int32_t>(std::numeric_limits<IT>::min());
int32_t max_val =
static_cast<int32_t>(std::numeric_limits<IT>::max());
out_data[output_offset + _ow] = std::min(
std::max(int32_t(std::nearbyint(val)), min_val), max_val);
} else {
out_data[output_offset + _ow] = val;
}
}
}
}
}

Tensor& avg_pool2d_out(
ET_UNUSED KernelRuntimeContext& ctx,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
optional<int64_t> divisor_override,
const optional<Tensor>& in_zero_point_t,
bool channel_last,
Tensor& out) {
ET_DCHECK_MSG(!channel_last, "NHWC layout for avg_pool2d not yet supported");
const int32_t in_zero_point = in_zero_point_t.has_value()
? in_zero_point_t.value().const_data_ptr<int32_t>()[0]
: 0;
const int64_t divisor = divisor_override.has_value()
? divisor_override.value()
: kernel_size[0] * kernel_size[1];

const int odim = out.dim();
const int on = getLeadingDims(out, odim - 2);
const int oh = out.size(odim - 2);
const int ow = out.size(odim - 1);
const int ih = input.size(odim - 2);
const int iw = input.size(odim - 1);

// We generate the kernel for float and uint8_t types. The operator also
// works for double, but does not support other dtypes.
#define typed_avg_pool2d(btype, ctype, quantized, dtype) \
case ScalarType::dtype: { \
avg_pool2d_nchw<btype, ctype, quantized>( \
input.const_data_ptr<btype>(), \
in_zero_point, \
out.mutable_data_ptr<btype>(), \
kernel_size, \
stride, \
padding, \
count_include_pad, \
divisor, \
on, \
ih, \
iw, \
oh, \
ow); \
break; \
}

ScalarType dtype = input.scalar_type();
switch (dtype) {
typed_avg_pool2d(float, float, false, Float);
typed_avg_pool2d(uint8_t, int32_t, true, Byte);
default:
ET_DCHECK_MSG(
false,
"avg_pool2d not implemented for dtype %s",
torch::executor::toString(dtype));
}

return out;
}

} // namespace native
} // namespace generic
} // namespace impl
34 changes: 34 additions & 0 deletions backends/cadence/generic/operators/op_avg_pool2d.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/runtime/core/exec_aten/exec_aten.h>
#include <executorch/runtime/kernel/kernel_runtime_context.h>

namespace impl {
namespace generic {
namespace native {

::executorch::aten::Tensor& avg_pool2d_out(
::executorch::runtime::KernelRuntimeContext& ctx,
const ::executorch::aten::Tensor& input,
::executorch::aten::IntArrayRef kernel_size,
::executorch::aten::IntArrayRef stride,
::executorch::aten::IntArrayRef padding,
bool ceil_mode,
bool count_include_pad,
::executorch::aten::optional<int64_t> divisor_override,
const ::executorch::aten::optional<::executorch::aten::Tensor>&
in_zero_point_t,
bool channel_last,
::executorch::aten::Tensor& out);

} // namespace native
} // namespace generic
} // namespace impl
104 changes: 104 additions & 0 deletions backends/cadence/generic/operators/op_conv1d.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/cadence/generic/operators/op_conv1d.h>

#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>

namespace impl {
namespace generic {
namespace native {

using ::executorch::aten::IntArrayRef;
using ::executorch::aten::ScalarType;
using ::executorch::aten::Tensor;
using ::executorch::runtime::KernelRuntimeContext;

// This implements a generic 1D float32 convolution kernel.
// The input is of shape [n x c x w] (batch x channels x width)
// The weight is of shape [oc x wc x ww] (out_channels x weight_channels x
// weight_width) The output is of shape [n x oc x ow] (batch x out_channels x
// out_width) The bias is of shape [oc]

Tensor& conv1d_out(
ET_UNUSED KernelRuntimeContext& ctx,
const Tensor& input,
const Tensor& weight,
const Tensor& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups,
Tensor& out) {
// Extract dimensions
const int n = input.size(0);
const int c = input.size(1);
const int w = input.size(2);
const int oc = weight.size(0);
const int wc = weight.size(1);
const int ww = weight.size(2);
const int ow = out.size(2);

const int16_t s = static_cast<int16_t>(stride[0]);
const int16_t p = static_cast<int16_t>(padding[0]);
const int16_t d = static_cast<int16_t>(dilation[0]);
const int16_t g = static_cast<int16_t>(groups);

const float* p_in = input.const_data_ptr<float>();
const float* p_weight = weight.const_data_ptr<float>();
const float* p_bias = bias.const_data_ptr<float>();
float* p_out = out.mutable_data_ptr<float>();

const bool zero_pad_unit_dilation = d == 1 && p == 0;
const int ocpg = oc / g;
const int icpg = c / g;

for (int _n = 0; _n < n; ++_n) {
const float* in_batch = p_in + _n * c * w;
float* out_batch = p_out + _n * oc * ow;
for (int _g = 0; _g < g; ++_g) {
int sic = _g * icpg;
int soc = _g * ocpg;
for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
float* out_plane = out_batch + _oc * ow;
const float* weight_batch = p_weight + _oc * wc * ww;
for (int _w = 0, _ow = 0; _ow < ow; _w += s, ++_ow) {
float acc = p_bias[_oc];
if (zero_pad_unit_dilation) {
for (int _ic = sic; _ic < sic + icpg; ++_ic) {
const float* in_plane = in_batch + _ic * w;
const float* weight_plane = weight_batch + (_ic - sic) * ww;
for (int _ww = 0; _ww < ww; ++_ww) {
int ioff = _w + _ww;
acc += in_plane[ioff] * weight_plane[_ww];
}
}
} else {
for (int _ic = sic; _ic < sic + icpg; ++_ic) {
const float* in_plane = in_batch + _ic * w;
const float* weight_plane = weight_batch + (_ic - sic) * ww;
for (int _ww = 0; _ww < ww; ++_ww) {
int w_pos = _w + d * _ww - p;
if (w_pos >= 0 && w_pos < w) {
acc += in_plane[w_pos] * weight_plane[_ww];
}
}
}
}
out_plane[_ow] = acc;
}
}
}
}

return out;
}

} // namespace native
} // namespace generic
} // namespace impl
35 changes: 35 additions & 0 deletions backends/cadence/generic/operators/op_conv1d.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/runtime/core/exec_aten/exec_aten.h>
#include <executorch/runtime/kernel/kernel_runtime_context.h>

namespace impl {
namespace generic {
namespace native {

using ::executorch::aten::IntArrayRef;
using ::executorch::aten::Tensor;
using ::executorch::runtime::KernelRuntimeContext;

Tensor& conv1d_out(
KernelRuntimeContext& ctx,
const Tensor& input,
const Tensor& weight,
const Tensor& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups,
Tensor& out);

} // namespace native
} // namespace generic
} // namespace impl
Loading
Loading