Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 62 additions & 50 deletions kernels/optimized/cpu/op_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

Expand All @@ -31,6 +32,26 @@ Tensor& opt_add_out(
ScalarType a_type = a.scalar_type();
ScalarType b_type = b.scalar_type();
ScalarType out_type = out.scalar_type();
ScalarType common_type = promoteTypes(a_type, b_type);

ET_KERNEL_CHECK(
ctx,
(canCast(common_type, out_type) &&
check_alpha_type(utils::get_scalar_dtype(alpha), common_type)),
InvalidArgument,
out);

ET_KERNEL_CHECK(
ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);

ET_KERNEL_CHECK(
ctx,
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
InvalidArgument,
out);

// @lint-ignore CLANGTIDY facebook-hte-CArray
static constexpr const char op_name[] = "add.out";

if (b.numel() == 1) {
if (executorch::runtime::isComplexType(a_type) ||
Expand All @@ -40,13 +61,8 @@ Tensor& opt_add_out(
// output tensors have the same dtype. Support mixed dtypes in the future.
ET_KERNEL_CHECK(
ctx, a_type == b_type && a_type == out_type, InvalidArgument, out);
ET_KERNEL_CHECK(
ctx,
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
InvalidArgument,
out);

ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, "add.out", CTYPE, [&]() {
ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, op_name, CTYPE, [&]() {
CTYPE alpha_val = utils::scalar_to<CTYPE>(alpha);
CTYPE b_val = *b.const_data_ptr<CTYPE>();

Expand All @@ -61,14 +77,8 @@ Tensor& opt_add_out(
} else if (
a_type == b_type && a_type == out_type && a_type != ScalarType::Half &&
a_type != ScalarType::BFloat16) {
ET_KERNEL_CHECK(
ctx,
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
InvalidArgument,
out);

ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() {
ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() {
ET_SWITCH_REALB_TYPES(b_type, ctx, op_name, CTYPE_B, [&]() {
CTYPE alpha_val;
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
Expand All @@ -91,7 +101,6 @@ Tensor& opt_add_out(
return opt_add_out(ctx, b, a, alpha, out);
}

static constexpr const char op_name[] = "add.out";
return torch::executor::kernels::impl::opt_add_sub_out_impl<false, op_name>(
ctx, a, b, alpha, out);
}
Expand All @@ -102,26 +111,29 @@ Tensor& opt_add_scalar_out(
const Scalar& b,
const Scalar& alpha,
Tensor& out) {
(void)ctx;

ScalarType a_type = a.scalar_type();
ScalarType common_type =
utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false);
ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
ScalarType out_type = out.scalar_type();

ET_CHECK(common_type == out_type);
ET_KERNEL_CHECK(
ctx,
(common_type == a_type &&
check_alpha_type(utils::get_scalar_dtype(alpha), common_type)),
InvalidArgument,
out);

if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
common_type = ScalarType::Float;
}
ET_KERNEL_CHECK(
ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);

ET_KERNEL_CHECK(
ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out);

// Resize for dynamic shape
auto error = resize_tensor(out, a.sizes());
ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
// @lint-ignore CLANGTIDY facebook-hte-CArray
static constexpr const char op_name[] = "add.Scalar_out";

if (a_type == common_type && a_type == out_type &&
a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
ET_SWITCH_REALB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE, [&]() {
ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() {
CTYPE b_casted = utils::scalar_to<CTYPE>(b);
CTYPE alpha_val;
ET_KERNEL_CHECK(
Expand All @@ -137,28 +149,28 @@ Tensor& opt_add_scalar_out(
out.numel());
});
} else {
ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() {
ET_SWITCH_REALB_TYPES(
common_type, ctx, "add.Scalar_out", CTYPE_IN, [&]() {
ET_SWITCH_REALHBBF16_TYPES(
out_type, ctx, "add.Scalar_out", CTYPE_OUT, [&]() {
CTYPE_IN b_casted = utils::scalar_to<CTYPE_IN>(b);
CTYPE_IN alpha_val;
ET_KERNEL_CHECK(
ctx,
utils::extract_scalar(alpha, &alpha_val),
InvalidArgument, );

const size_t n = a.numel();
const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
for (auto i = 0; i < n; ++i) {
out_data[i] = static_cast<CTYPE_OUT>(
static_cast<CTYPE_IN>(a_data[i]) +
alpha_val * b_casted);
}
});
});
ScalarType compute_type = utils::internal::get_compute_type(common_type);

ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
CTYPE_COMPUTE val_alpha;
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &val_alpha), InvalidArgument, );
auto val_alpha_times_b = val_alpha * val_b;
utils::apply_unitensor_elementwise_fn<
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
[val_alpha_times_b](const auto val_a) {
// Cast here supports vectorization; either it does nothing
// or it casts from CTYPE_COMPUTE to
// Vectorized<CTYPE_COMPUTE>.
return val_a + decltype(val_a)(val_alpha_times_b);
},
ctx,
a,
utils::SupportedTensorDtypes::REALHBBF16,
out);
});
}

Expand Down
118 changes: 26 additions & 92 deletions kernels/optimized/cpu/op_add_sub_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

Expand All @@ -19,55 +19,6 @@ namespace executor {
namespace kernels {
namespace impl {

namespace {
template <
bool can_cast,
typename CTYPE_A,
typename CTYPE_B,
typename CTYPE_IN,
typename CTYPE_OUT>
struct AddInner;

template <
typename CTYPE_A,
typename CTYPE_B,
typename CTYPE_IN,
typename CTYPE_OUT>
struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
static void
run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
// NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
[alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
CTYPE_IN value = a_casted + alpha_val * b_casted;

return static_cast<CTYPE_OUT>(value);
},
a,
b,
out);
}
};

template <typename CTYPE_IN>
struct ReportCanCastBug {
static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
}
};

template <
typename CTYPE_A,
typename CTYPE_B,
typename CTYPE_IN,
typename CTYPE_OUT>
struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
: public ReportCanCastBug<CTYPE_IN> {};

} // namespace

using Tensor = executorch::aten::Tensor;
using ScalarType = executorch::aten::ScalarType;

Expand All @@ -78,8 +29,6 @@ Tensor& opt_add_sub_out_impl(
const Tensor& b,
const Scalar& alpha,
Tensor& out) {
(void)ctx;

ScalarType a_type = a.scalar_type();
ScalarType b_type = b.scalar_type();
ScalarType out_type = out.scalar_type();
Expand Down Expand Up @@ -115,14 +64,6 @@ Tensor& opt_add_sub_out_impl(
}

if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
// Resize for dynamic shape
ET_KERNEL_CHECK_MSG(
ctx,
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");

ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() {
CTYPE alpha_val;
ET_KERNEL_CHECK(
Expand Down Expand Up @@ -202,39 +143,32 @@ Tensor& opt_add_sub_out_impl(
}
});
} else {
ScalarType common_type =
promoteTypes(a_type, b_type, /*half_to_float*/ true);
ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
ScalarType common_type = promoteTypes(a_type, b_type);
ScalarType compute_type =
native::utils::internal::get_compute_type(common_type);

ET_KERNEL_CHECK(
ctx,
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
InvalidArgument,
out);

ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, op_name, CTYPE_A, [&]() {
ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, op_name, CTYPE_B, [&]() {
using CTYPE_IN = typename torch::executor::
promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, op_name, CTYPE_OUT, [&]() {
CTYPE_IN alpha_val;
ET_KERNEL_CHECK(
ctx,
torch::executor::native::utils::extract_scalar(alpha, &alpha_val),
InvalidArgument, );
if constexpr (is_sub) {
alpha_val = -alpha_val;
}

AddInner<
can_cast<CTYPE_IN, CTYPE_OUT>::value,
CTYPE_A,
CTYPE_B,
CTYPE_IN,
CTYPE_OUT>::run(a, b, alpha_val, out);
});
});
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
CTYPE_COMPUTE val_alpha;
ET_KERNEL_CHECK(
ctx,
native::utils::extract_scalar(alpha, &val_alpha),
InvalidArgument, );
if constexpr (is_sub) {
val_alpha = -val_alpha;
}
native::utils::apply_bitensor_elementwise_fn<
CTYPE_COMPUTE,
op_name,
native::utils::SupportedTensorDtypes::REALHBBF16>(
[val_alpha](const auto val_a, const auto val_b) {
return val_a + val_alpha * val_b;
},
ctx,
a,
native::utils::SupportedTensorDtypes::REALHBBF16,
b,
native::utils::SupportedTensorDtypes::REALHBBF16,
out);
});
}

Expand Down
Loading
Loading