Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,7 @@
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_linear_out
- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
3 changes: 3 additions & 0 deletions backends/cadence/aot/ops_registrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
lib.define(
"quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
)
lib.define(
"cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
)

lib.define(
"quantized_relu(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Y)"
Expand Down
18 changes: 18 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,24 @@ WORD32 matmul_asym8uxasym8u_asym8u(
WORD32 out_zero_bias,
bool per_channel_quantized = false); // per-channel quantized weight

WORD32 xa_nn_matmul_asym8uxasym8u_asym8u(
UWORD8* __restrict__ p_out,
const UWORD8* __restrict__ p_mat1,
const UWORD8* __restrict__ p_mat2,
const WORD32* __restrict__ p_bias,
WORD32 rows,
WORD32 cols,
WORD32 row_stride,
WORD32 vec_count,
WORD32 vec_offset,
WORD32 out_offset,
WORD32 out_stride,
WORD32 mat1_zero_bias,
WORD32 vec1_zero_bias,
WORD32 out_multiplier,
WORD32 out_shift,
WORD32 out_zero_bias);

template <typename T>
T quantize(const float x, float scale, int32_t zero_point);

Expand Down
265 changes: 247 additions & 18 deletions backends/cadence/hifi/operators/quantized_linear_out.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,51 @@
*/

#include <executorch/backends/cadence/hifi/kernels/kernels.h>
#include <executorch/backends/cadence/hifi/operators/operators.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <xa_nnlib_kernels_api.h>
#include <xtensa/tie/xt_datacache.h>
#include <algorithm>
#include <cmath>
#include <optional>

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

using executorch::aten::Tensor;
using executorch::runtime::getLeadingDims;
using executorch::runtime::KernelRuntimeContext;
using ::executorch::aten::optional;
using ::executorch::aten::ScalarType;
using ::executorch::aten::Tensor;
using ::executorch::runtime::getLeadingDims;
using ::executorch::runtime::KernelRuntimeContext;

void quantized_linear_out(
KernelRuntimeContext& ctx,
const Tensor& src,
void _quantized_linear_asym8u(
const Tensor& in,
const Tensor& weight,
const Tensor& bias,
int64_t src_zero_point,
int64_t in_zero_point,
const Tensor& weight_zero_point,
const Tensor& out_multiplier,
const Tensor& out_shift,
int64_t out_zero_point,
const executorch::aten::optional<Tensor>& offset,
__ET_UNUSED const optional<Tensor>& offset,
Tensor& out) {
// input comes in shape [leading_dims, in_dim]
// weight comes in shape [out_dim, in_dim]
// output comes in empty with shape [leading_dims, out_dim]
// Perform matrix multiply (M x N) x (N x P)' => M x P
int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
int64_t out_dim = weight.size(0); // = out_dim
int64_t in_dim = weight.size(1); // = in_dim
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
const int64_t out_dim = weight.size(0); // = out_dim
const int64_t in_dim = weight.size(1); // = in_dim

const uint8_t* __restrict__ in_data = src.const_data_ptr<uint8_t>();
const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();

// The nnlib kernel to compute quantized linear via matmul.
int32_t ret = cadence::impl::HiFi::kernels::matmul_asym8uxasym8u_asym8u(
int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
out_data, // p_out
weight_data, // p_mat1,
in_data, // p_mat2,
Expand All @@ -59,14 +64,238 @@ void quantized_linear_out(
out_dim, // out_offset, i.e., offset of next output element written
1, // out_stride, i.e., stride to go to next output row
-weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
-src_zero_point, // mat2_zero_bias
out_multiplier.const_data_ptr<int32_t>(), // out_multiplier
out_shift.const_data_ptr<int32_t>(), // out_shift
out_zero_point, // out_zero_bias
false); // per channel quantization
-in_zero_point, // mat2_zero_bias
out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
out_shift.const_data_ptr<int32_t>()[0], // out_shift
out_zero_point); // out_zero_bias
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
}

void inline _quantized_linear_asym8s(
const Tensor& in,
const Tensor& weight,
const Tensor& bias,
int64_t in_zero_point,
const Tensor& weight_zero_point,
const Tensor& out_multiplier,
const Tensor& out_shift,
int64_t out_zero_point,
__ET_UNUSED const optional<Tensor>& offset,
Tensor& out) {
// input comes in shape [leading_dims, in_dim]
// weight comes in shape [out_dim, in_dim]
// output comes in empty with shape [leading_dims, out_dim]
// Perform matrix multiply (M x N) x (N x P)' => M x P
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
const int64_t out_dim = weight.size(0); // = out_dim
const int64_t in_dim = weight.size(1); // = in_dim

const int8_t* __restrict__ in_data = in.const_data_ptr<int8_t>();
const int8_t* __restrict__ weight_data = weight.const_data_ptr<int8_t>();
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();

// The nnlib kernel to compute quantized linear via matmul.
int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s(
out_data, // p_out
weight_data, // p_mat1,
in_data, // p_mat2,
bias_data, // p_bias
out_dim, // rows of p_mat1
in_dim, // cols of p_mat1
in_dim, // row_stride of p_mat1
leading_dims, // vec_count, i.e., rows of p_mat2
in_dim, // vec_offset of p_mat2.
out_dim, // out_offset, i.e., offset of next output element written
1, // out_stride, i.e., stride to go to next output row
-weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
-in_zero_point, // mat2_zero_bias
out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
out_shift.const_data_ptr<int32_t>()[0], // out_shift
out_zero_point); // out_zero_bias
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
}

void inline _quantized_linear_per_tensor_asym8u(
const Tensor& in,
const Tensor& weight,
const Tensor& bias,
int64_t in_zero_point,
int64_t weight_zero_point,
int64_t out_multiplier,
int64_t out_shift,
int64_t out_zero_point,
__ET_UNUSED const optional<Tensor>& offset,
Tensor& out) {
// input comes in shape [leading_dims, in_dim]
// weight comes in shape [out_dim, in_dim]
// output comes in empty with shape [leading_dims, out_dim]
// Perform matrix multiply (M x N) x (N x P)' => M x P
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
const int64_t out_dim = weight.size(0); // = out_dim
const int64_t in_dim = weight.size(1); // = in_dim

const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();

const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);

// The nnlib kernel to compute quantized linear via matmul.
const int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
out_data, // p_out
weight_data, // p_mat1,
in_data, // p_mat2,
bias_data, // p_bias
out_dim, // rows of p_mat1
in_dim, // cols of p_mat1
in_dim, // row_stride of p_mat1
leading_dims, // vec_count, i.e., rows of p_mat2
in_dim, // vec_offset of p_mat2.
out_dim, // out_offset, i.e., offset of next output element written
1, // out_stride, i.e., stride to go to next output row
-weight_zero_point, // mat1_zero_bias
-in_zero_point, // mat2_zero_bias
out_multipler_int32, // out_multiplier
out_shift_int32, // out_shift
out_zero_point); // out_zero_bias
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed");
}

void inline _quantized_linear_per_tensor_asym8s(
const Tensor& in,
const Tensor& weight,
const Tensor& bias,
int64_t in_zero_point,
int64_t weight_zero_point,
int64_t out_multiplier,
int64_t out_shift,
int64_t out_zero_point,
__ET_UNUSED const optional<Tensor>& offset,
Tensor& out) {
// input comes in shape [leading_dims, in_dim]
// weight comes in shape [out_dim, in_dim]
// output comes in empty with shape [leading_dims, out_dim]
// Perform matrix multiply (M x N) x (N x P)' => M x P
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
const int64_t out_dim = weight.size(0); // = out_dim
const int64_t in_dim = weight.size(1); // = in_dim

const int8_t* __restrict__ in_data = in.const_data_ptr<int8_t>();
const int8_t* __restrict__ weight_data = weight.const_data_ptr<int8_t>();
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();

const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);

// The nnlib kernel to compute quantized linear via matmul.
const int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s(
out_data, // p_out
weight_data, // p_mat1,
in_data, // p_mat2,
bias_data, // p_bias
out_dim, // rows of p_mat1
in_dim, // cols of p_mat1
in_dim, // row_stride of p_mat1
leading_dims, // vec_count, i.e., rows of p_mat2
in_dim, // vec_offset of p_mat2.
out_dim, // out_offset, i.e., offset of next output element written
1, // out_stride, i.e., stride to go to next output row
-weight_zero_point, // mat1_zero_bias
-in_zero_point, // mat2_zero_bias
out_multipler_int32, // out_multiplier
out_shift_int32, // out_shift
out_zero_point); // out_zero_bias
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed");
}

void quantized_linear_out(
__ET_UNUSED KernelRuntimeContext& ctx,
const Tensor& in,
const Tensor& weight,
const Tensor& bias,
int64_t in_zero_point,
const Tensor& weight_zero_point,
const Tensor& out_multiplier,
const Tensor& out_shift,
int64_t out_zero_point,
__ET_UNUSED const optional<Tensor>& offset,
Tensor& out) {
if (out.scalar_type() == exec_aten::ScalarType::Byte) {
_quantized_linear_asym8u(
in,
weight,
bias,
in_zero_point,
weight_zero_point,
out_multiplier,
out_shift,
out_zero_point,
offset,
out);
} else if (out.scalar_type() == exec_aten::ScalarType::Char) {
_quantized_linear_asym8s(
in,
weight,
bias,
in_zero_point,
weight_zero_point,
out_multiplier,
out_shift,
out_zero_point,
offset,
out);
} else {
ET_CHECK_MSG(
false, "quantized linear only supported for uint8 and int8 dtypes");
}
}

void quantized_linear_per_tensor_out(
__ET_UNUSED KernelRuntimeContext& ctx,
const Tensor& in,
const Tensor& weight,
const Tensor& bias,
int64_t in_zero_point,
int64_t weight_zero_point,
int64_t out_multiplier,
int64_t out_shift,
int64_t out_zero_point,
__ET_UNUSED const optional<Tensor>& offset,
Tensor& out) {
if (out.scalar_type() == exec_aten::ScalarType::Byte) {
_quantized_linear_per_tensor_asym8u(
in,
weight,
bias,
in_zero_point,
weight_zero_point,
out_multiplier,
out_shift,
out_zero_point,
offset,
out);
} else if (out.scalar_type() == exec_aten::ScalarType::Char) {
_quantized_linear_per_tensor_asym8s(
in,
weight,
bias,
in_zero_point,
weight_zero_point,
out_multiplier,
out_shift,
out_zero_point,
offset,
out);
} else {
ET_CHECK_MSG(
false, "quantized linear only supported for uint8 and int8 dtypes");
}
}

}; // namespace native
}; // namespace HiFi
}; // namespace impl
Expand Down
Loading