diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 5c25d89946e..b4ec3382919 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -125,3 +125,7 @@ kernels: - arg_meta: null kernel_name: cadence::impl::HiFi::quantized_linear_out +- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index fa7b7feb208..ac4417c79ae 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -43,6 +43,9 @@ lib.define( "quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)" +) lib.define( "quantized_relu(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Y)" diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index d27e8051f52..57fe0e140d8 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -38,6 +38,24 @@ WORD32 matmul_asym8uxasym8u_asym8u( WORD32 out_zero_bias, bool per_channel_quantized = false); // per-channel quantized weight +WORD32 xa_nn_matmul_asym8uxasym8u_asym8u( + UWORD8* __restrict__ p_out, + const UWORD8* __restrict__ p_mat1, + const UWORD8* __restrict__ p_mat2, + const WORD32* __restrict__ p_bias, + WORD32 rows, + WORD32 cols, + WORD32 row_stride, + WORD32 vec_count, + WORD32 vec_offset, + WORD32 out_offset, + WORD32 out_stride, + WORD32 mat1_zero_bias, + WORD32 vec1_zero_bias, + WORD32 out_multiplier, + WORD32 out_shift, + WORD32 out_zero_bias); + template T quantize(const float x, float scale, int32_t zero_point); diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index 8944a24ddbc..0f56a1a9631 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -7,46 +7,51 @@ */ #include +#include #include +#include +#include #include #include +#include namespace cadence { namespace impl { namespace HiFi { namespace native { -using executorch::aten::Tensor; -using executorch::runtime::getLeadingDims; -using executorch::runtime::KernelRuntimeContext; +using ::executorch::aten::optional; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::getLeadingDims; +using ::executorch::runtime::KernelRuntimeContext; -void quantized_linear_out( - KernelRuntimeContext& ctx, - const Tensor& src, +void _quantized_linear_asym8u( + const Tensor& in, const Tensor& weight, const Tensor& bias, - int64_t src_zero_point, + int64_t in_zero_point, const Tensor& weight_zero_point, const Tensor& out_multiplier, const Tensor& out_shift, int64_t out_zero_point, - const executorch::aten::optional& offset, + __ET_UNUSED const optional& offset, Tensor& out) { // input comes in shape [leading_dims, in_dim] // weight comes in shape [out_dim, in_dim] // output comes in empty with shape [leading_dims, out_dim] // Perform matrix multiply (M x N) x (N x P)' => M x P - int64_t leading_dims = getLeadingDims(src, src.dim() - 1); - int64_t out_dim = weight.size(0); // = out_dim - int64_t in_dim = weight.size(1); // = in_dim + const int64_t leading_dims = getLeadingDims(in, in.dim() - 1); + const int64_t out_dim = weight.size(0); // = out_dim + const int64_t in_dim = weight.size(1); // = in_dim - const uint8_t* __restrict__ in_data = src.const_data_ptr(); + const uint8_t* __restrict__ in_data = in.const_data_ptr(); const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); const int32_t* __restrict__ bias_data = bias.const_data_ptr(); uint8_t* __restrict__ out_data = out.mutable_data_ptr(); // The nnlib kernel to compute quantized linear via matmul. - int32_t ret = cadence::impl::HiFi::kernels::matmul_asym8uxasym8u_asym8u( + int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u( out_data, // p_out weight_data, // p_mat1, in_data, // p_mat2, @@ -59,14 +64,238 @@ void quantized_linear_out( out_dim, // out_offset, i.e., offset of next output element written 1, // out_stride, i.e., stride to go to next output row -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias - -src_zero_point, // mat2_zero_bias - out_multiplier.const_data_ptr(), // out_multiplier - out_shift.const_data_ptr(), // out_shift - out_zero_point, // out_zero_bias - false); // per channel quantization + -in_zero_point, // mat2_zero_bias + out_multiplier.const_data_ptr()[0], // out_multiplier + out_shift.const_data_ptr()[0], // out_shift + out_zero_point); // out_zero_bias ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); } +void inline _quantized_linear_asym8s( + const Tensor& in, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + int64_t out_zero_point, + __ET_UNUSED const optional& offset, + Tensor& out) { + // input comes in shape [leading_dims, in_dim] + // weight comes in shape [out_dim, in_dim] + // output comes in empty with shape [leading_dims, out_dim] + // Perform matrix multiply (M x N) x (N x P)' => M x P + const int64_t leading_dims = getLeadingDims(in, in.dim() - 1); + const int64_t out_dim = weight.size(0); // = out_dim + const int64_t in_dim = weight.size(1); // = in_dim + + const int8_t* __restrict__ in_data = in.const_data_ptr(); + const int8_t* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + int8_t* __restrict__ out_data = out.mutable_data_ptr(); + + // The nnlib kernel to compute quantized linear via matmul. + int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s( + out_data, // p_out + weight_data, // p_mat1, + in_data, // p_mat2, + bias_data, // p_bias + out_dim, // rows of p_mat1 + in_dim, // cols of p_mat1 + in_dim, // row_stride of p_mat1 + leading_dims, // vec_count, i.e., rows of p_mat2 + in_dim, // vec_offset of p_mat2. + out_dim, // out_offset, i.e., offset of next output element written + 1, // out_stride, i.e., stride to go to next output row + -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias + -in_zero_point, // mat2_zero_bias + out_multiplier.const_data_ptr()[0], // out_multiplier + out_shift.const_data_ptr()[0], // out_shift + out_zero_point); // out_zero_bias + ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); +} + +void inline _quantized_linear_per_tensor_asym8u( + const Tensor& in, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + int64_t weight_zero_point, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + __ET_UNUSED const optional& offset, + Tensor& out) { + // input comes in shape [leading_dims, in_dim] + // weight comes in shape [out_dim, in_dim] + // output comes in empty with shape [leading_dims, out_dim] + // Perform matrix multiply (M x N) x (N x P)' => M x P + const int64_t leading_dims = getLeadingDims(in, in.dim() - 1); + const int64_t out_dim = weight.size(0); // = out_dim + const int64_t in_dim = weight.size(1); // = in_dim + + const uint8_t* __restrict__ in_data = in.const_data_ptr(); + const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + uint8_t* __restrict__ out_data = out.mutable_data_ptr(); + + const int32_t out_multipler_int32 = static_cast(out_multiplier); + const int32_t out_shift_int32 = static_cast(out_shift); + + // The nnlib kernel to compute quantized linear via matmul. + const int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u( + out_data, // p_out + weight_data, // p_mat1, + in_data, // p_mat2, + bias_data, // p_bias + out_dim, // rows of p_mat1 + in_dim, // cols of p_mat1 + in_dim, // row_stride of p_mat1 + leading_dims, // vec_count, i.e., rows of p_mat2 + in_dim, // vec_offset of p_mat2. + out_dim, // out_offset, i.e., offset of next output element written + 1, // out_stride, i.e., stride to go to next output row + -weight_zero_point, // mat1_zero_bias + -in_zero_point, // mat2_zero_bias + out_multipler_int32, // out_multiplier + out_shift_int32, // out_shift + out_zero_point); // out_zero_bias + ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed"); +} + +void inline _quantized_linear_per_tensor_asym8s( + const Tensor& in, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + int64_t weight_zero_point, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + __ET_UNUSED const optional& offset, + Tensor& out) { + // input comes in shape [leading_dims, in_dim] + // weight comes in shape [out_dim, in_dim] + // output comes in empty with shape [leading_dims, out_dim] + // Perform matrix multiply (M x N) x (N x P)' => M x P + const int64_t leading_dims = getLeadingDims(in, in.dim() - 1); + const int64_t out_dim = weight.size(0); // = out_dim + const int64_t in_dim = weight.size(1); // = in_dim + + const int8_t* __restrict__ in_data = in.const_data_ptr(); + const int8_t* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + int8_t* __restrict__ out_data = out.mutable_data_ptr(); + + const int32_t out_multipler_int32 = static_cast(out_multiplier); + const int32_t out_shift_int32 = static_cast(out_shift); + + // The nnlib kernel to compute quantized linear via matmul. + const int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s( + out_data, // p_out + weight_data, // p_mat1, + in_data, // p_mat2, + bias_data, // p_bias + out_dim, // rows of p_mat1 + in_dim, // cols of p_mat1 + in_dim, // row_stride of p_mat1 + leading_dims, // vec_count, i.e., rows of p_mat2 + in_dim, // vec_offset of p_mat2. + out_dim, // out_offset, i.e., offset of next output element written + 1, // out_stride, i.e., stride to go to next output row + -weight_zero_point, // mat1_zero_bias + -in_zero_point, // mat2_zero_bias + out_multipler_int32, // out_multiplier + out_shift_int32, // out_shift + out_zero_point); // out_zero_bias + ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed"); +} + +void quantized_linear_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& in, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + int64_t out_zero_point, + __ET_UNUSED const optional& offset, + Tensor& out) { + if (out.scalar_type() == exec_aten::ScalarType::Byte) { + _quantized_linear_asym8u( + in, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + out); + } else if (out.scalar_type() == exec_aten::ScalarType::Char) { + _quantized_linear_asym8s( + in, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + out); + } else { + ET_CHECK_MSG( + false, "quantized linear only supported for uint8 and int8 dtypes"); + } +} + +void quantized_linear_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& in, + const Tensor& weight, + const Tensor& bias, + int64_t in_zero_point, + int64_t weight_zero_point, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + __ET_UNUSED const optional& offset, + Tensor& out) { + if (out.scalar_type() == exec_aten::ScalarType::Byte) { + _quantized_linear_per_tensor_asym8u( + in, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + out); + } else if (out.scalar_type() == exec_aten::ScalarType::Char) { + _quantized_linear_per_tensor_asym8s( + in, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + offset, + out); + } else { + ET_CHECK_MSG( + false, "quantized linear only supported for uint8 and int8 dtypes"); + } +} + }; // namespace native }; // namespace HiFi }; // namespace impl