pytorch · facebook-github-bot · Oct 28, 2024 · Oct 27, 2024
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -125,3 +125,7 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_linear_out
+- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -43,6 +43,9 @@
 lib.define(
     "quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) ->  Tensor(a!)"
 )
+lib.define(
+    "cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
+)
 
 lib.define(
     "quantized_relu(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Y)"

diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
@@ -38,6 +38,24 @@ WORD32 matmul_asym8uxasym8u_asym8u(
     WORD32 out_zero_bias,
     bool per_channel_quantized = false); // per-channel quantized weight
 
+WORD32 xa_nn_matmul_asym8uxasym8u_asym8u(
+    UWORD8* __restrict__ p_out,
+    const UWORD8* __restrict__ p_mat1,
+    const UWORD8* __restrict__ p_mat2,
+    const WORD32* __restrict__ p_bias,
+    WORD32 rows,
+    WORD32 cols,
+    WORD32 row_stride,
+    WORD32 vec_count,
+    WORD32 vec_offset,
+    WORD32 out_offset,
+    WORD32 out_stride,
+    WORD32 mat1_zero_bias,
+    WORD32 vec1_zero_bias,
+    WORD32 out_multiplier,
+    WORD32 out_shift,
+    WORD32 out_zero_bias);
+
 template <typename T>
 T quantize(const float x, float scale, int32_t zero_point);
 

diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -7,46 +7,51 @@
  */
 
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
+#include <xtensa/tie/xt_datacache.h>
 #include <algorithm>
 #include <cmath>
+#include <optional>
 
 namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
 
-using executorch::aten::Tensor;
-using executorch::runtime::getLeadingDims;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
 
-void quantized_linear_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& src,
+void _quantized_linear_asym8u(
+    const Tensor& in,
     const Tensor& weight,
     const Tensor& bias,
-    int64_t src_zero_point,
+    int64_t in_zero_point,
     const Tensor& weight_zero_point,
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     int64_t out_zero_point,
-    const executorch::aten::optional<Tensor>& offset,
+    __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out) {
   // input comes in shape [leading_dims, in_dim]
   // weight comes in shape [out_dim, in_dim]
   // output comes in empty with shape [leading_dims, out_dim]
   // Perform matrix multiply (M x N) x (N x P)' => M x P
-  int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
-  int64_t out_dim = weight.size(0); // = out_dim
-  int64_t in_dim = weight.size(1); // = in_dim
+  const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
 
-  const uint8_t* __restrict__ in_data = src.const_data_ptr<uint8_t>();
+  const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
   const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
   const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
   uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
 
   // The nnlib kernel to compute quantized linear via matmul.
-  int32_t ret = cadence::impl::HiFi::kernels::matmul_asym8uxasym8u_asym8u(
+  int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
       out_data, // p_out
       weight_data, // p_mat1,
       in_data, // p_mat2,
@@ -59,14 +64,238 @@ void quantized_linear_out(
       out_dim, // out_offset, i.e., offset of next output element written
       1, // out_stride, i.e., stride to go to next output row
       -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
-      -src_zero_point, // mat2_zero_bias
-      out_multiplier.const_data_ptr<int32_t>(), // out_multiplier
-      out_shift.const_data_ptr<int32_t>(), // out_shift
-      out_zero_point, // out_zero_bias
-      false); // per channel quantization
+      -in_zero_point, // mat2_zero_bias
+      out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
+      out_shift.const_data_ptr<int32_t>()[0], // out_shift
+      out_zero_point); // out_zero_bias
   ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
 }
 
+void inline _quantized_linear_asym8s(
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+  // input comes in shape [leading_dims, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [leading_dims, out_dim]
+  // Perform matrix multiply (M x N) x (N x P)' => M x P
+  const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
+
+  const int8_t* __restrict__ in_data = in.const_data_ptr<int8_t>();
+  const int8_t* __restrict__ weight_data = weight.const_data_ptr<int8_t>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
+
+  // The nnlib kernel to compute quantized linear via matmul.
+  int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s(
+      out_data, // p_out
+      weight_data, // p_mat1,
+      in_data, // p_mat2,
+      bias_data, // p_bias
+      out_dim, // rows of p_mat1
+      in_dim, // cols of p_mat1
+      in_dim, // row_stride of p_mat1
+      leading_dims, // vec_count, i.e., rows of p_mat2
+      in_dim, // vec_offset of p_mat2.
+      out_dim, // out_offset, i.e., offset of next output element written
+      1, // out_stride, i.e., stride to go to next output row
+      -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
+      -in_zero_point, // mat2_zero_bias
+      out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
+      out_shift.const_data_ptr<int32_t>()[0], // out_shift
+      out_zero_point); // out_zero_bias
+  ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
+}
+
+void inline _quantized_linear_per_tensor_asym8u(
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+  // input comes in shape [leading_dims, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [leading_dims, out_dim]
+  // Perform matrix multiply (M x N) x (N x P)' => M x P
+  const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
+
+  const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
+  const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
+
+  const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
+  const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);
+
+  // The nnlib kernel to compute quantized linear via matmul.
+  const int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
+      out_data, // p_out
+      weight_data, // p_mat1,
+      in_data, // p_mat2,
+      bias_data, // p_bias
+      out_dim, // rows of p_mat1
+      in_dim, // cols of p_mat1
+      in_dim, // row_stride of p_mat1
+      leading_dims, // vec_count, i.e., rows of p_mat2
+      in_dim, // vec_offset of p_mat2.
+      out_dim, // out_offset, i.e., offset of next output element written
+      1, // out_stride, i.e., stride to go to next output row
+      -weight_zero_point, // mat1_zero_bias
+      -in_zero_point, // mat2_zero_bias
+      out_multipler_int32, // out_multiplier
+      out_shift_int32, // out_shift
+      out_zero_point); // out_zero_bias
+  ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed");
+}
+
+void inline _quantized_linear_per_tensor_asym8s(
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+  // input comes in shape [leading_dims, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [leading_dims, out_dim]
+  // Perform matrix multiply (M x N) x (N x P)' => M x P
+  const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
+
+  const int8_t* __restrict__ in_data = in.const_data_ptr<int8_t>();
+  const int8_t* __restrict__ weight_data = weight.const_data_ptr<int8_t>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
+
+  const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
+  const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);
+
+  // The nnlib kernel to compute quantized linear via matmul.
+  const int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s(
+      out_data, // p_out
+      weight_data, // p_mat1,
+      in_data, // p_mat2,
+      bias_data, // p_bias
+      out_dim, // rows of p_mat1
+      in_dim, // cols of p_mat1
+      in_dim, // row_stride of p_mat1
+      leading_dims, // vec_count, i.e., rows of p_mat2
+      in_dim, // vec_offset of p_mat2.
+      out_dim, // out_offset, i.e., offset of next output element written
+      1, // out_stride, i.e., stride to go to next output row
+      -weight_zero_point, // mat1_zero_bias
+      -in_zero_point, // mat2_zero_bias
+      out_multipler_int32, // out_multiplier
+      out_shift_int32, // out_shift
+      out_zero_point); // out_zero_bias
+  ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed");
+}
+
+void quantized_linear_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
+    _quantized_linear_asym8u(
+        in,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        out);
+  } else if (out.scalar_type() == exec_aten::ScalarType::Char) {
+    _quantized_linear_asym8s(
+        in,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false, "quantized linear only supported for uint8 and int8 dtypes");
+  }
+}
+
+void quantized_linear_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
+    _quantized_linear_per_tensor_asym8u(
+        in,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        out);
+  } else if (out.scalar_type() == exec_aten::ScalarType::Char) {
+    _quantized_linear_per_tensor_asym8s(
+        in,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false, "quantized linear only supported for uint8 and int8 dtypes");
+  }
+}
+
 }; // namespace native
 }; // namespace HiFi
 }; // namespace impl