pytorch · hsharma35 · Apr 29, 2026 · Apr 29, 2026
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -2527,7 +2527,7 @@ def quantized_max_pool2d_nhwc_meta(
 def fully_connected_meta(
     src: torch.Tensor,
     weight: torch.Tensor,
-    bias: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     # src comes in shape [leading_dims, in_dim]
     # weight comes in shape [out_dim, in_dim]

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -633,10 +633,8 @@ def quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor() -> torch.Tensor:
 def fully_connected(
     input_tensor: torch.Tensor,
     weight: torch.Tensor,
-    bias: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-    if input_tensor.shape[0] != 1:
-        raise ValueError("Fully connected linear only supports batch size of 1")
     return F.linear(input_tensor, weight, bias)
 
 

diff --git a/backends/cadence/generic/operators/op_fully_connected.cpp b/backends/cadence/generic/operators/op_fully_connected.cpp
@@ -27,7 +27,8 @@ void linear(
     Tensor& output) {
   const float* __restrict__ input_data = input.const_data_ptr<float>();
   const float* __restrict__ weight_data = weight.const_data_ptr<float>();
-  const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
+  const float* __restrict__ bias_data =
+      bias.has_value() ? bias.value().const_data_ptr<float>() : nullptr;
   float* __restrict__ output_data = output.mutable_data_ptr<float>();
 
   // input comes in shape [batch_size, in_dim]
@@ -43,7 +44,7 @@ void linear(
 
   for (int i = 0; i < leading_dims; ++i) {
     for (int j = 0; j < M; ++j) {
-      float sum = bias_data[j];
+      float sum = bias_data != nullptr ? bias_data[j] : 0.0f;
       for (int k = 0; k < N; ++k) {
         sum += input_data[i * N + k] * weight_data[j * N + k];
       }

diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp
@@ -240,7 +240,10 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
   WORD32 x_stride = stride[0];
   WORD32 x_padding = padding[0];
   WORD32 input_zero_bias = -in_zero_point;
-  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  const float eff_scale = bias_scale * (1.0f / output_scale);
+  WORD32 out_multiplier32 = (eff_scale >= 1.0f)
+      ? static_cast<WORD32>(2147483647)
+      : static_cast<WORD32>(eff_scale * 2147483648.0f);
   WORD32 out_shift32 = 0;
   WORD32 kernel_zero_bias = -weight_zero_point;
 
@@ -419,9 +422,9 @@ void quantized_conv1d_ncl_per_tensor_out(
           out);
     }
   } else if (dtype == ScalarType::Byte) {
-    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1).
-    // Fall back to generic implementation.
-    if (groups > 1) {
+    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1)
+    // or stride > 1. Fall back to generic implementation.
+    if (groups > 1 || stride[0] > 1) {
       impl::generic::native::quantized_conv1d_ncl_per_tensor_out(
           ctx,
           input,

diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
@@ -176,7 +176,10 @@ void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
   WORD32 x_stride = stride[stride.size() - 1];
   WORD32 x_padding = padding[padding.size() - 1];
   WORD32 input_zero_bias = -in_zero_point;
-  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  const float eff_scale = bias_scale * (1.0f / output_scale);
+  WORD32 out_multiplier32 = (eff_scale >= 1.0f)
+      ? static_cast<WORD32>(2147483647)
+      : static_cast<WORD32>(eff_scale * 2147483648.0f);
   WORD32 out_shift32 = 0;
   WORD32 kernel_zero_bias = -weight_zero_point;
 
@@ -298,9 +301,9 @@ void quantized_conv1d_nlc_per_tensor_out(
           out);
     }
   } else if (dtype == ScalarType::Byte) {
-    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1).
-    // Fall back to generic implementation.
-    if (groups > 1) {
+    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1)
+    // or stride > 1. Fall back to generic implementation.
+    if (groups > 1 || stride[0] > 1) {
       impl::generic::native::quantized_conv1d_nlc_per_tensor_out(
           ctx,
           input,