From 0dae951760f48782dfca1e391c995ba3c3b8c611 Mon Sep 17 00:00:00 2001 From: Ethan Ng Date: Tue, 26 Aug 2025 09:31:38 -0700 Subject: [PATCH] Clean up ET_Check in depthwise conv, replace with assert Summary: Replace if check with ET_Check assert Reviewed By: zonglinpeng Differential Revision: D80952627 --- ...chw_asym8sxsym8s_asym8s_per_tensor_out.cpp | 211 +++++++++--------- ...chw_asym8uxsym8u_asym8u_per_tensor_out.cpp | 211 +++++++++--------- ...hwc_asym8sxsym8s_asym8s_per_tensor_out.cpp | 100 ++++----- ...hwc_asym8uxsym8u_asym8u_per_tensor_out.cpp | 100 ++++----- 4 files changed, 300 insertions(+), 322 deletions(-) diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp index 6e09b995126..2788de589cf 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -89,67 +89,101 @@ void xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s( WORD32 scratch_size = 0; - if (groups == 1) { - WORD32 out_data_format = 1; - - WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( - ctx, - ((batches * input_channels * input_height * input_width) + 8) * - sizeof(WORD8)); - - WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory( - ctx, - ((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * - sizeof(WORD8)); - - WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); - WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8); - - WORD32 p_inp_shape[kNnlibMaxDim]; - p_inp_shape[0] = input.size(0); - p_inp_shape[1] = input_channels; - p_inp_shape[2] = input_height; - p_inp_shape[3] = input_width; - - WORD32 p_out_shape[kNnlibMaxDim]; - p_out_shape[0] = input.size(0); - p_out_shape[1] = input_height; - p_out_shape[2] = input_width; - p_out_shape[3] = input_channels; - - WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 3, 1}; - - xa_nn_transpose_8_8( - pin, - p_out_shape, - p_inp, - p_inp_shape, - p_permute_vec, - kNnlibMaxDim, - kNnlibMaxDim); - - WORD32 p_inp_shape1[kNnlibMaxDim]; - p_inp_shape1[0] = out_channels; - p_inp_shape1[1] = kernel_channels; - p_inp_shape1[2] = kernel_height; - p_inp_shape1[3] = kernel_width; - - WORD32 p_out_shape1[kNnlibMaxDim]; - p_out_shape1[0] = out_channels; - p_out_shape1[1] = kernel_height; - p_out_shape1[2] = kernel_width; - p_out_shape1[3] = kernel_channels; - - xa_nn_transpose_8_8( + ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); + WORD32 out_data_format = 1; + + WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( + ctx, + ((batches * input_channels * input_height * input_width) + 8) * + sizeof(WORD8)); + + WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory( + ctx, + ((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * + sizeof(WORD8)); + + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[kNnlibMaxDim]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_height; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[kNnlibMaxDim]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 3, 1}; + + xa_nn_transpose_8_8( + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 p_inp_shape1[kNnlibMaxDim]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[kNnlibMaxDim]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + xa_nn_transpose_8_8( + pkernel, + p_out_shape1, + p_kernel, + p_inp_shape1, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size = scratch_size < 0 ? 0 : scratch_size; + + ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + + p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + WORD8* in_batch = pin + _n * input_channels * input_height * input_width; + WORD8* out_batch = p_out + _n * out_channels * out_height * out_width; + + xa_nn_conv2d_per_chan_sym8sxasym8s( + out_batch, + in_batch, pkernel, - p_out_shape1, - p_kernel, - p_inp_shape1, - p_permute_vec, - kNnlibMaxDim, - kNnlibMaxDim); - - scratch_size = xa_nn_conv2d_getsize( + p_bias, input_height, input_width, input_channels, @@ -158,59 +192,20 @@ void xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s( kernel_channels, dilation_height, dilation_width, - y_stride, - y_padding, + out_channels, x_stride, + y_stride, x_padding, + y_padding, out_height, out_width, - out_channels, - inp_precision, - kernel_precision, - out_data_format); - - scratch_size = scratch_size < 0 ? 0 : scratch_size; - - ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); - - p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); - - for (int _n = 0; _n < batches; _n++) { - WORD8* in_batch = pin + _n * input_channels * input_height * input_width; - WORD8* out_batch = p_out + _n * out_channels * out_height * out_width; - - xa_nn_conv2d_per_chan_sym8sxasym8s( - out_batch, - in_batch, - pkernel, - p_bias, - input_height, - input_width, - input_channels, - kernel_height, - kernel_width, - kernel_channels, - dilation_height, - dilation_width, - out_channels, - x_stride, - y_stride, - x_padding, - y_padding, - out_height, - out_width, - input_zero_bias, - out_multiplier32, - out_shift32, - out_zero_bias, - out_data_format, - p_scratch); - } - return; + input_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); } - - // Depthwise convolutions are now handled by specialized operators - ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); } void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out( diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp index ccbf70e1d2d..9fd2d69dda9 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -89,67 +89,101 @@ void xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u( WORD32 scratch_size = 0; - if (groups == 1) { - WORD32 out_data_format = 1; - - UWORD8* ptr1 = (UWORD8*)kernels::allocate_temp_memory( - ctx, - ((batches * input_channels * input_height * input_width) + 8) * - sizeof(UWORD8)); - - UWORD8* ptr2 = (UWORD8*)kernels::allocate_temp_memory( - ctx, - ((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * - sizeof(UWORD8)); - - UWORD8* pin = (UWORD8*)ALIGN_PTR(ptr1, 8); - UWORD8* pkernel = (UWORD8*)ALIGN_PTR(ptr2, 8); - - WORD32 p_inp_shape[kNnlibMaxDim]; - p_inp_shape[0] = input.size(0); - p_inp_shape[1] = input_channels; - p_inp_shape[2] = input_height; - p_inp_shape[3] = input_width; - - WORD32 p_out_shape[kNnlibMaxDim]; - p_out_shape[0] = input.size(0); - p_out_shape[1] = input_height; - p_out_shape[2] = input_width; - p_out_shape[3] = input_channels; - - WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 3, 1}; - - xa_nn_transpose_8_8( - (WORD8*)pin, - p_out_shape, - (WORD8*)p_inp, - p_inp_shape, - p_permute_vec, - kNnlibMaxDim, - kNnlibMaxDim); - - WORD32 p_inp_shape1[kNnlibMaxDim]; - p_inp_shape1[0] = out_channels; - p_inp_shape1[1] = kernel_channels; - p_inp_shape1[2] = kernel_height; - p_inp_shape1[3] = kernel_width; - - WORD32 p_out_shape1[kNnlibMaxDim]; - p_out_shape1[0] = out_channels; - p_out_shape1[1] = kernel_height; - p_out_shape1[2] = kernel_width; - p_out_shape1[3] = kernel_channels; - - xa_nn_transpose_8_8( + ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); + WORD32 out_data_format = 1; + + UWORD8* ptr1 = (UWORD8*)kernels::allocate_temp_memory( + ctx, + ((batches * input_channels * input_height * input_width) + 8) * + sizeof(UWORD8)); + + UWORD8* ptr2 = (UWORD8*)kernels::allocate_temp_memory( + ctx, + ((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * + sizeof(UWORD8)); + + UWORD8* pin = (UWORD8*)ALIGN_PTR(ptr1, 8); + UWORD8* pkernel = (UWORD8*)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[kNnlibMaxDim]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_height; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[kNnlibMaxDim]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 3, 1}; + + xa_nn_transpose_8_8( + (WORD8*)pin, + p_out_shape, + (WORD8*)p_inp, + p_inp_shape, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 p_inp_shape1[kNnlibMaxDim]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[kNnlibMaxDim]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + xa_nn_transpose_8_8( + (WORD8*)pkernel, + p_out_shape1, + (WORD8*)p_kernel, + p_inp_shape1, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size = scratch_size < 0 ? 0 : scratch_size; + + ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + + p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = pin + _n * input_channels * input_height * input_width; + UWORD8* out_batch = p_out + _n * out_channels * out_height * out_width; + + xa_nn_conv2d_per_chan_sym8sxasym8s( + (WORD8*)out_batch, + (WORD8*)in_batch, (WORD8*)pkernel, - p_out_shape1, - (WORD8*)p_kernel, - p_inp_shape1, - p_permute_vec, - kNnlibMaxDim, - kNnlibMaxDim); - - scratch_size = xa_nn_conv2d_getsize( + p_bias, input_height, input_width, input_channels, @@ -158,59 +192,20 @@ void xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u( kernel_channels, dilation_height, dilation_width, - y_stride, - y_padding, + out_channels, x_stride, + y_stride, x_padding, + y_padding, out_height, out_width, - out_channels, - inp_precision, - kernel_precision, - out_data_format); - - scratch_size = scratch_size < 0 ? 0 : scratch_size; - - ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); - - p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); - - for (int _n = 0; _n < batches; _n++) { - UWORD8* in_batch = pin + _n * input_channels * input_height * input_width; - UWORD8* out_batch = p_out + _n * out_channels * out_height * out_width; - - xa_nn_conv2d_per_chan_sym8sxasym8s( - (WORD8*)out_batch, - (WORD8*)in_batch, - (WORD8*)pkernel, - p_bias, - input_height, - input_width, - input_channels, - kernel_height, - kernel_width, - kernel_channels, - dilation_height, - dilation_width, - out_channels, - x_stride, - y_stride, - x_padding, - y_padding, - out_height, - out_width, - input_zero_bias, - out_multiplier32, - out_shift32, - out_zero_bias, - out_data_format, - p_scratch); - } - return; + input_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); } - - // Depthwise convolutions are now handled by specialized operators - ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); } void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out( diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp index 9416b8b7fd2..b1e023736cf 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -89,10 +89,44 @@ void xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s( WORD32 scratch_size = 0; - if (groups == 1) { - WORD32 out_data_format = 1; - - scratch_size = xa_nn_conv2d_getsize( + ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); + WORD32 out_data_format = 1; + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size = scratch_size < 0 ? 0 : scratch_size; + + ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + + p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + WORD8* in_batch = p_inp + _n * input_channels * input_height * input_width; + WORD8* out_batch = p_out + _n * out_channels * out_height * out_width; + + xa_nn_conv2d_per_chan_sym8sxasym8s( + out_batch, + in_batch, + p_kernel, + p_bias, input_height, input_width, input_channels, @@ -101,60 +135,20 @@ void xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s( kernel_channels, dilation_height, dilation_width, - y_stride, - y_padding, + out_channels, x_stride, + y_stride, x_padding, + y_padding, out_height, out_width, - out_channels, - inp_precision, - kernel_precision, - out_data_format); - - scratch_size = scratch_size < 0 ? 0 : scratch_size; - - ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); - - p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); - - for (int _n = 0; _n < batches; _n++) { - WORD8* in_batch = - p_inp + _n * input_channels * input_height * input_width; - WORD8* out_batch = p_out + _n * out_channels * out_height * out_width; - - xa_nn_conv2d_per_chan_sym8sxasym8s( - out_batch, - in_batch, - p_kernel, - p_bias, - input_height, - input_width, - input_channels, - kernel_height, - kernel_width, - kernel_channels, - dilation_height, - dilation_width, - out_channels, - x_stride, - y_stride, - x_padding, - y_padding, - out_height, - out_width, - input_zero_bias, - out_multiplier32, - out_shift32, - out_zero_bias, - out_data_format, - p_scratch); - } - return; + input_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); } - - // Depthwise convolutions are now handled by specialized operators - ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); } void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out( diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp index 97f7967a2ba..0678cb1b821 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -89,10 +89,44 @@ void xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u( WORD32 scratch_size = 0; - if (groups == 1) { - WORD32 out_data_format = 1; - - scratch_size = xa_nn_conv2d_getsize( + ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); + WORD32 out_data_format = 1; + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size = scratch_size < 0 ? 0 : scratch_size; + + ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + + p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = p_inp + _n * input_channels * input_height * input_width; + UWORD8* out_batch = p_out + _n * out_channels * out_height * out_width; + + xa_nn_conv2d_per_chan_sym8sxasym8s( + (WORD8*)out_batch, + (WORD8*)in_batch, + (WORD8*)p_kernel, + p_bias, input_height, input_width, input_channels, @@ -101,60 +135,20 @@ void xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u( kernel_channels, dilation_height, dilation_width, - y_stride, - y_padding, + out_channels, x_stride, + y_stride, x_padding, + y_padding, out_height, out_width, - out_channels, - inp_precision, - kernel_precision, - out_data_format); - - scratch_size = scratch_size < 0 ? 0 : scratch_size; - - ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); - - p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); - - for (int _n = 0; _n < batches; _n++) { - UWORD8* in_batch = - p_inp + _n * input_channels * input_height * input_width; - UWORD8* out_batch = p_out + _n * out_channels * out_height * out_width; - - xa_nn_conv2d_per_chan_sym8sxasym8s( - (WORD8*)out_batch, - (WORD8*)in_batch, - (WORD8*)p_kernel, - p_bias, - input_height, - input_width, - input_channels, - kernel_height, - kernel_width, - kernel_channels, - dilation_height, - dilation_width, - out_channels, - x_stride, - y_stride, - x_padding, - y_padding, - out_height, - out_width, - input_zero_bias, - out_multiplier32, - out_shift32, - out_zero_bias, - out_data_format, - p_scratch); - } - return; + input_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); } - - // Depthwise convolutions are now handled by specialized operators - ET_CHECK_MSG(groups == 1, "Only groups=1 supported for regular convolution"); } void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out(