diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml index 9cf6c22c6ca..87aa86154ee 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml @@ -25,3 +25,6 @@ conv2d_dw_output_tile: - NAME: conv2d_dw_output_tile_5x5_clamp OPERATOR: clamp(X, A, B) TILE_SIZE: 5 + - NAME: conv2d_dw_output_tile_3x3_b1x1 + BATCH_SIZE_X: 1 + BATCH_SIZE_Y: 1 diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl new file mode 100644 index 00000000000..aee920bd84a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl @@ -0,0 +1,158 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} + +#define TILE_M4 ${TILE_M4} +#define TILE_K4 ${TILE_K4} +#define TILE_N4 ${TILE_N4} + +#define TILE_M ${TILE_M} +#define TILE_K ${TILE_K4 * 4} +#define TILE_N ${TILE_N4 * 4} + +${define_required_extensions(STORAGE, DTYPE)} + +layout(std430) buffer; + +#include "common.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "texture2d")} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture2d")} + +${layout_declare_ubo(B, "ivec4", "in_sizes")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} + +layout(push_constant) uniform restrict Block { + int stride_h; + int stride_w; + int padding_h; + int padding_w; + float out_min; + float out_max; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "stride_1_padding_0", "0")} +${layout_declare_spec_const(C, "int", "activation_type", "0")} + +#include "linear_fp_input_tile.glslh" +#include "linear_fp_packed_weight_tile_load.glslh" +#include "linear_fp_output_tile_fp_compute.glslh" + +void load_input_tile_with_checks( + out FPInputTile tile, + const int k4_start, + const int m_start, + const int K4, + const int M, + const int W_out, + const int W_in, + const int H_in) { + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { + if (k4_start + k4 < K4 && m_start + m < M) { + if (stride_1_padding_0 != 0) { + const int spatial = m_start + m; + tile.data[m][k4] = + texelFetch(t_in, ivec3(spatial % W_out, spatial / W_out, k4_start + k4), 0); + } else { + const int out_spatial = m_start + m; + const int out_x = out_spatial % W_out; + const int out_y = out_spatial / W_out; + const int in_x = out_x * stride_w - padding_w; + const int in_y = out_y * stride_h - padding_h; + if (in_x >= 0 && in_x < W_in && in_y >= 0 && in_y < H_in) { + tile.data[m][k4] = + texelFetch(t_in, ivec3(in_x, in_y, k4_start + k4), 0); + } else { + tile.data[m][k4] = VEC4_T(0.0); + } + } + } else { + tile.data[m][k4] = VEC4_T(0.0); + } + } + } +} + +void store_output_tile_with_checks( + const FPOutTile out_tile, + const int n4_start, + const int m_start, + const int N4, + const int M, + const int W_out) { + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + if (m_start + m < M && n4_start + n4 < N4) { + const int spatial = m_start + m; + VEC4_T texel = out_tile.data[m][n4]; + if (activation_type == 1) { + texel = max(texel, VEC4_T(0.0)); + } else if (activation_type == 2) { + texel = clamp(texel, VEC4_T(out_min), VEC4_T(out_max)); + } + imageStore(t_out, ivec3(spatial % W_out, spatial / W_out, n4_start + n4), texel); + } + } + } +} + +void main() { + const int tile_idx_n = int(gl_GlobalInvocationID.x); + const int tile_idx_m = int(gl_GlobalInvocationID.y); + + const int n4_start = tile_idx_n * TILE_N4; + const int m_start = tile_idx_m * TILE_M; + + const int W_in = in_sizes.x; + const int H_in = in_sizes.y; + const int K = in_sizes.z; + const int K4 = div_up_4(K); + + const int W_out = out_sizes.x; + const int H_out = out_sizes.y; + const int M = W_out * H_out; + const int N = out_sizes.z; + const int N4 = div_up_4(N); + + if (n4_start >= N4 || m_start >= M) { + return; + } + + FPOutTile out_tile; + initialize(out_tile); + + FPInputTile in_tile; + FPWeightTile w_tile; + + for (int k4 = 0; k4 < K4; k4++) { + load_input_tile_with_checks(in_tile, k4, m_start, K4, M, W_out, W_in, H_in); + load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4); + fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile); + } + + // Apply bias + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + if (n4_start + n4 < N4) { + out_tile.data[m][n4] += + texelFetch(t_bias, ivec2(n4_start + n4, 0), 0); + } + } + } + + store_output_tile_with_checks(out_tile, n4_start, m_start, N4, M, W_out); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml new file mode 100644 index 00000000000..037dfc35c89 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_pw_tiled: + parameter_names_with_default_values: + DTYPE: float + STORAGE: texture3d + TILE_M4: 1 + TILE_K4: 1 + TILE_N4: 1 + TILE_M: 4 + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: conv2d_pw_tiled diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp new file mode 100644 index 00000000000..a9d8483b2e2 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp @@ -0,0 +1,327 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include + +#include + +namespace vkcompute { + +// +// Weight prepack +// + +ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) { + const auto original_sizes = graph.sizes_of(vref); + + int64_t out_channels_padded = + utils::align_up_4(utils::val_at(-4, original_sizes)); + int64_t height = utils::val_at(-2, original_sizes); + int64_t width = utils::val_at(-1, original_sizes); + + const std::vector final_sizes = { + 4, out_channels_padded / 4, height * width}; + + ValueRef v = graph.add_tensor( + final_sizes, + graph.dtype_of(vref), + utils::kTexture2D, + utils::kChannelsPacked); + + std::string kernel_name = "conv2d_dw_prepack_weights"; + add_dtype_suffix(kernel_name, graph.dtype_of(v)); + add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref)); + + const auto original_sizes_pc = + utils::make_ivec4(original_sizes, /*reverse = */ true); + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + graph.create_global_wg_size(v), + graph.create_local_wg_size(v), + vref, + v, + {}, + // Specialization constants + {graph.packed_dim_of(v)}, + {graph.sizes_pc_of(v), + PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))})); + + return v; +} + +// +// Shader selection +// + +std::string pick_conv2d_dw_shader( + ComputeGraph& graph, + const ValueRef weight_data, + const ValueRef out, + const bool stride_equals_dilation, + const bool clamp_out) { + std::string kernel_name = "conv2d_dw"; + kernel_name.reserve(kShaderNameReserve); + + const auto& weight_sizes = graph.get_tref(weight_data)->sizes; + const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3; + const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5; + + if (!stride_equals_dilation) { + kernel_name += "_sned"; + } + + if (is_3x3) { + kernel_name += "_output_tile_3x3"; + if (stride_equals_dilation && graph.device_is_mali()) { + kernel_name += "_b1x1"; + } + } else if (is_5x5) { + kernel_name += "_output_tile_5x5"; + } + + if (clamp_out) { + kernel_name += "_clamp"; + } + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + return kernel_name; +} + +// +// Workgroup size +// + +utils::uvec3 conv2d_dw_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + + const bool uses_output_tile = + shader.kernel_name.find("_output_tile") != std::string::npos; + + if (uses_output_tile) { + const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos; + + const utils::uvec3 image_extents = graph->create_global_wg_size(out); + + if (is_sned) { + // sned output_tile shaders: no batch division, just flatten W*H + return {image_extents[0] * image_extents[1], image_extents[2], 1}; + } + + // stride==dilation output_tile shaders: apply batch division + uint32_t batch_x = 4u; + uint32_t batch_y = 2u; + if (shader.kernel_name.find("_b1x1") != std::string::npos) { + batch_x = 1u; + batch_y = 1u; + } + + uint32_t scaled_x = utils::div_up(image_extents[0], batch_x); + uint32_t scaled_y = utils::div_up(image_extents[1], batch_y); + return {scaled_x * scaled_y, image_extents[2], 1}; + } + + // Base conv2d_dw shader: fully linearized dispatch + const utils::uvec3 base_extents = graph->create_global_wg_size(out); + return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1}; +} + +utils::uvec3 conv2d_dw_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)graph; + (void)shader; + (void)global_workgroup_size; + (void)args; + (void)resize_args; + return {64, 1, 1}; +} + +// +// Dispatch node +// + +struct Conv2dDWParams final { + utils::ivec2 overlay_region; + int in_group_size; +}; + +struct OutputParams final { + float out_min; + float out_max; +}; + +void add_conv2d_dw_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef arg_weight, + const ValueRef arg_bias, + const ValueRef weight_data, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef out, + const std::string& kernel_name, + const Kernel2dParams& kernel_params, + const Conv2dDWParams& extra_params, + const OutputParams& out_params) { + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + vkapi::ParamsBindList param_buffers; + std::vector push_constants; + + const bool uses_output_tile = + kernel_name.find("_output_tile") != std::string::npos; + + if (uses_output_tile) { + const utils::ivec4 kernel_param_size_stride = { + kernel_params.kernel_size[0], + kernel_params.kernel_size[1], + kernel_params.stride[0], + kernel_params.stride[1]}; + + const utils::ivec4 kernel_param_pad_dial = { + kernel_params.padding[0], + kernel_params.padding[1], + kernel_params.dilation[0], + kernel_params.dilation[1]}; + + push_constants = { + graph.logical_limits_pc_of(out), + graph.sizes_pc_of(in), + PushConstantDataInfo( + &kernel_param_size_stride, sizeof(kernel_param_size_stride)), + PushConstantDataInfo( + &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)), + PushConstantDataInfo( + &extra_params, sizeof(extra_params), sizeof(utils::ivec4)), + PushConstantDataInfo(&out_params, sizeof(out_params)), + }; + } else { + param_buffers = { + graph.logical_limits_ubo(out), + graph.sizes_ubo(in), + graph.create_params_buffer(kernel_params), + graph.create_params_buffer(extra_params), + graph.create_params_buffer(out_params), + }; + } + + // transposed is always false for depthwise, output_padding unused + ValueRef transposed_ref = graph.add_scalar(false); + ValueRef output_padding = graph.add_none(); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + shader, + conv2d_dw_global_wg_size, + conv2d_dw_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + {}, + // Resize Args + {weight_data, stride, padding, dilation, transposed_ref, output_padding}, + // Resizing Logic + resize_conv2d_node)); +} + +// +// High level operator impl +// + +void conv2d_dw_impl( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef out, + const bool clamp_out, + const float out_min_val, + const float out_max_val) { + ValueRef arg_weight = prepack_dw_weights(graph, weight_data); + ValueRef arg_bias = prepack_biases( + graph, + bias, + weight_data, + /* transposed = */ false, + /* storage_type = */ utils::kTexture2D, + /* memory_layout = */ utils::kWidthPacked); + + const std::vector in_sizes = graph.sizes_of(in); + if (in_sizes.at(0) > 1) { + VK_THROW("conv2d: input batch size > 1 is not supported yet!"); + } + + check_conv_args(graph, in, out); + + Kernel2dParams kernel_params = create_kernel2d_params( + graph, + weight_data, + /*kernel_size_only = */ false, + stride, + padding, + dilation); + + const bool stride_equals_dilation = + (kernel_params.stride[0] == kernel_params.dilation[0] && + kernel_params.stride[1] == kernel_params.dilation[1]); + + const auto& overlay_region = utils::make_ivec2({ + kernel_params.kernel_size[0] + + (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1), + kernel_params.kernel_size[1] + + (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1), + }); + const auto weight_sizes = graph.sizes_of(weight_data); + const int32_t in_group_size = + utils::safe_downcast(utils::align_up_4(weight_sizes.at(1))); + Conv2dDWParams extra_params = {overlay_region, in_group_size}; + + OutputParams out_params = {out_min_val, out_max_val}; + + std::string kernel_name = pick_conv2d_dw_shader( + graph, weight_data, out, stride_equals_dilation, clamp_out); + + add_conv2d_dw_node( + graph, + in, + arg_weight, + arg_bias, + weight_data, + stride, + padding, + dilation, + out, + kernel_name, + kernel_params, + extra_params, + out_params); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp new file mode 100644 index 00000000000..2863d80aa0e --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include + +namespace vkcompute { + +// +// Shader dispatch utilities +// + +void resize_conv2d_pw_tiled_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef self = args.at(1).refs.at(0); + + std::vector self_sizes = graph->sizes_of(self); + TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0)); + const auto& weight_sizes = weight_ref->sizes; + + const auto stride_list = graph->get_int_list(extra_args.at(1)); + const auto padding_list = graph->get_int_list(extra_args.at(2)); + + const int64_t stride_h = stride_list->at(0); + const int64_t stride_w = stride_list->at(1); + const int64_t padding_h = padding_list->at(0); + const int64_t padding_w = padding_list->at(1); + + const int64_t in_h = self_sizes.at(self_sizes.size() - 2); + const int64_t in_w = self_sizes.at(self_sizes.size() - 1); + + // For 1x1 kernel with dilation=1: out = (in + 2*padding - 1) / stride + 1 + const int64_t out_h = (in_h + 2 * padding_h - 1) / stride_h + 1; + const int64_t out_w = (in_w + 2 * padding_w - 1) / stride_w + 1; + + std::vector new_out_sizes = self_sizes; + new_out_sizes.at(self_sizes.size() - 3) = weight_sizes.at(0); + new_out_sizes.at(self_sizes.size() - 2) = out_h; + new_out_sizes.at(self_sizes.size() - 1) = out_w; + + graph->virtual_resize(out, new_out_sizes); +} + +vkapi::ShaderInfo pick_conv2d_pw_tiled_shader( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + + std::string kernel_name = "conv2d_pw_tiled"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, graph->dtype_of(out)); + return VK_KERNEL_FROM_STR(kernel_name); +} + +utils::uvec3 pick_conv2d_pw_tiled_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + uint32_t W = graph->size_at(-1, out); + uint32_t H = graph->size_at(-2, out); + uint32_t C_out = graph->size_at(-3, out); + uint32_t M = H * W; + uint32_t N4 = utils::div_up_4(C_out); + // TILE_N4=1, TILE_M=4 + return {N4, utils::div_up(M, 4u), 1}; +} + +// +// Prepack nodes +// + +struct PackParams { + int32_t N; + int32_t K; + int32_t B; + int32_t is_transposed; +}; + +ValueRef prepack_conv2d_pw_weight( + ComputeGraph& graph, + const ValueRef weight_data) { + const std::vector weight_sizes = graph.sizes_of(weight_data); + const int64_t N = weight_sizes.at(0); // C_out + const int64_t K = weight_sizes.at(1); // C_in + const int64_t N4 = utils::div_up(N, int64_t(4)); + const int64_t K4 = utils::div_up(K, int64_t(4)); + + const int64_t output_height = K4; + const int64_t output_width = N4 * 4 * 4; + + utils::StorageType weight_storage = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + if (output_width / 4 > max_extent || + static_cast(output_height) > max_extent) { + weight_storage = utils::kBuffer; + } + + ValueRef packed_weight = graph.add_tensor( + {output_height, output_width}, + graph.dtype_of(weight_data), + weight_storage, + utils::kWidthPacked); + + utils::uvec3 global_wg_size = { + utils::safe_downcast(N4), + utils::safe_downcast(K4), + 1u}; + + PackParams pack_params{ + utils::safe_downcast(N), utils::safe_downcast(K), 1, 1}; + + std::string pack_kernel_name = "pack_fp_linear_weight"; + add_storage_type_suffix(pack_kernel_name, weight_storage); + add_dtype_suffix(pack_kernel_name, graph.dtype_of(weight_data)); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(pack_kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + weight_data, + packed_weight, + {}, + {}, + {PushConstantDataInfo(&pack_params, sizeof(PackParams))})); + + return packed_weight; +} + +// +// Dispatch nodes +// + +void add_conv2d_pw_tiled_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef packed_weight, + const ValueRef packed_bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef out, + const ValueRef weight_data, + const bool clamp_out, + const float out_min_val, + const float out_max_val) { + int32_t stride_h, stride_w, padding_h, padding_w; + { + const auto stride_list = graph.get_int_list(stride); + const auto padding_list = graph.get_int_list(padding); + stride_h = utils::safe_downcast(stride_list->at(0)); + stride_w = utils::safe_downcast(stride_list->at(1)); + padding_h = utils::safe_downcast(padding_list->at(0)); + padding_w = utils::safe_downcast(padding_list->at(1)); + } + + bool s1p0 = + stride_h == 1 && stride_w == 1 && padding_h == 0 && padding_w == 0; + + utils::ivec4 stride_padding{stride_h, stride_w, padding_h, padding_w}; + + struct ClampParams final { + float out_min; + float out_max; + }; + ClampParams clamp_params{out_min_val, out_max_val}; + + ValueRef clamp_out_ref = graph.add_scalar(clamp_out); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + pick_conv2d_pw_tiled_shader, + pick_conv2d_pw_tiled_global_wg_size, + pick_hw_square_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {{in, packed_weight, packed_bias}, vkapi::kRead}}, + // Shader params buffers + {graph.sizes_ubo(in), graph.sizes_ubo(out)}, + // Push Constants + {PushConstantDataInfo(&stride_padding, sizeof(stride_padding)), + PushConstantDataInfo(&clamp_params, sizeof(clamp_params))}, + // Specialization Constants + // activation_type: 0=none, 1=relu, 2=clamp + {s1p0 ? 1 : 0, clamp_out ? 2 : 0}, + // Resize Args + {weight_data, stride, padding, clamp_out_ref}, + // Resizing Logic + resize_conv2d_pw_tiled_node)); +} + +// +// High level operator impl +// + +void conv2d_pw_impl( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef out, + const bool transposed_val, + const bool clamp_out, + const float out_min_val, + const float out_max_val) { + ValueRef packed_weight = prepack_conv2d_pw_weight(graph, weight_data); + + ValueRef packed_bias = prepack_biases( + graph, + bias, + weight_data, + transposed_val, + utils::kTexture2D, + utils::kWidthPacked); + + check_conv_args(graph, in, out); + + const std::vector in_sizes = graph.sizes_of(in); + if (in_sizes.at(0) > 1) { + VK_THROW("conv2d: input batch size > 1 is not supported yet!"); + } + + add_conv2d_pw_tiled_node( + graph, + in, + packed_weight, + packed_bias, + stride, + padding, + out, + weight_data, + clamp_out, + out_min_val, + out_max_val); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 2bf3f8f726d..077ce285cfc 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -6,6 +6,8 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include @@ -137,18 +139,6 @@ vkapi::ShaderInfo get_conv2d_shader( switch (method) { case Conv2dMethod::Depthwise: kernel_name = "conv2d_dw"; - if (!prepack_weights) { - if (!stride_equals_dilation) { - kernel_name += "_sned"; - } - const auto& weight_sizes = graph.get_tref(weight)->sizes; - if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) { - kernel_name += "_output_tile_3x3"; - } - if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) { - kernel_name += "_output_tile_5x5"; - } - } break; case Conv2dMethod::Pointwise: if (prepack_weights) { @@ -294,17 +284,6 @@ Conv2dMethod get_conv2d_method( return Conv2dMethod::SlidingWindow; } -utils::uvec2 get_conv2d_dw_dispatch_divisor( - const std::vector& weight_sizes) { - if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) { - return {4u, 2u}; - } - if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) { - return {4u, 2u}; - } - return {4u, 2u}; -} - utils::uvec3 create_conv2d_global_wg_size( ComputeGraph& graph, const Conv2dMethod method, @@ -317,14 +296,6 @@ utils::uvec3 create_conv2d_global_wg_size( utils::div_up(image_extents[0u], 1u), utils::div_up(image_extents[1u], 4u), image_extents[2u]}; - } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) { - const utils::uvec3 image_extents = graph.create_global_wg_size(out); - const utils::uvec2 div = - get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes); - return { - utils::div_up(image_extents[0], div[0]), - utils::div_up(image_extents[1], div[1]), - image_extents[2]}; } else { return graph.create_global_wg_size(out); } @@ -341,10 +312,7 @@ utils::uvec3 conv2d_global_wg_size( // Determine method from shader name Conv2dMethod method; - if (shader.kernel_name.find("conv2d_dw") != std::string::npos) { - method = Conv2dMethod::Depthwise; - } else if ( - shader.kernel_name.find("conv2d_pw") != std::string::npos || + if (shader.kernel_name.find("conv2d_pw") != std::string::npos || (shader.kernel_name.find("conv2d") != std::string::npos && shader.kernel_name.find("conv_transpose2d") == std::string::npos)) { // Check if it's pointwise by examining weight sizes @@ -367,21 +335,7 @@ utils::uvec3 conv2d_global_wg_size( utils::uvec3 wg_size = create_conv2d_global_wg_size( *graph, method, out, weight_data, stride_equals_dilation); - if (method == Conv2dMethod::Depthwise) { - // The output_tile shaders (conv2d_dw_output_tile, - // conv2d_dw_sned_output_tile) use a 2D dispatch: (x_tile, y_tile) packed - // into glb_x, channel in glb_y. The base conv2d_dw shader uses a 1D - // dispatch: all (x, y, channel) packed into glb_x. For the base shader, we - // must use {W*H*C_packed, 1, 1}. - const bool uses_output_tile = - shader.kernel_name.find("_output_tile") != std::string::npos; - if (uses_output_tile) { - wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; - } else { - const utils::uvec3 base_extents = graph->create_global_wg_size(out); - wg_size = {base_extents[0] * base_extents[1] * base_extents[2], 1, 1}; - } - } else if (method == Conv2dMethod::Pointwise) { + if (method == Conv2dMethod::Pointwise) { wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; if (shader.kernel_name.find("s1p0") != std::string::npos) { @@ -404,10 +358,7 @@ utils::uvec3 conv2d_local_wg_size( // Determine method from shader name Conv2dMethod method; - if (shader.kernel_name.find("conv2d_dw") != std::string::npos) { - method = Conv2dMethod::Depthwise; - } else if ( - shader.kernel_name.find("conv2d_pw") != std::string::npos || + if (shader.kernel_name.find("conv2d_pw") != std::string::npos || (shader.kernel_name.find("conv2d") != std::string::npos && shader.kernel_name.find("conv_transpose2d") == std::string::npos)) { method = Conv2dMethod::Pointwise; @@ -425,8 +376,6 @@ utils::uvec3 conv2d_local_wg_size( local_wg_size_y = 2; } return {64 / local_wg_size_y, local_wg_size_y, 1}; - } else if (method == Conv2dMethod::Depthwise) { - return {64, 1, 1}; } else { return graph->create_local_wg_size(global_workgroup_size); } @@ -481,6 +430,37 @@ void add_conv2d_node( const Conv2dMethod method = get_conv2d_method(graph, weight_data, groups_val, transposed_val); + // Use tiled path for all pointwise conv2d + if (method == Conv2dMethod::Pointwise) { + return conv2d_pw_impl( + graph, + in, + weight_data, + bias, + stride, + padding, + out, + transposed_val, + clamp_out, + out_min_val, + out_max_val); + } + + if (method == Conv2dMethod::Depthwise) { + return conv2d_dw_impl( + graph, + in, + weight_data, + bias, + stride, + padding, + dilation, + out, + clamp_out, + out_min_val, + out_max_val); + } + ValueRef arg_weight = prepack_weights(graph, weight_data, method); ValueRef arg_bias = prepack_biases( graph, @@ -529,101 +509,13 @@ void add_conv2d_node( stride_equals_dilation, stride_1_padding_0); - utils::uvec3 wg_size = create_conv2d_global_wg_size( - graph, method, out, weight_data, stride_equals_dilation); - - utils::uvec3 local_wg_size; - if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) { - wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; - } - - if (method == Conv2dMethod::Pointwise) { - uint32_t local_wg_size_y = 1; - if (wg_size[1] % 8 == 0) { - local_wg_size_y = 8; - } else if (wg_size[1] % 4 == 0) { - local_wg_size_y = 4; - } else if (wg_size[1] % 2 == 0) { - local_wg_size_y = 2; - } - local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1}; - } else if (method == Conv2dMethod::Depthwise) { - local_wg_size = {64, 1, 1}; - } else { - local_wg_size = graph.create_local_wg_size(wg_size); - } - - vkapi::ParamsBindList param_buffers; - std::vector push_constants; - if (method == Conv2dMethod::Pointwise) { - const utils::ivec4 kernel_param_stride_pad = { - kernel_params.stride[0], - kernel_params.stride[1], - kernel_params.padding[0], - kernel_params.padding[1], - }; - - struct Conv2dPWParams final { - int in_group_size; - int dummy_padding; - OutputParams out_params; - } param{extra_params.in_group_size, 0, out_params}; - - push_constants = { - graph.logical_limits_pc_of(out), - PushConstantDataInfo( - &kernel_param_stride_pad, sizeof(kernel_param_stride_pad)), - PushConstantDataInfo(¶m, sizeof(param)), - }; - } else if (method == Conv2dMethod::Depthwise) { - // output_tile variants use push constants; the base conv2d_dw shader uses - // UBOs. Distinguish by checking if "_output_tile" is in the shader name. - const bool uses_output_tile = - shader.kernel_name.find("_output_tile") != std::string::npos; - - if (uses_output_tile) { - const utils::ivec4 kernel_param_size_stride = { - kernel_params.kernel_size[0], - kernel_params.kernel_size[1], - kernel_params.stride[0], - kernel_params.stride[1]}; - - const utils::ivec4 kernel_param_pad_dial = { - kernel_params.padding[0], - kernel_params.padding[1], - kernel_params.dilation[0], - kernel_params.dilation[1]}; - - push_constants = { - graph.logical_limits_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo( - &kernel_param_size_stride, sizeof(kernel_param_size_stride)), - PushConstantDataInfo( - &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)), - PushConstantDataInfo( - &extra_params, sizeof(extra_params), sizeof(utils::ivec4)), - PushConstantDataInfo(&out_params, sizeof(out_params)), - }; - } else { - // Base conv2d_dw shader uses UBOs, same as SlidingWindow case - param_buffers = { - graph.logical_limits_ubo(out), - graph.sizes_ubo(in), - graph.create_params_buffer(kernel_params), - graph.create_params_buffer(extra_params), - graph.create_params_buffer(out_params), - }; - } - } else { - param_buffers = { - graph.logical_limits_ubo(out), - graph.sizes_ubo(in), - graph.create_params_buffer(kernel_params), - graph.create_params_buffer(extra_params), - graph.create_params_buffer(out_params), - }; - } + vkapi::ParamsBindList param_buffers = { + graph.logical_limits_ubo(out), + graph.sizes_ubo(in), + graph.create_params_buffer(kernel_params), + graph.create_params_buffer(extra_params), + graph.create_params_buffer(out_params), + }; graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, @@ -635,7 +527,7 @@ void add_conv2d_node( // Shader params buffers param_buffers, // Push Constants - push_constants, + {}, // Specialization Constants {utils::safe_downcast(groups_val)}, // Resize Args diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.h b/backends/vulkan/runtime/graph/ops/impl/Convolution.h new file mode 100644 index 00000000000..f49e7efcfe7 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +ValueRef prepack_biases( + ComputeGraph& graph, + const ValueRef vref, + const ValueRef weight, + const bool transposed, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout); + +void check_conv_args( + ComputeGraph& graph, + const ValueRef in, + const ValueRef out); + +void conv2d_pw_impl( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef out, + const bool transposed_val, + const bool clamp_out, + const float out_min_val, + const float out_max_val); + +void conv2d_dw_impl( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef out, + const bool clamp_out, + const float out_min_val, + const float out_max_val); + +void resize_conv2d_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args); + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp new file mode 100644 index 00000000000..f7454b6b93a --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp @@ -0,0 +1,348 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +namespace vkcompute { + +// +// Local copies of Conv2dDW internals, extended with impl_selector support. +// These mirror the logic in Conv2dDW.cpp but allow forcing a specific tile size +// variant via the impl_selector string. +// + +struct Conv2dDWParams final { + utils::ivec2 overlay_region; + int in_group_size; +}; + +struct OutputParams final { + float out_min; + float out_max; +}; + +static std::string pick_conv2d_dw_shader_with_selector( + ComputeGraph& graph, + const ValueRef weight_data, + const ValueRef out, + const bool stride_equals_dilation, + const bool clamp_out, + const std::string& impl_selector) { + std::string kernel_name = "conv2d_dw"; + kernel_name.reserve(40); + + const auto& weight_sizes = graph.get_tref(weight_data)->sizes; + const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3; + const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5; + + if (!stride_equals_dilation) { + kernel_name += "_sned"; + } + + if (is_3x3) { + kernel_name += "_output_tile_3x3"; + if (impl_selector == "b1x1") { + kernel_name += "_b1x1"; + } else if (impl_selector == "b4x2") { + // b4x2 is the default (no suffix) + } else { + // Auto-selection: use b1x1 on Mali + if (stride_equals_dilation && graph.device_is_mali()) { + kernel_name += "_b1x1"; + } + } + } else if (is_5x5) { + kernel_name += "_output_tile_5x5"; + // No b1x1 variant for 5x5; impl_selector is ignored for batch size + } + + if (clamp_out) { + kernel_name += "_clamp"; + } + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + return kernel_name; +} + +static utils::uvec3 conv2d_dw_global_wg_size_fn( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + + const bool uses_output_tile = + shader.kernel_name.find("_output_tile") != std::string::npos; + + if (uses_output_tile) { + const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos; + const utils::uvec3 image_extents = graph->create_global_wg_size(out); + + if (is_sned) { + return {image_extents[0] * image_extents[1], image_extents[2], 1}; + } + + uint32_t batch_x = 4u; + uint32_t batch_y = 2u; + if (shader.kernel_name.find("_b1x1") != std::string::npos) { + batch_x = 1u; + batch_y = 1u; + } + + uint32_t scaled_x = utils::div_up(image_extents[0], batch_x); + uint32_t scaled_y = utils::div_up(image_extents[1], batch_y); + return {scaled_x * scaled_y, image_extents[2], 1}; + } + + const utils::uvec3 base_extents = graph->create_global_wg_size(out); + return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1}; +} + +static utils::uvec3 conv2d_dw_local_wg_size_fn( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)graph; + (void)shader; + (void)global_workgroup_size; + (void)args; + (void)resize_args; + return {64, 1, 1}; +} + +static ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) { + const auto original_sizes = graph.sizes_of(vref); + + int64_t out_channels_padded = + utils::align_up_4(utils::val_at(-4, original_sizes)); + int64_t height = utils::val_at(-2, original_sizes); + int64_t width = utils::val_at(-1, original_sizes); + + const std::vector final_sizes = { + 4, out_channels_padded / 4, height * width}; + + ValueRef v = graph.add_tensor( + final_sizes, + graph.dtype_of(vref), + utils::kTexture2D, + utils::kChannelsPacked); + + std::string kernel_name = "conv2d_dw_prepack_weights"; + add_dtype_suffix(kernel_name, graph.dtype_of(v)); + add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref)); + + const auto original_sizes_pc = + utils::make_ivec4(original_sizes, /*reverse=*/true); + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + graph.create_global_wg_size(v), + graph.create_local_wg_size(v), + vref, + v, + {}, + {graph.packed_dim_of(v)}, + {graph.sizes_pc_of(v), + PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))})); + + return v; +} + +static void conv2d_dw_with_selector( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef out, + const std::string& impl_selector) { + ValueRef arg_weight = prepack_dw_weights(graph, weight_data); + ValueRef arg_bias = prepack_biases( + graph, + bias, + weight_data, + /*transposed=*/false, + /*storage_type=*/utils::kTexture2D, + /*memory_layout=*/utils::kWidthPacked); + + check_conv_args(graph, in, out); + + Kernel2dParams kernel_params = create_kernel2d_params( + graph, + weight_data, + /*kernel_size_only=*/false, + stride, + padding, + dilation); + + const bool stride_equals_dilation = + (kernel_params.stride[0] == kernel_params.dilation[0] && + kernel_params.stride[1] == kernel_params.dilation[1]); + + const auto& overlay_region = utils::make_ivec2({ + kernel_params.kernel_size[0] + + (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1), + kernel_params.kernel_size[1] + + (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1), + }); + const auto weight_sizes = graph.sizes_of(weight_data); + const int32_t in_group_size = + utils::safe_downcast(utils::align_up_4(weight_sizes.at(1))); + Conv2dDWParams extra_params = {overlay_region, in_group_size}; + + OutputParams out_params = { + std::numeric_limits::lowest(), std::numeric_limits::max()}; + + std::string kernel_name = pick_conv2d_dw_shader_with_selector( + graph, + weight_data, + out, + stride_equals_dilation, + /*clamp_out=*/false, + impl_selector); + + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + vkapi::ParamsBindList param_buffers; + std::vector push_constants; + + const bool uses_output_tile = + kernel_name.find("_output_tile") != std::string::npos; + + if (uses_output_tile) { + const utils::ivec4 kernel_param_size_stride = { + kernel_params.kernel_size[0], + kernel_params.kernel_size[1], + kernel_params.stride[0], + kernel_params.stride[1]}; + + const utils::ivec4 kernel_param_pad_dial = { + kernel_params.padding[0], + kernel_params.padding[1], + kernel_params.dilation[0], + kernel_params.dilation[1]}; + + push_constants = { + graph.logical_limits_pc_of(out), + graph.sizes_pc_of(in), + PushConstantDataInfo( + &kernel_param_size_stride, sizeof(kernel_param_size_stride)), + PushConstantDataInfo( + &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)), + PushConstantDataInfo( + &extra_params, sizeof(extra_params), sizeof(utils::ivec4)), + PushConstantDataInfo(&out_params, sizeof(out_params)), + }; + } else { + param_buffers = { + graph.logical_limits_ubo(out), + graph.sizes_ubo(in), + graph.create_params_buffer(kernel_params), + graph.create_params_buffer(extra_params), + graph.create_params_buffer(out_params), + }; + } + + ValueRef transposed_ref = graph.add_scalar(false); + ValueRef output_padding = graph.add_none(); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + shader, + conv2d_dw_global_wg_size_fn, + conv2d_dw_local_wg_size_fn, + {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}}, + param_buffers, + push_constants, + {}, + {weight_data, stride, padding, dilation, transposed_ref, output_padding}, + resize_conv2d_node)); +} + +void test_conv2d_dw(ComputeGraph& graph, const std::vector& args) { + // args[0] = input [N, C, H, W] + // args[1] = weight [C, 1, K_h, K_w] (constant) + // args[2] = bias (constant, or none) + // args[3] = stride_h (int) + // args[4] = stride_w (int) + // args[5] = padding_h (int) + // args[6] = padding_w (int) + // args[7] = dilation_h (int) + // args[8] = dilation_w (int) + // args[9] = impl_selector (string) + // args[10] = output + const ValueRef input = args.at(0); + const ValueRef weight = args.at(1); + const ValueRef bias = args.at(2); + const int64_t stride_h = graph.extract_scalar(args.at(3)); + const int64_t stride_w = graph.extract_scalar(args.at(4)); + const int64_t padding_h = graph.extract_scalar(args.at(5)); + const int64_t padding_w = graph.extract_scalar(args.at(6)); + const int64_t dilation_h = graph.extract_scalar(args.at(7)); + const int64_t dilation_w = graph.extract_scalar(args.at(8)); + const std::string impl_selector = graph.extract_string(args.at(9)); + const ValueRef out = args.at(10); + + ValueRef stride = + graph.add_scalar_list(std::vector{stride_h, stride_w}); + ValueRef padding = graph.add_scalar_list( + std::vector{padding_h, padding_w}); + ValueRef dilation = graph.add_scalar_list( + std::vector{dilation_h, dilation_w}); + + if (impl_selector.empty()) { + // Auto-selection: delegate to aten.convolution.default + const int64_t channels = graph.sizes_of(input).at(1); + ValueRef transposed = graph.add_scalar(false); + ValueRef output_padding = + graph.add_scalar_list(std::vector{0, 0}); + ValueRef groups = graph.add_scalar(channels); + + VK_GET_OP_FN("aten.convolution.default") + (graph, + {input, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + out}); + } else { + // Forced variant: build the dispatch directly with impl_selector + conv2d_dw_with_selector( + graph, + input, + weight, + bias, + stride, + padding, + dilation, + out, + impl_selector); + } +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.test_conv2d_dw.default, test_conv2d_dw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp new file mode 100644 index 00000000000..7c640d7a1ac --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace vkcompute { + +void test_conv2d_pw(ComputeGraph& graph, const std::vector& args) { + // args[0] = input [N, C_in, H, W] + // args[1] = weight [C_out, C_in, 1, 1] (constant) + // args[2] = bias (constant, or none) + // args[3] = impl_selector (string) + // args[4] = output [N, C_out, H, W] + const ValueRef input = args.at(0); + const ValueRef weight = args.at(1); + const ValueRef bias = args.at(2); + const ValueRef impl_selector_str = args.at(3); + const ValueRef out = args.at(4); + + std::string impl_selector = graph.extract_string(impl_selector_str); + (void)impl_selector; // Reserved for future use + + // Create fixed pointwise conv parameters + ValueRef stride = graph.add_scalar_list(std::vector{1, 1}); + ValueRef padding = graph.add_scalar_list(std::vector{0, 0}); + ValueRef dilation = + graph.add_scalar_list(std::vector{1, 1}); + ValueRef transposed = graph.add_scalar(false); + ValueRef output_padding = + graph.add_scalar_list(std::vector{0, 0}); + ValueRef groups = graph.add_scalar(1); + + // Call aten.convolution.default with all 10 args: + // input, weight, bias, stride, padding, dilation, transposed, + // output_padding, groups, output + VK_GET_OP_FN("aten.convolution.default") + (graph, + {input, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + out}); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.test_conv2d_pw.default, test_conv2d_pw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index fef8994718f..84432bce30b 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -100,3 +100,5 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("test_q8ta_linear") define_custom_op_test_binary("test_q8ta_conv2d_transposed") define_custom_op_test_binary("test_mm") + define_custom_op_test_binary("test_conv2d_pw") + define_custom_op_test_binary("test_conv2d_dw") diff --git a/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp new file mode 100644 index 00000000000..9c48c320d62 --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp @@ -0,0 +1,435 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include + +#include "conv2d_utils.h" +#include "utils.h" + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 64; + +struct InputDims { + int64_t N; + int64_t C; + int64_t H; + int64_t W; + + InputDims(int64_t n, int64_t c, int64_t h, int64_t w) + : N(n), C(c), H(h), W(w) {} +}; + +struct Conv2dDwConfig { + InputDims dims; + KernelSize kernel; + Stride stride; + Padding padding; + Dilation dilation; + bool has_bias; +}; + +static int64_t calc_out_size( + int64_t in_size, + int64_t kernel_size, + int64_t stride, + int64_t padding, + int64_t dilation) { + return (in_size + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + + 1; +} + +static TestCase create_conv2d_dw_test_case( + const Conv2dDwConfig& config, + vkapi::ScalarType dtype, + utils::StorageType storage_type, + utils::GPUMemoryLayout memory_layout, + const std::string& impl_selector = "") { + TestCase test_case; + + bool is_perf = config.dims.C > kRefDimSizeLimit || + config.dims.H > kRefDimSizeLimit || config.dims.W > kRefDimSizeLimit; + + std::string prefix = is_perf ? "PERF" : "ACCU"; + std::string storage_str = storage_type_abbrev(storage_type); + std::string layout_str = layout_abbrev(memory_layout); + std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32"; + std::string bias_str = config.has_bias ? "+bias" : ""; + + int64_t H_out = calc_out_size( + config.dims.H, + config.kernel.h, + config.stride.h, + config.padding.h, + config.dilation.h); + int64_t W_out = calc_out_size( + config.dims.W, + config.kernel.w, + config.stride.w, + config.padding.w, + config.dilation.w); + + std::string shape = "[" + std::to_string(config.dims.N) + "," + + std::to_string(config.dims.C) + "," + std::to_string(config.dims.H) + + "," + std::to_string(config.dims.W) + "] k" + + std::to_string(config.kernel.h) + "x" + std::to_string(config.kernel.w) + + " s" + std::to_string(config.stride.h) + " p" + + std::to_string(config.padding.h) + " d" + + std::to_string(config.dilation.h) + "->[" + + std::to_string(config.dims.N) + "," + std::to_string(config.dims.C) + + "," + std::to_string(H_out) + "," + std::to_string(W_out) + "]"; + + std::string selector_str = + impl_selector.empty() ? "" : " [" + impl_selector + "]"; + + std::string name = prefix + " conv2d_dw" + bias_str + " " + shape + " " + + storage_str + "(" + layout_str + ") " + dtype_str + selector_str; + + test_case.set_name(name); + test_case.set_operator_name("test_etvk.test_conv2d_dw.default"); + + // Input tensor [N, C, H, W] + ValueSpec input( + {config.dims.N, config.dims.C, config.dims.H, config.dims.W}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + + // Weight tensor [C, 1, K_h, K_w] - constant + ValueSpec weight( + {config.dims.C, 1, config.kernel.h, config.kernel.w}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + weight.set_constant(true); + + test_case.add_input_spec(input); + test_case.add_input_spec(weight); + + // Bias (or none) + if (config.has_bias) { + ValueSpec bias( + {config.dims.C}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + bias.set_constant(true); + test_case.add_input_spec(bias); + } else { + ValueSpec none_bias(static_cast(0)); + none_bias.set_none(true); + test_case.add_input_spec(none_bias); + } + + // stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w + test_case.add_input_spec(ValueSpec(static_cast(config.stride.h))); + test_case.add_input_spec(ValueSpec(static_cast(config.stride.w))); + test_case.add_input_spec(ValueSpec(static_cast(config.padding.h))); + test_case.add_input_spec(ValueSpec(static_cast(config.padding.w))); + test_case.add_input_spec(ValueSpec(static_cast(config.dilation.h))); + test_case.add_input_spec(ValueSpec(static_cast(config.dilation.w))); + + // impl_selector string + test_case.add_input_spec(ValueSpec::make_string(impl_selector)); + + // Output tensor [N, C, H_out, W_out] + ValueSpec output( + {config.dims.N, config.dims.C, H_out, W_out}, + dtype, + storage_type, + memory_layout, + DataGenType::ZEROS); + test_case.add_output_spec(output); + + if (dtype == vkapi::kHalf) { + test_case.set_abs_tolerance(1e-1f); + test_case.set_rel_tolerance(1e-2f); + } else { + test_case.set_abs_tolerance(1e-3f); + test_case.set_rel_tolerance(1e-3f); + } + + test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"}); + + return test_case; +} + +// Reference implementation for depthwise conv2d +static void conv2d_dw_reference_impl(TestCase& test_case) { + const ValueSpec& input = test_case.inputs()[0]; + const ValueSpec& weight = test_case.inputs()[1]; + const ValueSpec& bias_spec = test_case.inputs()[2]; + ValueSpec& output = test_case.outputs()[0]; + + if (input.dtype != vkapi::kFloat) { + throw std::invalid_argument("Reference only supports float"); + } + + auto input_sizes = input.get_tensor_sizes(); + auto weight_sizes = weight.get_tensor_sizes(); + auto output_sizes = output.get_tensor_sizes(); + + int64_t N = input_sizes[0]; + int64_t C = input_sizes[1]; + int64_t H_in = input_sizes[2]; + int64_t W_in = input_sizes[3]; + int64_t K_h = weight_sizes[2]; + int64_t K_w = weight_sizes[3]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + int64_t stride_h = test_case.inputs()[3].get_int_value(); + int64_t stride_w = test_case.inputs()[4].get_int_value(); + int64_t padding_h = test_case.inputs()[5].get_int_value(); + int64_t padding_w = test_case.inputs()[6].get_int_value(); + int64_t dilation_h = test_case.inputs()[7].get_int_value(); + int64_t dilation_w = test_case.inputs()[8].get_int_value(); + + auto& input_data = input.get_float_data(); + auto& weight_data = weight.get_float_data(); + auto& ref_data = output.get_ref_float_data(); + ref_data.resize(N * C * H_out * W_out, 0.0f); + + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C; ++c) { + for (int64_t oh = 0; oh < H_out; ++oh) { + for (int64_t ow = 0; ow < W_out; ++ow) { + float sum = 0.0f; + for (int64_t kh = 0; kh < K_h; ++kh) { + for (int64_t kw = 0; kw < K_w; ++kw) { + int64_t ih = oh * stride_h - padding_h + kh * dilation_h; + int64_t iw = ow * stride_w - padding_w + kw * dilation_w; + if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in) { + float in_val = input_data + [n * (C * H_in * W_in) + c * (H_in * W_in) + ih * W_in + + iw]; + // weight is [C, 1, K_h, K_w] + float w_val = weight_data[c * (K_h * K_w) + kh * K_w + kw]; + sum += in_val * w_val; + } + } + } + if (!bias_spec.is_none()) { + auto& bias_data = bias_spec.get_float_data(); + sum += bias_data[c]; + } + ref_data + [n * (C * H_out * W_out) + c * (H_out * W_out) + oh * W_out + + ow] = sum; + } + } + } + } +} + +static std::vector generate_conv2d_dw_test_cases() { + std::vector test_cases; + + std::vector storage_types = {utils::kTexture3D}; + utils::GPUMemoryLayout layout = utils::kChannelsPacked; + + // Accuracy shapes (small enough for float reference validation) + std::vector accuracy_configs = { + {InputDims(1, 8, 16, 16), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 8, 16, 16), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + {InputDims(1, 8, 16, 16), + KernelSize(5, 5), + Stride(1, 1), + Padding(2, 2), + Dilation(1, 1), + false}, + {InputDims(1, 8, 16, 16), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + // Non-multiple-of-4 channels + {InputDims(1, 11, 16, 16), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 3, 16, 16), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + }; + + // EdgeTAM depthwise shapes (from profiling data) + std::vector perf_configs = { + // Backbone stem and early stages + {InputDims(1, 24, 512, 512), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 48, 256, 256), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 48, 256, 256), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 96, 128, 128), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 96, 128, 128), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 192, 64, 64), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 192, 64, 64), + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 384, 32, 32), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + // 5x5 kernels + {InputDims(1, 48, 256, 256), + KernelSize(5, 5), + Stride(1, 1), + Padding(2, 2), + Dilation(1, 1), + false}, + {InputDims(1, 96, 128, 128), + KernelSize(5, 5), + Stride(1, 1), + Padding(2, 2), + Dilation(1, 1), + false}, + // FPN/Neck + {InputDims(1, 256, 256, 256), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 256, 128, 128), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + }; + + // Generate accuracy test cases (float only) + for (const auto& config : accuracy_configs) { + for (auto st : storage_types) { + test_cases.push_back( + create_conv2d_dw_test_case(config, vkapi::kFloat, st, layout)); + } + } + + // Generate performance test cases (float and half) + for (const auto& config : perf_configs) { + std::vector dtypes = {vkapi::kFloat, vkapi::kHalf}; + for (auto dtype : dtypes) { + for (auto st : storage_types) { + // Auto-selection (empty impl_selector) + test_cases.push_back( + create_conv2d_dw_test_case(config, dtype, st, layout)); + + // Force b4x2 variant + test_cases.push_back( + create_conv2d_dw_test_case(config, dtype, st, layout, "b4x2")); + + // Force b1x1 variant (only for 3x3 kernels; for 5x5 it falls back + // to default, but we still generate it to test the fallback path) + test_cases.push_back( + create_conv2d_dw_test_case(config, dtype, st, layout, "b1x1")); + } + } + } + + return test_cases; +} + +static int64_t conv2d_dw_flop_calculator(const TestCase& test_case) { + auto input_sizes = test_case.inputs()[0].get_tensor_sizes(); + auto weight_sizes = test_case.inputs()[1].get_tensor_sizes(); + auto output_sizes = test_case.outputs()[0].get_tensor_sizes(); + + int64_t N = output_sizes[0]; + int64_t C = output_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + int64_t K_h = weight_sizes[2]; + int64_t K_w = weight_sizes[3]; + + // Each output element: K_h * K_w multiplies + (K_h * K_w - 1) adds + return 2 * N * C * H_out * W_out * K_h * K_w; +} + +static void reference_impl(TestCase& test_case) { + conv2d_dw_reference_impl(test_case); +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Depthwise Conv2d Benchmark" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = reference_impl; + + auto results = execute_test_cases( + generate_conv2d_dw_test_cases, + conv2d_dw_flop_calculator, + "Conv2dDW", + 3, + 10, + ref_fn); + + return 0; +} diff --git a/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp b/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp new file mode 100644 index 00000000000..1e8eec2a6c2 --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp @@ -0,0 +1,268 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include + +#include "utils.h" + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 64; + +struct Conv2dPwConfig { + int64_t N; + int64_t C_in; + int64_t C_out; + int64_t H; + int64_t W; + bool has_bias; +}; + +static TestCase create_conv2d_pw_test_case( + const Conv2dPwConfig& config, + vkapi::ScalarType dtype, + utils::StorageType storage_type, + utils::GPUMemoryLayout memory_layout) { + TestCase test_case; + + bool is_perf = config.C_in > kRefDimSizeLimit || + config.C_out > kRefDimSizeLimit || config.H > kRefDimSizeLimit || + config.W > kRefDimSizeLimit; + + std::string prefix = is_perf ? "PERF" : "ACCU"; + std::string storage_str = storage_type_abbrev(storage_type); + std::string layout_str = layout_abbrev(memory_layout); + std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32"; + std::string bias_str = config.has_bias ? "+bias" : ""; + + std::string shape = "[" + std::to_string(config.N) + "," + + std::to_string(config.C_in) + "," + std::to_string(config.H) + "," + + std::to_string(config.W) + "]->[" + std::to_string(config.N) + "," + + std::to_string(config.C_out) + "," + std::to_string(config.H) + "," + + std::to_string(config.W) + "]"; + + std::string name = prefix + " conv2d_pw" + bias_str + " " + shape + " " + + storage_str + "(" + layout_str + ") " + dtype_str; + + test_case.set_name(name); + test_case.set_operator_name("test_etvk.test_conv2d_pw.default"); + + // Input tensor [N, C_in, H, W] + ValueSpec input( + {config.N, config.C_in, config.H, config.W}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + + // Weight tensor [C_out, C_in, 1, 1] - constant + ValueSpec weight( + {config.C_out, config.C_in, 1, 1}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + weight.set_constant(true); + + test_case.add_input_spec(input); + test_case.add_input_spec(weight); + + // Bias (or none) + if (config.has_bias) { + ValueSpec bias( + {config.C_out}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + bias.set_constant(true); + test_case.add_input_spec(bias); + } else { + ValueSpec none_bias(static_cast(0)); + none_bias.set_none(true); + test_case.add_input_spec(none_bias); + } + + // impl_selector + ValueSpec impl_selector_spec = ValueSpec::make_string("default"); + test_case.add_input_spec(impl_selector_spec); + + // Output tensor [N, C_out, H, W] + ValueSpec output( + {config.N, config.C_out, config.H, config.W}, + dtype, + storage_type, + memory_layout, + DataGenType::ZEROS); + test_case.add_output_spec(output); + + if (dtype == vkapi::kHalf) { + test_case.set_abs_tolerance(1e-1f); + test_case.set_rel_tolerance(1e-2f); + } else { + test_case.set_abs_tolerance(1e-3f); + test_case.set_rel_tolerance(1e-3f); + } + + test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"}); + + return test_case; +} + +// Reference implementation: pointwise conv2d is essentially a matmul +// output[n][c_out][h][w] = bias[c_out] + +// sum_over_c_in(input[n][c_in][h][w] * weight[c_out][c_in][0][0]) +static void conv2d_pw_reference_impl(TestCase& test_case) { + // input[0], weight[1], bias[2], impl_selector[3] + const ValueSpec& input = test_case.inputs()[0]; + const ValueSpec& weight = test_case.inputs()[1]; + const ValueSpec& bias_spec = test_case.inputs()[2]; + ValueSpec& output = test_case.outputs()[0]; + + if (input.dtype != vkapi::kFloat) { + throw std::invalid_argument("Reference only supports float"); + } + + auto input_sizes = input.get_tensor_sizes(); + auto weight_sizes = weight.get_tensor_sizes(); + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t H = input_sizes[2]; + int64_t W = input_sizes[3]; + int64_t C_out = weight_sizes[0]; + + auto& input_data = input.get_float_data(); + auto& weight_data = weight.get_float_data(); + auto& ref_data = output.get_ref_float_data(); + ref_data.resize(N * C_out * H * W, 0.0f); + + for (int64_t n = 0; n < N; ++n) { + for (int64_t co = 0; co < C_out; ++co) { + for (int64_t h = 0; h < H; ++h) { + for (int64_t w = 0; w < W; ++w) { + float sum = 0.0f; + for (int64_t ci = 0; ci < C_in; ++ci) { + float in_val = + input_data[n * (C_in * H * W) + ci * (H * W) + h * W + w]; + // weight is [C_out, C_in, 1, 1] + float w_val = weight_data[co * C_in + ci]; + sum += in_val * w_val; + } + if (!bias_spec.is_none()) { + auto& bias_data = bias_spec.get_float_data(); + sum += bias_data[co]; + } + ref_data[n * (C_out * H * W) + co * (H * W) + h * W + w] = sum; + } + } + } + } +} + +static std::vector generate_conv2d_pw_test_cases() { + std::vector test_cases; + + // Conv2d shaders are texture-only and require channels-packed layout + std::vector storage_types = {utils::kTexture3D}; + utils::GPUMemoryLayout layout = utils::kChannelsPacked; + + // Accuracy shapes (small enough for float reference validation) + std::vector accuracy_configs = { + {1, 16, 32, 8, 8, false}, + {1, 32, 16, 8, 8, false}, + {1, 16, 32, 8, 8, true}, + {1, 48, 96, 16, 16, false}, + {1, 96, 48, 16, 16, false}, + // Non-multiple-of-4 channels + {1, 13, 27, 8, 8, false}, + {1, 33, 17, 8, 8, false}, + }; + + // EdgeTAM performance shapes + std::vector perf_configs = { + // EdgeTAM backbone stages + {1, 48, 96, 256, 256, false}, + {1, 96, 48, 256, 256, false}, + {1, 96, 192, 128, 128, false}, + {1, 192, 96, 128, 128, false}, + {1, 192, 384, 64, 64, false}, + {1, 384, 192, 64, 64, false}, + {1, 384, 768, 32, 32, false}, + {1, 768, 384, 32, 32, false}, + // EdgeTAM FPN/Neck + {1, 48, 256, 256, 256, false}, + {1, 256, 32, 256, 256, false}, + {1, 96, 256, 128, 128, false}, + {1, 256, 64, 128, 128, false}, + }; + + // Generate accuracy test cases (float only) + for (const auto& config : accuracy_configs) { + for (auto st : storage_types) { + test_cases.push_back( + create_conv2d_pw_test_case(config, vkapi::kFloat, st, layout)); + } + } + + // Generate performance test cases (float and half) + for (const auto& config : perf_configs) { + std::vector dtypes = {vkapi::kFloat, vkapi::kHalf}; + for (auto dtype : dtypes) { + for (auto st : storage_types) { + test_cases.push_back( + create_conv2d_pw_test_case(config, dtype, st, layout)); + } + } + } + + return test_cases; +} + +static int64_t conv2d_pw_flop_calculator(const TestCase& test_case) { + auto input_sizes = test_case.inputs()[0].get_tensor_sizes(); + auto weight_sizes = test_case.inputs()[1].get_tensor_sizes(); + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t H = input_sizes[2]; + int64_t W = input_sizes[3]; + int64_t C_out = weight_sizes[0]; + + return 2 * N * C_out * C_in * H * W; +} + +static void reference_impl(TestCase& test_case) { + conv2d_pw_reference_impl(test_case); +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Pointwise Conv2d (1x1) Benchmark" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = reference_impl; + + auto results = execute_test_cases( + generate_conv2d_pw_test_cases, + conv2d_pw_flop_calculator, + "Conv2dPW", + 3, + 10, + ref_fn); + + return 0; +}