pytorch · SS-JIA · Sep 8, 2025 · Sep 8, 2025
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -932,6 +932,7 @@ jobs:
         # Custom operator tests
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh add
         ./cmake-out/backends/vulkan/test/custom_ops/q8csw_linear
+        ./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
 
   nxp-build-test:
     name: nxp-build-test

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)}
+#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)}
+
+$if OUTPUT_STORAGE == "buffer":
+  #define OUTPUT_BUFFER
+$if INPUT_STORAGE == "buffer":
+  #define INPUT_BUFFER
+
+#define TILE_M4 1
+#define TILE_N4 1
+#define TILE_K4 1
+
+#define TILE_M 4
+#define TILE_N 4
+#define TILE_K 4
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
+
+// Sizes of the convolution output image
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+// Sizes of the convolution input image
+${layout_declare_ubo(B, "ivec4", "input_sizes")}
+// Sizes of the im2col matrix of the convolution output
+${layout_declare_ubo(B, "ivec4", "matrix_sizes")}
+
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "conv2d_fp_im2col_block_store.glslh"
+
+#ifdef INPUT_BUFFER
+
+void load_matrix_tile(
+    out FPOutTile tile,
+    const int n4,
+    const int m_start,
+    const int N4) {
+  [[unroll]] for (int m = 0; m < TILE_M; m++) {
+    tile.data[m][0] = t_input[(m_start + m) * N4 + n4];
+  }
+}
+
+#else // INPUT_TEXTURE
+
+void load_matrix_tile(
+    out FPOutTile tile,
+    const int n4,
+    const int m_start,
+    const int N4) {
+  [[unroll]] for (int m = 0; m < TILE_M; m++) {
+    tile.data[m][0] = texelFetch(
+        t_input, ivec3(n4, m_start + m, 0), 0);
+  }
+}
+
+#endif // INPUT_BUFFER
+
+void main() {
+  // Each thread loads and writes a 4 wide x 4 high block of the matrix
+  const int n4 = int(gl_GlobalInvocationID.x);
+  const int m4 = int(gl_GlobalInvocationID.y);
+
+  const int n = mul_4(n4);
+  const int m = mul_4(m4);
+
+  if (n >= matrix_sizes.x || m >= matrix_sizes.y) {
+    return;
+  }
+
+  FPOutTile tile;
+
+  const int N4 = div_4(matrix_sizes.x);
+  load_matrix_tile(tile, n4, m, N4);
+  write_im2col_tile_as_image(tile, n4, m);
+}
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+col2im:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUTPUT_STORAGE: texture3d
+    INPUT_STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: col2im_texture3d_buffer
+    - NAME: col2im_texture3d_texture3d
+      INPUT_STORAGE: texture3d
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_COMMON_GLSLH
+#define CONV2D_COMMON_GLSLH
+
+#include "common.glslh"
+
+struct Conv2DParams {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+  int groups;
+  int out_channels_per_group;
+  int in_channels_per_group;
+  int logical_K_per_group;
+  int K_per_group;
+  int K4_per_group;
+  int logical_K;
+  int K;
+  int K4;
+};
+
+#ifdef DEBUG_MODE
+
+void printConv2DParams(const Conv2DParams params) {
+  debugPrintfEXT("Conv2DParams: \\n");
+  debugPrintfEXT(
+      "  kernel_size: %d, %d\\n", params.kernel_size.x, params.kernel_size.y);
+  debugPrintfEXT("  stride: %d, %d\\n", params.stride.x, params.stride.y);
+  debugPrintfEXT("  padding: %d, %d\\n", params.padding.x, params.padding.y);
+  debugPrintfEXT("  dilation: %d, %d\\n", params.dilation.x, params.dilation.y);
+  debugPrintfEXT("  groups: %d\\n", params.groups);
+  debugPrintfEXT(
+      "  out_channels_per_group: %d\\n", params.out_channels_per_group);
+  debugPrintfEXT(
+      "  in_channels_per_group: %d\\n", params.in_channels_per_group);
+  debugPrintfEXT("  logical_K_per_group: %d\\n", params.logical_K_per_group);
+  debugPrintfEXT("  K_per_group: %d\\n", params.K_per_group);
+  debugPrintfEXT("  K4_per_group: %d\\n", params.K4_per_group);
+}
+
+#endif // DEBUG_MODE
+
+#endif // CONV2D_COMMON_GLSLH
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_FP_IM2COL_BLOCK
+#define CONV2D_FP_IM2COL_BLOCK
+
+/*
+ * Defines utilities to convert between (col, row) indices of an im2col matrix
+ * and 4-dimension tensor indices of image tensors.
+ *
+ * Requires:
+ * - output_sizes to be defined in the shader layout, corresponding to the sizes
+ *   of the output image of the convolution op.
+ * - image_sizes to be defined in the shader layout, corresponding to the sizes
+ *   of the input image of the convolution op.
+ * - conv2d_params to be defined in the shader layout
+ */
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "common.glslh"
+#include "conv2d_common.glslh"
+
+struct Im2ColMatrixIdx {
+  int row;
+  int col;
+  // Relevant for grouped convolution. This indicates the column index relative
+  // to the first column in the group.
+  int col_idx_in_group;
+  int group_idx;
+};
+
+void unwrap_m(out TensorIndex4D out_tidx_base, const int m) {
+  out_tidx_base.data[3] = m / (output_sizes.y * output_sizes.x);
+  out_tidx_base.data[1] = (m / output_sizes.x) % output_sizes.y;
+  out_tidx_base.data[0] = m % output_sizes.x;
+
+  // Initialize channels to 0; assume it will be set later on
+  out_tidx_base.data[2] = 0;
+}
+
+void im2col_tidx_to_output_tidx(
+    out TensorIndex4D output_tidx,
+    const Im2ColMatrixIdx im2col_tidx) {
+  unwrap_m(output_tidx, im2col_tidx.row);
+  // Set channels
+  output_tidx.data.z = im2col_tidx.col;
+}
+
+/*
+ * Converts im2col matrix position to corresponding 4D tensor index, accounting
+ * for grouped convolutions. The conversion should ensure that all data within
+ * the same group occupy a contiguous block in memory.
+ */
+void im2col_idx_to_input_tidx(
+    out TensorIndex4D input_tidx,
+    const Im2ColMatrixIdx im2col_idx) {
+  TensorIndex4D output_tidx;
+  unwrap_m(output_tidx, im2col_idx.row);
+
+  const int in_channels_per_group = conv2d_params.in_channels_per_group;
+  // Determine the corresponding position within the convolution window based
+  // on the col index (more specifically, the col index within the group)
+  const int channel_within_group =
+      im2col_idx.col_idx_in_group % in_channels_per_group;
+  const int kernel_x = (im2col_idx.col_idx_in_group / in_channels_per_group) %
+      conv2d_params.kernel_size.x;
+  const int kernel_y = im2col_idx.col_idx_in_group /
+      (in_channels_per_group * conv2d_params.kernel_size.x);
+
+  // Calculate the actual input channel index
+  const int channel_idx =
+      im2col_idx.group_idx * conv2d_params.in_channels_per_group +
+      channel_within_group;
+
+  // Calculate corresponding input coordinates based on output position
+  // associated with the row index.
+  const int input_y = int(output_tidx.data.y * conv2d_params.stride.y) -
+      int(conv2d_params.padding.y) + int(kernel_y * conv2d_params.dilation.y);
+  const int input_x = int(output_tidx.data.x * conv2d_params.stride.x) -
+      int(conv2d_params.padding.x) + int(kernel_x * conv2d_params.dilation.x);
+
+  input_tidx.data = ivec4(input_x, input_y, channel_idx, output_tidx.data.w);
+}
+
+// 4x4 block of the im2col matrix
+struct FPIm2ColBlock {
+  VEC4_T data[4];
+};
+
+#endif // CONV2D_FP_IM2COL_BLOCK