[pytorch] Add support for "height" and "width" dimension for the "sel…

…ect" operator on pytorch vulkan backend (#94612) Summary: Add support for "height" and "width" dimension for the "select" operator on pytorch vulkan backend. Test Plan: ``` yipjustin@yipjustin-mbp fbsource % buck run -c pt.vulkan_full_precision=1 --target-platforms ovr_config//platform/macos:arm64-fbsource //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64 -- --gtest_filter="*select_3d*" Downloaded 1/2 artifacts, 1.29 Mbytes, 0.0% cache miss (for updated rules) Building: finished in 3.7 sec (100%) 450/450 jobs, 2/450 updated Total time: 3.8 sec BUILD SUCCEEDED Running main() from xplat/third-party/gmock/googletest-1.12.1/googletest/src/gtest_main.cc Note: Google Test filter = *select_3d* [==========] Running 9 tests from 1 test suite. [----------] Global test environment set-up. [----------] 9 tests from VulkanAPITest [ RUN ] VulkanAPITest.select_3d_depth_small [ OK ] VulkanAPITest.select_3d_depth_small (30 ms) [ RUN ] VulkanAPITest.select_3d_depth_medium [ OK ] VulkanAPITest.select_3d_depth_medium (0 ms) [ RUN ] VulkanAPITest.select_3d_depth_large [ OK ] VulkanAPITest.select_3d_depth_large (1 ms) [ RUN ] VulkanAPITest.select_3d_height_small [ OK ] VulkanAPITest.select_3d_height_small (0 ms) [ RUN ] VulkanAPITest.select_3d_height_medium [ OK ] VulkanAPITest.select_3d_height_medium (0 ms) [ RUN ] VulkanAPITest.select_3d_height_large [ OK ] VulkanAPITest.select_3d_height_large (3 ms) [ RUN ] VulkanAPITest.select_3d_width_small [ OK ] VulkanAPITest.select_3d_width_small (0 ms) [ RUN ] VulkanAPITest.select_3d_width_medium [ OK ] VulkanAPITest.select_3d_width_medium (0 ms) [ RUN ] VulkanAPITest.select_3d_width_large [ OK ] VulkanAPITest.select_3d_width_large (1 ms) [----------] 9 tests from VulkanAPITest (40 ms total) [----------] Global test environment tear-down [==========] 9 tests from 1 test suite ran. (40 ms total) [ PASSED ] 9 tests. ``` Reviewed By: SS-JIA Differential Revision: D43020796 Pull Request resolved: #94612 Approved by: https://github.com/SS-JIA
pytorch · Feb 15, 2023 · f2c2642 · f2c2642
1 parent fa1ea9f
commit f2c2642
Show file tree

Hide file tree

Showing 4 changed files with 240 additions and 2 deletions.
diff --git a/aten/src/ATen/native/vulkan/glsl/select_height.glsl b/aten/src/ATen/native/vulkan/glsl/select_height.glsl
@@ -0,0 +1,40 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec3 size;
+  int index;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // w
+  const int src_x = pos.x;
+  // h
+  const int src_y = uBlock.index;
+  // c
+  const int src_z = pos.y;
+
+  const vec4 v = texelFetch(uInput, ivec3(src_x, src_y, src_z), 0);
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= uBlock.size.y) {
+      return;
+    }
+
+    imageStore(uOutput, new_pos, vec4(v[i], 0, 0, 0));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/select_width.glsl b/aten/src/ATen/native/vulkan/glsl/select_width.glsl
@@ -0,0 +1,40 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec3 size;
+  int index;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // w
+  const int src_x = uBlock.index;
+  // h
+  const int src_y = pos.x;
+  // c
+  const int src_z = pos.y;
+
+  const vec4 v = texelFetch(uInput, ivec3(src_x, src_y, src_z), 0);
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= uBlock.size.y) {
+      return;
+    }
+
+    imageStore(uOutput, new_pos, vec4(v[i], 0, 0, 0));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Select.cpp b/aten/src/ATen/native/vulkan/ops/Select.cpp
@@ -53,9 +53,125 @@ Tensor select_depth(const Tensor& input_arg, uint32_t index) {
   return convert(v_output);
 }
 
+Tensor select_height(const Tensor& input_arg, uint32_t index) {
+  api::Context* const context = api::context();
+
+  const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
+  const vTensor& v_input = convert(input);
+  const IntArrayRef v_input_sizes = v_input.sizes();
+
+  vTensor v_output{
+      context,
+      {v_input_sizes[0], v_input_sizes[2]},
+      input_arg.scalar_type(),
+  };
+
+  const struct Block final {
+    uvec3 size; // output texture size
+    uint32_t index;
+  } block{v_output.extents(), index};
+
+  // Input tensor is a (c, h, w)
+  // Output tensor is a (c, w)
+  // In shader, the input texture's coordinate is (w, h, c)
+  // In shader, the output texture's coordinate is (w, c, 1)
+  uint32_t w = v_output.extents().data[0u];
+  uint32_t c = v_output.extents().data[1u];
+
+  // Encoding of c-channel is packed into texel, hence we only call ceil(c/4)
+  // times to minimize invocation and read.
+  // For the last dimension, it is the selected height. Shader will do a direct
+  // lookup based on block.index.
+  uvec3 global_workgroup_size{w, api::utils::div_up(c, 4u), 1};
+
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(select_height),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_workgroup_size,
+      // local work group size
+      adaptive_work_group_size(global_workgroup_size),
+      // fence handle
+      VK_NULL_HANDLE,
+      // shader arguments
+      v_output.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      // params buffer
+      params.buffer());
+
+  return convert(v_output);
+}
+
+Tensor select_width(const Tensor& input_arg, uint32_t index) {
+  api::Context* const context = api::context();
+
+  const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
+  const vTensor& v_input = convert(input);
+  const IntArrayRef v_input_sizes = v_input.sizes();
+
+  vTensor v_output{
+      context,
+      {v_input_sizes[0], v_input_sizes[1]},
+      input_arg.scalar_type(),
+  };
+
+  const struct Block final {
+    uvec3 size; // output texture size
+    uint32_t index;
+  } block{v_output.extents(), index};
+
+  // Input tensor is a (c, h, w)
+  // Output tensor is a (c, h)
+  // In shader, the input texture's coordinate is (w, h, c)
+  // In shader, the output texture's coordinate is (h, c, 1)
+  uint32_t h = v_output.extents().data[0u];
+  uint32_t c = v_output.extents().data[1u];
+
+  // Encoding of c-channel is packed into texel, hence we only call ceil(c/4)
+  // times to minimize invocation and read.
+  // For the last dimension, it is the selected width. Shader will do a direct
+  // lookup based on block.index.
+  uvec3 global_workgroup_size{h, api::utils::div_up(c, 4u), 1};
+
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(select_width),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_workgroup_size,
+      // local work group size
+      adaptive_work_group_size(global_workgroup_size),
+      // fence handle
+      VK_NULL_HANDLE,
+      // shader arguments
+      v_output.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      // params buffer
+      params.buffer());
+
+  return convert(v_output);
+}
+
 Tensor select(const Tensor& self, int64_t dim, int64_t index) {
   TORCH_CHECK(self.dim() == 3, "Vulkan select only supports 3d tensors!");
-  TORCH_CHECK(dim == 0, "Vulkan select only supports dim = 0!");
+  TORCH_CHECK(
+      0 <= dim && dim <= 2,
+      "Vulkan select only supports one of the dim (0, 1, 2)");
 
   const int64_t size = self.size(dim);
 
@@ -73,7 +189,13 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) {
     index += size;
   }
 
-  return select_depth(self, index);
+  if (dim == 0) {
+    return select_depth(self, index);
+  } else if (dim == 1) {
+    return select_height(self, index);
+  } else {
+    return select_width(self, index);
+  }
 }
 
 #ifdef USE_VULKAN_API

diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -2803,6 +2803,42 @@ TEST_F(VulkanAPITest, select_3d_depth_large) {
   test_select({100, 1, 144}, 0, 50);
 }
 
+TEST_F(VulkanAPITest, select_3d_height_small) {
+  test_select({1, 1, 1}, 1, 0);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_medium) {
+  test_select({3, 5, 2}, 1, 2);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_medium1) {
+  test_select({16, 16, 5}, 1, 6);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_medium2) {
+  test_select({17, 17, 5}, 1, 6);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_large) {
+  test_select({100, 144, 5}, 1, 50);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_small) {
+  test_select({1, 1, 1}, 2, 0);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_medium) {
+  test_select({3, 5, 3}, 2, 2);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_medium2) {
+  test_select({17, 17, 8}, 2, 6);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_large) {
+  test_select({100, 3, 144}, 2, 50);
+}
+
 TEST_F(VulkanAPITest, sigmoid) {
   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
   const auto in_vulkan = in_cpu.vulkan();