[PyTorch] Add Vulkan support and tests for at::upsample_bilinear2d (#…

…98022) Summary: Pull Request resolved: #98022 Bilinear upsampling is a [4D tensor upsampling operation](https://pytorch.org/docs/stable/generated/torch.nn.Upsample.html), this adds support for the operation on the Vulkan GPU backend. Test Plan: 1. `buck run --target-platforms ovr_config//platform/macos:arm64-fbsource //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64 -c pt.vulkan_full_precision=1` on Apple M1 MacBook 2. Confirm all tests pass with no regression, and the added tests `*upsample_bilinear2d*` pass 2a. All tests P669847383 2b. `upsample_bilinear2d` tests P669866631 3. Overview: ``` ... [ RUN ] VulkanAPITest.upsample_bilinear2d_align_false_small [ OK ] VulkanAPITest.upsample_bilinear2d_align_false_small (1 ms) [ RUN ] VulkanAPITest.upsample_bilinear2d_align_false_large [ OK ] VulkanAPITest.upsample_bilinear2d_align_false_large (2 ms) [ RUN ] VulkanAPITest.upsample_bilinear2d_align_true_small [ OK ] VulkanAPITest.upsample_bilinear2d_align_true_small (2 ms) [ RUN ] VulkanAPITest.upsample_bilinear2d_align_true_large [ OK ] VulkanAPITest.upsample_bilinear2d_align_true_large (1 ms) ... [==========] 209 tests from 1 test suite ran. (6317 ms total) [ PASSED ] 201 tests. [ SKIPPED ] 1 test, listed below: [ SKIPPED ] VulkanAPITest.querypool_flushed_shader_log [ FAILED ] 7 tests, listed below: [ FAILED ] VulkanAPITest.cat_dim1_singledepth_success [ FAILED ] VulkanAPITest.gru_success [ FAILED ] VulkanAPITest.gru_mclareninputs_success [ FAILED ] VulkanAPITest.gru_prepack_success [ FAILED ] VulkanAPITest.lstm_success [ FAILED ] VulkanAPITest.lstm_mclareninputs_success [ FAILED ] VulkanAPITest.lstm_prepack_success ``` Reviewed By: SS-JIA Differential Revision: D43142564 fbshipit-source-id: 39931862c2700e69562565042e2f9e92a262f276
pytorch · Mar 30, 2023 · 4f73c5e · 4f73c5e
1 parent c218309
commit 4f73c5e
Show file tree

Hide file tree

Showing 4 changed files with 294 additions and 0 deletions.
diff --git a/aten/src/ATen/native/vulkan/glsl/upsample_bilinear2d_align_false.glsl b/aten/src/ATen/native/vulkan/glsl/upsample_bilinear2d_align_false.glsl
@@ -0,0 +1,76 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+
+/*
+ * Input Buffer
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec4 oextents;
+  ivec2 iextents;
+  vec2 scale;
+}
+uBlock;
+
+/*
+ * Local Work Group Size
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Upsamples uInput to the uOutput with scale according to uBlock params,
+ * using the equation for bilinear upsampling/interpolation
+ * along the height and width plane.
+ * align_false ~ align_corners=False, it means that each of the 4 output
+ * corner texels are treated in interpolation as if they were half texel
+ * offset outwards from the 4 input corner texels, if the two textures
+ * were overlaid the output texture would be "bigger".
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThan(pos, uBlock.oextents.xyz))) {
+    return;
+  }
+  // the border interpolated continuous coordinates from align=false
+  // are floored and ceiled to avoid alpha becoming negative
+  vec2 pos_interp = clamp(
+      ((pos.xy + 0.5) * uBlock.scale) - 0.5, vec2(0, 0), uBlock.iextents.xy);
+
+  // 4 input texels used for bilinear interpolation, naming by PyTorch
+  // Tensor coordinate space where the "top" is x = 0 and "left" is y = 0,
+  // Vulkan reversed
+  ivec3 in_pos_topleft = ivec3(floor(pos_interp.x), floor(pos_interp.y), pos.z);
+  ivec3 in_pos_bottomleft =
+      ivec3(floor(pos_interp.x), ceil(pos_interp.y), pos.z);
+  ivec3 in_pos_topright = ivec3(ceil(pos_interp.x), floor(pos_interp.y), pos.z);
+  ivec3 in_pos_bottomright =
+      ivec3(ceil(pos_interp.x), ceil(pos_interp.y), pos.z);
+
+  vec2 alpha = pos_interp - in_pos_topleft.xy;
+
+  const vec4 top_val_interp =
+      (texelFetch(uInput, in_pos_topleft, 0) * (1 - alpha.x)) +
+      (texelFetch(uInput, in_pos_topright, 0) * alpha.x);
+  const vec4 bot_val_interp =
+      (texelFetch(uInput, in_pos_bottomleft, 0) * (1 - alpha.x)) +
+      (texelFetch(uInput, in_pos_bottomright, 0) * alpha.x);
+
+  imageStore(
+      uOutput,
+      pos,
+      (top_val_interp * (1 - alpha.y)) + (bot_val_interp * alpha.y));
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/upsample_bilinear2d_align_true.glsl b/aten/src/ATen/native/vulkan/glsl/upsample_bilinear2d_align_true.glsl
@@ -0,0 +1,73 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+
+/*
+ * Input Buffer
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec4 oextents;
+  ivec2 iextents;
+  vec2 scale;
+}
+uBlock;
+
+/*
+ * Local Work Group Size
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Upsamples uInput to the uOutput with scale according to uBlock params,
+ * using the equation for bilinear upsampling/interpolation
+ * along the height and width plane.
+ * align_true ~ align_corners=True, it means that each of the 4 output
+ * corner texels are treated in interpolation as if they were squarely
+ * aligned with the 4 input corner texels, if the two textures were overlaid.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThan(pos, uBlock.oextents.xyz))) {
+    return;
+  }
+  vec2 pos_interp = vec2(pos.xy) * uBlock.iextents.xy /
+      clamp(uBlock.oextents.xy - 1, vec2(1, 1), uBlock.oextents.xy - 1);
+
+  // 4 input texels used for bilinear interpolation, naming by PyTorch
+  // Tensor coordinate space where the "top" is x = 0 and "left" is y = 0,
+  // Vulkan reversed
+  ivec3 in_pos_topleft = ivec3(floor(pos_interp.x), floor(pos_interp.y), pos.z);
+  ivec3 in_pos_bottomleft =
+      ivec3(floor(pos_interp.x), ceil(pos_interp.y), pos.z);
+  ivec3 in_pos_topright = ivec3(ceil(pos_interp.x), floor(pos_interp.y), pos.z);
+  ivec3 in_pos_bottomright =
+      ivec3(ceil(pos_interp.x), ceil(pos_interp.y), pos.z);
+
+  vec2 alpha = pos_interp - in_pos_topleft.xy;
+
+  const vec4 top_val_interp =
+      (texelFetch(uInput, in_pos_topleft, 0) * (1 - alpha.x)) +
+      (texelFetch(uInput, in_pos_topright, 0) * alpha.x);
+  const vec4 bot_val_interp =
+      (texelFetch(uInput, in_pos_bottomleft, 0) * (1 - alpha.x)) +
+      (texelFetch(uInput, in_pos_bottomright, 0) * alpha.x);
+
+  imageStore(
+      uOutput,
+      pos,
+      (top_val_interp * (1 - alpha.y)) + (bot_val_interp * alpha.y));
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
@@ -93,12 +93,97 @@ Tensor upsample_nearest2d(
   return convert(v_output);
 }
 
+Tensor upsample_bilinear2d(
+    const Tensor& input_arg,
+    const IntArrayRef output_sizes,
+    bool align_corners,
+    const c10::optional<double> scales_h,
+    const c10::optional<double> scales_w) {
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      (4 == input_arg.sizes().size()) && (2 == output_sizes.size()),
+      "Invalid input!");
+
+  const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
+  const vTensor& v_input = convert(input);
+
+  vTensor v_output{
+      context,
+      {
+          get_dim<Dim4D::Batch>(v_input),
+          get_dim<Dim4D::Channel>(v_input),
+          output_sizes[Layout::Parameter::height],
+          output_sizes[Layout::Parameter::width],
+      },
+      input_arg.scalar_type(),
+  };
+
+  const api::utils::uvec3 output_extents = v_output.extents();
+  const struct Block final {
+    uvec3 oextents;
+    uint32_t padding;
+    ivec2 iextents;
+    vec2 scale;
+  } block{
+      v_output.extents(), // oextents
+      0u, // padding
+      {
+          safe_downcast<int32_t>(get_dim<Dim4D::Width>(input_arg) - 1),
+          safe_downcast<int32_t>(get_dim<Dim4D::Height>(input_arg) - 1),
+      }, // iextents
+      {
+          compute_scales_value<float>(
+              scales_w,
+              get_dim<Dim4D::Width>(input_arg),
+              get_dim<Dim4D::Width>(v_output)),
+          compute_scales_value<float>(
+              scales_h,
+              get_dim<Dim4D::Height>(input_arg),
+              get_dim<Dim4D::Height>(v_output)),
+      }, // scale
+  };
+
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+  api::ShaderInfo shader_desc;
+  if (align_corners) {
+    shader_desc = VK_KERNEL(upsample_bilinear2d_align_true);
+  } else {
+    shader_desc = VK_KERNEL(upsample_bilinear2d_align_false);
+  }
+  context->submit_compute_job(
+      // shader descriptor
+      shader_desc,
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      output_extents,
+      // local work group size
+      adaptive_work_group_size(output_extents),
+      // fence handle
+      VK_NULL_HANDLE,
+      // shader arguments
+      v_output.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      // params buffer
+      params.buffer());
+
+  return convert(v_output);
+}
+
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("aten::upsample_nearest2d"),
       TORCH_FN(upsample_nearest2d));
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::upsample_bilinear2d"),
+      TORCH_FN(upsample_bilinear2d));
 }
 
 #endif /* USE_VULKAN_API */

diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -3310,6 +3310,66 @@ TEST_F(VulkanAPITest, upsample_nearest2d) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, upsample_bilinear2d_align_false_small) {
+  const auto in_cpu = at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  const auto out_cpu = at::upsample_bilinear2d(in_cpu, {4, 6}, false);
+
+  const auto in_vulkan = in_cpu.vulkan();
+  const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {4, 6}, false);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, upsample_bilinear2d_align_false_large) {
+  const auto in_cpu = at::rand({1, 7, 25, 25}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  const auto out_cpu = at::upsample_bilinear2d(in_cpu, {45, 45}, false);
+
+  const auto in_vulkan = in_cpu.vulkan();
+  const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {45, 45}, false);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, upsample_bilinear2d_align_true_small) {
+  const auto in_cpu = at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  const auto out_cpu = at::upsample_bilinear2d(in_cpu, {4, 6}, true);
+
+  const auto in_vulkan = in_cpu.vulkan();
+  const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {4, 6}, true);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, upsample_bilinear2d_align_true_large) {
+  const auto in_cpu = at::rand({1, 7, 25, 25}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  const auto out_cpu = at::upsample_bilinear2d(in_cpu, {45, 45}, true);
+
+  const auto in_vulkan = in_cpu.vulkan();
+  const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {45, 45}, true);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
 void test_unbind(const at::IntArrayRef input_shape, int64_t dim) {
   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
   const auto out_cpu = at::unbind(in_cpu, dim);