pytorch · SS-JIA · Nov 3, 2020 · Nov 3, 2020 · AshkanAliabadi · Nov 3, 2020
diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl
@@ -2,36 +2,26 @@
 #define PRECISION $precision
 layout(std430) buffer;
 layout(std430) uniform;
-layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
-layout(set = 0, binding = 2) uniform constBlock {
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)          uniform                                        Block {
   int W;
   int H;
-  int OW;
-  int OH;
-}
-uConstBlock;
+} uBlock;
 
 layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
 
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID);
-  int W = uConstBlock.W;
-  int H = uConstBlock.H;
-  int OW = uConstBlock.OW;
-  int OH = uConstBlock.OH;
-  vec4 r = vec4(1.0) / float(W) / float(H);
+  vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
   vec4 acc = vec4(0);
   int xi, yi;
-  for (xi = 0; xi < W; ++xi) {
-    for (yi = 0; yi < H; ++yi) {
+  for (xi = 0; xi < uBlock.W; ++xi) {
+    for (yi = 0; yi < uBlock.H; ++yi) {
       acc += texelFetch(uInput, ivec3(xi, yi, pos.z), 0);
     }
   }
   vec4 outValue = r * acc;
-  for (int vi = 0; vi < 4; ++vi) {
-    int oy = (4 * pos.z + vi) / OW;
-    int ox = (4 * pos.z + vi) % OW;
-    imageStore(uOutput, ivec3(ox, oy, 0), vec4(outValue[vi], 0, 0, 0));
-  }
+
+  imageStore(uOutput, pos, outValue);
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
@@ -0,0 +1,30 @@
+#version 450 core
+#define PRECISION $precision
+layout(std430) buffer;
+layout(std430) uniform;
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)          uniform                                        Block {
+  int W;
+  int H;
+} uBlock;
+
+layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
+
+void main() {
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
+  vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
+  vec4 acc = vec4(0);
+  int xi, yi;
+  int zi = (imageSize(uOutput).x*pos.y + pos.x)/4;
+  int zo = (imageSize(uOutput).x*pos.y + pos.x)%4;
+  for (xi = 0; xi < uBlock.W; ++xi) {
+    for (yi = 0; yi < uBlock.H; ++yi) {
+      acc += texelFetch(uInput, ivec3(xi, yi, zi), 0);
+    }
+  }
+  vec4 outValue = r * acc;
+
+  int test = (imageSize(uOutput).x*pos.x + pos.x);
+  imageStore(uOutput, pos, vec4(outValue[zo], 0,0,0));
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp
@@ -8,6 +8,92 @@ namespace vulkan {
 namespace ops {
 namespace {
 
+int64_t normalize_dim(int64_t d, int64_t n) {
+  return (d % n + n) % n;
+}
+
+Tensor mean(
+    const at::Tensor& input_arg,
+    const IntArrayRef dim,
+    const bool keepdim,
+    const optional<ScalarType> dtype) {
+  TORCH_INTERNAL_ASSERT(
+      input_arg.dim() == 4,
+      "vulkan_mean expects 4-dimensional input");
+  static const std::unordered_set<int64_t> expected_dims_set({2, 3});
+  std::unordered_set<int64_t> dims_set;
+  for (const auto& d : dim) {
+    dims_set.insert(normalize_dim(d, 4));
+  }
+  TORCH_INTERNAL_ASSERT(
+    dims_set == expected_dims_set,
+    "vulkan_mean currently only supported for image-wide reduction"
+  );
+
+  std::vector<int64_t> output_dims{input_arg.sizes()[0], input_arg.sizes()[1]};
+  if (keepdim) {
+    output_dims.push_back(1);
+    output_dims.push_back(1);
+  }
+
+  api::Context* const context = api::context();
+  const vTensor& v_input = convert(input_arg);
+  vTensor v_output{
+    context,
+    output_dims,
+    input_arg.options(),
+  };
+
+  api::Command::Buffer command_buffer = context->command().pool.allocate();
+  command_buffer.begin();
+  {
+    if (v_input.has_image()) {
+      const struct {
+        uint32_t input_width, input_height;
+      } block {
+        input_arg.sizes()[3],
+        input_arg.sizes()[2],
+      };
+
+      if (keepdim) {
+        context->dispatch(
+            command_buffer,
+            {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            },
+            VK_KERNEL(mean),
+            v_output.extents(),
+            v_output.image(command_buffer, vTensor::Access::Write),
+            v_input.image(command_buffer),
+            context->resource().pool.uniform(block).object);
+      }
+      else {
+        context->dispatch(
+            command_buffer,
+            {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            },
+            VK_KERNEL(mean2d),
+            v_output.extents(),
+            v_output.image(command_buffer, vTensor::Access::Write),
+            v_input.image(command_buffer),
+            context->resource().pool.uniform(block).object);
+      }
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_buffer.end();
+  command_buffer.submit(context->gpu().queue);
+
+  return convert(v_output);
+}
+
 Tensor adaptive_avg_pool2d(const at::Tensor& input_arg, IntArrayRef output_size) {
   TORCH_INTERNAL_ASSERT(
       input_arg.dim() == 4,
@@ -155,6 +241,7 @@ Tensor avg_pool2d(
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
+  m.impl("mean.dim", TORCH_FN(mean));
   m.impl("_adaptive_avg_pool2d", TORCH_FN(adaptive_avg_pool2d));
   m.impl("avg_pool2d", TORCH_FN(avg_pool2d));
 }

diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -169,6 +169,42 @@ TEST(VulkanTest, mm) {
   ASSERT_TRUE(check);
 }
 
+TEST(VulkanTest, mean) {
+  auto t_in =
+      at::rand({5,3,9,9}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto t_out_expected = at::mean(t_in, {-1,-2}, false);
+  auto tv_in = t_in.vulkan();
+
+  auto tv_out = at::mean(tv_in, {-1,-2}, false);
+  auto t_out = tv_out.cpu();
+
+  const auto check = almostEqual(t_out, t_out_expected);
+  if (!check) {
+    //std::cout << "original:\n" << t_in << std::endl;
+    std::cout << "expected:\n" << t_out_expected << std::endl;
+    std::cout << "got:\n" << t_out << std::endl;
+  }
+  ASSERT_TRUE(check);
+}
+
+TEST(VulkanTest, mean_keep_dim) {
+  auto t_in =
+      at::rand({10, 3, 21, 21}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+  auto t_out_expected = at::mean(t_in, {-1, -2}, true);
+  auto tv_in = t_in.vulkan();
+
+  auto tv_out = at::mean(tv_in, {-1, -2}, true);
+  auto t_out = tv_out.cpu();
+
+  const auto check = almostEqual(t_out, t_out_expected);
+  if (!check) {
+    //std::cout << "original:\n" << t_in << std::endl;
+    std::cout << "expected:\n" << t_out_expected << std::endl;
+    std::cout << "got:\n" << t_out << std::endl;
+  }
+  ASSERT_TRUE(check);
+}
+
 TEST(VulkanTest, adaptive_avg_pool2d) {
   auto t_in =
       at::rand({1, 2, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
@@ -180,6 +216,7 @@ TEST(VulkanTest, adaptive_avg_pool2d) {
 
   const auto check = almostEqual(t_out, t_out_expected);
   if (!check) {
+    std::cout << "original:\n" << t_in << std::endl;
     std::cout << "expected:\n" << t_out_expected << std::endl;
     std::cout << "got:\n" << t_out << std::endl;
   }