pytorch · SS-JIA · Sep 18, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -13,33 +13,15 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a buffer-backed tensor, estimate the equivalent memory
- * layout enum value by identifying the fastest moving dimension.
- */
-utils::GPUMemoryLayout estimate_memory_layout(
-    const std::vector<int64_t>& dim_order) {
-  int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
-  if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) {
-    return utils::GPUMemoryLayout(fastest_dim_whcn);
-  }
-
-  // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
-  // a UNKOWN GPUMemoryLayout. This is not high priority though because we don't
-  // expect this to ever come up in practice.
-  VK_THROW("No compatible GPUMemoryLayout value");
-}
-
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const utils::GPUMemoryLayout memory_layout) {
+    const int32_t packed_dim_whcn_idx) {
   // Special case for zero dim tensors
   if (ndim == 0) {
     return {0};
   }
   std::vector<int64_t> dim_order(ndim);
-  int64_t last_dim =
-      ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+  int64_t last_dim = ndim - 1 - packed_dim_whcn_idx;
 
   int64_t cur_dim = 0;
   for (int d = 0; d < ndim; ++d) {
@@ -149,7 +131,7 @@ std::vector<int64_t> unsqueeze_strides(
 
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const int32_t packed_dim_whcn_idx) {
   int64_t ndim = sizes.size();
   if (ndim == 0) {
     ndim = 1;
@@ -163,8 +145,7 @@ std::vector<int64_t> calculate_padded_sizes(
   }
 
   // Pad the packed dim to the next multiple of 4.
-  const int64_t dim_offset =
-      utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+  const int64_t dim_offset = packed_dim_whcn_idx + 1;
   const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
   padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
 
@@ -174,7 +155,7 @@ std::vector<int64_t> calculate_padded_sizes(
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
     const std::vector<int64_t>& axis_map,
-    const utils::GPUMemoryLayout memory_layout) {
+    const int32_t packed_dim_whcn_idx) {
   VK_CHECK_COND(padded_sizes.size() == 4);
   VK_CHECK_COND(axis_map.size() == 4);
 
@@ -195,21 +176,8 @@ utils::uvec3 calculate_image_extents(
   // Multiply the extents of the batch axis by the batch size.
   extents[batch_axis] *= padded_sizes.at(0);
 
-  switch (memory_layout) {
-    case utils::kWidthPacked:
-      VK_CHECK_COND(extents[axis_map.at(0)] % 4 == 0);
-      extents[axis_map.at(0)] /= 4;
-      break;
-    case utils::kHeightPacked:
-      VK_CHECK_COND(extents[axis_map.at(1)] % 4 == 0);
-      extents[axis_map.at(1)] /= 4;
-      break;
-    case utils::kChannelsPacked:
-      VK_CHECK_COND(extents[axis_map.at(2)] % 4 == 0);
-      extents[axis_map.at(2)] /= 4;
-      break;
-  }
-
+  VK_CHECK_COND(extents[axis_map.at(packed_dim_whcn_idx)] % 4 == 0);
+  extents[axis_map.at(packed_dim_whcn_idx)] /= 4;
   return extents;
 }
 
@@ -285,15 +253,15 @@ vkapi::VulkanBuffer allocate_buffer(
 vTensorStorage::vTensorStorage(
     Context* const context,
     const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout gpu_memory_layout,
     const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim_whcn_idx,
     const std::vector<int64_t>& padded_sizes,
     const vkapi::ScalarType dtype,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
       image_extents_(
-          calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout)),
+          calculate_image_extents(padded_sizes, axis_map, packed_dim_whcn_idx)),
       buffer_length_{utils::multiply_integers(padded_sizes)},
       buffer_offset_{0},
       image_(allocate_image(
@@ -408,14 +376,15 @@ vTensor::vTensor(
     const utils::GPUMemoryLayout memory_layout,
     const bool allocate_memory)
     : dtype_(dtype),
-      memory_layout_(memory_layout),
       // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
-      dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
+      packed_dim_whcn_idx_(
+          utils::to_packed_dim_whcn_idx<int32_t>(memory_layout)),
+      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_whcn_idx_)),
       axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
-      padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
+      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_whcn_idx_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
       logical_limits_{{0, 0, 0}},
@@ -429,8 +398,8 @@ vTensor::vTensor(
       storage_(
           context,
           storage_type,
-          memory_layout_,
           axis_map_,
+          packed_dim_whcn_idx_,
           padded_sizes_,
           dtype_,
           allocate_memory) {
@@ -451,9 +420,9 @@ vTensor::vTensor(
 
 vTensor::vTensor(const vTensor& other)
     : dtype_(other.dtype_),
-      memory_layout_(other.memory_layout_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
+      packed_dim_whcn_idx_{other.packed_dim_whcn_idx_},
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
       axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
@@ -479,14 +448,14 @@ vTensor::vTensor(
     const std::vector<int64_t>& dim_order,
     const int64_t offset_numel)
     : dtype_(other.dtype_),
-      memory_layout_(estimate_memory_layout(dim_order)),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
+      packed_dim_whcn_idx_(other.packed_dim_whcn_idx_),
       dim_order_(dim_order.begin(), dim_order.end()),
       axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
-      padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
+      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_whcn_idx_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
       logical_limits_(other.logical_limits_),
@@ -542,6 +511,19 @@ void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
   logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
 }
 
+utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
+  switch (packed_dim_whcn_idx_) {
+    case WHCN::kWidthDim:
+      return utils::kWidthPacked;
+    case WHCN::kHeightDim:
+      return utils::kHeightPacked;
+    case WHCN::kChannelsDim:
+      return utils::kChannelsPacked;
+    default:
+      VK_THROW("Invalid packed dim");
+  }
+}
+
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
   if (!sizes_uniform_.buffer()) {
     sizes_uniform_ =
@@ -618,21 +600,16 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
 
 void vTensor::update_metadata() {
   strides_ = calculate_strides(sizes_, dim_order_);
-  // Only update the memory layout for buffer-backed tensors. Strides are
-  // meaningless for texture-backed tensors and do not impact the memory layout.
-  if (storage_type() == utils::kBuffer) {
-    memory_layout_ = estimate_memory_layout(dim_order_);
-  }
   numel_ = utils::multiply_integers(sizes_);
 
-  padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_whcn_idx_);
   unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
   padded_numel_ = utils::multiply_integers(padded_sizes_);
 
   // Calculate the image extents that would have been used to allocate a texture
   // withthe current sizes, and use that to set the logical limits.
   set_logical_limits(
-      calculate_image_extents(padded_sizes_, axis_map_, memory_layout_));
+      calculate_image_extents(padded_sizes_, axis_map_, packed_dim_whcn_idx_));
 
   if (sizes_uniform_.buffer()) {
     sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
@@ -656,7 +633,7 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);
+        calculate_image_extents(padded_sizes_, axis_map_, packed_dim_whcn_idx_);
 
     bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0];
     valid_resize =
@@ -725,23 +702,23 @@ void transpose_dim_order_inplace(
 
 void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
   std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
+
+  const int dim0_whcn = sizes_.size() - 1 - dim0;
+  const int dim1_whcn = sizes_.size() - 1 - dim1;
+  if (packed_dim_whcn_idx_ == dim0_whcn) {
+    packed_dim_whcn_idx_ = dim1_whcn;
+  }
+  if (packed_dim_whcn_idx_ == dim1_whcn) {
+    packed_dim_whcn_idx_ = dim0_whcn;
+  }
+
   if (storage_type() == utils::kBuffer) {
     transpose_dim_order_inplace(dim_order_, dim0, dim1);
   } else {
-    const int dim0_whcn = sizes_.size() - 1 - dim0;
-    const int dim1_whcn = sizes_.size() - 1 - dim1;
     // Cannot transpose batch dimension for texture storage
     VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
-
     std::iter_swap(
         axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
-
-    if (packed_dim_whcn_idx() == dim0_whcn) {
-      memory_layout_ = utils::GPUMemoryLayout(dim1_whcn);
-    }
-    if (packed_dim_whcn_idx() == dim1_whcn) {
-      memory_layout_ = utils::GPUMemoryLayout(dim0_whcn);
-    }
   }
   update_metadata();
 }

diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -26,7 +26,7 @@ namespace api {
  */
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const utils::GPUMemoryLayout memory_layout);
+    const int32_t packed_dim_whcn_idx);
 
 /*
  * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
@@ -57,15 +57,15 @@ std::vector<int64_t> unsqueeze_strides(
  */
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout);
+    const int32_t packed_dim_whcn_idx);
 
 /*
  * Calculate the image extents required of a texture backed tensor.
  */
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
     const std::vector<int64_t>& axis_map,
-    const utils::GPUMemoryLayout memory_layout);
+    const int32_t packed_dim_whcn_idx);
 
 struct LastAccess {
   vkapi::PipelineStageFlags stage;
@@ -89,8 +89,8 @@ class vTensorStorage final {
   vTensorStorage(
       Context* context,
       const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout gpu_memory_layout,
       const std::vector<int64_t>& axis_map,
+      const int32_t packed_dim_whcn_idx,
       const std::vector<int64_t>& padded_sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
@@ -221,13 +221,14 @@ class vTensor final {
 
   // Whether the tensor has elements of type float, int, etc.
   vkapi::ScalarType dtype_;
-  // Describes which dimension is "tightly packed". For texture backed tensors,
-  // this describes which dimension is packed along a texel. For buffer backed
-  // tensors, this describes which dimension has a stride of 1 (i.e. is last in
-  // the dim order).
-  utils::GPUMemoryLayout memory_layout_;
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
+  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
+  // width, 1 for height, etc.). For texture backed tensors, this describes
+  // which dimension is packed along a texel. For buffer backed tensors, this
+  // describes which dimension has a stride of 1 (i.e. is last in the dim
+  // order).
+  int32_t packed_dim_whcn_idx_;
 
   /*
    * "Layout" metadata. These describe with further detail how tensor data is
@@ -371,12 +372,18 @@ class vTensor final {
     return dtype_;
   }
 
-  inline utils::GPUMemoryLayout gpu_memory_layout() const {
-    return memory_layout_;
-  }
+  /*
+   * Provide a "best guess" of a memory layout that can be used to construct a
+   * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this
+   * tensor. In some scenarios, the exact layout of the tensor may not be able
+   * to be replicated due to calling `virtual_*()` functions after construction;
+   * however, this function will provide a memory layout that will produce the
+   * same `packed_dim_whcn_idx` as this tensor.
+   */
+  utils::GPUMemoryLayout estimate_memory_layout() const;
 
   inline int32_t packed_dim_whcn_idx() const {
-    return static_cast<int32_t>(memory_layout_);
+    return packed_dim_whcn_idx_;
   }
 
   inline const std::vector<int64_t>& sizes() const {
@@ -496,6 +503,9 @@ class vTensor final {
    *
    * This function can only be used for buffer-backed tensors, since texture
    * backed buffers cannot change dimensionality or memory layout.
+   *
+   * TODO(ssjia): delete this API. prefer functions such as virtual_transpose
+   * instead.
    */
   void virtual_reconfigure(
       const std::vector<int64_t>& new_sizes,

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -307,8 +307,9 @@ class ComputeGraph final {
         .is_view_of(values_.at(base).toConstTensor());
   }
 
-  inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().gpu_memory_layout();
+  inline utils::GPUMemoryLayout estimate_memory_layout_of(
+      const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().estimate_memory_layout();
   }
 
   inline int32_t packed_dim_whcn_idx_of(const ValueRef idx) const {

diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -21,7 +21,7 @@ void check_binary_op_args(
     const api::vTensor& self,
     const api::vTensor& other,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_same_memory_layout(self, other, out));
+  VK_CHECK_COND(check_same_packed_dim(self, other, out));
   std::vector<int64_t> broadcasted_sizes =
       calculate_broadcasted_output_size(self, other);
   VK_CHECK_COND(out.sizes() == broadcasted_sizes);
@@ -53,7 +53,7 @@ void add_binary_op_node(
     const std::string& op_name) {
   ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
   ValueRef arg2 =
-      prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));
+      prepack_if_tensor_ref(graph, in2, graph.estimate_memory_layout_of(arg1));
 
   vTensorPtr t_in1 = graph.get_tensor(arg1);
   vTensorPtr t_in2 = graph.get_tensor(arg2);

diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -25,7 +25,7 @@ void add_cat_default_node(
 
   for (ValueRef input_ref : *input_list) {
     vTensorPtr t_in = graph.get_tensor(input_ref);
-    VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
+    VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
   }
 
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -222,8 +222,8 @@ ValueRef prepack_weights(
 }
 
 void check_conv_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 struct Conv2dParams final {