pytorch · facebook-github-bot · Aug 23, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
@@ -13,36 +13,15 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a buffer-backed tensor, find the index of the "fastest
- * moving" dimension in WHCN dimension order. If multiple dims have the lowest
- * stride, then the "earlier" dim is assumed to be the fastest moving (width is
- * "earlier" than height).
- */
-int32_t find_fastest_whcn_dim(const std::vector<int64_t>& strides) {
-  if (strides.size() == 0) {
-    return 0;
-  }
-  int32_t fastest_dim = 0;
-  int64_t min_stride = strides.at(0);
-  for (int d = strides.size() - 1; d >= 0; --d) {
-    if (strides.at(d) < min_stride) {
-      fastest_dim = d;
-      min_stride = strides.at(d);
-    }
-  }
-  return (strides.size() - 1 - fastest_dim);
-}
-
 /*
  * Given the strides of a buffer-backed tensor, estimate the equivalent memory
  * layout enum value by identifying the fastest moving dimension.
  */
 utils::GPUMemoryLayout estimate_memory_layout(
-    const std::vector<int64_t>& strides) {
-  int32_t fastest_dim = find_fastest_whcn_dim(strides);
-  if (fastest_dim <= 3) {
-    return utils::GPUMemoryLayout(fastest_dim);
+    const std::vector<int64_t>& dim_order) {
+  int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
+  if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) {
+    return utils::GPUMemoryLayout(fastest_dim_whcn);
   }
 
   // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
@@ -51,41 +30,70 @@ utils::GPUMemoryLayout estimate_memory_layout(
   VK_THROW("No compatible GPUMemoryLayout value");
 }
 
+std::vector<int64_t> calculate_dim_order(
+    const size_t ndim,
+    const utils::GPUMemoryLayout memory_layout) {
+  // Special case for zero dim tensors
+  if (ndim == 0) {
+    return {0};
+  }
+  std::vector<int64_t> dim_order(ndim);
+  int64_t last_dim =
+      ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+
+  int64_t cur_dim = 0;
+  for (int d = 0; d < ndim; ++d) {
+    if (d == last_dim) {
+      cur_dim++;
+    }
+    dim_order[d] = cur_dim;
+    cur_dim++;
+  }
+  if (last_dim >= 0) {
+    dim_order[ndim - 1] = last_dim;
+  }
+
+  return dim_order;
+}
+
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const std::vector<int64_t>& dim_order) {
   // For zero dim tensors
   if (sizes.size() == 0) {
     return {1};
   }
 
-  const int64_t dim_offset =
-      utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
-  int64_t last_dim = sizes.size() - dim_offset;
-  if (last_dim < 0) {
-    last_dim = sizes.size() - 1;
-  }
-
   size_t ndim = sizes.size();
   std::vector<int64_t> strides(ndim);
 
-  const int64_t last_dim_size = sizes.at(last_dim);
-
-  for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) {
-    strides.at(stride_d) = 1;
-    if (stride_d == last_dim) {
-      continue;
-    }
-    strides.at(stride_d) = last_dim_size;
-    for (int size_d = ndim - 1; size_d > stride_d; size_d--) {
-      if (size_d != last_dim) {
-        strides.at(stride_d) *= sizes.at(size_d);
-      }
+  strides[dim_order[ndim - 1]] = 1;
+  for (int32_t i = ndim - 2; i >= 0; --i) {
+    if (sizes[dim_order[i + 1]] == 0) {
+      strides[dim_order[i]] = strides[dim_order[i + 1]];
+    } else {
+      strides[dim_order[i]] =
+          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
     }
   }
+
   return strides;
 }
 
+bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
+  int64_t sum = 0;
+  for (size_t i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) {
+      return false;
+    }
+    sum += dim_order[i];
+  }
+  int64_t n = static_cast<int64_t>(dim_order.size() - 1);
+  // Sanity check that the sum of the indices in the vector is equal to the sum
+  // of 0 + 1 + 2 + ... + (ndim - 1)
+  return sum == n * (n + 1) / 2;
+}
+
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
     const int64_t numel) {
@@ -170,7 +178,8 @@ vTensor::vTensor(
       memory_layout_(memory_layout),
       // Calculate tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(calculate_strides(sizes, memory_layout_)),
+      dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
+      strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -189,6 +198,9 @@ vTensor::vTensor(
           padded_sizes_,
           dtype_,
           allocate_memory) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "computed dim order is invalid");
+
   if (storage_type != utils::kBuffer) {
     texture_limits_.limits = utils::ivec3{
         utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
@@ -204,16 +216,39 @@ vTensor::vTensor(
   }
 }
 
+vTensor::vTensor(const vTensor& other)
+    : dtype_(other.dtype_),
+      memory_layout_(other.memory_layout_),
+      // Copy tensor size metadata
+      sizes_(other.sizes_.begin(), other.sizes_.end()),
+      dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
+      strides_(other.strides_.begin(), other.strides_.end()),
+      numel_(other.numel_),
+      padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
+      unsqueezed_strides_{
+          other.unsqueezed_strides_.begin(),
+          other.unsqueezed_strides_.end()},
+      padded_numel_(other.padded_numel_),
+      texture_limits_{other.texture_limits_},
+      // Empty initialize Utility Uniform Buffers
+      sizes_uniform_(),
+      strides_uniform_(),
+      numel_uniform_(),
+      texture_limits_uniform_(),
+      // Copy Tensor storage
+      storage_(other.storage_) {}
+
 vTensor::vTensor(
     const vTensor& other,
     const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& strides,
-    const size_t offset_numel)
+    const std::vector<int64_t>& dim_order,
+    const int64_t offset_numel)
     : dtype_(other.dtype_),
-      memory_layout_(estimate_memory_layout(strides)),
+      memory_layout_(estimate_memory_layout(dim_order)),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(strides.begin(), strides.end()),
+      dim_order_(dim_order.begin(), dim_order.end()),
+      strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -226,6 +261,8 @@ vTensor::vTensor(
       texture_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "new dim order provided is invalid");
   VK_CHECK_COND(
       offset_numel + numel_ <= other.numel(),
       "Tensor alias cannot access more elements than available in the original"
@@ -339,9 +376,17 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
   }
 }
 
-void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
+void vTensor::update_metadata(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
   sizes_ = new_sizes;
-  strides_ = calculate_strides(new_sizes, memory_layout_);
+  dim_order_ = new_dim_order;
+  strides_ = calculate_strides(sizes_, dim_order_);
+  // Only update the memory layout for buffer-backed tensors. Strides are
+  // meaningless for texture-backed tensors and do not impact the memory layout.
+  if (storage_type() == utils::kBuffer) {
+    memory_layout_ = estimate_memory_layout(dim_order_);
+  }
   numel_ = utils::multiply_integers(sizes_);
 
   padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
@@ -373,15 +418,7 @@ void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   }
 }
 
-void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
-  update_size_metadata(new_sizes);
-  storage_.discard_and_reallocate(
-      calculate_padded_sizes(new_sizes, memory_layout_),
-      memory_layout_,
-      dtype_);
-}
-
-void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
@@ -394,10 +431,47 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
 
     VK_CHECK_COND(
         valid_resize,
-        "Cannot use virtual resize if new sizes requires a larger texture.");
+        "tensor sizes requires a larger texture than the current one.");
+  } else {
+    // For buffer storage check that the current buffer is large enough for the
+    // new sizes of the tensor.
+    int64_t numel = utils::multiply_integers(sizes);
+    bool valid_resize =
+        numel + storage_.buffer_offset_ <= storage_.buffer_length_;
+    VK_CHECK_COND(
+        valid_resize,
+        "tensor sizes requires a larger buffer than the current one.");
   }
+}
+
+void vTensor::virtual_reconfigure(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
+  VK_CHECK_COND(
+      storage_type() == utils::kBuffer,
+      "virtual_reconfigure is only applicable for buffer backed tensors");
+  VK_CHECK_COND(new_sizes.size() == new_dim_order.size());
+  VK_CHECK_COND(dim_order_is_valid(new_dim_order));
 
-  update_size_metadata(new_sizes);
+  check_sizes(new_sizes);
+  update_metadata(new_sizes, new_dim_order);
+}
+
+void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  VK_CHECK_COND(
+      new_sizes.size() == dim_order_.size(),
+      "new sizes cannot modify the dimensionality of the tensor ");
+
+  check_sizes(new_sizes);
+  update_metadata(new_sizes, dim_order_);
+}
+
+void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
+  update_metadata(new_sizes, dim_order_);
+  storage_.discard_and_reallocate(
+      calculate_padded_sizes(new_sizes, memory_layout_),
+      memory_layout_,
+      dtype_);
 }
 
 //
@@ -480,6 +554,7 @@ vTensorStorage::vTensorStorage(
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)),
       buffer_length_{utils::multiply_integers(padded_sizes)},
+      buffer_offset_{0},
       image_(allocate_image(
           context_,
           image_extents_,
@@ -496,11 +571,12 @@ vTensorStorage::vTensorStorage(
 
 vTensorStorage::vTensorStorage(
     const vTensorStorage& other,
-    const size_t buffer_offset)
+    const int64_t buffer_offset)
     : context_(other.context_),
       storage_type_{other.storage_type_},
       image_extents_(other.image_extents_),
       buffer_length_{other.buffer_length_},
+      buffer_offset_{buffer_offset},
       image_(),
       buffer_(other.buffer_, buffer_offset),
       last_access_{other.last_access_} {