From 49602cd2f97952ce4b34501b1bdb4debede2741c Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 30 Sep 2025 01:10:05 -0700 Subject: [PATCH 1/2] tensor destroy Summary: This diff introduce `aoti_torch_delete_tensor_object` for deleting tensors created during cuda backend inference. Reviewed By: [ghstack-poisoned] --- backends/cuda/runtime/shims/memory.cpp | 69 ++- backends/cuda/runtime/shims/memory.h | 10 +- backends/cuda/runtime/shims/tests/targets.bzl | 1 + .../test_aoti_torch_delete_tensor_object.cpp | 454 ++++++++++++++++++ 4 files changed, 532 insertions(+), 2 deletions(-) create mode 100644 backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index 99d936e32ca..12a1d59e5e1 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -123,11 +123,78 @@ AOTITorchError aoti_torch_empty_strided( return Error::Ok; } -// TODO(gasoonjia): reuse aoti_torch_delete_tensor_object to destory tensors void clear_all_tensors() { + // Use aoti_torch_delete_tensor_object to properly delete each tensor + // Note: We need to collect tensor pointers first since deletion modifies the + // set + auto old_tensors = + std::move(tensors); // tensors is now empty and no need to copy + for (const auto& tensor_shared : old_tensors) { + aoti_torch_delete_tensor_object(tensor_shared.get()); + } + + // tensors set should now be empty, but ensure it's cleared tensors.clear(); } +AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) { + // Handle null tensor pointer + if (tensor == nullptr) { + ET_LOG(Error, "Cannot delete null tensor"); + return Error::InvalidArgument; + } + + // Check if tensor exists in our tracking + bool found_in_tensors = false; + for (auto it = tensors.begin(); it != tensors.end(); ++it) { + if (it->get() == tensor) { + found_in_tensors = true; + break; + } + } + + // If tensor not found in our tracking, it's invalid + if (!found_in_tensors) { + ET_LOG(Error, "Didn't find tensor %p", tensor); + return Error::InvalidArgument; + } + + // Find and delete the tensor + for (auto it = tensors.begin(); it != tensors.end(); ++it) { + if (it->get() == tensor) { + // Get the tensor before erasing + auto tensor_ptr = *it; + + void* data_ptr = tensor_ptr->mutable_data_ptr(); + + // Determine if it's GPU memory + cudaPointerAttributes attributes{}; + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaPointerGetAttributes(&attributes, data_ptr)); + + // et tensor does not own data; need to free them manually. + if (attributes.type == cudaMemoryTypeManaged) { + // This is CUDA managed memory - free with proper synchronization + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaDeviceSynchronize()); // Wait for all operations to complete + // BEFORE freeing + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr)); + } else { + // This is CPU memory - free immediately + free(data_ptr); + } + // Remove from set (this will call the destructor if it's the last + // reference) + tensors.erase(it); + return Error::Ok; + } + } + + // This should never be reached since we found it above + ET_LOG(Error, "Internal error: tensor not found after validation"); + return Error::Internal; +} + } // extern "C" } // namespace cuda diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 2fdfdd8a72c..93bd9c30e70 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -44,8 +44,16 @@ AOTITorchError aoti_torch_empty_strided( int32_t device_index, Tensor** ret_new_tensor); +/** + * Deletes a tensor object and frees its associated memory. + * + * @param tensor Pointer to the tensor object to be deleted + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); + // Function to clear all tensors from internal storage -// TODO(gasoonjia): reuse aoti_torch_delete_tensor_object to destory tensors void clear_all_tensors(); } // extern "C" diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index 5737bdb00ab..1db52ce1b97 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -28,3 +28,4 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ cuda_shim_cpp_unittest("aoti_torch_empty_strided") + cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object") diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp new file mode 100644 index 00000000000..eceb141e9ca --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp @@ -0,0 +1,454 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::aoti; +using namespace executorch::backends::cuda; +using namespace executorch::runtime; +using executorch::runtime::etensor::Tensor; + +// Test fixture for aoti_torch_delete_tensor_object tests +class AOTITorchDeleteTensorObjectTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize ExecuTorch Platform Abstraction Layer + et_pal_init(); + + // Check if CUDA is available + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + + // Clear any remaining tensors from previous tests + clear_all_tensors(); + } + + void TearDown() override { + // Clean up metadata + cleanup_tensor_metadata(); + + // Clear the global tensor storage using the provided function + clear_all_tensors(); + } + + // Helper to create test tensors + Tensor* create_test_tensor( + const std::vector& sizes, + const std::vector& strides = {}, + int32_t dtype = 6, // float32 + int32_t device_type = 1, // CUDA + int32_t device_index = 0) { + Tensor* tensor; + + const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data(); + + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides_ptr, + dtype, + device_type, + device_index, + &tensor); + + return (error == Error::Ok) ? tensor : nullptr; + } +}; + +// Test basic deletion of CUDA tensor +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCudaTensorBasic) { + // Create a CUDA tensor + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties before deletion + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test basic deletion of CPU tensor +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCpuTensorBasic) { + // Create a CPU tensor + std::vector sizes = {3, 4}; + Tensor* tensor = create_test_tensor(sizes, {}, 6, 0, 0); // CPU device + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties before deletion + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 3); + EXPECT_EQ(tensor->size(1), 4); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of null tensor pointer +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteNullTensor) { + AOTITorchError error = aoti_torch_delete_tensor_object(nullptr); + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test deletion of tensor not in tracking system +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteUntrackedTensor) { + // Create a tensor and then clear the tracking system + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Clear the tracking system (simulating an untracked tensor) + clear_all_tensors(); + + // Try to delete the tensor - should fail + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test deletion of multiple tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMultipleTensors) { + // Create multiple tensors + std::vector tensors; + + for (int i = 1; i <= 5; i++) { + std::vector sizes = {i, i + 1}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + tensors.push_back(tensor); + } + + // Delete all tensors + for (Tensor* tensor : tensors) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + } +} + +// Test deletion of zero-sized tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteZeroSizedTensor) { + // Create a zero-sized tensor + std::vector sizes = {0, 5}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 0); + EXPECT_EQ(tensor->size(1), 5); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of scalar (0D) tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteScalarTensor) { + // Create a scalar tensor + std::vector sizes = {}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 0); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of large multi-dimensional tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteLargeTensor) { + // Create a large multi-dimensional tensor + std::vector sizes = {10, 20, 30}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 10); + EXPECT_EQ(tensor->size(1), 20); + EXPECT_EQ(tensor->size(2), 30); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of tensors with custom strides +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteTensorWithCustomStrides) { + // Create tensor with custom strides + std::vector sizes = {3, 4}; + std::vector strides = {4, 1}; // Row-major strides + Tensor* tensor = create_test_tensor(sizes, strides); + ASSERT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 3); + EXPECT_EQ(tensor->size(1), 4); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion after accessing tensor data +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteAfterDataAccess) { + // Create a tensor + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // Access tensor data (this should not prevent deletion) + void* data_ptr = tensor->mutable_data_ptr(); + EXPECT_NE(data_ptr, nullptr); + + // Delete the tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); +} + +// Test double deletion (should fail on second attempt) +TEST_F(AOTITorchDeleteTensorObjectTest, DoubleDeletion) { + // Create a tensor + std::vector sizes = {2, 3}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + + // First deletion should succeed + AOTITorchError error1 = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error1, Error::Ok); + + // Second deletion should fail (tensor no longer tracked) + AOTITorchError error2 = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error2, Error::InvalidArgument); +} + +// Test deletion of tensors on both CUDA and CPU devices +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMixedDeviceTensors) { + // Create CUDA tensor + std::vector sizes = {2, 3}; + Tensor* cuda_tensor = create_test_tensor(sizes, {}, 6, 1, 0); + ASSERT_NE(cuda_tensor, nullptr); + + // Create CPU tensor + Tensor* cpu_tensor = create_test_tensor(sizes, {}, 6, 0, 0); + ASSERT_NE(cpu_tensor, nullptr); + + // Delete both tensors + AOTITorchError cuda_error = aoti_torch_delete_tensor_object(cuda_tensor); + EXPECT_EQ(cuda_error, Error::Ok); + + AOTITorchError cpu_error = aoti_torch_delete_tensor_object(cpu_tensor); + EXPECT_EQ(cpu_error, Error::Ok); +} + +// Test memory consistency after deletion +TEST_F(AOTITorchDeleteTensorObjectTest, MemoryConsistencyAfterDeletion) { + // Create multiple tensors + std::vector tensors; + const int num_tensors = 10; + + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {i + 1, i + 2}; + Tensor* tensor = create_test_tensor(sizes); + ASSERT_NE(tensor, nullptr); + tensors.push_back(tensor); + } + + // Delete every other tensor + for (int i = 0; i < num_tensors; i += 2) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]); + EXPECT_EQ(error, Error::Ok); + } + + // Delete remaining tensors + for (int i = 1; i < num_tensors; i += 2) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]); + EXPECT_EQ(error, Error::Ok); + } +} + +// Test stress deletion with many small tensors +TEST_F(AOTITorchDeleteTensorObjectTest, StressDeletionManySmallTensors) { + const int num_tensors = 100; + std::vector tensors; + + // Create many small tensors + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {1, 1}; // Minimal size + Tensor* tensor = create_test_tensor(sizes); + if (tensor != nullptr) { + tensors.push_back(tensor); + } + } + + // Delete all created tensors + for (Tensor* tensor : tensors) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + } +} + +// Test CUDA synchronization during deletion +TEST_F(AOTITorchDeleteTensorObjectTest, CudaSynchronizationDuringDeletion) { + // Create a larger CUDA tensor to ensure memory allocation + std::vector sizes = {100, 100}; + Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device + ASSERT_NE(tensor, nullptr); + + // Delete the tensor (should handle synchronization internally) + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Verify CUDA state is still good + cudaError_t cuda_error = cudaGetLastError(); + EXPECT_EQ(cuda_error, cudaSuccess); +} + +// Test specific deletion of bfloat16 tensors +TEST_F(AOTITorchDeleteTensorObjectTest, DeleteBFloat16Tensor) { + // Test 1D bfloat16 tensor deletion + std::vector sizes_1d = {10}; + Tensor* tensor_bf16_1d = create_test_tensor( + sizes_1d, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_1d, nullptr); + + // Verify it's bfloat16 before deletion + int32_t actual_dtype; + EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_1d, &actual_dtype), Error::Ok); + EXPECT_EQ(actual_dtype, static_cast(SupportedDTypes::BFLOAT16)) + << "Expected bfloat16 dtype (" + << static_cast(SupportedDTypes::BFLOAT16) << "), got " + << actual_dtype; + + // Verify element size (bfloat16 should be 2 bytes per element) + EXPECT_EQ(tensor_bf16_1d->element_size(), 2); + + // Delete the bfloat16 tensor + AOTITorchError error = aoti_torch_delete_tensor_object(tensor_bf16_1d); + EXPECT_EQ(error, Error::Ok); + + // Test 2D bfloat16 tensor deletion with custom strides + std::vector sizes_2d = {4, 6}; + std::vector strides_2d = {6, 1}; // Row-major strides + Tensor* tensor_bf16_2d = create_test_tensor( + sizes_2d, + strides_2d, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_2d, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor_bf16_2d->dim(), 2); + EXPECT_EQ(tensor_bf16_2d->size(0), 4); + EXPECT_EQ(tensor_bf16_2d->size(1), 6); + EXPECT_EQ(tensor_bf16_2d->element_size(), 2); + + // Verify it's bfloat16 + int32_t dtype_2d; + EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_2d, &dtype_2d), Error::Ok); + EXPECT_EQ(dtype_2d, static_cast(SupportedDTypes::BFLOAT16)); + + // Delete the 2D bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_2d); + EXPECT_EQ(error, Error::Ok); + + // Test 3D bfloat16 tensor deletion + std::vector sizes_3d = {2, 3, 4}; + Tensor* tensor_bf16_3d = create_test_tensor( + sizes_3d, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_3d, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor_bf16_3d->dim(), 3); + EXPECT_EQ(tensor_bf16_3d->size(0), 2); + EXPECT_EQ(tensor_bf16_3d->size(1), 3); + EXPECT_EQ(tensor_bf16_3d->size(2), 4); + EXPECT_EQ(tensor_bf16_3d->element_size(), 2); + + // Verify memory size (2 * 3 * 4 * 2 bytes = 48 bytes) + size_t expected_memory = 2 * 3 * 4 * 2; + size_t actual_memory = + tensor_bf16_3d->numel() * tensor_bf16_3d->element_size(); + EXPECT_EQ(actual_memory, expected_memory); + + // Delete the 3D bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_3d); + EXPECT_EQ(error, Error::Ok); + + // Test bfloat16 scalar tensor (0D) deletion + std::vector scalar_sizes = {}; + Tensor* tensor_bf16_scalar = create_test_tensor( + scalar_sizes, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_scalar, nullptr); + + // Verify scalar tensor properties + EXPECT_EQ(tensor_bf16_scalar->dim(), 0); + EXPECT_EQ(tensor_bf16_scalar->numel(), 1); + EXPECT_EQ(tensor_bf16_scalar->element_size(), 2); + + // Delete the scalar bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_scalar); + EXPECT_EQ(error, Error::Ok); + + // Test zero-element bfloat16 tensor deletion + std::vector zero_sizes = {0, 5}; + Tensor* tensor_bf16_zero = create_test_tensor( + zero_sizes, + {}, + static_cast(SupportedDTypes::BFLOAT16), + 1, // CUDA device + 0); + ASSERT_NE(tensor_bf16_zero, nullptr); + + // Verify zero-element tensor properties + EXPECT_EQ(tensor_bf16_zero->dim(), 2); + EXPECT_EQ(tensor_bf16_zero->size(0), 0); + EXPECT_EQ(tensor_bf16_zero->size(1), 5); + EXPECT_EQ(tensor_bf16_zero->numel(), 0); + EXPECT_EQ(tensor_bf16_zero->element_size(), 2); + + // Delete the zero-element bfloat16 tensor + error = aoti_torch_delete_tensor_object(tensor_bf16_zero); + EXPECT_EQ(error, Error::Ok); +} + +// Test deletion of mixed dtype tensors (float32 and bfloat16) From 56b6ac6ff7762a42d6e70b8a0b30c2d584a803c5 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 30 Sep 2025 18:40:31 +0100 Subject: [PATCH 2/2] aoti_torch_create_tensor_from_blob_v2 (#14687) Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * #14690 * #14689 * #14688 * __->__ #14687 * #14686 Summary: This function introduce aoti_torch_create_tensor_from_blob_v2, a function that create tensor from data blob and custom stride and size. Worth to notice that unlike aoti_torch_empty_strided, the tensor created by aoti_torch_create_tensor_from_blob_v2 will not have the control of the memory blob. Therefore when we delete it, the memory will not be freed. Reviewed By: Differential Revision: --- backends/aoti/utils.h | 17 + backends/cuda/runtime/shims/memory.cpp | 187 ++++- backends/cuda/runtime/shims/memory.h | 39 +- backends/cuda/runtime/shims/tests/targets.bzl | 1 + ..._aoti_torch_create_tensor_from_blob_v2.cpp | 754 ++++++++++++++++++ backends/cuda/runtime/shims/utils.h | 20 + 6 files changed, 981 insertions(+), 37 deletions(-) create mode 100644 backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h index 22734935df2..1c872e08648 100644 --- a/backends/aoti/utils.h +++ b/backends/aoti/utils.h @@ -73,6 +73,23 @@ inline AOTITorchError validate_storage_offset(int64_t storage_offset) { return Error::Ok; } +// Check if tensor is in contiguous memory format (NCHW for 4D tensors) +// Contiguous format means strides decrease from left to right: +// For NCHW: strides = [C*H*W, H*W, W, 1] +inline bool is_tensor_contiguous( + int64_t ndim, + const int64_t* sizes, + const int64_t* strides) { + int64_t expected_stride = 1; + for (int64_t i = ndim - 1; i >= 0; i--) { + if (strides[i] != expected_stride) { + return false; + } + expected_stride *= sizes[i]; + } + return true; +} + } // extern "C" } // namespace aoti diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index 12a1d59e5e1..94f589aece6 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -15,29 +15,10 @@ #include #include // For posix_memalign #include +#include #include #include -// CUDA error checking macro -#define ET_CUDA_CHECK_OR_RETURN_ERROR(EXPR) \ - do { \ - const cudaError_t err = EXPR; \ - if (err == cudaSuccess) { \ - break; \ - } \ - ET_LOG( \ - Error, \ - "%s:%d CUDA error: %s", \ - __FILE__, \ - __LINE__, \ - cudaGetErrorString(err)); \ - return Error::Internal; \ - } while (0) - -// Kernel launch check macro -#define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \ - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError()) - namespace executorch { namespace backends { namespace cuda { @@ -46,12 +27,122 @@ using executorch::aten::SizesType; using executorch::aten::StridesType; using executorch::backends::aoti::dtype_to_element_size; using executorch::backends::aoti::dtype_to_scalar_type; +using executorch::backends::aoti::validate_storage_offset; // Global storage for tensors and their metadata std::unordered_set> tensors; +// Reference counting for memory addresses +// Maps memory address to number of tensors using it +// Special value: NOT_OWN (-1) means tensor never owns the memory +constexpr int32_t NOT_OWN = -1; +std::unordered_map memory_to_n_tensor; + extern "C" { +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size) { + // TODO(gasoonjia): verify given data is on the target device + (void)device_type; + (void)opaque_metadata; + (void)layout; + (void)opaque_metadata_size; + + // Validate input parameters first + if (data == nullptr) { + ET_LOG( + Error, + "aoti_torch_create_tensor_from_blob_v2 failed: data pointer is null"); + return Error::InvalidArgument; + } + + if (sizes_ptr == nullptr && ndim > 0) { + ET_LOG( + Error, + "aoti_torch_create_tensor_from_blob_v2 failed: sizes_ptr is null"); + return Error::InvalidArgument; + } + + if (ret_new_tensor == nullptr) { + ET_LOG( + Error, + "aoti_torch_create_tensor_from_blob_v2 failed: ret_new_tensor is null"); + return Error::InvalidArgument; + } + + // Check that device_index is always 0 + if (device_index != 0) { + ET_LOG(Error, "device_index must be 0, got: %d", device_index); + return Error::InvalidArgument; + } + + // Validate dtype using SupportedDTypes from utils.h + AOTITorchError dtype_error = validate_dtype(dtype); + if (dtype_error != Error::Ok) { + return dtype_error; + } + + // Storage offset must be 0 since from_blob cannot handle different offsets + AOTITorchError storage_offset_error = validate_storage_offset(storage_offset); + if (storage_offset_error != Error::Ok) { + return storage_offset_error; + } + + // Convert sizes to the format expected by ExecutorTorch using SizesType + std::vector sizes = + convert_sizes_to_vector(ndim, sizes_ptr); + + // Convert strides using the common helper function with StridesType + std::vector strides = + convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); + + // Create ExecutorTorch tensor that wraps the existing memory + // Note: We're NOT copying the data, just wrapping it + auto tensor = executorch::extension::from_blob( + data, // existing memory (don't copy!) + sizes, // tensor dimensions + strides, // tensor strides (allows different strides) + dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType + ); + + if (!tensor) { + ET_LOG(Error, "Failed to create tensor from blob"); + return Error::InvalidArgument; + } + + // Store the tensor so it doesn't get destroyed + tensors.insert(tensor); + + *ret_new_tensor = tensor.get(); + + // Check if this memory address is already being tracked + auto memory_it = memory_to_n_tensor.find(data); + if (memory_it != memory_to_n_tensor.end()) { + ET_LOG( + Error, + "Memory address %p is already being tracked by another tensor", + data); + return Error::InvalidArgument; + } + + // Mark this memory as NOT_OWN since tensor created from blob never owns + // memory + memory_to_n_tensor[data] = NOT_OWN; + + return Error::Ok; +} + AOTITorchError aoti_torch_empty_strided( int64_t ndim, const int64_t* sizes_ptr, @@ -120,6 +211,9 @@ AOTITorchError aoti_torch_empty_strided( tensors.insert(tensor); *ret_new_tensor = tensor.get(); + // This tensor owns the memory it allocated, set reference count to 1 + memory_to_n_tensor[ptr] = 1; + return Error::Ok; } @@ -164,26 +258,47 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) { if (it->get() == tensor) { // Get the tensor before erasing auto tensor_ptr = *it; - void* data_ptr = tensor_ptr->mutable_data_ptr(); - // Determine if it's GPU memory - cudaPointerAttributes attributes{}; - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaPointerGetAttributes(&attributes, data_ptr)); - - // et tensor does not own data; need to free them manually. - if (attributes.type == cudaMemoryTypeManaged) { - // This is CUDA managed memory - free with proper synchronization - ET_CUDA_CHECK_OR_RETURN_ERROR( - cudaDeviceSynchronize()); // Wait for all operations to complete - // BEFORE freeing - ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr)); + // Find the reference count for this memory address + auto memory_it = memory_to_n_tensor.find(data_ptr); + if (memory_it != memory_to_n_tensor.end()) { + int32_t ref_count = memory_it->second; + + if (ref_count == NOT_OWN) { + // Tensor never owned the memory, skip freeing + // Just remove tensor from tracking + tensors.erase(it); + return Error::Ok; + } else if (ref_count == 1) { + // Only current tensor using this memory, free it + // Determine if it's GPU memory + cudaPointerAttributes attributes{}; + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaPointerGetAttributes(&attributes, data_ptr)); + + if (attributes.type == cudaMemoryTypeManaged) { + // This is CUDA managed memory - free with proper synchronization + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaDeviceSynchronize()); + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr)); + } else { + // This is CPU memory - free immediately + free(data_ptr); + data_ptr = nullptr; + } + + // Remove from memory tracking + memory_to_n_tensor.erase(memory_it); + } else if (ref_count > 1) { + // Other tensors still using this memory, just decrement count + memory_to_n_tensor[data_ptr] = ref_count - 1; + } } else { - // This is CPU memory - free immediately - free(data_ptr); + ET_LOG(Error, "Internal error: memory not found during deletion"); + return Error::Internal; } - // Remove from set (this will call the destructor if it's the last + + // Remove tensor from set (this will call the destructor if it's the last // reference) tensors.erase(it); return Error::Ok; diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 93bd9c30e70..7f4c56a8000 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -21,6 +21,44 @@ using executorch::backends::aoti::Tensor; extern "C" { +/** + * Creates a tensor object from an existing memory blob without copying the + * data. The tensor will wrap the provided memory and will not take ownership of + * it. When the tensor is deleted, the original memory will remain valid and + * must be freed by the caller. + * + * @param data Pointer to the memory blob to wrap (must not be null) + * @param ndim Number of dimensions in the tensor + * @param sizes_ptr Pointer to array of dimension sizes (using SizesType) + * @param strides_ptr Pointer to array of strides for each dimension (using + * StridesType, can be null for contiguous) + * @param storage_offset Storage offset (must be 0 for current implementation) + * @param dtype Data type identifier (supports FLOAT32 and BFLOAT16 from + * SupportedDTypes) + * @param device_type Device type (CPU=0, CUDA=1 from SupportedDevices) + * @param device_index Device index (must be 0 for current implementation) + * @param ret_new_tensor Output parameter for the created tensor (must not be + * null) + * @param layout Tensor layout identifier (0=strided) + * @param opaque_metadata Optional metadata pointer (can be null) + * @param opaque_metadata_size Size of opaque metadata in bytes + * @return AOTITorchError error code (Error::Ok on success, or an error code on + * failure) + */ +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size); + /** * Creates an uninitialized tensor with specified dimensions, strides, and * dtyper on either CPU or CUDA device. @@ -55,7 +93,6 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); // Function to clear all tensors from internal storage void clear_all_tensors(); - } // extern "C" } // namespace cuda diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index 1db52ce1b97..dce7d0be39c 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -29,3 +29,4 @@ def define_common_targets(): """ cuda_shim_cpp_unittest("aoti_torch_empty_strided") cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object") + cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2") diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp new file mode 100644 index 00000000000..2cb12719782 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp @@ -0,0 +1,754 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::aoti; +using namespace executorch::backends::cuda; +using namespace executorch::runtime; +using executorch::runtime::etensor::Tensor; + +// Test fixture for aoti_torch_create_tensor_from_blob_v2 tests +class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test { + protected: + void SetUp() override { + // Initialize ExecuTorch Platform Abstraction Layer + et_pal_init(); + + // Check if CUDA is available + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available, skipping CUDA tests"; + } + + // Clean up any existing cached metadata before each test + cleanup_tensor_metadata(); + + // Clear any remaining tensors from previous tests + clear_all_tensors(); + } + + void TearDown() override { + // Clean up metadata + cleanup_tensor_metadata(); + + // Clear the global tensor storage using the provided function + clear_all_tensors(); + + // Clean up any allocated memory buffers + for (void* ptr : cuda_memory_buffers_) { + if (ptr) { + cudaError_t cuda_err = cudaFree(ptr); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Failed to free CUDA memory: " << cudaGetErrorString(cuda_err); + } + } + cuda_memory_buffers_.clear(); + + for (void* ptr : cpu_memory_buffers_) { + if (ptr) { + free(ptr); + } + } + cpu_memory_buffers_.clear(); + } + + // Helper to allocate CUDA memory and track it for cleanup + void* allocate_cuda_memory(size_t bytes) { + void* ptr; + cudaError_t err = cudaMallocManaged(&ptr, bytes); + if (err == cudaSuccess) { + cuda_memory_buffers_.push_back(ptr); + return ptr; + } + return nullptr; + } + + // Helper to allocate CPU memory and track it for cleanup + void* allocate_cpu_memory(size_t bytes) { + void* ptr; + int result = posix_memalign(&ptr, 16, bytes); // 16-byte aligned + if (result == 0 && ptr != nullptr) { + cpu_memory_buffers_.push_back(ptr); + return ptr; + } + return nullptr; + } + + // Helper to calculate number of elements from sizes + int64_t calculate_numel(const std::vector& sizes) { + int64_t numel = 1; + for (int64_t size : sizes) { + numel *= size; + } + return numel; + } + + // Helper to calculate contiguous strides from sizes + std::vector calculate_contiguous_strides( + const std::vector& sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) { + return strides; + } + + strides[sizes.size() - 1] = 1; + // Use int64_t and check for underflow to avoid unsigned integer wraparound + for (int64_t i = static_cast(sizes.size()) - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + return strides; + } + + private: + std::vector cuda_memory_buffers_; + std::vector cpu_memory_buffers_; +}; + +// Test basic functionality with CUDA memory +TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCUDA) { + // Test 1D tensor + std::vector sizes_1d = {5}; + std::vector strides_1d = calculate_contiguous_strides(sizes_1d); + + // Allocate CUDA memory + size_t bytes = calculate_numel(sizes_1d) * sizeof(float); + void* cuda_data = allocate_cuda_memory(bytes); + ASSERT_NE(cuda_data, nullptr); + + Tensor* tensor_1d; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + cuda_data, + sizes_1d.size(), + sizes_1d.data(), + strides_1d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_1d, + 0, // layout (strided) + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_1d, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor_1d->dim(), 1); + EXPECT_EQ(tensor_1d->size(0), 5); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor_1d->mutable_data_ptr(); + EXPECT_EQ(tensor_data, cuda_data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor_1d); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) For CUDA memory, check that we can still access it (synchronously) + // after tensor deletion + float pattern_value = 42.0f; + cudaError_t cuda_err = cudaMemcpy( + cuda_data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = cudaMemcpy( + &readback_value, cuda_data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test basic functionality with CPU memory +TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCPU) { + // Test 2D tensor + std::vector sizes_2d = {3, 4}; + std::vector strides_2d = calculate_contiguous_strides(sizes_2d); + + // Allocate CPU memory + size_t bytes = calculate_numel(sizes_2d) * sizeof(float); + void* cpu_data = allocate_cpu_memory(bytes); + ASSERT_NE(cpu_data, nullptr); + + Tensor* tensor_2d; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + cpu_data, + sizes_2d.size(), + sizes_2d.data(), + strides_2d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CPU), + 0, // device index + &tensor_2d, + 0, // layout (strided) + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_2d, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor_2d->dim(), 2); + EXPECT_EQ(tensor_2d->size(0), 3); + EXPECT_EQ(tensor_2d->size(1), 4); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor_2d->mutable_data_ptr(); + EXPECT_EQ(tensor_data, cpu_data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor_2d); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) For CPU memory, directly write and read to verify accessibility + float* float_ptr = reinterpret_cast(cpu_data); + float pattern_value = 42.0f; + *float_ptr = pattern_value; + EXPECT_EQ(*float_ptr, pattern_value) + << "Original CPU memory should still be accessible after tensor deletion"; +} + +// Test with invalid dtype +TEST_F(AOTITorchCreateTensorFromBlobV2Test, InvalidDtype) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + 999, // invalid dtype + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test with non-zero storage offset (should fail since from_blob cannot handle +// offsets) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, NonZeroStorageOffset) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 1, // non-zero storage_offset (should fail since from_blob cannot handle + // offsets) + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test with custom strides (using stride parameter but still contiguous) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, CustomContiguousStrides) { + std::vector sizes = {2, 3}; + // Use the correct contiguous strides but pass them explicitly + std::vector contiguous_strides = {3, 1}; // Proper contiguous strides + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + contiguous_strides.data(), // Explicitly pass contiguous strides + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, data); + + // Verify strides were properly set (we can check via aoti_torch_get_strides) + int64_t* tensor_strides; + error = aoti_torch_get_strides(tensor, &tensor_strides); + EXPECT_EQ(error, Error::Ok); + EXPECT_EQ(tensor_strides[0], 3); + EXPECT_EQ(tensor_strides[1], 1); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) + float pattern_value = 42.0f; + cudaError_t cuda_err = + cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = + cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test with null data pointer +TEST_F(AOTITorchCreateTensorFromBlobV2Test, NullDataPointer) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + nullptr, // null data pointer + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument); +} + +// Test scalar tensor (0D) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) { + std::vector sizes = {}; // 0D tensor + std::vector strides = {}; // Empty strides for scalar + + size_t bytes = sizeof(float); // Single element + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor = nullptr; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + ASSERT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 0); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) + float pattern_value = 42.0f; + cudaError_t cuda_err = + cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = + cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test zero-sized tensor +TEST_F(AOTITorchCreateTensorFromBlobV2Test, ZeroSizedTensor) { + std::vector sizes = {0, 5}; // Zero elements + std::vector strides = calculate_contiguous_strides(sizes); + + // Even for zero-sized tensor, we need some memory allocated + size_t bytes = sizeof(float); // Minimum allocation + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 0); + EXPECT_EQ(tensor->size(1), 5); + + // Verify the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, data); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // Test that the original memory is still accessible (proves tensor didn't own + // it) + float pattern_value = 42.0f; + cudaError_t cuda_err = + cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to write to original CUDA memory after tensor deletion"; + + float readback_value = 0.0f; + cuda_err = + cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(cuda_err, cudaSuccess) + << "Should be able to read from original CUDA memory after tensor deletion"; + EXPECT_EQ(readback_value, pattern_value) + << "Original CUDA memory should still contain our test pattern"; +} + +// Test multi-dimensional tensors +TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultiDimensionalTensors) { + // Test 3D tensor + std::vector sizes_3d = {2, 3, 4}; + std::vector strides_3d = calculate_contiguous_strides(sizes_3d); + + size_t bytes_3d = calculate_numel(sizes_3d) * sizeof(float); + void* data_3d = allocate_cuda_memory(bytes_3d); + ASSERT_NE(data_3d, nullptr); + + Tensor* tensor_3d; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data_3d, + sizes_3d.size(), + sizes_3d.data(), + strides_3d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_3d, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_3d, nullptr); + EXPECT_EQ(tensor_3d->dim(), 3); + EXPECT_EQ(tensor_3d->size(0), 2); + EXPECT_EQ(tensor_3d->size(1), 3); + EXPECT_EQ(tensor_3d->size(2), 4); + + // Test 4D tensor + std::vector sizes_4d = {2, 3, 4, 5}; + std::vector strides_4d = calculate_contiguous_strides(sizes_4d); + + size_t bytes_4d = calculate_numel(sizes_4d) * sizeof(float); + void* data_4d = allocate_cuda_memory(bytes_4d); + ASSERT_NE(data_4d, nullptr); + + Tensor* tensor_4d; + error = aoti_torch_create_tensor_from_blob_v2( + data_4d, + sizes_4d.size(), + sizes_4d.data(), + strides_4d.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor_4d, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_4d, nullptr); + EXPECT_EQ(tensor_4d->dim(), 4); + EXPECT_EQ(tensor_4d->size(0), 2); + EXPECT_EQ(tensor_4d->size(1), 3); + EXPECT_EQ(tensor_4d->size(2), 4); + EXPECT_EQ(tensor_4d->size(3), 5); +} + +// Test tensor data pointer consistency +TEST_F(AOTITorchCreateTensorFromBlobV2Test, DataPointerConsistency) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* original_data = allocate_cuda_memory(bytes); + ASSERT_NE(original_data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + original_data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check that the tensor uses the same data pointer + void* tensor_data = tensor->mutable_data_ptr(); + EXPECT_EQ(tensor_data, original_data); +} + +// Test creating multiple tensors from different blobs +TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultipleTensorsFromBlobs) { + const int num_tensors = 5; + std::vector tensors; + std::vector data_ptrs; + + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {i + 1, i + 2}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + data_ptrs.push_back(data); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + tensors.push_back(tensor); + + // Verify dimensions + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), i + 1); + EXPECT_EQ(tensor->size(1), i + 2); + + // Verify the tensor uses the correct data pointer + EXPECT_EQ(tensor->mutable_data_ptr(), data); + } + + // Verify all tensors have different data pointers + for (int i = 0; i < num_tensors; i++) { + EXPECT_EQ(tensors[i]->mutable_data_ptr(), data_ptrs[i]); + for (int j = i + 1; j < num_tensors; j++) { + EXPECT_NE(tensors[i]->mutable_data_ptr(), tensors[j]->mutable_data_ptr()); + } + } +} + +// Test deletion of tensor created from blob (should not free the original +// memory) +TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeletionDoesNotFreeOriginalMemory) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Delete the tensor - this should NOT free the original memory + error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + + // The original memory should still be valid (we'll free it in teardown) + // We can't easily test if the memory is still valid without risking crashes, + // but the test should pass without issues if memory management is correct +} + +// Test with opaque metadata +TEST_F(AOTITorchCreateTensorFromBlobV2Test, WithOpaqueMetadata) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + ASSERT_NE(data, nullptr); + + // Create some opaque metadata + std::vector metadata = {0x01, 0x02, 0x03, 0x04}; + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + metadata.data(), // opaque_metadata + metadata.size()); // opaque_metadata_size + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Check tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); +} + +// Test stress test with many small tensors from blobs +TEST_F(AOTITorchCreateTensorFromBlobV2Test, StressTestManySmallTensors) { + const int num_tensors = 50; // Reduced for reasonable test time + std::vector tensors; + + for (int i = 0; i < num_tensors; i++) { + std::vector sizes = {1, 1}; // Minimal size + std::vector strides = calculate_contiguous_strides(sizes); + + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* data = allocate_cuda_memory(bytes); + if (data == nullptr) { + // Skip if we run out of memory + continue; + } + + Tensor* tensor; + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + if (error == Error::Ok && tensor != nullptr) { + tensors.push_back(tensor); + + // Verify the tensor uses the correct data pointer + EXPECT_EQ(tensor->mutable_data_ptr(), data); + } + } + + // Delete all created tensors + for (Tensor* tensor : tensors) { + AOTITorchError error = aoti_torch_delete_tensor_object(tensor); + EXPECT_EQ(error, Error::Ok); + } +} diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h index 23943391b50..38e56ca45a1 100644 --- a/backends/cuda/runtime/shims/utils.h +++ b/backends/cuda/runtime/shims/utils.h @@ -14,6 +14,26 @@ #include #include +// CUDA error checking macro +#define ET_CUDA_CHECK_OR_RETURN_ERROR(EXPR) \ + do { \ + const cudaError_t err = EXPR; \ + if (err == cudaSuccess) { \ + break; \ + } \ + ET_LOG( \ + Error, \ + "%s:%d CUDA error: %s", \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(err)); \ + return Error::Internal; \ + } while (0) + +// Kernel launch check macro +#define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \ + ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError()) + namespace executorch { namespace backends { namespace cuda {