From d8e6d136a953f183551ac8d9b82ca8d15cdac366 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Tue, 21 Oct 2025 21:15:30 -0700 Subject: [PATCH 1/3] Update [ghstack-poisoned] --- backends/apple/metal/metal_backend.py | 38 ++++++++++++++++--- .../apple/metal/runtime/metal_backend.cpp | 26 +++++++++++++ 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/backends/apple/metal/metal_backend.py b/backends/apple/metal/metal_backend.py index 13a3534004b..ffc1e6e6d7d 100644 --- a/backends/apple/metal/metal_backend.py +++ b/backends/apple/metal/metal_backend.py @@ -108,8 +108,11 @@ def preprocess( options: dict[str, typing.Any] = { # Do not link against the full PyTorch/libtorch library "aot_inductor.link_libtorch": False, - # Package model constants and other generated files directly in the shared object (.so) file - "aot_inductor.package_constants_in_so": True, + # Separate weight constants from the .so file + "aot_inductor.package": True, + "aot_inductor.package_constants_in_so": False, + # Store weight constants on disk in a binary blob + "aot_inductor.package_constants_on_disk_format": "binary_blob", # Enable maximum automatic tuning for optimal performance "max_autotune": True, # "aot_inductor.debug_compile": True, @@ -117,7 +120,7 @@ def preprocess( } with collect_unsupported_fallback_kernels(): - so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type] + paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type] if len(missing_fallback_kernels) > 0: formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels)) raise RuntimeError( @@ -125,17 +128,42 @@ def preprocess( "Please add them to the AOTI backend." ) + # Extract the .so and .blob paths from the returned list + so_path = None + blob_path = None + for path in paths: + if path.endswith(".wrapper.so"): + so_path = path + elif path.endswith(".wrapper_weights.blob"): + blob_path = path + + if so_path is None or blob_path is None: + raise RuntimeError( + f"Could not find required files in compiled paths, got {paths}" + ) + # pyre-ignorep[6]: Incompatible parameter type with open(so_path, "rb") as f: so_data = f.read() named_data_store = NamedDataStore() method_name = MetalBackend.method_name_from_compile_specs(compile_specs) + + # Keep the so file in the NamedDataStore, so that it can be packaged into the .pte file. + named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None) + + # Add weights blob to named data store + with open(blob_path, "rb") as f: + blob_data = f.read() + named_data_store.add_named_data( - method_name + "_so_blob", so_data, 1, "aoti_metal_blob" + method_name + "_weights_blob", blob_data, 1, "aoti_metal_blob" ) - # Clean up the generated so file; it has been packaged into the NamdeDataStore + # Clean up the weights blob file + os.remove(blob_path) + + # Clean up the generated so file; it has been packaged into the NamedDataStore # pyre-ignorep[6]: Incompatible parameter type os.remove(so_path) diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp index 97b273d428f..f79a2a67b6f 100644 --- a/backends/apple/metal/runtime/metal_backend.cpp +++ b/backends/apple/metal/runtime/metal_backend.cpp @@ -106,6 +106,15 @@ class ET_EXPERIMENTAL MetalBackend final Debug, "MetalBackend::load_function_pointers_into_handle - Loaded AOTInductorModelContainerRun"); + LOAD_SYMBOL( + handle, + update_constants_from_blob, + AOTInductorModelUpdateConstantsFromBlob, + so_handle); + ET_LOG( + Debug, + "MetalBackend::load_function_pointers_into_handle - Loaded AOTInductorModelUpdateConstantsFromBlob"); + ET_LOG( Debug, "MetalBackend::load_function_pointers_into_handle - All symbols loaded successfully"); @@ -203,6 +212,9 @@ class ET_EXPERIMENTAL MetalBackend final outfile.close(); ET_LOG(Info, "MetalBackend::init - File closed successfully"); + // Free the buffer immediately after writing to disk + aoti_metal_buffer->Free(); + // Load the ELF using dlopen void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL); ET_CHECK_OR_RETURN_ERROR( @@ -234,6 +246,20 @@ class ET_EXPERIMENTAL MetalBackend final handle->container_handle = container_handle; + // Look into named data map for constant data + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; + auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); + if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { + ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); + const void* weights_blob = buffer_res->data(); + // Feed the weights blob into the container. Under the hood it's copying + // weights, so we should free the buffer immediately. + ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob( + handle->container_handle, static_cast(weights_blob))); + buffer_res->Free(); + } + ET_LOG(Info, "MetalBackend::init - Initialization completed successfully"); return (DelegateHandle*)handle; // Return the handle post-processing } From 00ba52242cdd965c2b263862be67dd72f32a292a Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Tue, 21 Oct 2025 21:15:34 -0700 Subject: [PATCH 2/3] Update [ghstack-poisoned] --- backends/apple/metal/runtime/shims/et_metal.h | 1 + .../apple/metal/runtime/shims/et_metal.mm | 15 ++ .../apple/metal/runtime/shims/et_metal_ops.mm | 16 +- backends/apple/metal/runtime/shims/memory.cpp | 230 ++++++++++++------ backends/apple/metal/runtime/shims/memory.h | 2 +- 5 files changed, 181 insertions(+), 83 deletions(-) diff --git a/backends/apple/metal/runtime/shims/et_metal.h b/backends/apple/metal/runtime/shims/et_metal.h index 75f79e5139c..0e012d18c8f 100644 --- a/backends/apple/metal/runtime/shims/et_metal.h +++ b/backends/apple/metal/runtime/shims/et_metal.h @@ -354,6 +354,7 @@ extern "C" { // Memory management functions for Metal void* metal_allocate_buffer(long bytes); +void metal_deallocate_buffer(void* ptr); bool metal_is_device_pointer(void* ptr); int metal_copy_memory( void* dst, diff --git a/backends/apple/metal/runtime/shims/et_metal.mm b/backends/apple/metal/runtime/shims/et_metal.mm index fdca0a28cf3..cae8f96c0d2 100644 --- a/backends/apple/metal/runtime/shims/et_metal.mm +++ b/backends/apple/metal/runtime/shims/et_metal.mm @@ -86,6 +86,21 @@ void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) { } } +void metal_deallocate_buffer(void* ptr) { + @autoreleasepool { + auto it = ptr_to_mtl_buffer.find(ptr); + if (it != ptr_to_mtl_buffer.end()) { + id buffer = it->second; + [buffer release]; + ptr_to_mtl_buffer.erase(it); + ET_LOG(Debug, "Deallocated Metal buffer for pointer %p", ptr); + ptr = nullptr; + } else { + ET_LOG(Error, "Failed to find Metal buffer for pointer %p", ptr); + } + } +} + void metal_cleanup_resources() { if (!ptr_to_mtl_buffer.empty()) { @autoreleasepool { diff --git a/backends/apple/metal/runtime/shims/et_metal_ops.mm b/backends/apple/metal/runtime/shims/et_metal_ops.mm index 0aa90650a1d..94bc3219306 100644 --- a/backends/apple/metal/runtime/shims/et_metal_ops.mm +++ b/backends/apple/metal/runtime/shims/et_metal_ops.mm @@ -736,9 +736,12 @@ AOTITorchError aoti_torch_mps_convolution( throw std::runtime_error("Tensor size mismatch"); } - // Store the tensor handle - mark that we own the memory since we manually allocated it with malloc + // Store the tensor handle - mark that we own the memory since we manually allocated it *ret0 = output_tensor_handle; - is_tensor_own_memory[et_tensor] = true; // We allocated the GPU memory + // Note: memory_to_n_tensor is managed automatically in aoti_torch_create_tensor_from_blob_v2 + // The function sets it to NOT_OWN, but we need to change it to 1 since we allocated it + extern std::unordered_map memory_to_n_tensor; + memory_to_n_tensor[tensor_data] = 1; ET_LOG(Debug, "aoti_torch_mps_convolution: Created output tensor with %zu elements using MPSGraph", actual_numel); @@ -1327,10 +1330,11 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps( } // Mark that we own the memory for these tensors - auto* out_et_tensor = reinterpret_cast(out_tensor_handle); - auto* attn_et_tensor = reinterpret_cast(attn_tensor_handle); - is_tensor_own_memory[out_et_tensor] = true; - is_tensor_own_memory[attn_et_tensor] = true; + // Note: memory_to_n_tensor is managed automatically in aoti_torch_create_tensor_from_blob_v2 + // The function sets it to NOT_OWN, but we need to change it to 1 since we allocated it + extern std::unordered_map memory_to_n_tensor; + memory_to_n_tensor[out_contents_ptr] = 1; + memory_to_n_tensor[attn_contents_ptr] = 1; // Set output tensor handles *ret0 = out_tensor_handle; diff --git a/backends/apple/metal/runtime/shims/memory.cpp b/backends/apple/metal/runtime/shims/memory.cpp index 83250f308bb..50f8d4e1cd0 100644 --- a/backends/apple/metal/runtime/shims/memory.cpp +++ b/backends/apple/metal/runtime/shims/memory.cpp @@ -31,7 +31,12 @@ using namespace executorch::backends::aoti; // Global storage for tensors and their metadata std::unordered_set> tensors; -std::unordered_map is_tensor_own_memory; + +// Reference counting for memory addresses +// Maps memory address to number of tensors using it +// Special value: NOT_OWN (-1) means tensor never owns the memory +constexpr int32_t NOT_OWN = -1; +std::unordered_map memory_to_n_tensor; extern "C" { @@ -110,7 +115,18 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( // Store the tensor so it doesn't get destroyed tensors.insert(tensor); *ret_new_tensor = tensor.get(); - is_tensor_own_memory[tensor.get()] = false; + + // Check if this memory address is already being tracked + auto memory_it = memory_to_n_tensor.find(adjusted_data); + ET_CHECK_OR_RETURN_ERROR( + memory_it == memory_to_n_tensor.end(), + InvalidArgument, + "Memory address %p is already being tracked by another tensor", + adjusted_data); + + // Mark this memory as NOT_OWN since tensor created from blob never owns + // memory + memory_to_n_tensor[adjusted_data] = NOT_OWN; ET_LOG(Debug, "aoti_torch_create_tensor_from_blob_v2: successfull"); return Error::Ok; @@ -192,7 +208,9 @@ AOTITorchError aoti_torch_empty_strided( // Store the tensor so it doesn't get destroyed tensors.insert(tensor); *ret_new_tensor = tensor.get(); - is_tensor_own_memory[tensor.get()] = true; + + // This tensor owns the memory it allocated, set reference count to 1 + memory_to_n_tensor[ptr] = 1; ET_LOG(Debug, "aoti_torch_empty_strided: successfull"); return Error::Ok; @@ -200,51 +218,81 @@ AOTITorchError aoti_torch_empty_strided( AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { ET_LOG(Debug, "aoti_torch_delete_tensor_object: entered"); - // Find tensor in the set + + // Handle null tensor pointer + if (tensor == nullptr) { + ET_LOG(Debug, "aoti_torch_delete_tensor_object: null tensor"); + return Error::Ok; + } + + // Check if tensor exists in our tracking + bool found_in_tensors = false; for (auto it = tensors.begin(); it != tensors.end(); ++it) { if (it->get() == tensor) { - auto tensor_ptr = *it; + found_in_tensors = true; + break; + } + } - // Check ownership before cleaning up - auto ownership_it = is_tensor_own_memory.find(tensor); - bool owns_memory = (ownership_it != is_tensor_own_memory.end()) - ? ownership_it->second - : false; + // If tensor not found in our tracking, it's invalid + ET_CHECK_OR_RETURN_ERROR( + found_in_tensors, InvalidArgument, "Didn't find tensor %p", tensor); - // Clean up ownership metadata - is_tensor_own_memory.erase(tensor); + // Find and delete the tensor + for (auto it = tensors.begin(); it != tensors.end(); ++it) { + if (it->get() == tensor) { + // Get the tensor before erasing + auto tensor_ptr = *it; + void* data_ptr = tensor_ptr->mutable_data_ptr(); - if (owns_memory) { - // et tensor owns the memory; need to free it manually - void* data_ptr = tensor_ptr->mutable_data_ptr(); + // Find the reference count for this memory address + auto memory_it = memory_to_n_tensor.find(data_ptr); + if (memory_it != memory_to_n_tensor.end()) { + int32_t ref_count = memory_it->second; - // Check if it's Metal GPU memory - if (metal_is_device_pointer(data_ptr)) { - // This is Metal GPU memory - the Metal helper will handle cleanup - // Metal buffers are automatically managed by ARC when the buffer is - // released + if (ref_count == NOT_OWN) { + // Tensor never owned the memory, skip freeing + // Just remove tensor from tracking tensors.erase(it); - ET_LOG( - Debug, - "aoti_torch_delete_tensor_object: successfull (Metal GPU memory)"); + ET_LOG(Debug, "aoti_torch_delete_tensor_object: tensor doesn't own memory, skipping free"); return Error::Ok; + } else if (ref_count == 1) { + // Only current tensor using this memory, free it + // Check if it's Metal GPU memory + if (metal_is_device_pointer(data_ptr)) { + metal_deallocate_buffer(data_ptr); + } else { + // This is CPU memory - free immediately + free(data_ptr); + data_ptr = nullptr; + ET_LOG(Debug, "aoti_torch_delete_tensor_object: freeing CPU memory"); + } + + // Remove from memory tracking + memory_to_n_tensor.erase(memory_it); + } else if (ref_count > 1) { + // Other tensors still using this memory, just decrement count + memory_to_n_tensor[data_ptr] = ref_count - 1; + ET_LOG(Debug, "aoti_torch_delete_tensor_object: decremented ref count from %d to %d", ref_count, ref_count - 1); } - - // This is CPU memory - free immediately - free(data_ptr); + } else { + ET_CHECK_OR_RETURN_ERROR( + false, + Internal, + "Internal error: memory not found during deletion"); } - // else: Don't free memory since the tensor doesn't own it - // Remove from set (this will call the destructor if it's the last + // Remove tensor from set (this will call the destructor if it's the last // reference) tensors.erase(it); - ET_LOG( - Debug, "aoti_torch_delete_tensor_object: successfull (CPU memory)"); + ET_LOG(Debug, "aoti_torch_delete_tensor_object: successfull"); return Error::Ok; } } - ET_LOG(Error, "Didn't find tensor %p", tensor); - return Error::InvalidArgument; + + // This should never be reached since we found it above + ET_CHECK_OR_RETURN_ERROR( + false, Internal, "Internal error: tensor not found after validation"); } AOTITorchError aoti_torch_copy_( @@ -375,6 +423,24 @@ AOTITorchError aoti_torch__reinterpret_tensor( InvalidArgument, "aoti_torch__reinterpret_tensor failed: ret_new_tensor is null"); + // Check if storage_offset is not 0 - return error if not + ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset)); + + // Get the device info from the source tensor to perform device_index + // validation + int32_t device_type = 0; + int32_t device_index = 0; + ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type)); + + ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index)); + + // Ensure device_index is always 0 + ET_CHECK_OR_RETURN_ERROR( + device_index == 0, + InvalidArgument, + "device_index must be 0, got: %d", + device_index); + // Get the dtype from the source tensor int32_t dtype = 0; ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(self, &dtype)); @@ -382,54 +448,52 @@ AOTITorchError aoti_torch__reinterpret_tensor( // Validate dtype using SupportedDTypes ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); - int32_t device_type = 0; - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type)); + // Get the original data pointer from the source tensor + void* data_ptr = self->mutable_data_ptr(); + ET_CHECK_OR_RETURN_ERROR( + data_ptr != nullptr, + InvalidArgument, + "Source tensor has null data pointer"); - int32_t device_index = 0; - ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index)); + // Check if the given memory is in the map, if not return error + auto memory_it = memory_to_n_tensor.find(data_ptr); + ET_CHECK_OR_RETURN_ERROR( + memory_it != memory_to_n_tensor.end(), + InvalidArgument, + "Memory address %p is not being tracked by reference counting system", + data_ptr); + + // Convert sizes using utility function from utils.h + std::vector sizes = convert_sizes_to_vector(ndim, sizes_ptr); + + // Convert strides using utility function from utils.h + std::vector strides = + convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); + + // Create new tensor view that reinterprets the same memory with different + // shape/strides This creates a view, not a copy - the data pointer is shared + std::shared_ptr tensor = executorch::extension::from_blob( + data_ptr, // Reuse the same memory from source tensor + sizes, // New sizes with explicit SizesType + strides, // New strides with explicit StridesType + dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting + ); - // Get the base data pointer from the source tensor - void* base_data_ptr = self->mutable_data_ptr(); ET_CHECK_OR_RETURN_ERROR( - base_data_ptr != nullptr, + tensor != nullptr, InvalidArgument, - "Source tensor has null data pointer"); + "Failed to create reinterpreted tensor view"); - // Calculate new tensor size in elements for logging - int64_t new_numel = 1; - for (int64_t i = 0; i < ndim; i++) { - new_numel *= sizes_ptr[i]; - } + // Store the tensor so it doesn't get destroyed + tensors.insert(tensor); - ET_LOG( - Debug, - "aoti_torch__reinterpret_tensor: base_data_ptr=%p, new_numel=%lld, storage_offset=%lld", - base_data_ptr, - new_numel, - storage_offset); - - // Create a new tensor view that shares the same underlying storage - // This is the correct way to implement reinterpret_tensor - as a view, not a - // copy - AOTITorchError create_err = aoti_torch_create_tensor_from_blob_v2( - base_data_ptr, // Same underlying data pointer - ndim, // New dimensions - sizes_ptr, // New sizes - strides_ptr, // New strides - storage_offset, // Storage offset (will be handled properly now) - dtype, - device_type, - device_index, - ret_new_tensor, - 0, // layout (default) - nullptr, // opaque_metadata - 0 // opaque_metadata_size - ); + *ret_new_tensor = tensor.get(); - if (create_err != Error::Ok) { - ET_LOG(Error, "failed to create reinterpreted tensor view"); - return create_err; - } + // Increment the reference count for this memory address only if it is owned + // by tensor + memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN + ? NOT_OWN + : memory_to_n_tensor[data_ptr] + 1; ET_LOG(Debug, "aoti_torch__reinterpret_tensor: successfull"); return Error::Ok; @@ -437,13 +501,27 @@ AOTITorchError aoti_torch__reinterpret_tensor( // Cleanup function for clearing global state void cleanup_memory() { - is_tensor_own_memory.clear(); - if (!tensors.empty()) { - ET_LOG(Error, "Warning: tensors not empty during cleanup"); + // Use aoti_torch_delete_tensor_object to properly delete each tensor + // Note: We need to collect tensor pointers first since deletion modifies the + // set + std::vector tensor_ptrs; + tensor_ptrs.reserve(tensors.size()); + for (const auto& tensor_shared : tensors) { + tensor_ptrs.push_back(tensor_shared.get()); + } + + // Now delete each tensor - this will modify the global tensors set + for (Tensor* tensor_ptr : tensor_ptrs) { + aoti_torch_delete_tensor_object(tensor_ptr); } + // tensors set should now be empty, but ensure it's cleared + tensors.clear(); + // Clean up Metal resources metal_cleanup_resources(); + + ET_LOG(Info, "Cleared all tensors and Metal resources"); } } // extern "C" diff --git a/backends/apple/metal/runtime/shims/memory.h b/backends/apple/metal/runtime/shims/memory.h index 47fb6352b50..5f48fd921c6 100644 --- a/backends/apple/metal/runtime/shims/memory.h +++ b/backends/apple/metal/runtime/shims/memory.h @@ -22,7 +22,7 @@ namespace metal { extern "C" { // Global storage declarations -extern std::unordered_map is_tensor_own_memory; +extern std::unordered_map memory_to_n_tensor; extern std::unordered_set> tensors; // Memory-related operations From 5bd34ee8ed29147ae2171071eb1f00109144e4cd Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Tue, 21 Oct 2025 21:19:21 -0700 Subject: [PATCH 3/3] Update [ghstack-poisoned] --- backends/apple/metal/runtime/shims/memory.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/backends/apple/metal/runtime/shims/memory.cpp b/backends/apple/metal/runtime/shims/memory.cpp index 50f8d4e1cd0..b5d2d3161ae 100644 --- a/backends/apple/metal/runtime/shims/memory.cpp +++ b/backends/apple/metal/runtime/shims/memory.cpp @@ -254,7 +254,9 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { // Tensor never owned the memory, skip freeing // Just remove tensor from tracking tensors.erase(it); - ET_LOG(Debug, "aoti_torch_delete_tensor_object: tensor doesn't own memory, skipping free"); + ET_LOG( + Debug, + "aoti_torch_delete_tensor_object: tensor doesn't own memory, skipping free"); return Error::Ok; } else if (ref_count == 1) { // Only current tensor using this memory, free it @@ -265,7 +267,8 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { // This is CPU memory - free immediately free(data_ptr); data_ptr = nullptr; - ET_LOG(Debug, "aoti_torch_delete_tensor_object: freeing CPU memory"); + ET_LOG( + Debug, "aoti_torch_delete_tensor_object: freeing CPU memory"); } // Remove from memory tracking @@ -273,7 +276,11 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { } else if (ref_count > 1) { // Other tensors still using this memory, just decrement count memory_to_n_tensor[data_ptr] = ref_count - 1; - ET_LOG(Debug, "aoti_torch_delete_tensor_object: decremented ref count from %d to %d", ref_count, ref_count - 1); + ET_LOG( + Debug, + "aoti_torch_delete_tensor_object: decremented ref count from %d to %d", + ref_count, + ref_count - 1); } } else { ET_CHECK_OR_RETURN_ERROR(