diff --git a/extension/memory_allocator/CMakeLists.txt b/extension/memory_allocator/CMakeLists.txt new file mode 100644 index 00000000000..1c3c8a0831c --- /dev/null +++ b/extension/memory_allocator/CMakeLists.txt @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +cmake_minimum_required(VERSION 3.19) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/") +if(CMAKE_TOOLCHAIN_IOS + OR CMAKE_TOOLCHAIN_ANDROID + OR APPLE +) + # Building a share library on iOS requires code signing On Android we see + # duplicated registration when using shared lib + add_library(extension_memory_allocator STATIC ${_extension_memory_allocator__srcs}) +else() + add_library(extension_memory_allocator ${_extension_memory_allocator__srcs}) +endif() +target_link_libraries( + extension_memory_allocator PRIVATE executorch_core) +target_include_directories( + extension_memory_allocator PUBLIC ${_common_include_directories} +) +target_compile_options( + extension_memory_allocator + PUBLIC $<$:/wd4996> + $<$>:-Wno-deprecated-declarations -fPIC> +) + +# Install libraries +install( + TARGETS extension_memory_allocator + EXPORT ExecuTorchTargets + DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES + DESTINATION ${_common_include_directories} +) diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp new file mode 100644 index 00000000000..eac308a944e --- /dev/null +++ b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp @@ -0,0 +1,89 @@ +#include + +#include + +namespace executorch::extension { + +namespace { +size_t get_alignment_adjusted_size(size_t size, size_t alignment) { + alignment = std::max(alignment, kDefaultAlignment); + if (size % alignment != 0) { + // Adjust size to the next multiple of alignment + // This is needed for aligned_alloc to work + return (size + alignment) & ~(alignment - 1); + } else { + return size; + } +} +} // namespace + +CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size) + : MemoryAllocator(0, nullptr) { + max_size_ = max_size; + current_size_ = 0; +} + +void* CPUCachingAllocator::allocate(size_t size, size_t alignment) { + EXECUTORCH_TRACK_ALLOCATION(prof_id(), size); + + if (!isPowerOf2(alignment)) { + ET_LOG(Error, "Alignment %zu is not a power of 2", alignment); + return nullptr; + } + size = get_alignment_adjusted_size(size, alignment); + + std::lock_guard guard(mutex_); + const auto& it = available_map_.find(size); + if (it == available_map_.end() || it->second.empty()) { + if (current_size_ + size > max_size_) { + // Freeing while holding the lock will cause performance issues + // we probably should log how often this happens so as to allow + // for calling site to adjust the max_size_ parameter + free_cached(); + } + void* ptr = std::aligned_alloc(alignment, size); + current_size_ += size; + if (ptr == nullptr) { + ET_LOG(Error, "Failed to allocate memory"); + return nullptr; + } + allocation_map_[ptr] = size; + return ptr; + } + void* ptr = it->second.back(); + it->second.pop_back(); + allocation_map_[ptr] = size; + return ptr; +} + +void CPUCachingAllocator::free_cached() { + // We dont lock mutex_ here because it will cause deadlock otherwise + // we could use recursive_mutex but we just design this differently since + // free_cache is not a public API anyways + for (const auto& it : available_map_) { + for (const auto ptr : it.second) { + std::free(ptr); + } + } + available_map_.clear(); +} + +void CPUCachingAllocator::reset() { + std::lock_guard guard(mutex_); + for (auto& it : allocation_map_) { + void* ptr = it.first; + size_t alloc_size = it.second; + // Cache the memory + available_map_[alloc_size].push_back(ptr); + current_size_ -= alloc_size; + } + allocation_map_.clear(); +} + +CPUCachingAllocator::~CPUCachingAllocator() { + // destructor must be called in thread safe manner + reset(); + free_cached(); +} + +} // namespace executorch::extension diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.h b/extension/memory_allocator/cpu_caching_malloc_allocator.h new file mode 100644 index 00000000000..d22f9e38396 --- /dev/null +++ b/extension/memory_allocator/cpu_caching_malloc_allocator.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include + +#include + +#ifdef USE_C10_SMALL_VECTOR +#include +#else +#include +#endif + +#ifdef USE_C10_FLAT_HASH_MAP +#include +#else +#include +#endif + +/* + * CPUCachingAllocator: + * This file is copied over from c10/mobile/CPUCachingAllocator.h + * It is a thread safe caching allocator. + */ + +namespace executorch::extension { + +#ifdef USE_C10_SMALL_VECTOR +template +using SmallVector = c10::SmallVector; +#else +template +using SmallVector = std::vector; +#endif + +#ifdef USE_C10_FLAT_HASH_MAP +template +using FlatHashMap = ska::flat_hash_map; +#else +template +using FlatHashMap = std::unordered_map; +#endif + +constexpr size_t kDefaultAlignment = 64; +class CPUCachingAllocator : public executorch::runtime::MemoryAllocator { + /* + * What it does: + * Caches all the allocations carried out by this allocator. + * Cache key is the size of the allocation. + * If requested size is found in the cache returns the cached pointer. + * What it does not do: + * No speculative allocation for any future allocations. + */ + private: + void free_cached(); + + protected: + // Invariants. + // New invariants must be written. + FlatHashMap> available_map_; + FlatHashMap allocation_map_; + // Since allocation_map, which is a global instance, is mutated/read via + // all public APIs we need a global mutex. + std::mutex mutex_; + size_t max_size_; + size_t current_size_; + + public: + /* + max_size: Maximum size of memory to cache. Never cache more than that. + */ + CPUCachingAllocator(uint32_t max_size); + // Checks the cache to see if allocation of size bytes can be found. + // If so return cached memory, else + // allocates memory, records it for caching and returns. + void* allocate(size_t size, size_t alignment = kDefaultAlignment) override; + void reset() override; + ~CPUCachingAllocator(); +}; + +} // namespace executorch::extension diff --git a/extension/memory_allocator/targets.bzl b/extension/memory_allocator/targets.bzl index d021a4da707..f51b084e4ee 100644 --- a/extension/memory_allocator/targets.bzl +++ b/extension/memory_allocator/targets.bzl @@ -20,3 +20,20 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], ) + + runtime.cxx_library( + name = "cpu_caching_allocator", + srcs = [ + "cpu_caching_malloc_allocator.cpp", + ], + exported_headers = [ + "cpu_caching_malloc_allocator.h", + ], + exported_deps = [ + "//executorch/runtime/core:memory_allocator", + ], + visibility = [ + "//executorch/extension/memory_allocator/test/...", + "@EXECUTORCH_CLIENTS", + ], + ) diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp new file mode 100644 index 00000000000..cda60c1b6f3 --- /dev/null +++ b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp @@ -0,0 +1,301 @@ +#include +#include +#include +#include +#include + +#include + +#include +#include + +using namespace ::testing; +using executorch::extension::CPUCachingAllocator; + +constexpr auto kDefaultAlignment = executorch::extension::kDefaultAlignment; + +class CPUCachingAllocatorTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + executorch::runtime::runtime_init(); + } +}; + +bool is_aligned(const void* ptr, size_t alignment) { + uintptr_t addr = reinterpret_cast(ptr); + return addr % alignment == 0; +} + +#define EXPECT_ALIGNED(ptr, alignment) \ + EXPECT_TRUE(is_aligned((ptr), (alignment))) \ + << "Pointer " << (ptr) << " is not aligned to " << (alignment) + +TEST_F(CPUCachingAllocatorTest, SimpleAllocateSucceeds) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + auto p = allocator.allocate(16); + EXPECT_NE(p, nullptr); + EXPECT_ALIGNED(p, kDefaultAlignment); + + auto p2 = allocator.allocate(32); + EXPECT_NE(p2, nullptr); + EXPECT_ALIGNED(p2, kDefaultAlignment); + + auto p3 = allocator.allocate(64); + EXPECT_NE(p3, nullptr); + EXPECT_ALIGNED(p3, kDefaultAlignment); +} + +TEST_F(CPUCachingAllocatorTest, CachingReusesSameSize) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + auto p1 = allocator.allocate(256); + EXPECT_NE(p1, nullptr); + EXPECT_ALIGNED(p1, kDefaultAlignment); + + // Reset to return the allocation to the cache + allocator.reset(); + + // Allocate the same size should reuse the cached pointer + auto p2 = allocator.allocate(256); + EXPECT_EQ(p1, p2); + EXPECT_ALIGNED(p2, kDefaultAlignment); +} + +TEST_F(CPUCachingAllocatorTest, DifferentSizesAllocateDifferentPtrs) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + auto p1 = allocator.allocate(128); + auto p2 = allocator.allocate(256); + auto p3 = allocator.allocate(512); + + EXPECT_NE(p1, nullptr); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p3, nullptr); + + // All pointers should be different + EXPECT_NE(p1, p2); + EXPECT_NE(p2, p3); + EXPECT_NE(p1, p3); + + EXPECT_ALIGNED(p1, kDefaultAlignment); + EXPECT_ALIGNED(p2, kDefaultAlignment); + EXPECT_ALIGNED(p3, kDefaultAlignment); +} + +TEST_F(CPUCachingAllocatorTest, ResetCachesAllocations) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + auto p1 = allocator.allocate(256); + auto p2 = allocator.allocate(256); + EXPECT_NE(p1, p2); + + allocator.reset(); + + // After reset, both cached allocations should be available + auto p3 = allocator.allocate(256); + auto p4 = allocator.allocate(256); + + // p3 should be one of the cached pointers (either p1 or p2) + EXPECT_TRUE((p3 == p1) || (p3 == p2)); + EXPECT_TRUE((p4 == p1) || (p4 == p2)); + EXPECT_NE(p3, p4); +} + +TEST_F(CPUCachingAllocatorTest, AlignmentParameter) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + std::vector alignments = { + kDefaultAlignment, + kDefaultAlignment * 2, + kDefaultAlignment * 4, + kDefaultAlignment * 8, + }; + + for (size_t alignment : alignments) { + auto p = allocator.allocate(256, alignment); + EXPECT_NE(p, nullptr); + EXPECT_ALIGNED(p, alignment); + } +} + +TEST_F(CPUCachingAllocatorTest, InvalidAlignmentFails) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + // Should fail because alignment is not a power of 2 + std::vector invalid_alignments = {0, 5, 6, 12, 34}; + for (auto alignment : invalid_alignments) { + auto p = allocator.allocate(256, alignment); + EXPECT_EQ(p, nullptr); + } +} + +TEST_F(CPUCachingAllocatorTest, MaxSizeRespected) { + constexpr size_t kMaxSize = 1024; // 1KB max + CPUCachingAllocator allocator(kMaxSize); + + // Allocate close to the max size + auto p1 = allocator.allocate(512); + EXPECT_NE(p1, nullptr); + + auto p2 = allocator.allocate(512); + EXPECT_NE(p2, nullptr); + + // This should trigger cache freeing since we would exceed max_size + auto p3 = allocator.allocate(512); + EXPECT_NE(p3, nullptr); +} + +TEST_F(CPUCachingAllocatorTest, MultipleAllocationsAndResets) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + for (int i = 0; i < 5; ++i) { + auto p1 = allocator.allocate(256); + auto p2 = allocator.allocate(512); + auto p3 = allocator.allocate(1024); + + EXPECT_NE(p1, nullptr); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p3, nullptr); + + allocator.reset(); + } +} + +TEST_F(CPUCachingAllocatorTest, MemoryWriteability) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + const size_t size = 1024; + auto p = allocator.allocate(size); + EXPECT_NE(p, nullptr); + + // Write to allocated memory + memset(p, 0x55, size); + + // Read back and verify + uint8_t* bytes = reinterpret_cast(p); + for (size_t i = 0; i < size; ++i) { + EXPECT_EQ(bytes[i], 0x55); + } + + allocator.reset(); +} + +TEST_F(CPUCachingAllocatorTest, CachingWithMultipleSizes) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + // Allocate various sizes + auto p1 = allocator.allocate(128); + auto p2 = allocator.allocate(256); + auto p3 = allocator.allocate(512); + auto p4 = allocator.allocate(128); + + // Reset to cache them + allocator.reset(); + + // Allocate same sizes - should reuse cached pointers + auto p5 = allocator.allocate(128); + auto p6 = allocator.allocate(256); + auto p7 = allocator.allocate(512); + + EXPECT_TRUE((p5 == p1) || (p5 == p4)); + EXPECT_EQ(p6, p2); + EXPECT_EQ(p7, p3); +} + +TEST_F(CPUCachingAllocatorTest, ThreadSafety) { + CPUCachingAllocator allocator(4 * 1024 * 1024); // 4MB max size + + std::vector threads; + std::vector allocated_ptrs; + std::mutex ptrs_mutex; + + const int num_threads = 4; + const int allocations_per_thread = 10; + + // Lambda function for thread work + auto thread_work = [&]() { + for (int i = 0; i < allocations_per_thread; ++i) { + size_t size = (i + 1) * 64; + auto p = allocator.allocate(size); + EXPECT_NE(p, nullptr); + EXPECT_ALIGNED(p, kDefaultAlignment); + + { + std::lock_guard guard(ptrs_mutex); + allocated_ptrs.push_back(p); + } + } + + // Reset in each thread + allocator.reset(); + }; + + // Create threads + for (int i = 0; i < num_threads; ++i) { + threads.emplace_back(thread_work); + } + + // Wait for all threads to finish + for (auto& thread : threads) { + thread.join(); + } + + // Verify all allocations were valid + EXPECT_EQ(allocated_ptrs.size(), num_threads * allocations_per_thread); +} + +TEST_F(CPUCachingAllocatorTest, LargeAllocation) { + CPUCachingAllocator allocator(10 * 1024 * 1024); // 10MB max size + + const size_t large_size = 1024 * 1024; // 1MB allocation + auto p = allocator.allocate(large_size); + EXPECT_NE(p, nullptr); + EXPECT_ALIGNED(p, kDefaultAlignment); + + // Write and verify + memset(p, 0xAA, large_size); + uint8_t* bytes = reinterpret_cast(p); + for (size_t i = 0; i < 1000; ++i) { // Sample check + EXPECT_EQ(bytes[i], 0xAA); + } + + allocator.reset(); + + // Re-allocate same size should reuse cached pointer + auto p2 = allocator.allocate(large_size); + EXPECT_EQ(p, p2); +} + +TEST_F(CPUCachingAllocatorTest, SizeAlignmentAdjustment) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + // Test that allocation sizes get properly aligned + auto p1 = allocator.allocate(100, 256); // Size not aligned to 256 + EXPECT_NE(p1, nullptr); + EXPECT_ALIGNED(p1, 256); + + auto p2 = allocator.allocate(100, 256); + // Should not get cached pointer since size was adjusted during first + // allocation + allocator.reset(); + + auto p3 = allocator.allocate(100, 256); + // Should reuse p1 due to alignment adjustment + EXPECT_EQ(p1, p3); +} + +TEST_F(CPUCachingAllocatorTest, ResetMultipleTimes) { + CPUCachingAllocator allocator(1024 * 1024); // 1MB max size + + for (int i = 0; i < 3; ++i) { + auto p = allocator.allocate(512); + EXPECT_NE(p, nullptr); + allocator.reset(); + + auto p2 = allocator.allocate(512); + EXPECT_EQ(p, p2); + } +} diff --git a/extension/memory_allocator/test/targets.bzl b/extension/memory_allocator/test/targets.bzl index 77fb6936a3a..5855bee5c14 100644 --- a/extension/memory_allocator/test/targets.bzl +++ b/extension/memory_allocator/test/targets.bzl @@ -15,3 +15,13 @@ def define_common_targets(): "//executorch/extension/memory_allocator:malloc_memory_allocator", ], ) + + runtime.cxx_test( + name = "cpu_caching_malloc_allocator_test", + srcs = [ + "cpu_caching_malloc_allocator_test.cpp", + ], + deps = [ + "//executorch/extension/memory_allocator:cpu_caching_allocator", + ], + ) diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl index 8d8893f7454..0529c814f14 100644 --- a/shim_et/xplat/executorch/build/build_variables.bzl +++ b/shim_et/xplat/executorch/build/build_variables.bzl @@ -337,6 +337,10 @@ EXTENSION_FLAT_TENSOR_SRCS = [ "extension/flat_tensor/serialize/flat_tensor_header.cpp", ] +EXTENSION_MEMORY_ALLOCATOR_SRCS = [ + "extension/memory_allocator/cpu_caching_malloc_allocator.cpp", +] + EXTENSION_MODULE_SRCS = [ "extension/module/module.cpp", ] diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake index 32d3d8b554f..e838e62c582 100644 --- a/tools/cmake/Codegen.cmake +++ b/tools/cmake/Codegen.cmake @@ -398,6 +398,7 @@ function(executorch_load_build_variables) EXTENSION_DATA_LOADER_SRCS EXTENSION_EVALUE_UTIL_SRCS EXTENSION_FLAT_TENSOR_SRCS + EXTENSION_MEMORY_ALLOCATOR_SRCS EXTENSION_MODULE_SRCS EXTENSION_NAMED_DATA_MAP_SRCS EXTENSION_RUNNER_UTIL_SRCS @@ -431,6 +432,7 @@ function(executorch_load_build_variables) _extension_data_loader__srcs _extension_evalue_util__srcs _extension_flat_tensor__srcs + _extension_memory_allocator__srcs _extension_module__srcs _extension_named_data_map__srcs _extension_runner_util__srcs