From 66a37e393d8e4054044e92fd1ed7504bc05226e7 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Thu, 6 Nov 2025 12:51:30 -0800 Subject: [PATCH] [Executorch][LLM] Use caching allocator for runner We observed that on iOS it improves perf by 6% because SDPA op does temp allocations. No significant difference on android though. Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/) [ghstack-poisoned] --- CMakeLists.txt | 2 ++ extension/llm/runner/CMakeLists.txt | 2 +- extension/llm/runner/llm_runner_helper.cpp | 20 ++++++++++++++++++-- extension/llm/runner/targets.bzl | 1 + 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6d6f26b41f..3b6c84b049b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -922,6 +922,8 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING) endif() if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator/runner) + list(APPEND _executorch_extensions extension_memory_allocator) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) list(APPEND _executorch_extensions extension_llm_runner) endif() diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 8d280b4eaf9..d144cf5492e 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -34,7 +34,7 @@ list(TRANSFORM _extension_llm_runner__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs}) set(runner_deps executorch_core extension_module extension_tensor - tokenizers::tokenizers + extension_memory_allocator tokenizers::tokenizers ) # depend on arange_utils diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp index 674be820072..9700604f2fe 100644 --- a/extension/llm/runner/llm_runner_helper.cpp +++ b/extension/llm/runner/llm_runner_helper.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -209,11 +210,26 @@ std::unique_ptr create_text_llm_runner( // Create the Module std::unique_ptr module; + uint32_t max_cached_memory_size_bytes_ = 1024 * 1024 * 10; // 10MB if (data_files.size() > 0) { module = std::make_unique( - model_path, data_files, Module::LoadMode::File); + model_path, + data_files, + Module::LoadMode::File, + nullptr, + std::make_unique< + executorch::extension::CPUCachingAllocator>( // temp memory + // allocator + max_cached_memory_size_bytes_)); } else { - module = std::make_unique(model_path, Module::LoadMode::File); + module = std::make_unique( + model_path, + Module::LoadMode::File, + nullptr, + std::make_unique< + executorch::extension::CPUCachingAllocator>( // temp memory + // allocator + max_cached_memory_size_bytes_)); } // Get metadata from Module diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index e001e8fc154..418259a27e0 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -148,6 +148,7 @@ def define_common_targets(): ":text_prefiller" + aten_suffix, ":text_token_generator" + aten_suffix, "//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix, + "//executorch/extension/memory_allocator:cpu_caching_allocator", "//pytorch/tokenizers:hf_tokenizer", "//pytorch/tokenizers:llama2c_tokenizer", "//pytorch/tokenizers:sentencepiece",