From 66a37e393d8e4054044e92fd1ed7504bc05226e7 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 6 Nov 2025 12:51:30 -0800
Subject: [PATCH] [Executorch][LLM] Use caching allocator for runner

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 CMakeLists.txt                             |  2 ++
 extension/llm/runner/CMakeLists.txt        |  2 +-
 extension/llm/runner/llm_runner_helper.cpp | 20 ++++++++++++++++++--
 extension/llm/runner/targets.bzl           |  1 +
 4 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6d6f26b41f..3b6c84b049b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -922,6 +922,8 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator/runner)
+  list(APPEND _executorch_extensions extension_memory_allocator)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 8d280b4eaf9..d144cf5492e 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -34,7 +34,7 @@ list(TRANSFORM _extension_llm_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs})
 
 set(runner_deps executorch_core extension_module extension_tensor
-                tokenizers::tokenizers
+                extension_memory_allocator tokenizers::tokenizers
 )
 
 # depend on arange_utils
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index 674be820072..9700604f2fe 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -17,6 +17,7 @@
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
@@ -209,11 +210,26 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create the Module
   std::unique_ptr<Module> module;
+  uint32_t max_cached_memory_size_bytes_ = 1024 * 1024 * 10; // 10MB
   if (data_files.size() > 0) {
     module = std::make_unique<Module>(
-        model_path, data_files, Module::LoadMode::File);
+        model_path,
+        data_files,
+        Module::LoadMode::File,
+        nullptr,
+        std::make_unique<
+            executorch::extension::CPUCachingAllocator>( // temp memory
+                                                         // allocator
+            max_cached_memory_size_bytes_));
   } else {
-    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+    module = std::make_unique<Module>(
+        model_path,
+        Module::LoadMode::File,
+        nullptr,
+        std::make_unique<
+            executorch::extension::CPUCachingAllocator>( // temp memory
+                                                         // allocator
+            max_cached_memory_size_bytes_));
   }
 
   // Get metadata from Module
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index e001e8fc154..418259a27e0 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -148,6 +148,7 @@ def define_common_targets():
                 ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,
                 "//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
+                "//executorch/extension/memory_allocator:cpu_caching_allocator",
                 "//pytorch/tokenizers:hf_tokenizer",
                 "//pytorch/tokenizers:llama2c_tokenizer",
                 "//pytorch/tokenizers:sentencepiece",