diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp index 25846a2c5bc..6038353fdb5 100644 --- a/extension/llm/runner/llm_runner_helper.cpp +++ b/extension/llm/runner/llm_runner_helper.cpp @@ -183,7 +183,8 @@ std::unique_ptr create_text_llm_runner( std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::optional data_path, float temperature, - const std::string& method_name) { + const std::string& method_name, + Module::LoadMode load_mode) { if (data_path.has_value()) { std::vector data_files; data_files.push_back(data_path.value()); @@ -193,7 +194,8 @@ std::unique_ptr create_text_llm_runner( std::move(data_files), temperature, nullptr, - method_name); + method_name, + load_mode); } return create_text_llm_runner( model_path, @@ -201,7 +203,8 @@ std::unique_ptr create_text_llm_runner( std::vector(), temperature, nullptr, - method_name); + method_name, + load_mode); } std::unique_ptr create_text_llm_runner( @@ -210,7 +213,8 @@ std::unique_ptr create_text_llm_runner( std::vector data_files, float temperature, std::unique_ptr<::executorch::runtime::EventTracer> event_tracer, - const std::string& method_name) { + const std::string& method_name, + Module::LoadMode load_mode) { // Sanity check tokenizer if (!tokenizer || !tokenizer->is_loaded()) { ET_LOG(Error, "Tokenizer is null or not loaded"); @@ -221,13 +225,10 @@ std::unique_ptr create_text_llm_runner( std::unique_ptr module; if (data_files.size() > 0) { module = std::make_unique( - model_path, - data_files, - Module::LoadMode::File, - std::move(event_tracer)); + model_path, data_files, load_mode, std::move(event_tracer)); } else { module = std::make_unique( - model_path, Module::LoadMode::File, std::move(event_tracer)); + model_path, load_mode, std::move(event_tracer)); } // Get metadata from Module diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h index 373124d8560..b4c7c59806d 100644 --- a/extension/llm/runner/llm_runner_helper.h +++ b/extension/llm/runner/llm_runner_helper.h @@ -96,6 +96,10 @@ ET_EXPERIMENTAL std::unordered_set get_eos_ids( * @param temperature Optional temperature parameter for controlling randomness * (deprecated) * @param method_name Name of the method to execute in the model + * @param load_mode Loading strategy for the model file. Defaults to + * MmapUseMlockIgnoreErrors which uses mmap to avoid loading the entire + * model into RAM and attempts to pin pages with mlock for lower inference + * latency, gracefully falling back to standard mmap if mlock is unavailable. * @return std::unique_ptr Initialized TextLLMRunner instance, or * nullptr on failure */ @@ -104,7 +108,8 @@ ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner( std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::optional data_path, float temperature = -1.0f, - const std::string& method_name = "forward"); + const std::string& method_name = "forward", + Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors); /** * @brief Creates a TextLLMRunner instance with dependency injection @@ -120,6 +125,10 @@ ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner( * (deprecated) * @param event_tracer Optional event tracer for profiling * @param method_name Name of the method to execute in the model + * @param load_mode Loading strategy for the model file. Defaults to + * MmapUseMlockIgnoreErrors which uses mmap to avoid loading the entire + * model into RAM and attempts to pin pages with mlock for lower inference + * latency, gracefully falling back to standard mmap if mlock is unavailable. * @return std::unique_ptr Initialized TextLLMRunner instance, or * nullptr on failure */ @@ -129,7 +138,8 @@ ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner( std::vector data_files = {}, float temperature = -1.0f, std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr, - const std::string& method_name = "forward"); + const std::string& method_name = "forward", + Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors); /** * @brief Creates a MultimodalRunner instance with dependency injection