From b4d0cb4c35dafe39883ebdeb19b6988b3cc3a61d Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 12 Sep 2025 12:11:25 -0700 Subject: [PATCH 1/2] Remove generate_from_pos since there's no user of it As titled --- .../executor_runner/mtk_llama_runner.h | 2 ++ .../oss_scripts/llama/runner/runner.cpp | 14 ++--------- .../oss_scripts/llama/runner/runner.h | 9 ++----- extension/llm/runner/irunner.h | 25 ++----------------- extension/llm/runner/text_llm_runner.cpp | 13 ++-------- extension/llm/runner/text_llm_runner.h | 21 ++-------------- 6 files changed, 12 insertions(+), 72 deletions(-) diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h index 5dd8a85005e..e82b36d9373 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.h +++ b/examples/mediatek/executor_runner/mtk_llama_runner.h @@ -66,6 +66,8 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner { std::function token_callback); std::unique_ptr load_tokenizer(); + void reset() {} + private: // model const LlamaModelOptions modeloptions_; diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 0c9be4d441d..253e083a80e 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -354,17 +354,6 @@ Error Runner::generate( const llm::GenerationConfig& config, std::function token_callback, std::function stats_callback) { - return generate_from_pos(prompt, 0, config, token_callback, stats_callback); -} - -template -Error Runner::generate_from_pos( - const std::string& prompt, - int64_t start_pos, - const llm::GenerationConfig& config, - std::function token_callback, - std::function stats_callback) { - // TODO: currently only support start_pos == 0 return generate_from_prompt_or_file( prompt, false, config, token_callback, stats_callback); } @@ -435,7 +424,8 @@ Error Runner::generate_from_prompt_or_file( stats_.first_token_ms = time_in_ms(); stats_.prompt_eval_end_ms = time_in_ms(); - // print the first token from prefill. No prev_token so use cur_token for it. + // print the first token from prefill. No prev_token so use cur_token for + // it. if (token_callback) { token_callback( ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index 30fba71ecef..9f290d79c75 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -72,13 +72,7 @@ class Runner : public executorch::extension::llm::IRunner { std::function token_callback = {}, std::function stats_callback = {}) override; - executorch::runtime::Error generate_from_pos( - const std::string& prompt, - int64_t start_pos, - const executorch::extension::llm::GenerationConfig& config, - std::function token_callback = {}, - std::function stats_callback = {}) - override; + executorch::runtime::Error generate_from_prompt_or_file( const std::string& prompt, bool tokenized_prompt, @@ -86,6 +80,7 @@ class Runner : public executorch::extension::llm::IRunner { std::function token_callback = {}, std::function stats_callback = {}); void stop() override {}; + void reset() override {}; executorch::runtime::Result get_decoder_model_version(); private: diff --git a/extension/llm/runner/irunner.h b/extension/llm/runner/irunner.h index 5bd5ef9d04e..ef93f32319c 100644 --- a/extension/llm/runner/irunner.h +++ b/extension/llm/runner/irunner.h @@ -125,39 +125,18 @@ class ET_EXPERIMENTAL IRunner { std::function token_callback, std::function stats_callback) = 0; - /** - * Generate text based on the provided prompt and generation config, from a - * given position in KV cache. - * - * @param prompt The input prompt to generate from - * @param start_pos The starting position in KV cache of the input. Note: - * Depending on the actual implementation, a runner may manage the position - * internally, and this may not be respected. - * @param config Generation configuration parameters - * @param token_callback Callback function called for each generated token - * @param stats_callback Callback function for generation statistics - * @return Error::Ok if successful, an error otherwise - */ - virtual runtime::Error generate_from_pos( - const std::string& prompt, - int64_t start_pos, - const GenerationConfig& config, - std::function token_callback, - std::function stats_callback) = 0; /** * Stop the generation process. */ virtual void stop() = 0; + /** * Force remove prefilled tokens and reset KV cache start position * - * For some existing runners, overriding this method is not needed because - * start_pos is passed as an argument to generate_from_pos. - * * This method removes the prefilled tokens from the KV cache and resets the * start position to 0. */ - virtual void reset() {}; + virtual void reset() = 0; }; } // namespace llm diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index b6f41fd7af6..338e35bcc2e 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -43,6 +43,7 @@ TextLLMRunner::TextLLMRunner( io_manager_(std::move(io_manager)), text_token_generator_(std::move(text_token_generator)), stats_(std::move(stats)), + pos_(0), temperature_(temperature) { // Note: This constructor assumes that text_prefiller and text_token_generator // already have references to the Module and TextDecoderRunner they need @@ -70,9 +71,8 @@ Error TextLLMRunner::load() { ET_LOG(Info, format, __VA_ARGS__); \ } -Error TextLLMRunner::generate_from_pos( +Error TextLLMRunner::generate( const std::string& prompt, - ET_UNUSED int64_t start_pos, const GenerationConfig& config, std::function token_callback, std::function stats_callback) { @@ -217,15 +217,6 @@ Error TextLLMRunner::generate_from_pos( return Error::Ok; } -Error TextLLMRunner::generate( - const std::string& prompt, - const GenerationConfig& config, - std::function token_callback, - std::function stats_callback) { - pos_ = 0; - return generate_from_pos(prompt, 0, config, token_callback, stats_callback); -} - Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) { // Create a GenerationConfig for warmup GenerationConfig config{ diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h index 21b77fe1dfa..9dd99d82d59 100644 --- a/extension/llm/runner/text_llm_runner.h +++ b/extension/llm/runner/text_llm_runner.h @@ -101,25 +101,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { std::function token_callback = {}, std::function stats_callback = {}) override; - /** - * Generate text based on the provided prompt and generation config, from a - * given position in KV cache. - * - * @param prompt The input prompt to generate from - * @param start_pos [Unused] The starting position in KV cache of the input, - * ignored because the runner manages the position internally. - * @param config Generation configuration parameters - * @param token_callback Callback function called for each generated token - * @param stats_callback Callback function for generation statistics - * @return Error::Ok if successful, an error otherwise - */ - ET_DEPRECATED runtime::Error generate_from_pos( - const std::string& prompt, - ET_UNUSED int64_t start_pos, - const GenerationConfig& config, - std::function token_callback = {}, - std::function stats_callback = {}) override; - /** * @brief Warms up the model with a sample prompt * @@ -133,6 +114,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { ::executorch::runtime::Error warmup( const std::string& prompt, int32_t max_new_tokens); + /** * @brief Remove prefilled tokens and reset start position, and stats. * @@ -140,6 +122,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { * start position to 0. It also clears the stats for previous runs. */ void reset() override; + /** * @brief Stops the ongoing text generation process * From 5766ae2b7d7a673932d7069e864b0f6f0e828b76 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 12 Sep 2025 16:24:52 -0700 Subject: [PATCH 2/2] Reset in llama main --- examples/models/llama/main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp index 25b840f260b..078d938ffde 100644 --- a/examples/models/llama/main.cpp +++ b/examples/models/llama/main.cpp @@ -105,6 +105,8 @@ int32_t main(int32_t argc, char** argv) { ET_LOG(Error, "Failed to warmup llama runner"); return 1; } + // reset kv cache pos to 0 + runner->reset(); } // generate executorch::extension::llm::GenerationConfig config{