From 2bcc2cbbcc88ec548d3a124e8dc48f978a112ee6 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Mon, 20 Oct 2025 22:09:11 -0700 Subject: [PATCH] Fix text_llm_runner kv cache pos count and use it for generate() (#15286) pos_ should advance by prefill and generated prompt size. (cherry picked from commit 8946d8085d338cc05b763fbcfa599c3265a3f41f) --- extension/llm/runner/text_llm_runner.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index 333716ac831..0106fd5c250 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -183,11 +183,13 @@ Error TextLLMRunner::generate( // Generate max_new_tokens - 1 because prefill already generated 1 token. int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( prompt_tokens, - num_prompt_tokens, + pos_, max_new_tokens - 1, temperature_ == -1.0f ? config.temperature : temperature_, wrapped_callback)); + pos_ += num_generated_tokens; + stats_->inference_end_ms = time_in_ms(); if (!config.warming) { printf("\n");