From 7ebb46b89a23c2467b1ab57bfa29f1beee88d569 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Mon, 20 Oct 2025 16:34:20 -0700 Subject: [PATCH 1/2] Fix text_llm_runner kv cache pos count --- extension/llm/runner/text_llm_runner.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index 4e0bccdb781..496ca5744b1 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -199,6 +199,8 @@ Error TextLLMRunner::generate( } int64_t num_generated_tokens = generate_result.get(); + pos_ += num_generated_tokens; + stats_->inference_end_ms = time_in_ms(); if (!config.warming) { printf("\n"); From 12545a3fcc814263b23c19238c547cdef150ad2d Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Mon, 20 Oct 2025 17:09:09 -0700 Subject: [PATCH 2/2] Should use pos_ to text_token_generator --- extension/llm/runner/text_llm_runner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index 496ca5744b1..738951ebb95 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -190,7 +190,7 @@ Error TextLLMRunner::generate( // Generate max_new_tokens - 1 because prefill already generated 1 token. auto generate_result = text_token_generator_->generate( prompt_tokens, - num_prompt_tokens, + pos_, max_new_tokens - 1, temperature_ == -1.0f ? config.temperature : temperature_, wrapped_callback);