From 980f44da29a1222886e483eca9ed9911f414f137 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 14 Jul 2025 21:04:15 -0700
Subject: [PATCH] Fix cache_positions tensor size in TextLLMRunner

In Huggingface causal LM forward convention, `cache_position` should
have the same length as `input_ids`. The previous logic will allocate
`cache_position` based on method metadata which by default is equal to
the maximum length of this tensor (normally max context length). Now
changing the logic to align the size of `cache_position` to `input_ids`.

ghstack-source-id: d1b407c02b8e2f6ba3a7d7be777d0ac5a855e3b3
Pull Request resolved: https://github.com/pytorch/executorch/pull/12476
---
 extension/llm/runner/text_decoder_runner.cpp | 17 ++++++++++-------
 extension/llm/runner/text_prefiller.h        |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index e60a07bc50a..4293b2a08d8 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -52,22 +52,25 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     auto numel = sizes[0];
     std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
 
-    // Assuming the last dimension is the one with the variable token length,
-    // for example [1, S] or [1, 1, S]
-    sizes_vec[sizes_vec.size() - 1] = numel;
     TensorPtr start_pos_tensor;
     if (numel > 1) {
-      // Assuming model is exported with cache_positions, create a tensor with
-      // the same size as cache_positions
+      // If we are here, model is exported with cache_positions, create a tensor
+      // with the same length as input_ids. Assuming the last dimension is the
+      // one with the variable token length, for example [1, S] or [1, 1, S]
+      sizes_vec[sizes_vec.size() - 1] = tokens->numel();
       start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
       torch::executor::native::arange_out_impl(
-          start_pos, start_pos + numel, 1.0, *start_pos_tensor);
+          start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
     } else {
       // Assuming model is exported with input_pos, create a tensor with size 1
       start_pos_tensor = from_blob(
           &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
     }
-    ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel());
+    ET_LOG(
+        Info,
+        "Start pos tensor numel: %zu, tokens numel: %zu",
+        start_pos_tensor->numel(),
+        tokens->numel());
     auto outputs_res = module_->forward({tokens, start_pos_tensor});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
index ce12506a05c..a02cd3d1bf4 100644
--- a/extension/llm/runner/text_prefiller.h
+++ b/extension/llm/runner/text_prefiller.h
@@ -21,7 +21,7 @@ class ET_EXPERIMENTAL TextPrefiller {
  public:
   TextPrefiller(
       TextDecoderRunner* text_decoder_runner,
-      bool use_kv_cache_,
+      bool use_kv_cache,
       bool enable_parallel_prefill,
       int64_t max_seq_len = 128);