From b4d0cb4c35dafe39883ebdeb19b6988b3cc3a61d Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 12 Sep 2025 12:11:25 -0700
Subject: [PATCH 1/2] Remove generate_from_pos since there's no user of it

As titled
---
 .../executor_runner/mtk_llama_runner.h        |  2 ++
 .../oss_scripts/llama/runner/runner.cpp       | 14 ++---------
 .../oss_scripts/llama/runner/runner.h         |  9 ++-----
 extension/llm/runner/irunner.h                | 25 ++-----------------
 extension/llm/runner/text_llm_runner.cpp      | 13 ++--------
 extension/llm/runner/text_llm_runner.h        | 21 ++--------------
 6 files changed, 12 insertions(+), 72 deletions(-)
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index 5dd8a85005e..e82b36d9373 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -66,6 +66,8 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner {
       std::function<void(const std::string&)> token_callback);
   std::unique_ptr<Tokenizer> load_tokenizer();
 
+  void reset() {}
+
  private:
   // model
   const LlamaModelOptions modeloptions_;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 0c9be4d441d..253e083a80e 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -354,17 +354,6 @@ Error Runner<T>::generate(
     const llm::GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
-  return generate_from_pos(prompt, 0, config, token_callback, stats_callback);
-}
-
-template <typename T>
-Error Runner<T>::generate_from_pos(
-    const std::string& prompt,
-    int64_t start_pos,
-    const llm::GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  // TODO: currently only support start_pos == 0
   return generate_from_prompt_or_file(
       prompt, false, config, token_callback, stats_callback);
 }
@@ -435,7 +424,8 @@ Error Runner<T>::generate_from_prompt_or_file(
   stats_.first_token_ms = time_in_ms();
   stats_.prompt_eval_end_ms = time_in_ms();
 
-  // print the first token from prefill. No prev_token so use cur_token for it.
+  // print the first token from prefill. No prev_token so use cur_token for
+  // it.
   if (token_callback) {
     token_callback(
         ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index 30fba71ecef..9f290d79c75 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -72,13 +72,7 @@ class Runner : public executorch::extension::llm::IRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const executorch::llm::Stats&)> stats_callback = {})
       override;
-  executorch::runtime::Error generate_from_pos(
-      const std::string& prompt,
-      int64_t start_pos,
-      const executorch::extension::llm::GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const executorch::llm::Stats&)> stats_callback = {})
-      override;
+
   executorch::runtime::Error generate_from_prompt_or_file(
       const std::string& prompt,
       bool tokenized_prompt,
@@ -86,6 +80,7 @@ class Runner : public executorch::extension::llm::IRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const executorch::llm::Stats&)> stats_callback = {});
   void stop() override {};
+  void reset() override {};
   executorch::runtime::Result<DecoderModelVersion> get_decoder_model_version();
 
  private:
diff --git a/extension/llm/runner/irunner.h b/extension/llm/runner/irunner.h
index 5bd5ef9d04e..ef93f32319c 100644
--- a/extension/llm/runner/irunner.h
+++ b/extension/llm/runner/irunner.h
@@ -125,39 +125,18 @@ class ET_EXPERIMENTAL IRunner {
       std::function<void(const std::string&)> token_callback,
       std::function<void(const Stats&)> stats_callback) = 0;
 
-  /**
-   * Generate text based on the provided prompt and generation config, from a
-   * given position in KV cache.
-   *
-   * @param prompt The input prompt to generate from
-   * @param start_pos The starting position in KV cache of the input. Note:
-   * Depending on the actual implementation, a runner may manage the position
-   * internally, and this may not be respected.
-   * @param config Generation configuration parameters
-   * @param token_callback Callback function called for each generated token
-   * @param stats_callback Callback function for generation statistics
-   * @return Error::Ok if successful, an error otherwise
-   */
-  virtual runtime::Error generate_from_pos(
-      const std::string& prompt,
-      int64_t start_pos,
-      const GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback,
-      std::function<void(const Stats&)> stats_callback) = 0;
   /**
    * Stop the generation process.
    */
   virtual void stop() = 0;
+
   /**
    * Force remove prefilled tokens and reset KV cache start position
    *
-   * For some existing runners, overriding this method is not needed because
-   * start_pos is passed as an argument to generate_from_pos.
-   *
    * This method removes the prefilled tokens from the KV cache and resets the
    * start position to 0.
    */
-  virtual void reset() {};
+  virtual void reset() = 0;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
index b6f41fd7af6..338e35bcc2e 100644
--- a/extension/llm/runner/text_llm_runner.cpp
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -43,6 +43,7 @@ TextLLMRunner::TextLLMRunner(
       io_manager_(std::move(io_manager)),
       text_token_generator_(std::move(text_token_generator)),
       stats_(std::move(stats)),
+      pos_(0),
       temperature_(temperature) {
   // Note: This constructor assumes that text_prefiller and text_token_generator
   // already have references to the Module and TextDecoderRunner they need
@@ -70,9 +71,8 @@ Error TextLLMRunner::load() {
     ET_LOG(Info, format, __VA_ARGS__);     \
   }
 
-Error TextLLMRunner::generate_from_pos(
+Error TextLLMRunner::generate(
     const std::string& prompt,
-    ET_UNUSED int64_t start_pos,
     const GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
@@ -217,15 +217,6 @@ Error TextLLMRunner::generate_from_pos(
   return Error::Ok;
 }
 
-Error TextLLMRunner::generate(
-    const std::string& prompt,
-    const GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  pos_ = 0;
-  return generate_from_pos(prompt, 0, config, token_callback, stats_callback);
-}
-
 Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) {
   // Create a GenerationConfig for warmup
   GenerationConfig config{
diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h
index 21b77fe1dfa..9dd99d82d59 100644
--- a/extension/llm/runner/text_llm_runner.h
+++ b/extension/llm/runner/text_llm_runner.h
@@ -101,25 +101,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {}) override;
 
-  /**
-   * Generate text based on the provided prompt and generation config, from a
-   * given position in KV cache.
-   *
-   * @param prompt The input prompt to generate from
-   * @param start_pos [Unused] The starting position in KV cache of the input,
-   * ignored because the runner manages the position internally.
-   * @param config Generation configuration parameters
-   * @param token_callback Callback function called for each generated token
-   * @param stats_callback Callback function for generation statistics
-   * @return Error::Ok if successful, an error otherwise
-   */
-  ET_DEPRECATED runtime::Error generate_from_pos(
-      const std::string& prompt,
-      ET_UNUSED int64_t start_pos,
-      const GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {}) override;
-
   /**
    * @brief Warms up the model with a sample prompt
    *
@@ -133,6 +114,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
   ::executorch::runtime::Error warmup(
       const std::string& prompt,
       int32_t max_new_tokens);
+
   /**
    * @brief Remove prefilled tokens and reset start position, and stats.
    *
@@ -140,6 +122,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
    * start position to 0. It also clears the stats for previous runs.
    */
   void reset() override;
+
   /**
    * @brief Stops the ongoing text generation process
    *

From 5766ae2b7d7a673932d7069e864b0f6f0e828b76 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 12 Sep 2025 16:24:52 -0700
Subject: [PATCH 2/2] Reset in llama main

---
 examples/models/llama/main.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
index 25b840f260b..078d938ffde 100644
--- a/examples/models/llama/main.cpp
+++ b/examples/models/llama/main.cpp
@@ -105,6 +105,8 @@ int32_t main(int32_t argc, char** argv) {
       ET_LOG(Error, "Failed to warmup llama runner");
       return 1;
     }
+    // reset kv cache pos to 0
+    runner->reset();
   }
   // generate
   executorch::extension::llm::GenerationConfig config{