pytorch · kirklandsign · Apr 14, 2026 · Apr 1, 2026 · Apr 1, 2026 · Copilot
@@ -69,10 +69,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
     return method_name_;
   }
 
-  inline void stop() {
-    should_stop_ = true;
-  }
-
   /**
-  /**
+  /**
+   * Deprecated compatibility shim for older callers. TextDecoderRunner no
+   * longer requires explicit stop behavior, so this method is now a no-op.
+   */
+  [[deprecated(
+      "TextDecoderRunner::stop() is deprecated and is now a no-op; remove "
+      "calls to this method.")]] virtual void stop() {}
+
+  /**
-  /**
+  /**
+   * Deprecated compatibility shim for older callers. TextDecoderRunner no
+   * longer requires explicit stop behavior, so this method is now a no-op.
+   */
+  [[deprecated(
+      "TextDecoderRunner::stop() is deprecated and is now a no-op; remove "
+      "calls to this method.")]] virtual void stop() {}
+
+  /**
    * Sample the next token from the logits tensor.
    * @param logits_tensor The logits tensor.
@@ -98,7 +94,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
   Module* module_;
   IOManager* io_manager_;
   std::string method_name_;
-  bool should_stop_{false};
 };
 
 } // namespace llm

@@ -108,7 +108,6 @@ Error TextLLMRunner::generate(
   // return a response token.
 
   stats_->inference_start_ms = time_in_ms();
-  shouldStop_ = false;
 
   // Capture remaining KV cache capacity before prefill (pos_ will change)
   int64_t max_context_len = metadata_.at(kMaxContextLen) - pos_;

@@ -161,8 +161,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
   void stop() override;
 
  private:
-  bool shouldStop_{false};
-
   // Components
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;

@@ -9,6 +9,8 @@
 // Generate tokens in a loop.
 #pragma once
 
+#include <atomic>
+
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/tensor/tensor.h>
@@ -83,7 +85,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     auto tokens_managed = from_blob(
         token_data.data(), token_shape, executorch::aten::ScalarType::Long);
 
-    should_stop_ = false;
+    should_stop_.store(false, std::memory_order_relaxed);
-    should_stop_.store(false, std::memory_order_relaxed);
+    // Clear any stale stop request from a previous run without losing a
+    // concurrent early stop for this run. If a stop was already requested,
+    // honor it immediately for this generation call.
+    if (should_stop_.exchange(false, std::memory_order_relaxed)) {
+      return 0;
+    }
-    should_stop_.store(false, std::memory_order_relaxed);
+    // Clear any stale stop request from a previous run without losing a
+    // concurrent early stop for this run. If a stop was already requested,
+    // honor it immediately for this generation call.
+    if (should_stop_.exchange(false, std::memory_order_relaxed)) {
+      return 0;
+    }
 
     // Generate our tokens
     while (pos < start_pos + max_new_tokens) {
@@ -124,7 +126,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
       }
       token_callback(std::move(*decode_result));
 
-      if (should_stop_) {
+      if (should_stop_.load(std::memory_order_relaxed)) {
         break;
       }
 
@@ -142,7 +144,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
    * Stop the generation loop.
    */
   inline void stop() {
-    should_stop_ = true;
+    should_stop_.store(true, std::memory_order_relaxed);
   }
 
   /**
@@ -176,7 +178,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   bool ignore_eos_ = false;
 
   // state machine
-  bool should_stop_ = false;
+  std::atomic<bool> should_stop_{false};
 
   // stats
   Stats* stats_;