From 8f5230b1472fe88a486229fba843ae906515dc21 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 1 Apr 2026 14:02:07 -0700
Subject: [PATCH 1/2] Fix data race on should_stop_ flag in LLM runner

should_stop_ is written from the caller thread via stop() and read from
the inference thread in the generate loop. A plain bool without
synchronization is undefined behavior per the C++ standard and can cause
the compiler to optimize away the cross-thread visibility on ARM targets.

Change bool to std::atomic<bool> with relaxed memory ordering, which is
sufficient for a simple cancellation flag and has negligible overhead.
---
 extension/llm/runner/text_decoder_runner.h  |  6 ++++--
 extension/llm/runner/text_llm_runner.h      |  3 ++-
 extension/llm/runner/text_token_generator.h | 10 ++++++----
 3 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 8b855e2924f..94f4120daa5 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -10,6 +10,8 @@
 
 #pragma once
 
+#include <atomic>
+
 #include <executorch/extension/llm/runner/io_manager/io_manager.h>
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/sampler/util.h>
@@ -70,7 +72,7 @@ class ET_EXPERIMENTAL TextDecoderRunner {
   }
 
   inline void stop() {
-    should_stop_ = true;
+    should_stop_.store(true, std::memory_order_relaxed);
   }
 
   /**
@@ -98,7 +100,7 @@ class ET_EXPERIMENTAL TextDecoderRunner {
   Module* module_;
   IOManager* io_manager_;
   std::string method_name_;
-  bool should_stop_{false};
+  std::atomic<bool> should_stop_{false};
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h
index 1948ef446fe..e57e0af4178 100644
--- a/extension/llm/runner/text_llm_runner.h
+++ b/extension/llm/runner/text_llm_runner.h
@@ -11,6 +11,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -161,7 +162,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
   void stop() override;
 
  private:
-  bool shouldStop_{false};
+  std::atomic<bool> shouldStop_{false};
 
   // Components
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 128de05d1d9..bf4153739ae 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -9,6 +9,8 @@
 // Generate tokens in a loop.
 #pragma once
 
+#include <atomic>
+
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/tensor/tensor.h>
@@ -83,7 +85,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     auto tokens_managed = from_blob(
         token_data.data(), token_shape, executorch::aten::ScalarType::Long);
 
-    should_stop_ = false;
+    should_stop_.store(false, std::memory_order_relaxed);
 
     // Generate our tokens
     while (pos < start_pos + max_new_tokens) {
@@ -124,7 +126,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
       }
       token_callback(std::move(*decode_result));
 
-      if (should_stop_) {
+      if (should_stop_.load(std::memory_order_relaxed)) {
         break;
       }
 
@@ -142,7 +144,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
    * Stop the generation loop.
    */
   inline void stop() {
-    should_stop_ = true;
+    should_stop_.store(true, std::memory_order_relaxed);
   }
 
   /**
@@ -176,7 +178,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   bool ignore_eos_ = false;
 
   // state machine
-  bool should_stop_ = false;
+  std::atomic<bool> should_stop_{false};
 
   // stats
   Stats* stats_;

From d03f001efb0f9b1b0df1c4550441daeeb3b90667 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 1 Apr 2026 14:33:55 -0700
Subject: [PATCH 2/2] Remove unused should_stop_ flags from TextDecoderRunner
 and TextLLMRunner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These flags were written but never read — cancellation is handled
entirely through TextTokenGenerator::should_stop_. Removing dead state
and the unnecessary <atomic> dependency from these two classes.
---
 extension/llm/runner/text_decoder_runner.h | 7 -------
 extension/llm/runner/text_llm_runner.cpp   | 1 -
 extension/llm/runner/text_llm_runner.h     | 3 ---
 3 files changed, 11 deletions(-)

diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 94f4120daa5..6762b73b7ce 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -10,8 +10,6 @@
 
 #pragma once
 
-#include <atomic>
-
 #include <executorch/extension/llm/runner/io_manager/io_manager.h>
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/sampler/util.h>
@@ -71,10 +69,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
     return method_name_;
   }
 
-  inline void stop() {
-    should_stop_.store(true, std::memory_order_relaxed);
-  }
-
   /**
    * Sample the next token from the logits tensor.
    * @param logits_tensor The logits tensor.
@@ -100,7 +94,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
   Module* module_;
   IOManager* io_manager_;
   std::string method_name_;
-  std::atomic<bool> should_stop_{false};
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
index 45ab6edd79c..5eee0e4cdaa 100644
--- a/extension/llm/runner/text_llm_runner.cpp
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -108,7 +108,6 @@ Error TextLLMRunner::generate(
   // return a response token.
 
   stats_->inference_start_ms = time_in_ms();
-  shouldStop_ = false;
 
   // Capture remaining KV cache capacity before prefill (pos_ will change)
   int64_t max_context_len = metadata_.at(kMaxContextLen) - pos_;
diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h
index e57e0af4178..c73b6a4bed6 100644
--- a/extension/llm/runner/text_llm_runner.h
+++ b/extension/llm/runner/text_llm_runner.h
@@ -11,7 +11,6 @@
 
 #pragma once
 
-#include <atomic>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -162,8 +161,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
   void stop() override;
 
  private:
-  std::atomic<bool> shouldStop_{false};
-
   // Components
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;