From 8f5230b1472fe88a486229fba843ae906515dc21 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:02:07 -0700 Subject: [PATCH 1/2] Fix data race on should_stop_ flag in LLM runner should_stop_ is written from the caller thread via stop() and read from the inference thread in the generate loop. A plain bool without synchronization is undefined behavior per the C++ standard and can cause the compiler to optimize away the cross-thread visibility on ARM targets. Change bool to std::atomic with relaxed memory ordering, which is sufficient for a simple cancellation flag and has negligible overhead. --- extension/llm/runner/text_decoder_runner.h | 6 ++++-- extension/llm/runner/text_llm_runner.h | 3 ++- extension/llm/runner/text_token_generator.h | 10 ++++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 8b855e2924f..94f4120daa5 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -10,6 +10,8 @@ #pragma once +#include + #include #include #include @@ -70,7 +72,7 @@ class ET_EXPERIMENTAL TextDecoderRunner { } inline void stop() { - should_stop_ = true; + should_stop_.store(true, std::memory_order_relaxed); } /** @@ -98,7 +100,7 @@ class ET_EXPERIMENTAL TextDecoderRunner { Module* module_; IOManager* io_manager_; std::string method_name_; - bool should_stop_{false}; + std::atomic should_stop_{false}; }; } // namespace llm diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h index 1948ef446fe..e57e0af4178 100644 --- a/extension/llm/runner/text_llm_runner.h +++ b/extension/llm/runner/text_llm_runner.h @@ -11,6 +11,7 @@ #pragma once +#include #include #include #include @@ -161,7 +162,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { void stop() override; private: - bool shouldStop_{false}; + std::atomic shouldStop_{false}; // Components std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index 128de05d1d9..bf4153739ae 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -9,6 +9,8 @@ // Generate tokens in a loop. #pragma once +#include + #include #include #include @@ -83,7 +85,7 @@ class ET_EXPERIMENTAL TextTokenGenerator { auto tokens_managed = from_blob( token_data.data(), token_shape, executorch::aten::ScalarType::Long); - should_stop_ = false; + should_stop_.store(false, std::memory_order_relaxed); // Generate our tokens while (pos < start_pos + max_new_tokens) { @@ -124,7 +126,7 @@ class ET_EXPERIMENTAL TextTokenGenerator { } token_callback(std::move(*decode_result)); - if (should_stop_) { + if (should_stop_.load(std::memory_order_relaxed)) { break; } @@ -142,7 +144,7 @@ class ET_EXPERIMENTAL TextTokenGenerator { * Stop the generation loop. */ inline void stop() { - should_stop_ = true; + should_stop_.store(true, std::memory_order_relaxed); } /** @@ -176,7 +178,7 @@ class ET_EXPERIMENTAL TextTokenGenerator { bool ignore_eos_ = false; // state machine - bool should_stop_ = false; + std::atomic should_stop_{false}; // stats Stats* stats_; From d03f001efb0f9b1b0df1c4550441daeeb3b90667 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:33:55 -0700 Subject: [PATCH 2/2] Remove unused should_stop_ flags from TextDecoderRunner and TextLLMRunner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These flags were written but never read — cancellation is handled entirely through TextTokenGenerator::should_stop_. Removing dead state and the unnecessary dependency from these two classes. --- extension/llm/runner/text_decoder_runner.h | 7 ------- extension/llm/runner/text_llm_runner.cpp | 1 - extension/llm/runner/text_llm_runner.h | 3 --- 3 files changed, 11 deletions(-) diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 94f4120daa5..6762b73b7ce 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -10,8 +10,6 @@ #pragma once -#include - #include #include #include @@ -71,10 +69,6 @@ class ET_EXPERIMENTAL TextDecoderRunner { return method_name_; } - inline void stop() { - should_stop_.store(true, std::memory_order_relaxed); - } - /** * Sample the next token from the logits tensor. * @param logits_tensor The logits tensor. @@ -100,7 +94,6 @@ class ET_EXPERIMENTAL TextDecoderRunner { Module* module_; IOManager* io_manager_; std::string method_name_; - std::atomic should_stop_{false}; }; } // namespace llm diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index 45ab6edd79c..5eee0e4cdaa 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -108,7 +108,6 @@ Error TextLLMRunner::generate( // return a response token. stats_->inference_start_ms = time_in_ms(); - shouldStop_ = false; // Capture remaining KV cache capacity before prefill (pos_ will change) int64_t max_context_len = metadata_.at(kMaxContextLen) - pos_; diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h index e57e0af4178..c73b6a4bed6 100644 --- a/extension/llm/runner/text_llm_runner.h +++ b/extension/llm/runner/text_llm_runner.h @@ -11,7 +11,6 @@ #pragma once -#include #include #include #include @@ -162,8 +161,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { void stop() override; private: - std::atomic shouldStop_{false}; - // Components std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; std::unordered_map metadata_;