pytorch · facebook-github-bot · Sep 26, 2025 · Sep 26, 2025
@@ -434,6 +434,7 @@ class StaticAttentionIOManager {
     std::vector<size_t> k_cache_output_indices;
     std::vector<size_t> v_cache_input_indices;
     std::vector<size_t> v_cache_output_indices;
+    size_t max_context_len{};
     RopeT* rope_freqs_cos;
     RopeT* rope_freqs_sin;
     StaticAttentionUpdateStyle style = StaticAttentionUpdateStyle::SMART_MASK;
@@ -604,6 +605,10 @@ class StaticAttentionIOManager {
     size_t batch_len = 0;
     for (size_t i = 0; i < tokens.size(); i += input_len) {
       batch_len = std::min(input_len, tokens.size() - i);
+      if (input_pos_ + batch_len > config_.max_context_len) {
+        ET_LOG(Error, "Maximum context size reached, stopping prefill.");
+        return input_len - 1;
+      }
       std::copy(&tokens[i], &tokens[i + batch_len], input_buffer.begin());
       prepare(method);
       ET_CHECK(method.execute() == executorch::runtime::Error::Ok);
@@ -646,6 +651,10 @@ class StaticAttentionIOManager {
 
     while (true) {
       input_buffer[0] = prev_tok;
+      if (input_pos_ + 1 > config_.max_context_len) {
+        ET_LOG(Error, "Maximum context size reached, stopping decode.");
+        break;
+      }
       prepare(method);
       ET_CHECK(method.execute() == executorch::runtime::Error::Ok);
       update(
@@ -730,6 +739,11 @@ class StaticAttentionIOManager {
       }
 
       // Setup input pointers and RoPE frequencies.
+      if (input_pos_ + ngram_size > config_.max_context_len) {
+        ET_LOG(
+            Error, "Maximum context size reached, stopping lookahead decode.");
+        break;
+      }
       prepare(
           method,
           executorch::runtime::Span(pos_offsets.data(), pos_offsets.size()));