From 76b9c2863a90551bdc3283a83b128969346be5d1 Mon Sep 17 00:00:00 2001
From: Abhinay Kukkadapu <abhinayk@meta.com>
Date: Thu, 13 Nov 2025 11:44:33 -0800
Subject: [PATCH] Introduce seq_len as inference param, and improve warnings
 (#15716)

Summary:

Changes:
1. add `--seq_len` param to llama script to distinguish max_seq_len which is compile time param
2. Add warnings in the runner when `seq_len` is clamped to `max_seq_len` to avoid silently clamping it.
3. Add warnings in the token generator when EOS is not reached due to insufficient seq_len or max_seq_len.

Differential Revision: D86696759
---
 .../oss_scripts/llama/runner/runner.cpp       | 17 ++++++++++++-
 .../llama/runner/token_generator.cpp          | 24 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index e239a2a5fe1..4c0351d4dea 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -376,7 +376,22 @@ Error Runner<T>::generate_from_prompt_or_file(
   stats_.inference_start_ms = time_in_ms();
 
   int32_t seq_len = config.seq_len;
-  seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
+  if (seq_len > context_len_) {
+    ET_LOG(
+        Info,
+        "Warning: Requested seq_len (%d) exceeds compiled max_seq_len (%d). Clamping to %d.",
+        seq_len,
+        context_len_,
+        context_len_);
+    seq_len = context_len_;
+  } else if (seq_len <= 0) {
+    ET_LOG(
+        Info,
+        "Warning: Invalid seq_len (%d). Using compiled max_seq_len (%d).",
+        seq_len,
+        context_len_);
+    seq_len = context_len_;
+  }
   int32_t n_bos = (cur_pos_ == 0) ? 1 : 0;
 
   // encode the (string) prompt into tokens sequence
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
index 6775c08bd87..40e8fb1a82d 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
@@ -323,6 +323,30 @@ Result<int64_t> TokenGenerator<T>::generate(
       break;
     }
   }
+
+  // Check if generation was truncated due to seq_len limit (no EOS token)
+  if (eos_ids_->count(cur_token) == 0 && pos >= seq_len - 1) {
+    printf("\n");
+    ET_LOG(
+        Info,
+        "Warning: Generation stopped at seq_len limit (%d) without reaching EOS token. Response may be incomplete.",
+        seq_len);
+    if (seq_len >= metadata_.context_len) {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) already equals compiled max_seq_len (%d). Consider recompiling with larger --max_seq_len.",
+          seq_len,
+          metadata_.context_len);
+    } else {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) is less than compiled max_seq_len (%d). Consider increasing --seq_len (up to %d).",
+          seq_len,
+          metadata_.context_len,
+          metadata_.context_len);
+    }
+  }
+
   return pos - start_pos;
 }
 // Explicit instantiations