From ed8786fdc2ecb2f8c82bf7affc42d59a9ba9fee8 Mon Sep 17 00:00:00 2001
From: Young Han <younghan@meta.com>
Date: Wed, 12 Nov 2025 15:56:25 -0800
Subject: [PATCH] feat: no need to specify decoder_start_token_id

---
 examples/models/whisper/main.cpp | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)
diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp
index d4b156a413a..080106c8915 100644
--- a/examples/models/whisper/main.cpp
+++ b/examples/models/whisper/main.cpp
@@ -39,10 +39,6 @@ DEFINE_string(
     audio_path,
     "",
     "Path to input audio file. Accepts .wav or raw float .bin.");
-DEFINE_string(
-    model_name,
-    "base",
-    "Whisper model name (base, small, medium, large, large-v2, large-v3, large-v3-turbo).");
 DEFINE_double(
     temperature,
     0.0,
@@ -114,21 +110,10 @@ int main(int argc, char** argv) {
   config.max_new_tokens = FLAGS_max_new_tokens;
   config.temperature = static_cast<float>(FLAGS_temperature);
 
-  // Set decoder_start_token_id based on model version
-  if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" ||
-      FLAGS_model_name == "large-v3-turbo") {
-    config.decoder_start_token_id = 50258;
-    ET_LOG(
-        Info,
-        "Using decoder_start_token_id=50258 for model: %s",
-        FLAGS_model_name.c_str());
-  } else {
-    config.decoder_start_token_id = 50257;
-    ET_LOG(
-        Info,
-        "Using decoder_start_token_id=50257 for model: %s",
-        FLAGS_model_name.c_str());
-  }
+  // All Whisper models from HuggingFace now use the v3 tokenizer format
+  // where token 50257 = <|endoftext|> and token 50258 = <|startoftranscript|>
+  config.decoder_start_token_id = 50258;
+  ET_LOG(Info, "Using decoder_start_token_id=50258");
 
   auto result =
       runner.transcribe(features, config, [&](const std::string& piece) {