From ed8786fdc2ecb2f8c82bf7affc42d59a9ba9fee8 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 12 Nov 2025 15:56:25 -0800 Subject: [PATCH] feat: no need to specify decoder_start_token_id --- examples/models/whisper/main.cpp | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp index d4b156a413a..080106c8915 100644 --- a/examples/models/whisper/main.cpp +++ b/examples/models/whisper/main.cpp @@ -39,10 +39,6 @@ DEFINE_string( audio_path, "", "Path to input audio file. Accepts .wav or raw float .bin."); -DEFINE_string( - model_name, - "base", - "Whisper model name (base, small, medium, large, large-v2, large-v3, large-v3-turbo)."); DEFINE_double( temperature, 0.0, @@ -114,21 +110,10 @@ int main(int argc, char** argv) { config.max_new_tokens = FLAGS_max_new_tokens; config.temperature = static_cast(FLAGS_temperature); - // Set decoder_start_token_id based on model version - if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" || - FLAGS_model_name == "large-v3-turbo") { - config.decoder_start_token_id = 50258; - ET_LOG( - Info, - "Using decoder_start_token_id=50258 for model: %s", - FLAGS_model_name.c_str()); - } else { - config.decoder_start_token_id = 50257; - ET_LOG( - Info, - "Using decoder_start_token_id=50257 for model: %s", - FLAGS_model_name.c_str()); - } + // All Whisper models from HuggingFace now use the v3 tokenizer format + // where token 50257 = <|endoftext|> and token 50258 = <|startoftranscript|> + config.decoder_start_token_id = 50258; + ET_LOG(Info, "Using decoder_start_token_id=50258"); auto result = runner.transcribe(features, config, [&](const std::string& piece) {