diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index 2bc658692da..f6b29d42c09 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -65,8 +65,8 @@ Error MultimodalRunner::load() { Error MultimodalRunner::generate( const std::vector& inputs, const GenerationConfig& config, - std::function& token_callback, - std::function& stats_callback) { + std::function token_callback, + std::function stats_callback) { if (inputs.empty()) { ET_LOG(Error, "MultimodalInput vector cannot be empty"); return Error::InvalidArgument; diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 186a5bf70e4..fe5d1d7f1d7 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -116,8 +116,8 @@ class ET_EXPERIMENTAL MultimodalRunner { virtual ::executorch::runtime::Error generate( const std::vector& inputs, const GenerationConfig& config, - std::function& token_callback, - std::function& stats_callback); + std::function token_callback = {}, + std::function stats_callback = {}); inline void stop() { text_token_generator_->stop(); diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index 1a05921ed3a..a57961ee1d2 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -36,9 +36,9 @@ class ET_EXPERIMENTAL TextTokenGenerator { /** * Token generation loop. - * @param tokens prompt tokens as well as the first token generated by - * prefill. - * @param start_pos the start position of the new tokens, based on how many + * @param tokens The first token generated by prefill, if using kv cache. Else + * the prompt tokens + the first token generated by prefill. + * @param start_pos The start position of the new tokens, based on how many * prompt tokens is prefilled. * @param max_new_tokens Maximum number of new tokens to generate. * @param temperature controls the randomness of predictions by scaling the