From bf7f343906c155a88f59b183d60d5ee52064b896 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Mon, 15 Sep 2025 16:40:15 -0700 Subject: [PATCH] Update cache position population and arg order for multimodal runner (#14225) For voxtral and phi-3, we construct the cache_position_tensor like before; for llava, it will construct underneath so we pass in size 1. (cherry picked from commit ea4f0043028a4f110ee491d4da9568a19941e024) --- .../llm/runner/multimodal_decoder_runner.h | 2 +- extension/llm/runner/multimodal_prefiller.cpp | 22 +++++---- extension/llm/runner/text_decoder_runner.cpp | 34 ++------------ extension/llm/runner/util.h | 45 +++++++++++++++++++ 4 files changed, 60 insertions(+), 43 deletions(-) diff --git a/extension/llm/runner/multimodal_decoder_runner.h b/extension/llm/runner/multimodal_decoder_runner.h index f76b8c64028..c8db3e57000 100644 --- a/extension/llm/runner/multimodal_decoder_runner.h +++ b/extension/llm/runner/multimodal_decoder_runner.h @@ -48,7 +48,7 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner &start_pos, {1}, executorch::aten::ScalarType::Long); // run text model auto outputs_res = ET_UNWRAP( - module_->execute(kTextModelMethod, {start_pos_tensor, embeddings})); + module_->execute(kTextModelMethod, {embeddings, start_pos_tensor})); ET_CHECK_MSG( outputs_res.size() == 1, diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 1d9a0c8fdfc..2705a9eadff 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -91,24 +91,22 @@ Result MultimodalPrefiller::prefill( } // 2. Run decoder model for prefill. - // `cache_position` goes from start_pos to start_pos + encoder_output.size(1). - // e.g. if start_pos = 2 and encoder_output.size(1) = 5, - // cache_position_tensor should be [2, 3, 4, 5, 6]. + + // Get expected shape of cache position tensor, which should be the second + // argument + int64_t seq_len = encoder_output.toTensor().size(1); if (seq_len == 0) { ET_LOG(Error, "The encoder returned an empty output."); return ::executorch::runtime::Error::InvalidState; } - std::vector cache_positions(seq_len); - for (int64_t i = 0; i < seq_len; ++i) { - cache_positions[i] = start_pos + i; - } - auto cache_position_tensor = ::executorch::extension::from_blob( - cache_positions.data(), - {static_cast(seq_len)}, - executorch::aten::ScalarType::Long); + std::vector cache_positions; + + auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position( + module_, start_pos, cache_positions, seq_len, kTextModelMethod)); + auto prefill_result = module_->execute( - kTextModelMethod, {cache_position_tensor, encoder_output}); + kTextModelMethod, {encoder_output, cache_position_tensor}); if (prefill_result.error() != ::executorch::runtime::Error::Ok) { return prefill_result.error(); } diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 27c00c19089..7cd7623f58f 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -36,37 +36,11 @@ ::executorch::runtime::Result TextDecoderRunner::step( // If only 1 input, we are not using kv cache bool use_kv_cache = method_meta.num_inputs() > 1; + std::vector cache_positions; + if (use_kv_cache) { - // Size of the second argument. This could be either input_pos or - // cache_positions - - // Check if we are using cache positions instead of input pos. - auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1)); - // For input_pos, numel is 1, for cache_positions, numel is max_seq_len - auto sizes = second_input_info.sizes(); - // Assuming 1D tensor - ET_CHECK_OR_RETURN_ERROR( - sizes.size() == 1, - InvalidProgram, - "The second input tensor is not 1D tensor. Got dimension (%zu)", - sizes.size()); - auto numel = sizes[0]; - std::vector<::executorch::aten::SizesType> sizes_vec = {numel}; - - TensorPtr start_pos_tensor; - if (numel > 1) { - // If we are here, model is exported with cache_positions, create a tensor - // with the same length as input_ids. Assuming the last dimension is the - // one with the variable token length, for example [1, S] or [1, 1, S] - sizes_vec[sizes_vec.size() - 1] = tokens->numel(); - start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long); - torch::executor::native::arange_out_impl( - start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor); - } else { - // Assuming model is exported with input_pos, create a tensor with size 1 - start_pos_tensor = from_blob( - &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long); - } + auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position( + module_, start_pos, cache_positions, tokens->numel(), "forward")); std::vector inputs; auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor); diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index 0cb2463d163..5aff2c8a3b5 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -7,6 +7,9 @@ */ #pragma once +#include +#include +#include #include #include #include @@ -99,6 +102,48 @@ ET_EXPERIMENTAL size_t inline get_rss_bytes() { // when this changed. return 0; } + +// Returns the cache position tensor, which can be either a single start_pos +// (when the method_name [`text_decoder` or `forward`] expects a tensor with +// size 1 because model will populate the cache position tensor underneath), or +// a populated tensor for cache position, for the given start_pos and seq_len. +inline runtime::Result populate_start_pos_or_cache_position( + Module* module, + int64_t& start_pos, + std::vector& cache_positions_vec, + int seq_len, + const char* method_name = "forward") { + // Get expected shape of cache position tensor, which should be the second + // argument + auto method_meta = ET_UNWRAP(module->method_meta(method_name)); + auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1)); + auto second_input_sizes = second_input_info.sizes(); + auto numel = second_input_sizes[0]; + + for (int i = 0; i < second_input_sizes.size(); ++i) { + ET_LOG(Error, "second_input_sizes[%d] = %d", i, second_input_sizes[i]); + } + + TensorPtr start_pos_tensor; + if (numel > 1) { + // `cache_position` goes from start_pos to start_pos + + // encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1) + // = 5, cache_position_tensor should be [2, 3, 4, 5, 6]. + cache_positions_vec.resize(seq_len); + for (int64_t i = 0; i < seq_len; ++i) { + cache_positions_vec[i] = start_pos + i; + } + return ::executorch::extension::from_blob( + cache_positions_vec.data(), + {static_cast(seq_len)}, + executorch::aten::ScalarType::Long); + } else { + // Cache position is size 1. + return ::executorch::extension::from_blob( + &start_pos, {1}, executorch::aten::ScalarType::Long); + } +} + } // namespace llm } // namespace extension } // namespace executorch