diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 7e571087c1d..62ddfc5c363 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -224,12 +224,12 @@ def export_all(llava_model: LlavaModel): lowered_and_edge = to_edge_transform_and_lower( { - "image_encoder": image_encoder_ep, + "vision_encoder": image_encoder_ep, "token_embedding": token_embedding_ep, "text_decoder": text_model_ep, }, partitioner={ - "image_encoder": [XnnpackPartitioner()], + "vision_encoder": [XnnpackPartitioner()], "text_decoder": [ # First partition the DQLinear nodes, then partition the rest of the nodes, # to avoid multiple DQLinear nodes in the same partition, @@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel): ], memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), sym_shape_eval_pass={ - "image_encoder": ConstraintBasedSymShapeEvalPass(), + "vision_encoder": ConstraintBasedSymShapeEvalPass(), "text_decoder": ConstraintBasedSymShapeEvalPass(), "token_embedding": HintBasedSymShapeEvalPass(), }, diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py index 7f2b59e0116..1708cdcd516 100644 --- a/examples/models/llava/test/test_llava.py +++ b/examples/models/llava/test/test_llava.py @@ -105,7 +105,7 @@ def test_llava_export(self): start_pos += pte_embeds_before_img.shape[1] # pte prefill image - pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0] + pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0] llava_module.run_method( "text_decoder", ( diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py index 1f4aaa9938c..4b924aed680 100644 --- a/examples/models/llava/test/test_pte.py +++ b/examples/models/llava/test/test_pte.py @@ -56,7 +56,7 @@ def main(): # pte prefill image logging.warning("Image encoder started") - pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0] + pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0] logging.warning("Image encoder finished") logging.warning("Image token prefill started") pte_prefill_img = llava_module.run_method( diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h index 4ba88203c50..d7b36077757 100644 --- a/extension/llm/runner/constants.h +++ b/extension/llm/runner/constants.h @@ -20,7 +20,7 @@ inline constexpr auto kUseKVCache = "use_kv_cache"; inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; // Multimodal method name conventions -inline constexpr auto kImageEncoderMethod = "image_encoder"; +inline constexpr auto kVisionEncoderMethod = "vision_encoder"; inline constexpr auto kAudioEncoderMethod = "audio_encoder"; inline constexpr auto kTokenEmbeddingMethod = "token_embedding"; inline constexpr auto kTextModelMethod = "text_decoder"; diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 2705a9eadff..f9645667f24 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -41,14 +41,46 @@ Result MultimodalPrefiller::prefill( ::executorch::runtime::EValue encoder_output; if (input.is_image()) { Image image = input.get_image(); - auto image_tensor = executorch::extension::from_blob( - image.data.data(), - {3, image.height, image.width}, - ::executorch::aten::ScalarType::Byte); + + auto method_meta = ET_UNWRAP( + module_->method_meta(kVisionEncoderMethod), + "Failed to get method_meta for %s", + kVisionEncoderMethod); + + ET_CHECK_MSG( + method_meta.num_inputs() > 0, + "Image encoder should have at least 1 input"); + auto input_meta = ET_UNWRAP( + method_meta.input_tensor_meta(0), + "Cannot get input tensor meta at index 0"); + auto expected_dtype = input_meta.scalar_type(); + + if (expected_dtype == ::executorch::aten::ScalarType::Float) { + ET_CHECK_MSG( + image.is_float(), + "Model expects float image data, but image has uint8_t data."); + } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) { + ET_CHECK_MSG( + image.is_uint8(), + "Model expects uint8_t image data, but image has float data."); + } else { + ET_LOG( + Error, + "Unsupported image encoder input dtype: %s", + ::executorch::runtime::toString(expected_dtype)); + return ::executorch::runtime::Error::NotSupported; + } + + // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D + // tensor (CHW). Add a batch dimension of 1 if needed. + auto expected_dims = input_meta.sizes(); + auto image_tensor = ET_UNWRAP( + image.toTensor(/*with_batch*/ expected_dims.size() == 4), + "Failed to convert image to tensor"); // Run image encoder auto image_encoder_outputs = - ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor)); + ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); encoder_output = image_encoder_outputs[0]; } else if (input.is_audio()) { @@ -143,8 +175,8 @@ ::executorch::runtime::Error MultimodalPrefiller::load() { ET_UNWRAP(module_->method_names(), "Failed to get method names"); // Load image_encoder method if exists. - if (methods.find(kImageEncoderMethod) != methods.end()) { - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod)); + if (methods.find(kVisionEncoderMethod) != methods.end()) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod)); } if (methods.find(kAudioEncoderMethod) != methods.end()) { @@ -171,8 +203,8 @@ bool MultimodalPrefiller::is_method_loaded() { ET_CHECK_MSG(false, "Failed to get method names"); } std::unordered_set methods = methods_res.get(); - if (methods.find(kImageEncoderMethod) != methods.end()) { - return module_->is_method_loaded(kImageEncoderMethod); + if (methods.find(kVisionEncoderMethod) != methods.end()) { + return module_->is_method_loaded(kVisionEncoderMethod); } return true; }