diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 7e571087c1d..62ddfc5c363 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -224,12 +224,12 @@ def export_all(llava_model: LlavaModel): lowered_and_edge = to_edge_transform_and_lower( { - "image_encoder": image_encoder_ep, + "vision_encoder": image_encoder_ep, "token_embedding": token_embedding_ep, "text_decoder": text_model_ep, }, partitioner={ - "image_encoder": [XnnpackPartitioner()], + "vision_encoder": [XnnpackPartitioner()], "text_decoder": [ # First partition the DQLinear nodes, then partition the rest of the nodes, # to avoid multiple DQLinear nodes in the same partition, @@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel): ], memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), sym_shape_eval_pass={ - "image_encoder": ConstraintBasedSymShapeEvalPass(), + "vision_encoder": ConstraintBasedSymShapeEvalPass(), "text_decoder": ConstraintBasedSymShapeEvalPass(), "token_embedding": HintBasedSymShapeEvalPass(), }, diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py index 7f2b59e0116..1708cdcd516 100644 --- a/examples/models/llava/test/test_llava.py +++ b/examples/models/llava/test/test_llava.py @@ -105,7 +105,7 @@ def test_llava_export(self): start_pos += pte_embeds_before_img.shape[1] # pte prefill image - pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0] + pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0] llava_module.run_method( "text_decoder", ( diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py index 1f4aaa9938c..4b924aed680 100644 --- a/examples/models/llava/test/test_pte.py +++ b/examples/models/llava/test/test_pte.py @@ -56,7 +56,7 @@ def main(): # pte prefill image logging.warning("Image encoder started") - pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0] + pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0] logging.warning("Image encoder finished") logging.warning("Image token prefill started") pte_prefill_img = llava_module.run_method( diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h index 4ba88203c50..d7b36077757 100644 --- a/extension/llm/runner/constants.h +++ b/extension/llm/runner/constants.h @@ -20,7 +20,7 @@ inline constexpr auto kUseKVCache = "use_kv_cache"; inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; // Multimodal method name conventions -inline constexpr auto kImageEncoderMethod = "image_encoder"; +inline constexpr auto kVisionEncoderMethod = "vision_encoder"; inline constexpr auto kAudioEncoderMethod = "audio_encoder"; inline constexpr auto kTokenEmbeddingMethod = "token_embedding"; inline constexpr auto kTextModelMethod = "text_decoder"; diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 3f8777d4acf..f9645667f24 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -43,9 +43,9 @@ Result MultimodalPrefiller::prefill( Image image = input.get_image(); auto method_meta = ET_UNWRAP( - module_->method_meta(kImageEncoderMethod), + module_->method_meta(kVisionEncoderMethod), "Failed to get method_meta for %s", - kImageEncoderMethod); + kVisionEncoderMethod); ET_CHECK_MSG( method_meta.num_inputs() > 0, @@ -80,7 +80,7 @@ Result MultimodalPrefiller::prefill( // Run image encoder auto image_encoder_outputs = - ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor)); + ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); encoder_output = image_encoder_outputs[0]; } else if (input.is_audio()) { @@ -175,8 +175,8 @@ ::executorch::runtime::Error MultimodalPrefiller::load() { ET_UNWRAP(module_->method_names(), "Failed to get method names"); // Load image_encoder method if exists. - if (methods.find(kImageEncoderMethod) != methods.end()) { - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod)); + if (methods.find(kVisionEncoderMethod) != methods.end()) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod)); } if (methods.find(kAudioEncoderMethod) != methods.end()) { @@ -203,8 +203,8 @@ bool MultimodalPrefiller::is_method_loaded() { ET_CHECK_MSG(false, "Failed to get method names"); } std::unordered_set methods = methods_res.get(); - if (methods.find(kImageEncoderMethod) != methods.end()) { - return module_->is_method_loaded(kImageEncoderMethod); + if (methods.find(kVisionEncoderMethod) != methods.end()) { + return module_->is_method_loaded(kVisionEncoderMethod); } return true; }