From 6d559c4855eace0b7a03fc1239bb1d7d0c164891 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Mon, 13 Oct 2025 23:53:41 -0700 Subject: [PATCH 1/2] Add API for normalized image input --- .../executorch/extension/llm/LlmModule.java | 22 ++++++++++++++ extension/android/jni/jni_layer_llama.cpp | 29 +++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java index f135731f26a..beee5e806a8 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java @@ -207,6 +207,28 @@ public long prefillImages(int[] image, int width, int height, int channels) { private native int appendImagesInput(int[] image, int width, int height, int channels); + /** + * Prefill an LLaVA Module with the given images input. + * + * @param image Input normalized image as a float array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer + * exposed to user. + * @throws RuntimeException if the prefill failed + */ + @Deprecated + public long prefillImages(float[] image, int width, int height, int channels) { + int nativeResult = appendNormalizedImagesInput(image, width, height, channels); + if (nativeResult != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult); + } + return 0; + } + + private native int appendNormalizedImagesInput(float[] image, int width, int height, int channels); + /** * Prefill an LLaVA Module with the given text input. * diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 22d20e99b7e..fcc69509352 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -289,6 +289,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { return 0; } + // Returns status_code + jint append_normalized_images_input( + facebook::jni::alias_ref image, + jint width, + jint height, + jint channels) { + std::vector images; + if (image == nullptr) { + return static_cast(Error::EndOfMethod); + } + auto image_size = image->size(); + if (image_size != 0) { + std::vector image_data_jfloat(image_size); + std::vector image_data(image_size); + image->getRegion(0, image_size, image_data_jfloat.data()); + for (int i = 0; i < image_size; i++) { + image_data[i] = image_data_jfloat[i]; + } + llm::Image image_runner{std::move(image_data), width, height, channels}; + prefill_inputs_.emplace_back( + llm::MultimodalInput{std::move(image_runner)}); + } + + return 0; + } + void stop() { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { multi_modal_runner_->stop(); @@ -323,6 +349,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { makeNativeMethod("load", ExecuTorchLlmJni::load), makeNativeMethod( "appendImagesInput", ExecuTorchLlmJni::append_images_input), + makeNativeMethod( + "appendNormalizedImagesInput", + ExecuTorchLlmJni::append_normalized_images_input), makeNativeMethod( "appendTextInput", ExecuTorchLlmJni::append_text_input), makeNativeMethod("resetContext", ExecuTorchLlmJni::reset_context), From bf7e690770586e4cd246737afd986445bbbbb9a1 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Tue, 14 Oct 2025 10:26:36 -0700 Subject: [PATCH 2/2] Linter --- .../java/org/pytorch/executorch/extension/llm/LlmModule.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java index beee5e806a8..3c586bf7577 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java @@ -227,7 +227,8 @@ public long prefillImages(float[] image, int width, int height, int channels) { return 0; } - private native int appendNormalizedImagesInput(float[] image, int width, int height, int channels); + private native int appendNormalizedImagesInput( + float[] image, int width, int height, int channels); /** * Prefill an LLaVA Module with the given text input.