From 874987435acb3ce23652c30ededf6445db1c43e4 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 17 Oct 2025 10:28:33 -0700 Subject: [PATCH] audio float API (#15214) Some audio inputs are float[] (cherry picked from commit df37d2912ce6122ef40baa75cdf168352f5f972d) --- .../executorch/extension/llm/LlmModule.java | 22 ++++++++++++++++ extension/android/jni/jni_layer_llama.cpp | 26 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java index 40e38afb8b9..cbd1c474ed3 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java @@ -233,6 +233,28 @@ public long prefillAudio(byte[] audio, int batch_size, int n_bins, int n_frames) private native int appendAudioInput(byte[] audio, int batch_size, int n_bins, int n_frames); + /** + * Prefill a multimodal Module with the given audio input. + * + * @param audio Input preprocessed audio as a float array + * @param batch_size Input batch size + * @param n_bins Input number of bins + * @param n_frames Input number of frames + * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer + * exposed to user. + * @throws RuntimeException if the prefill failed + */ + @Experimental + public long prefillAudio(float[] audio, int batch_size, int n_bins, int n_frames) { + int nativeResult = appendAudioInputFloat(audio, batch_size, n_bins, n_frames); + if (nativeResult != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult); + } + return 0; + } + + private native int appendAudioInputFloat(float[] audio, int batch_size, int n_bins, int n_frames); + /** * Prefill a multimodal Module with the given raw audio input. * diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index ccb0d55dc41..aa9a6ee58c5 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -325,6 +325,29 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { return 0; } + // Returns status_code + jint append_audio_input_float( + facebook::jni::alias_ref data, + jint batch_size, + jint n_bins, + jint n_frames) { + if (data == nullptr) { + return static_cast(Error::EndOfMethod); + } + auto data_size = data->size(); + if (data_size != 0) { + std::vector data_jfloat(data_size); + std::vector data_f(data_size); + data->getRegion(0, data_size, data_jfloat.data()); + for (int i = 0; i < data_size; i++) { + data_f[i] = data_jfloat[i]; + } + llm::Audio audio{std::move(data_f), batch_size, n_bins, n_frames}; + prefill_inputs_.emplace_back(llm::MultimodalInput{std::move(audio)}); + } + return 0; + } + // Returns status_code jint append_raw_audio_input( facebook::jni::alias_ref data, @@ -388,6 +411,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { ExecuTorchLlmJni::append_normalized_images_input), makeNativeMethod( "appendAudioInput", ExecuTorchLlmJni::append_audio_input), + makeNativeMethod( + "appendAudioInputFloat", + ExecuTorchLlmJni::append_audio_input_float), makeNativeMethod( "appendRawAudioInput", ExecuTorchLlmJni::append_raw_audio_input), makeNativeMethod(