From 72adf1f0acc8ae384912fff0c3879dba86edfdc5 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 18 Sep 2025 19:57:00 -0700 Subject: [PATCH 1/5] [multimodal] Let Audio take float data blob If the processed audio went through Mel transform, the spectrogram are float values. We should allow `Audio` class to be able to take this, since multimodal runner pybind API will have to be able to take processed input. Once we have the pybind API we can do something like: ```python model_id = "mistralai/Voxtral-Mini-3B-2507" processor = AutoProcessor.from_pretrained(model_id) audio_url = "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav" conversation = [ { "role": "user", "content": [ {"type": "audio", "url": audio_url}, { "type": "text", "text": "What can you tell me about this audio?", }, ], }, ] inputs = processor.apply_chat_template(conversation, tokenize=True, return_dict=True, return_tensors="pt") inputs_combined = [ make_text_input("[INST][BEGIN_AUDIO]"), make_audio_input(inputs["input_features"]), make_text_input("\nWhat can you tell me about this audio?[/INST]"), ] runner = MultimodalRunner("voxtral.pte", "tekken.json", None) config = GenerationConfig() config.max_new_tokens = 100 runner.generate(inputs_combined, config) ``` --- examples/models/voxtral/multimodal.cpp | 47 +++----- extension/llm/runner/audio.h | 102 +++++++++++++++++- extension/llm/runner/multimodal_prefiller.cpp | 23 ++-- 3 files changed, 129 insertions(+), 43 deletions(-) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 17013df96e1..081df27cd67 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -103,15 +103,13 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) { ET_LOG(Info, "audio_data len = %zu", n_floats); - // Create Audio multimodal input - auto audio = std::make_unique<::executorch::extension::llm::Audio>(); - audio->batch_size = batch_size; - audio->n_bins = n_bins; - audio->n_frames = n_frames; - audio->data.resize(n_floats * sizeof(float)); - f.read(reinterpret_cast(audio->data.data()), n_floats * sizeof(float)); + std::vector audio_data(n_floats); + f.read(reinterpret_cast(audio_data.data()), n_floats * sizeof(float)); f.close(); - return ::executorch::extension::llm::make_audio_input(std::move(*audio)); + + auto audio = ::executorch::extension::llm::Audio( + std::move(audio_data), batch_size, n_bins, n_frames); + return ::executorch::extension::llm::make_audio_input(std::move(audio)); } /** @@ -206,32 +204,21 @@ MultimodalInput processRawAudioFile( static_cast(sizes[2])); // Create Audio multimodal input from processed features - auto processed_audio = - std::make_unique<::executorch::extension::llm::Audio>(); - processed_audio->batch_size = - static_cast(sizes[0]); // Note: batching for s > 30 doesn't work - // yet, so this will just be = 1. - processed_audio->n_bins = static_cast(sizes[1]); - processed_audio->n_frames = - static_cast(sizes[2]); // And this will just be = 3000. - - size_t total_elements = processed_audio->batch_size * - processed_audio->n_bins * processed_audio->n_frames; - processed_audio->data.resize(total_elements * sizeof(float)); - std::memcpy( - processed_audio->data.data(), - processed_data, - total_elements * sizeof(float)); - + int32_t batch_size = static_cast(sizes[0]); + int32_t n_bins = static_cast(sizes[1]); + int32_t n_frames = static_cast(sizes[2]); + size_t total_elements = batch_size * n_bins * n_frames; + std::vector audio_vec(processed_data, processed_data + total_elements); + auto processed_audio = ::executorch::extension::llm::Audio( + std::move(audio_vec), batch_size, n_bins, n_frames); ET_LOG( Info, "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d", - processed_audio->batch_size, - processed_audio->n_bins, - processed_audio->n_frames); - + batch_size, + n_bins, + n_frames); return ::executorch::extension::llm::make_audio_input( - std::move(*processed_audio)); + std::move(processed_audio)); } /** diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index 868765950af..4dec8983b02 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -13,6 +13,8 @@ #include #include +#include + namespace executorch { namespace extension { namespace llm { @@ -29,11 +31,103 @@ struct ET_EXPERIMENTAL RawAudio { }; /** - * Pre-processed audio inputs, ready to feed directly into an audio - * encoder. + * Pre-processed audio inputs, ready to feed directly into an audio encoder. + * + * The data can be either uint8_t or float. If the audio has gone through a Mel + * transform, we expect the data type to be float (i.e., std::vector), as + * Mel spectrograms are typically represented as floating point values. For raw + * or quantized audio, uint8_t may be used instead. */ -struct ET_EXPERIMENTAL Audio { - std::vector data; +class ET_EXPERIMENTAL Audio { + public: + // Default constructor + Audio() : batch_size(0), n_bins(0), n_frames(0) {} + + // Constructor for uint8_t data + Audio( + std::vector&& data_, + int32_t batch_size_, + int32_t n_bins_, + int32_t n_frames_) + : data(std::move(data_)), + batch_size(batch_size_), + n_bins(n_bins_), + n_frames(n_frames_) {} + + // Constructor for float data + Audio( + std::vector&& data_, + int32_t batch_size_, + int32_t n_bins_, + int32_t n_frames_) + : data(std::move(data_)), + batch_size(batch_size_), + n_bins(n_bins_), + n_frames(n_frames_) {} + + // Type checkers + bool is_uint8() const { + return std::holds_alternative>(data); + } + + bool is_float() const { + return std::holds_alternative>(data); + } + + // Data access + const std::vector& get_uint8_data() const& { + return std::get>(data); + } + + std::vector& get_uint8_data() & { + return std::get>(data); + } + + const std::vector& get_float_data() const& { + return std::get>(data); + } + + std::vector& get_float_data() & { + return std::get>(data); + } + + int32_t get_batch_size() const { + return batch_size; + } + int32_t get_n_bins() const { + return n_bins; + } + int32_t get_n_frames() const { + return n_frames; + } + /** + * Convert the audio data to a TensorPtr, with optional batch dimension. + * The tensor will have shape (batch_size, n_bins, n_frames) or (1, + * batch_size, n_bins, n_frames) if with_batch is true. + */ + executorch::runtime::Result toTensor() + const { + std::vector sizes = { + get_batch_size(), get_n_bins(), get_n_frames()}; + if (is_float()) { + return executorch::extension::from_blob( + const_cast(get_float_data().data()), + sizes, + ::executorch::aten::ScalarType::Float); + } else if (is_uint8()) { + return executorch::extension::from_blob( + const_cast(get_uint8_data().data()), + sizes, + ::executorch::aten::ScalarType::Byte); + } + ET_LOG( + Error, "Audio data is not initialized with uint8_t or float vector."); + return ::executorch::runtime::Error::NotSupported; + } + + private: + // Members + std::variant, std::vector> data; int32_t batch_size; int32_t n_bins; int32_t n_frames; diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index f9645667f24..1d65c2166c3 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -47,8 +47,9 @@ Result MultimodalPrefiller::prefill( "Failed to get method_meta for %s", kVisionEncoderMethod); - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( method_meta.num_inputs() > 0, + InvalidArgument, "Image encoder should have at least 1 input"); auto input_meta = ET_UNWRAP( method_meta.input_tensor_meta(0), @@ -56,12 +57,14 @@ Result MultimodalPrefiller::prefill( auto expected_dtype = input_meta.scalar_type(); if (expected_dtype == ::executorch::aten::ScalarType::Float) { - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( image.is_float(), + InvalidArgument, "Model expects float image data, but image has uint8_t data."); } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) { - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( image.is_uint8(), + InvalidArgument, "Model expects uint8_t image data, but image has float data."); } else { ET_LOG( @@ -77,7 +80,11 @@ Result MultimodalPrefiller::prefill( auto image_tensor = ET_UNWRAP( image.toTensor(/*with_batch*/ expected_dims.size() == 4), "Failed to convert image to tensor"); - + ET_LOG( + Info, + "Image tensor dim: %zu, dtype: %s", + image_tensor->dim(), + ::executorch::runtime::toString(image_tensor->scalar_type())); // Run image encoder auto image_encoder_outputs = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); @@ -86,11 +93,9 @@ Result MultimodalPrefiller::prefill( } else if (input.is_audio()) { Audio audio = input.get_audio(); - // Use the original tensor shape as intended - auto audio_tensor = executorch::extension::from_blob( - audio.data.data(), - {audio.batch_size, audio.n_bins, audio.n_frames}, - ::executorch::aten::ScalarType::Float); + // Use Audio::toTensor() for tensor creation + auto audio_tensor = + ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor"); // Run audio encoder auto audio_encoder_result = From a24699837eaaee7b8ac190968761ed50081bd306 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 18 Sep 2025 20:26:42 -0700 Subject: [PATCH 2/5] Add include --- extension/llm/runner/audio.h | 1 + 1 file changed, 1 insertion(+) diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index 4dec8983b02..b8e4b9d84af 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -11,6 +11,7 @@ #pragma once #include #include +#include #include #include From 821d0854f06a5ad4dc98ea0eca3e450a30f4b47a Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 19 Sep 2025 10:52:07 -0700 Subject: [PATCH 3/5] Address comments --- extension/llm/runner/audio.h | 125 ++++++++++-------- extension/llm/runner/multimodal_prefiller.cpp | 6 +- 2 files changed, 78 insertions(+), 53 deletions(-) diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index b8e4b9d84af..170b245cfef 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -39,100 +39,121 @@ struct ET_EXPERIMENTAL RawAudio { * Mel spectrograms are typically represented as floating point values. For raw * or quantized audio, uint8_t may be used instead. */ -class ET_EXPERIMENTAL Audio { +class ET_EXPERIMENTAL Audio final { public: // Default constructor - Audio() : batch_size(0), n_bins(0), n_frames(0) {} + Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {} // Constructor for uint8_t data Audio( - std::vector&& data_, - int32_t batch_size_, - int32_t n_bins_, - int32_t n_frames_) - : data(std::move(data_)), - batch_size(batch_size_), - n_bins(n_bins_), - n_frames(n_frames_) {} + std::vector&& data, + int32_t batch_size, + int32_t n_bins, + int32_t n_frames) + : data_(std::move(data)), + batch_size_(batch_size), + n_bins_(n_bins), + n_frames_(n_frames) { + ET_CHECK_MSG( + data_.index() == 0 && + std::get>(data_).size() == + static_cast(batch_size * n_bins * n_frames), + "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)", + std::get>(data_).size(), + batch_size * n_bins * n_frames); + } // Constructor for float data Audio( - std::vector&& data_, - int32_t batch_size_, - int32_t n_bins_, - int32_t n_frames_) - : data(std::move(data_)), - batch_size(batch_size_), - n_bins(n_bins_), - n_frames(n_frames_) {} + std::vector&& data, + int32_t batch_size, + int32_t n_bins, + int32_t n_frames) + : data_(std::move(data)), + batch_size_(batch_size), + n_bins_(n_bins), + n_frames_(n_frames) { + ET_CHECK_MSG( + data_.index() == 1 && + std::get>(data_).size() == + static_cast(batch_size * n_bins * n_frames), + "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)", + std::get>(data_).size(), + batch_size * n_bins * n_frames); + } // Type checkers bool is_uint8() const { - return std::holds_alternative>(data); + return std::holds_alternative>(data_); } bool is_float() const { - return std::holds_alternative>(data); + return std::holds_alternative>(data_); } // Data access const std::vector& get_uint8_data() const& { - return std::get>(data); + return std::get>(data_); } std::vector& get_uint8_data() & { - return std::get>(data); + return std::get>(data_); } const std::vector& get_float_data() const& { - return std::get>(data); + return std::get>(data_); } std::vector& get_float_data() & { - return std::get>(data); + return std::get>(data_); } int32_t get_batch_size() const { - return batch_size; + return batch_size_; } int32_t get_n_bins() const { - return n_bins; + return n_bins_; } int32_t get_n_frames() const { - return n_frames; + return n_frames_; } /** * Convert the audio data to a TensorPtr, with optional batch dimension. * The tensor will have shape (batch_size, n_bins, n_frames) or (1, * batch_size, n_bins, n_frames) if with_batch is true. */ - executorch::runtime::Result toTensor() - const { - std::vector sizes = { - get_batch_size(), get_n_bins(), get_n_frames()}; - if (is_float()) { - return executorch::extension::from_blob( - const_cast(get_float_data().data()), - sizes, - ::executorch::aten::ScalarType::Float); - } else if (is_uint8()) { - return executorch::extension::from_blob( - const_cast(get_uint8_data().data()), - sizes, - ::executorch::aten::ScalarType::Byte); + executorch::runtime::Result toTensor( + bool with_batch = false) { + const { + std::vector sizes = { + get_batch_size(), get_n_bins(), get_n_frames()}; + if (with_batch) { + sizes.insert(sizes.begin(), 1); + } + if (is_float()) { + return executorch::extension::from_blob( + const_cast(get_float_data().data()), + sizes, + ::executorch::aten::ScalarType::Float); + } else if (is_uint8()) { + return executorch::extension::from_blob( + const_cast(get_uint8_data().data()), + sizes, + ::executorch::aten::ScalarType::Byte); + } + ET_LOG( + Error, + "Shouldn't reach here, audio data is not initialized with uint8_t or float vector."); + return ::executorch::runtime::Error::NotSupported; } - ET_LOG( - Error, "Audio data is not initialized with uint8_t or float vector."); - return ::executorch::runtime::Error::NotSupported; - } - private: - // Members - std::variant, std::vector> data; - int32_t batch_size; - int32_t n_bins; - int32_t n_frames; -}; + private: + // Members + std::variant, std::vector> data_; + int32_t batch_size_; + int32_t n_bins_; + int32_t n_frames_; + }; } // namespace llm } // namespace extension diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 1d65c2166c3..824fdf943a9 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -96,7 +96,11 @@ Result MultimodalPrefiller::prefill( // Use Audio::toTensor() for tensor creation auto audio_tensor = ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor"); - + ET_LOG( + Info, + "Audio tensor dim: %zu, dtype: %s", + audio_tensor->dim(), + ::executorch::runtime::toString(audio_tensor->scalar_type())); // Run audio encoder auto audio_encoder_result = module_->execute(kAudioEncoderMethod, audio_tensor); From 0db2588c7d820f15184b399c961bfea28f14031a Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 19 Sep 2025 13:27:25 -0700 Subject: [PATCH 4/5] Fix typo --- extension/llm/runner/audio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index 170b245cfef..bf04c1cc6da 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -123,8 +123,7 @@ class ET_EXPERIMENTAL Audio final { * batch_size, n_bins, n_frames) if with_batch is true. */ executorch::runtime::Result toTensor( - bool with_batch = false) { - const { + bool with_batch = false) const { std::vector sizes = { get_batch_size(), get_n_bins(), get_n_frames()}; if (with_batch) { From 8b6a3d326503b9910a00878619b00463df2a06e2 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 19 Sep 2025 13:35:08 -0700 Subject: [PATCH 5/5] Lintrunner --- extension/llm/runner/audio.h | 54 ++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index bf04c1cc6da..ce71513ed17 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -124,35 +124,35 @@ class ET_EXPERIMENTAL Audio final { */ executorch::runtime::Result toTensor( bool with_batch = false) const { - std::vector sizes = { - get_batch_size(), get_n_bins(), get_n_frames()}; - if (with_batch) { - sizes.insert(sizes.begin(), 1); - } - if (is_float()) { - return executorch::extension::from_blob( - const_cast(get_float_data().data()), - sizes, - ::executorch::aten::ScalarType::Float); - } else if (is_uint8()) { - return executorch::extension::from_blob( - const_cast(get_uint8_data().data()), - sizes, - ::executorch::aten::ScalarType::Byte); - } - ET_LOG( - Error, - "Shouldn't reach here, audio data is not initialized with uint8_t or float vector."); - return ::executorch::runtime::Error::NotSupported; + std::vector sizes = { + get_batch_size(), get_n_bins(), get_n_frames()}; + if (with_batch) { + sizes.insert(sizes.begin(), 1); } + if (is_float()) { + return executorch::extension::from_blob( + const_cast(get_float_data().data()), + sizes, + ::executorch::aten::ScalarType::Float); + } else if (is_uint8()) { + return executorch::extension::from_blob( + const_cast(get_uint8_data().data()), + sizes, + ::executorch::aten::ScalarType::Byte); + } + ET_LOG( + Error, + "Shouldn't reach here, audio data is not initialized with uint8_t or float vector."); + return ::executorch::runtime::Error::NotSupported; + } - private: - // Members - std::variant, std::vector> data_; - int32_t batch_size_; - int32_t n_bins_; - int32_t n_frames_; - }; + private: + // Members + std::variant, std::vector> data_; + int32_t batch_size_; + int32_t n_bins_; + int32_t n_frames_; +}; } // namespace llm } // namespace extension