From af94fa5456afa1206176cb32c960eb182aa1c16a Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 19 Sep 2025 14:51:52 -0700 Subject: [PATCH] [multimodal] Let Audio take float data blob (#14427) If the processed audio went through Mel transform, the spectrogram are float values. We should allow `Audio` class to be able to take this, since multimodal runner pybind API will have to be able to take processed input. Once we have the pybind API we can do something like: ```python model_id = "mistralai/Voxtral-Mini-3B-2507" processor = AutoProcessor.from_pretrained(model_id) audio_url = "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav" conversation = [ { "role": "user", "content": [ {"type": "audio", "url": audio_url}, { "type": "text", "text": "What can you tell me about this audio?", }, ], }, ] inputs = processor.apply_chat_template(conversation, tokenize=True, return_dict=True, return_tensors="pt") inputs_combined = [ make_text_input("[INST][BEGIN_AUDIO]"), make_audio_input(inputs["input_features"]), make_text_input("\nWhat can you tell me about this audio?[/INST]"), ] runner = MultimodalRunner("voxtral.pte", "tekken.json", None) config = GenerationConfig() config.max_new_tokens = 100 runner.generate(inputs_combined, config) ``` ### Summary [PLEASE REMOVE] See [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests) for ExecuTorch PR guidelines. [PLEASE REMOVE] If this PR closes an issue, please add a `Fixes #` line. [PLEASE REMOVE] If this PR introduces a fix or feature that should be the upcoming release notes, please add a "Release notes: " label. For a list of available release notes labels, check out [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests). ### Test plan [PLEASE REMOVE] How did you test this PR? Please write down any manual commands you used and note down tests that you have written if applicable. (cherry picked from commit 8b114180ef143abb06b0441c0788edec5461e5ad) --- examples/models/voxtral/multimodal.cpp | 47 +++---- extension/llm/runner/audio.h | 129 +++++++++++++++++- extension/llm/runner/multimodal_prefiller.cpp | 29 ++-- 3 files changed, 158 insertions(+), 47 deletions(-) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 17013df96e1..081df27cd67 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -103,15 +103,13 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) { ET_LOG(Info, "audio_data len = %zu", n_floats); - // Create Audio multimodal input - auto audio = std::make_unique<::executorch::extension::llm::Audio>(); - audio->batch_size = batch_size; - audio->n_bins = n_bins; - audio->n_frames = n_frames; - audio->data.resize(n_floats * sizeof(float)); - f.read(reinterpret_cast(audio->data.data()), n_floats * sizeof(float)); + std::vector audio_data(n_floats); + f.read(reinterpret_cast(audio_data.data()), n_floats * sizeof(float)); f.close(); - return ::executorch::extension::llm::make_audio_input(std::move(*audio)); + + auto audio = ::executorch::extension::llm::Audio( + std::move(audio_data), batch_size, n_bins, n_frames); + return ::executorch::extension::llm::make_audio_input(std::move(audio)); } /** @@ -206,32 +204,21 @@ MultimodalInput processRawAudioFile( static_cast(sizes[2])); // Create Audio multimodal input from processed features - auto processed_audio = - std::make_unique<::executorch::extension::llm::Audio>(); - processed_audio->batch_size = - static_cast(sizes[0]); // Note: batching for s > 30 doesn't work - // yet, so this will just be = 1. - processed_audio->n_bins = static_cast(sizes[1]); - processed_audio->n_frames = - static_cast(sizes[2]); // And this will just be = 3000. - - size_t total_elements = processed_audio->batch_size * - processed_audio->n_bins * processed_audio->n_frames; - processed_audio->data.resize(total_elements * sizeof(float)); - std::memcpy( - processed_audio->data.data(), - processed_data, - total_elements * sizeof(float)); - + int32_t batch_size = static_cast(sizes[0]); + int32_t n_bins = static_cast(sizes[1]); + int32_t n_frames = static_cast(sizes[2]); + size_t total_elements = batch_size * n_bins * n_frames; + std::vector audio_vec(processed_data, processed_data + total_elements); + auto processed_audio = ::executorch::extension::llm::Audio( + std::move(audio_vec), batch_size, n_bins, n_frames); ET_LOG( Info, "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d", - processed_audio->batch_size, - processed_audio->n_bins, - processed_audio->n_frames); - + batch_size, + n_bins, + n_frames); return ::executorch::extension::llm::make_audio_input( - std::move(*processed_audio)); + std::move(processed_audio)); } /** diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index 868765950af..ce71513ed17 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -11,8 +11,11 @@ #pragma once #include #include +#include #include +#include + namespace executorch { namespace extension { namespace llm { @@ -29,14 +32,126 @@ struct ET_EXPERIMENTAL RawAudio { }; /** - * Pre-processed audio inputs, ready to feed directly into an audio - * encoder. + * Pre-processed audio inputs, ready to feed directly into an audio encoder. + * + * The data can be either uint8_t or float. If the audio has gone through a Mel + * transform, we expect the data type to be float (i.e., std::vector), as + * Mel spectrograms are typically represented as floating point values. For raw + * or quantized audio, uint8_t may be used instead. */ -struct ET_EXPERIMENTAL Audio { - std::vector data; - int32_t batch_size; - int32_t n_bins; - int32_t n_frames; +class ET_EXPERIMENTAL Audio final { + public: + // Default constructor + Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {} + + // Constructor for uint8_t data + Audio( + std::vector&& data, + int32_t batch_size, + int32_t n_bins, + int32_t n_frames) + : data_(std::move(data)), + batch_size_(batch_size), + n_bins_(n_bins), + n_frames_(n_frames) { + ET_CHECK_MSG( + data_.index() == 0 && + std::get>(data_).size() == + static_cast(batch_size * n_bins * n_frames), + "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)", + std::get>(data_).size(), + batch_size * n_bins * n_frames); + } + + // Constructor for float data + Audio( + std::vector&& data, + int32_t batch_size, + int32_t n_bins, + int32_t n_frames) + : data_(std::move(data)), + batch_size_(batch_size), + n_bins_(n_bins), + n_frames_(n_frames) { + ET_CHECK_MSG( + data_.index() == 1 && + std::get>(data_).size() == + static_cast(batch_size * n_bins * n_frames), + "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)", + std::get>(data_).size(), + batch_size * n_bins * n_frames); + } + + // Type checkers + bool is_uint8() const { + return std::holds_alternative>(data_); + } + + bool is_float() const { + return std::holds_alternative>(data_); + } + + // Data access + const std::vector& get_uint8_data() const& { + return std::get>(data_); + } + + std::vector& get_uint8_data() & { + return std::get>(data_); + } + + const std::vector& get_float_data() const& { + return std::get>(data_); + } + + std::vector& get_float_data() & { + return std::get>(data_); + } + + int32_t get_batch_size() const { + return batch_size_; + } + int32_t get_n_bins() const { + return n_bins_; + } + int32_t get_n_frames() const { + return n_frames_; + } + /** + * Convert the audio data to a TensorPtr, with optional batch dimension. + * The tensor will have shape (batch_size, n_bins, n_frames) or (1, + * batch_size, n_bins, n_frames) if with_batch is true. + */ + executorch::runtime::Result toTensor( + bool with_batch = false) const { + std::vector sizes = { + get_batch_size(), get_n_bins(), get_n_frames()}; + if (with_batch) { + sizes.insert(sizes.begin(), 1); + } + if (is_float()) { + return executorch::extension::from_blob( + const_cast(get_float_data().data()), + sizes, + ::executorch::aten::ScalarType::Float); + } else if (is_uint8()) { + return executorch::extension::from_blob( + const_cast(get_uint8_data().data()), + sizes, + ::executorch::aten::ScalarType::Byte); + } + ET_LOG( + Error, + "Shouldn't reach here, audio data is not initialized with uint8_t or float vector."); + return ::executorch::runtime::Error::NotSupported; + } + + private: + // Members + std::variant, std::vector> data_; + int32_t batch_size_; + int32_t n_bins_; + int32_t n_frames_; }; } // namespace llm diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index f9645667f24..824fdf943a9 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -47,8 +47,9 @@ Result MultimodalPrefiller::prefill( "Failed to get method_meta for %s", kVisionEncoderMethod); - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( method_meta.num_inputs() > 0, + InvalidArgument, "Image encoder should have at least 1 input"); auto input_meta = ET_UNWRAP( method_meta.input_tensor_meta(0), @@ -56,12 +57,14 @@ Result MultimodalPrefiller::prefill( auto expected_dtype = input_meta.scalar_type(); if (expected_dtype == ::executorch::aten::ScalarType::Float) { - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( image.is_float(), + InvalidArgument, "Model expects float image data, but image has uint8_t data."); } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) { - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( image.is_uint8(), + InvalidArgument, "Model expects uint8_t image data, but image has float data."); } else { ET_LOG( @@ -77,7 +80,11 @@ Result MultimodalPrefiller::prefill( auto image_tensor = ET_UNWRAP( image.toTensor(/*with_batch*/ expected_dims.size() == 4), "Failed to convert image to tensor"); - + ET_LOG( + Info, + "Image tensor dim: %zu, dtype: %s", + image_tensor->dim(), + ::executorch::runtime::toString(image_tensor->scalar_type())); // Run image encoder auto image_encoder_outputs = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); @@ -86,12 +93,14 @@ Result MultimodalPrefiller::prefill( } else if (input.is_audio()) { Audio audio = input.get_audio(); - // Use the original tensor shape as intended - auto audio_tensor = executorch::extension::from_blob( - audio.data.data(), - {audio.batch_size, audio.n_bins, audio.n_frames}, - ::executorch::aten::ScalarType::Float); - + // Use Audio::toTensor() for tensor creation + auto audio_tensor = + ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor"); + ET_LOG( + Info, + "Audio tensor dim: %zu, dtype: %s", + audio_tensor->dim(), + ::executorch::runtime::toString(audio_tensor->scalar_type())); // Run audio encoder auto audio_encoder_result = module_->execute(kAudioEncoderMethod, audio_tensor);