diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 17013df96e1..081df27cd67 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -103,15 +103,13 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) { ET_LOG(Info, "audio_data len = %zu", n_floats); - // Create Audio multimodal input - auto audio = std::make_unique<::executorch::extension::llm::Audio>(); - audio->batch_size = batch_size; - audio->n_bins = n_bins; - audio->n_frames = n_frames; - audio->data.resize(n_floats * sizeof(float)); - f.read(reinterpret_cast(audio->data.data()), n_floats * sizeof(float)); + std::vector audio_data(n_floats); + f.read(reinterpret_cast(audio_data.data()), n_floats * sizeof(float)); f.close(); - return ::executorch::extension::llm::make_audio_input(std::move(*audio)); + + auto audio = ::executorch::extension::llm::Audio( + std::move(audio_data), batch_size, n_bins, n_frames); + return ::executorch::extension::llm::make_audio_input(std::move(audio)); } /** @@ -206,32 +204,21 @@ MultimodalInput processRawAudioFile( static_cast(sizes[2])); // Create Audio multimodal input from processed features - auto processed_audio = - std::make_unique<::executorch::extension::llm::Audio>(); - processed_audio->batch_size = - static_cast(sizes[0]); // Note: batching for s > 30 doesn't work - // yet, so this will just be = 1. - processed_audio->n_bins = static_cast(sizes[1]); - processed_audio->n_frames = - static_cast(sizes[2]); // And this will just be = 3000. - - size_t total_elements = processed_audio->batch_size * - processed_audio->n_bins * processed_audio->n_frames; - processed_audio->data.resize(total_elements * sizeof(float)); - std::memcpy( - processed_audio->data.data(), - processed_data, - total_elements * sizeof(float)); - + int32_t batch_size = static_cast(sizes[0]); + int32_t n_bins = static_cast(sizes[1]); + int32_t n_frames = static_cast(sizes[2]); + size_t total_elements = batch_size * n_bins * n_frames; + std::vector audio_vec(processed_data, processed_data + total_elements); + auto processed_audio = ::executorch::extension::llm::Audio( + std::move(audio_vec), batch_size, n_bins, n_frames); ET_LOG( Info, "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d", - processed_audio->batch_size, - processed_audio->n_bins, - processed_audio->n_frames); - + batch_size, + n_bins, + n_frames); return ::executorch::extension::llm::make_audio_input( - std::move(*processed_audio)); + std::move(processed_audio)); } /** diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index 868765950af..ce71513ed17 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -11,8 +11,11 @@ #pragma once #include #include +#include #include +#include + namespace executorch { namespace extension { namespace llm { @@ -29,14 +32,126 @@ struct ET_EXPERIMENTAL RawAudio { }; /** - * Pre-processed audio inputs, ready to feed directly into an audio - * encoder. + * Pre-processed audio inputs, ready to feed directly into an audio encoder. + * + * The data can be either uint8_t or float. If the audio has gone through a Mel + * transform, we expect the data type to be float (i.e., std::vector), as + * Mel spectrograms are typically represented as floating point values. For raw + * or quantized audio, uint8_t may be used instead. */ -struct ET_EXPERIMENTAL Audio { - std::vector data; - int32_t batch_size; - int32_t n_bins; - int32_t n_frames; +class ET_EXPERIMENTAL Audio final { + public: + // Default constructor + Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {} + + // Constructor for uint8_t data + Audio( + std::vector&& data, + int32_t batch_size, + int32_t n_bins, + int32_t n_frames) + : data_(std::move(data)), + batch_size_(batch_size), + n_bins_(n_bins), + n_frames_(n_frames) { + ET_CHECK_MSG( + data_.index() == 0 && + std::get>(data_).size() == + static_cast(batch_size * n_bins * n_frames), + "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)", + std::get>(data_).size(), + batch_size * n_bins * n_frames); + } + + // Constructor for float data + Audio( + std::vector&& data, + int32_t batch_size, + int32_t n_bins, + int32_t n_frames) + : data_(std::move(data)), + batch_size_(batch_size), + n_bins_(n_bins), + n_frames_(n_frames) { + ET_CHECK_MSG( + data_.index() == 1 && + std::get>(data_).size() == + static_cast(batch_size * n_bins * n_frames), + "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)", + std::get>(data_).size(), + batch_size * n_bins * n_frames); + } + + // Type checkers + bool is_uint8() const { + return std::holds_alternative>(data_); + } + + bool is_float() const { + return std::holds_alternative>(data_); + } + + // Data access + const std::vector& get_uint8_data() const& { + return std::get>(data_); + } + + std::vector& get_uint8_data() & { + return std::get>(data_); + } + + const std::vector& get_float_data() const& { + return std::get>(data_); + } + + std::vector& get_float_data() & { + return std::get>(data_); + } + + int32_t get_batch_size() const { + return batch_size_; + } + int32_t get_n_bins() const { + return n_bins_; + } + int32_t get_n_frames() const { + return n_frames_; + } + /** + * Convert the audio data to a TensorPtr, with optional batch dimension. + * The tensor will have shape (batch_size, n_bins, n_frames) or (1, + * batch_size, n_bins, n_frames) if with_batch is true. + */ + executorch::runtime::Result toTensor( + bool with_batch = false) const { + std::vector sizes = { + get_batch_size(), get_n_bins(), get_n_frames()}; + if (with_batch) { + sizes.insert(sizes.begin(), 1); + } + if (is_float()) { + return executorch::extension::from_blob( + const_cast(get_float_data().data()), + sizes, + ::executorch::aten::ScalarType::Float); + } else if (is_uint8()) { + return executorch::extension::from_blob( + const_cast(get_uint8_data().data()), + sizes, + ::executorch::aten::ScalarType::Byte); + } + ET_LOG( + Error, + "Shouldn't reach here, audio data is not initialized with uint8_t or float vector."); + return ::executorch::runtime::Error::NotSupported; + } + + private: + // Members + std::variant, std::vector> data_; + int32_t batch_size_; + int32_t n_bins_; + int32_t n_frames_; }; } // namespace llm diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index f9645667f24..824fdf943a9 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -47,8 +47,9 @@ Result MultimodalPrefiller::prefill( "Failed to get method_meta for %s", kVisionEncoderMethod); - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( method_meta.num_inputs() > 0, + InvalidArgument, "Image encoder should have at least 1 input"); auto input_meta = ET_UNWRAP( method_meta.input_tensor_meta(0), @@ -56,12 +57,14 @@ Result MultimodalPrefiller::prefill( auto expected_dtype = input_meta.scalar_type(); if (expected_dtype == ::executorch::aten::ScalarType::Float) { - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( image.is_float(), + InvalidArgument, "Model expects float image data, but image has uint8_t data."); } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) { - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( image.is_uint8(), + InvalidArgument, "Model expects uint8_t image data, but image has float data."); } else { ET_LOG( @@ -77,7 +80,11 @@ Result MultimodalPrefiller::prefill( auto image_tensor = ET_UNWRAP( image.toTensor(/*with_batch*/ expected_dims.size() == 4), "Failed to convert image to tensor"); - + ET_LOG( + Info, + "Image tensor dim: %zu, dtype: %s", + image_tensor->dim(), + ::executorch::runtime::toString(image_tensor->scalar_type())); // Run image encoder auto image_encoder_outputs = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); @@ -86,12 +93,14 @@ Result MultimodalPrefiller::prefill( } else if (input.is_audio()) { Audio audio = input.get_audio(); - // Use the original tensor shape as intended - auto audio_tensor = executorch::extension::from_blob( - audio.data.data(), - {audio.batch_size, audio.n_bins, audio.n_frames}, - ::executorch::aten::ScalarType::Float); - + // Use Audio::toTensor() for tensor creation + auto audio_tensor = + ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor"); + ET_LOG( + Info, + "Audio tensor dim: %zu, dtype: %s", + audio_tensor->dim(), + ::executorch::runtime::toString(audio_tensor->scalar_type())); // Run audio encoder auto audio_encoder_result = module_->execute(kAudioEncoderMethod, audio_tensor);