Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 17 additions & 30 deletions examples/models/voxtral/multimodal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,13 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {

ET_LOG(Info, "audio_data len = %zu", n_floats);

// Create Audio multimodal input
auto audio = std::make_unique<::executorch::extension::llm::Audio>();
audio->batch_size = batch_size;
audio->n_bins = n_bins;
audio->n_frames = n_frames;
audio->data.resize(n_floats * sizeof(float));
f.read(reinterpret_cast<char*>(audio->data.data()), n_floats * sizeof(float));
std::vector<float> audio_data(n_floats);
f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
f.close();
return ::executorch::extension::llm::make_audio_input(std::move(*audio));

auto audio = ::executorch::extension::llm::Audio(
std::move(audio_data), batch_size, n_bins, n_frames);
return ::executorch::extension::llm::make_audio_input(std::move(audio));
}

/**
Expand Down Expand Up @@ -206,32 +204,21 @@ MultimodalInput processRawAudioFile(
static_cast<int>(sizes[2]));

// Create Audio multimodal input from processed features
auto processed_audio =
std::make_unique<::executorch::extension::llm::Audio>();
processed_audio->batch_size =
static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
// yet, so this will just be = 1.
processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
processed_audio->n_frames =
static_cast<int32_t>(sizes[2]); // And this will just be = 3000.

size_t total_elements = processed_audio->batch_size *
processed_audio->n_bins * processed_audio->n_frames;
processed_audio->data.resize(total_elements * sizeof(float));
std::memcpy(
processed_audio->data.data(),
processed_data,
total_elements * sizeof(float));

int32_t batch_size = static_cast<int32_t>(sizes[0]);
int32_t n_bins = static_cast<int32_t>(sizes[1]);
int32_t n_frames = static_cast<int32_t>(sizes[2]);
size_t total_elements = batch_size * n_bins * n_frames;
std::vector<float> audio_vec(processed_data, processed_data + total_elements);
auto processed_audio = ::executorch::extension::llm::Audio(
std::move(audio_vec), batch_size, n_bins, n_frames);
ET_LOG(
Info,
"Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
processed_audio->batch_size,
processed_audio->n_bins,
processed_audio->n_frames);

batch_size,
n_bins,
n_frames);
return ::executorch::extension::llm::make_audio_input(
std::move(*processed_audio));
std::move(processed_audio));
}

/**
Expand Down
129 changes: 122 additions & 7 deletions extension/llm/runner/audio.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
#pragma once
#include <executorch/runtime/platform/compiler.h>
#include <cstdint>
#include <variant>
#include <vector>

#include <executorch/extension/tensor/tensor.h>

namespace executorch {
namespace extension {
namespace llm {
Expand All @@ -29,14 +32,126 @@ struct ET_EXPERIMENTAL RawAudio {
};

/**
* Pre-processed audio inputs, ready to feed directly into an audio
* encoder.
* Pre-processed audio inputs, ready to feed directly into an audio encoder.
*
* The data can be either uint8_t or float. If the audio has gone through a Mel
* transform, we expect the data type to be float (i.e., std::vector<float>), as
* Mel spectrograms are typically represented as floating point values. For raw
* or quantized audio, uint8_t may be used instead.
*/
struct ET_EXPERIMENTAL Audio {
std::vector<uint8_t> data;
int32_t batch_size;
int32_t n_bins;
int32_t n_frames;
class ET_EXPERIMENTAL Audio final {
public:
// Default constructor
Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}

// Constructor for uint8_t data
Audio(
std::vector<uint8_t>&& data,
int32_t batch_size,
int32_t n_bins,
int32_t n_frames)
: data_(std::move(data)),
batch_size_(batch_size),
n_bins_(n_bins),
n_frames_(n_frames) {
ET_CHECK_MSG(
data_.index() == 0 &&
std::get<std::vector<uint8_t>>(data_).size() ==
static_cast<size_t>(batch_size * n_bins * n_frames),
"data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
std::get<std::vector<uint8_t>>(data_).size(),
batch_size * n_bins * n_frames);
}

// Constructor for float data
Audio(
std::vector<float>&& data,
int32_t batch_size,
int32_t n_bins,
int32_t n_frames)
: data_(std::move(data)),
batch_size_(batch_size),
n_bins_(n_bins),
n_frames_(n_frames) {
ET_CHECK_MSG(
data_.index() == 1 &&
std::get<std::vector<float>>(data_).size() ==
static_cast<size_t>(batch_size * n_bins * n_frames),
"data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
std::get<std::vector<float>>(data_).size(),
batch_size * n_bins * n_frames);
}

// Type checkers
bool is_uint8() const {
return std::holds_alternative<std::vector<uint8_t>>(data_);
}

bool is_float() const {
return std::holds_alternative<std::vector<float>>(data_);
}

// Data access
const std::vector<uint8_t>& get_uint8_data() const& {
return std::get<std::vector<uint8_t>>(data_);
}

std::vector<uint8_t>& get_uint8_data() & {
return std::get<std::vector<uint8_t>>(data_);
}

const std::vector<float>& get_float_data() const& {
return std::get<std::vector<float>>(data_);
}

std::vector<float>& get_float_data() & {
return std::get<std::vector<float>>(data_);
}

int32_t get_batch_size() const {
return batch_size_;
}
int32_t get_n_bins() const {
return n_bins_;
}
int32_t get_n_frames() const {
return n_frames_;
}
/**
* Convert the audio data to a TensorPtr, with optional batch dimension.
* The tensor will have shape (batch_size, n_bins, n_frames) or (1,
* batch_size, n_bins, n_frames) if with_batch is true.
*/
executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
bool with_batch = false) const {
std::vector<executorch::aten::SizesType> sizes = {
get_batch_size(), get_n_bins(), get_n_frames()};
if (with_batch) {
sizes.insert(sizes.begin(), 1);
}
if (is_float()) {
return executorch::extension::from_blob(
const_cast<float*>(get_float_data().data()),
sizes,
::executorch::aten::ScalarType::Float);
} else if (is_uint8()) {
return executorch::extension::from_blob(
const_cast<uint8_t*>(get_uint8_data().data()),
sizes,
::executorch::aten::ScalarType::Byte);
}
ET_LOG(
Error,
"Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
return ::executorch::runtime::Error::NotSupported;
}

private:
// Members
std::variant<std::vector<uint8_t>, std::vector<float>> data_;
int32_t batch_size_;
int32_t n_bins_;
int32_t n_frames_;
};

} // namespace llm
Expand Down
29 changes: 19 additions & 10 deletions extension/llm/runner/multimodal_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,21 +47,24 @@ Result<uint64_t> MultimodalPrefiller::prefill(
"Failed to get method_meta for %s",
kVisionEncoderMethod);

ET_CHECK_MSG(
ET_CHECK_OR_RETURN_ERROR(
method_meta.num_inputs() > 0,
InvalidArgument,
"Image encoder should have at least 1 input");
auto input_meta = ET_UNWRAP(
method_meta.input_tensor_meta(0),
"Cannot get input tensor meta at index 0");
auto expected_dtype = input_meta.scalar_type();

if (expected_dtype == ::executorch::aten::ScalarType::Float) {
ET_CHECK_MSG(
ET_CHECK_OR_RETURN_ERROR(
image.is_float(),
InvalidArgument,
"Model expects float image data, but image has uint8_t data.");
} else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
ET_CHECK_MSG(
ET_CHECK_OR_RETURN_ERROR(
image.is_uint8(),
InvalidArgument,
"Model expects uint8_t image data, but image has float data.");
} else {
ET_LOG(
Expand All @@ -77,7 +80,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
auto image_tensor = ET_UNWRAP(
image.toTensor(/*with_batch*/ expected_dims.size() == 4),
"Failed to convert image to tensor");

ET_LOG(
Info,
"Image tensor dim: %zu, dtype: %s",
image_tensor->dim(),
::executorch::runtime::toString(image_tensor->scalar_type()));
// Run image encoder
auto image_encoder_outputs =
ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
Expand All @@ -86,12 +93,14 @@ Result<uint64_t> MultimodalPrefiller::prefill(
} else if (input.is_audio()) {
Audio audio = input.get_audio();

// Use the original tensor shape as intended
auto audio_tensor = executorch::extension::from_blob(
audio.data.data(),
{audio.batch_size, audio.n_bins, audio.n_frames},
::executorch::aten::ScalarType::Float);

// Use Audio::toTensor() for tensor creation
auto audio_tensor =
ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
ET_LOG(
Info,
"Audio tensor dim: %zu, dtype: %s",
audio_tensor->dim(),
::executorch::runtime::toString(audio_tensor->scalar_type()));
// Run audio encoder
auto audio_encoder_result =
module_->execute(kAudioEncoderMethod, audio_tensor);
Expand Down
Loading