Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions examples/models/voxtral/multimodal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
ET_LOG(Info, "audio_data len = %zu", n_floats);

std::vector<float> audio_data(n_floats);
ET_LOG(Info, "audio_data size = %zu", audio_data.size());
f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
ET_LOG(Info, "First 5 floats in audio_data is %f, %f, %f, %f, %f",
audio_data[10000], audio_data[10001], audio_data[10002], audio_data[10003], audio_data[10004]);

f.close();

auto audio = ::executorch::extension::llm::Audio(
Expand Down Expand Up @@ -331,6 +335,12 @@ int32_t main(int32_t argc, char** argv) {

// Generate
ET_LOG(Info, "Starting generation...");
for (const auto& input : inputs) {
ET_LOG(
Info,
"Input : %s",
input.to_string().c_str());
}
auto error = runner->generate(inputs, config);
if (error != ::executorch::runtime::Error::Ok) {
ET_LOG(Error, "Failed to generate with multimodal runner");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,28 @@ public long prefillAudio(byte[] audio, int batch_size, int n_bins, int n_frames)

private native int appendAudioInput(byte[] audio, int batch_size, int n_bins, int n_frames);

/**
* Prefill a multimodal Module with the given audio input.
*
* @param audio Input preprocessed audio as a float array
* @param batch_size Input batch size
* @param n_bins Input number of bins
* @param n_frames Input number of frames
* @return 0, as the updated starting position in KV cache of the input in the LLM is no longer
* exposed to user.
* @throws RuntimeException if the prefill failed
*/
@Experimental
public long prefillAudio(float[] audio, int batch_size, int n_bins, int n_frames) {
int nativeResult = appendAudioInputFloat(audio, batch_size, n_bins, n_frames);
if (nativeResult != 0) {
throw new RuntimeException("Prefill failed with error code: " + nativeResult);
}
return 0;
}

private native int appendAudioInputFloat(float[] audio, int batch_size, int n_bins, int n_frames);

/**
* Prefill a multimodal Module with the given raw audio input.
*
Expand Down
35 changes: 35 additions & 0 deletions extension/android/jni/jni_layer_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
.seq_len = seq_len,
.temperature = temperature_,
};
for (const auto& input : inputs) {
ET_LOG(
Error,
"Prefill input: %s",
input.to_string().c_str());
}

multi_modal_runner_->generate(
std::move(inputs),
config,
Expand Down Expand Up @@ -325,6 +332,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
return 0;
}

// Returns status_code
jint append_audio_input_float(
facebook::jni::alias_ref<jfloatArray> data,
jint batch_size,
jint n_bins,
jint n_frames) {
if (data == nullptr) {
return static_cast<jint>(Error::EndOfMethod);
}
auto data_size = data->size();
if (data_size != 0) {
std::vector<jfloat> data_jfloat(data_size);
std::vector<float> data_f(data_size);
data->getRegion(0, data_size, data_jfloat.data());
for (int i = 0; i < data_size; i++) {
data_f[i] = data_jfloat[i];
}
ET_LOG(Error, "First 5 elements of data_f: %f, %f, %f, %f, %f",
data_f[10000], data_f[10001], data_f[10002], data_f[10003], data_f[
10004]);
llm::Audio audio{std::move(data_f), batch_size, n_bins, n_frames};
prefill_inputs_.emplace_back(llm::MultimodalInput{std::move(audio)});
}
return 0;
}

// Returns status_code
jint append_raw_audio_input(
facebook::jni::alias_ref<jbyteArray> data,
Expand Down Expand Up @@ -388,6 +421,8 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
ExecuTorchLlmJni::append_normalized_images_input),
makeNativeMethod(
"appendAudioInput", ExecuTorchLlmJni::append_audio_input),
makeNativeMethod(
"appendAudioInputFloat", ExecuTorchLlmJni::append_audio_input_float),
makeNativeMethod(
"appendRawAudioInput", ExecuTorchLlmJni::append_raw_audio_input),
makeNativeMethod(
Expand Down
20 changes: 20 additions & 0 deletions extension/llm/runner/audio.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ struct ET_EXPERIMENTAL RawAudio {
int32_t batch_size;
int32_t n_channels; // For mono, use n_channels = 1.
int32_t n_samples;

std::string to_string() const {
return "RawAudio: " + std::to_string(batch_size) + "x" +
std::to_string(n_channels) + "x" + std::to_string(n_samples);
}
};

/**
Expand Down Expand Up @@ -146,6 +151,21 @@ class ET_EXPERIMENTAL Audio final {
return ::executorch::runtime::Error::NotSupported;
}

std::string to_string() const {
std::string result = "Audio: ";
if (is_uint8()) {
result += "uint8_t";
} else if (is_float()) {
result += "float";
} else {
result += "unknown";
}
result += " data, batch_size: " + std::to_string(get_batch_size()) +
", n_bins: " + std::to_string(get_n_bins()) +
", n_frames: " + std::to_string(get_n_frames());
return result;
}

private:
// Members
std::variant<std::vector<uint8_t>, std::vector<float>> data_;
Expand Down
15 changes: 15 additions & 0 deletions extension/llm/runner/image.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,21 @@ class ET_EXPERIMENTAL Image {
return ::executorch::runtime::Error::NotSupported;
}

std::string to_string() const {
std::string result = "Image: ";
if (is_uint8()) {
result += "uint8_t";
} else if (is_float()) {
result += "float";
} else {
result += "unknown";
}
result += "width: " + std::to_string(width_) + ", ";
result += "height: " + std::to_string(height_) + ", ";
result += "channels: " + std::to_string(channels_);
return result;
Comment on lines +114 to +125
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use string stream to avoid copying

}

private:
// Assuming NCHW format
std::variant<std::vector<uint8_t>, std::vector<float>> data_;
Expand Down
14 changes: 14 additions & 0 deletions extension/llm/runner/multimodal_input.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,20 @@ class ET_EXPERIMENTAL MultimodalInput {
// Destructor
~MultimodalInput() = default;

std::string to_string() const noexcept {
if (is_text()) {
return "Text: \"" + get_text() + "\"";
} else if (is_image()) {
return get_image().to_string();
} else if (is_audio()) {
return get_audio().to_string();
} else if (is_raw_audio()) {
return get_raw_audio().to_string();
} else {
return "Unsupported input type";
}
}

/**
* Check if this input contains text data.
* @return true if this input contains text, false otherwise.
Expand Down
Loading