Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions backends/cuda/tests/multimodal_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
#include <executorch/runtime/core/portable_type/tensor.h>

#include <cuda_runtime.h>
#include <nvml.h>

namespace {

using executorch::aten::ScalarType;
Expand Down Expand Up @@ -201,8 +204,21 @@ TensorPtr create_fallback_text_embedding(const ModelConfig& config) {
struct MethodTiming {
double load_ms{0.0};
double run_ms{0.0};
size_t peak_gpu_memory_bytes{0};
};

size_t get_gpu_memory_used() {
size_t free_bytes = 0;
size_t total_bytes = 0;
cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes);
if (status != cudaSuccess) {
std::cerr << "Warning: cudaMemGetInfo failed: "
<< cudaGetErrorString(status) << std::endl;
return 0;
}
return total_bytes - free_bytes;
}

enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN };

MethodCategory categorize_method(const std::string& method_name) {
Expand Down Expand Up @@ -306,6 +322,9 @@ Error execute_method(
std::vector<EValue> inputs = create_inputs_for_method(
method_name, category, model_type, config, token_output, owned_inputs);

cudaDeviceSynchronize();
size_t mem_before = get_gpu_memory_used();

const auto run_start = Clock::now();
ET_LOG(Info, "%s running", method_name.c_str());
Result<std::vector<EValue>> output_result =
Expand All @@ -314,6 +333,11 @@ Error execute_method(
const auto run_end = Clock::now();
timing.run_ms = DurationMs(run_end - run_start).count();

cudaDeviceSynchronize();
size_t mem_after = get_gpu_memory_used();
timing.peak_gpu_memory_bytes =
mem_after > mem_before ? (mem_after - mem_before) : 0;

if (output_result.error() != Error::Ok) {
std::cerr << method_name << " execution failed: error code "
<< static_cast<int>(output_result.error()) << std::endl;
Expand Down Expand Up @@ -457,6 +481,13 @@ int main(int argc, char** argv) {
std::cout << " " << name << ": " << timing.run_ms << std::endl;
}

std::cout << "\nPeak GPU memory usage:" << std::endl;
for (const auto& [name, timing] : timings) {
double memory_mb = timing.peak_gpu_memory_bytes / (1024.0 * 1024.0);
std::cout << " " << name << ": " << memory_mb << " MB ("
<< timing.peak_gpu_memory_bytes << " bytes)" << std::endl;
}

return 0;
} catch (const std::exception& ex) {
std::cerr << "Unhandled exception: " << ex.what() << std::endl;
Expand Down
83 changes: 83 additions & 0 deletions examples/models/gemma3/e2e_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/platform/log.h>

#include <cuda_runtime.h>

#define STB_IMAGE_IMPLEMENTATION
#include <stb_image.h>
#define STB_IMAGE_RESIZE_IMPLEMENTATION
Expand Down Expand Up @@ -67,6 +69,20 @@ using ::executorch::extension::llm::make_text_input;
using ::executorch::extension::llm::MultimodalInput;
using ::executorch::runtime::EValue;

size_t get_gpu_memory_used() {
size_t free_bytes = 0;
size_t total_bytes = 0;
cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes);
if (status != cudaSuccess) {
ET_LOG(
Error,
"Warning: cudaMemGetInfo failed: %s",
cudaGetErrorString(status));
return 0;
}
return total_bytes - free_bytes;
}

bool ends_with(const std::string& str, const std::string& suffix) {
return str.size() >= suffix.size() &&
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
Expand Down Expand Up @@ -200,13 +216,29 @@ int32_t main(int32_t argc, char** argv) {
return 1;
}

// Measure memory before loading
cudaDeviceSynchronize();
size_t mem_before_load = get_gpu_memory_used();
ET_LOG(
Info,
"GPU memory before loading: %.2f MB",
mem_before_load / (1024.0 * 1024.0));

// Load runner
auto load_error = runner->load();
if (load_error != ::executorch::runtime::Error::Ok) {
ET_LOG(Error, "Failed to load multimodal runner");
return 1;
}

// Measure memory after loading
cudaDeviceSynchronize();
size_t mem_after_load = get_gpu_memory_used();
ET_LOG(
Info,
"GPU memory after loading: %.2f MB",
mem_after_load / (1024.0 * 1024.0));

// Prepare inputs
std::vector<MultimodalInput> inputs = {
make_text_input("<start_of_turn>user\n<start_of_image>"),
Expand All @@ -230,13 +262,64 @@ int32_t main(int32_t argc, char** argv) {
runner->reset();
}

// Measure memory before generation
cudaDeviceSynchronize();
size_t mem_before_gen = get_gpu_memory_used();

auto error = runner->generate(inputs, config);

if (error != ::executorch::runtime::Error::Ok) {
ET_LOG(Error, "Failed to generate with multimodal runner\n");
return 1;
}

// Measure memory after generation
cudaDeviceSynchronize();
size_t mem_after_gen = get_gpu_memory_used();

ET_LOG(Info, "Generated successfully");

// Calculate and print memory usage statistics
size_t load_memory = mem_after_load - mem_before_load;
size_t gen_memory =
mem_after_gen > mem_before_gen ? (mem_after_gen - mem_before_gen) : 0;
size_t total_memory = mem_after_gen - mem_before_load;
size_t peak_memory = mem_after_gen;

std::printf("\n=== CUDA Memory Usage Statistics ===\n");
std::printf(
"Memory before loading: %.2f MB (%zu bytes)\n",
mem_before_load / (1024.0 * 1024.0),
mem_before_load);
std::printf(
"Memory after loading: %.2f MB (%zu bytes)\n",
mem_after_load / (1024.0 * 1024.0),
mem_after_load);
std::printf(
"Memory consumed by loading: %.2f MB (%zu bytes)\n",
load_memory / (1024.0 * 1024.0),
load_memory);
std::printf(
"Memory before generation: %.2f MB (%zu bytes)\n",
mem_before_gen / (1024.0 * 1024.0),
mem_before_gen);
std::printf(
"Memory after generation: %.2f MB (%zu bytes)\n",
mem_after_gen / (1024.0 * 1024.0),
mem_after_gen);
std::printf(
"Memory consumed by generation: %.2f MB (%zu bytes)\n",
gen_memory / (1024.0 * 1024.0),
gen_memory);
std::printf(
"Total memory consumed: %.2f MB (%zu bytes)\n",
total_memory / (1024.0 * 1024.0),
total_memory);
std::printf(
"Peak GPU memory used: %.2f MB (%zu bytes)\n",
peak_memory / (1024.0 * 1024.0),
peak_memory);
std::printf("====================================\n\n");

return 0;
}
Loading