From 38049a0002cad33903bedc5006fab408e32da656 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 12 Nov 2025 09:33:15 -0800 Subject: [PATCH 1/4] Add a CUDA memory tracker and use it in voxtral runner --- backends/cuda/runtime/memory_tracker.h | 136 +++++++++++++++++++++++++ examples/models/voxtral/multimodal.cpp | 11 ++ 2 files changed, 147 insertions(+) create mode 100644 backends/cuda/runtime/memory_tracker.h diff --git a/backends/cuda/runtime/memory_tracker.h b/backends/cuda/runtime/memory_tracker.h new file mode 100644 index 00000000000..1bed7da9374 --- /dev/null +++ b/backends/cuda/runtime/memory_tracker.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace executorch::backends::cuda { + +/** + * @class CudaMemoryTracker + * @brief Tracks CUDA memory usage and logs memory state at key points + * + * This class provides utilities to query and track CUDA memory usage, + * including peak memory usage and detailed memory state logging. + */ +class CudaMemoryTracker { + public: + /** + * @brief Constructor - initializes tracker and logs startup memory state + */ + CudaMemoryTracker() { + if (!query(&last_free_bytes_, &total_bytes_)) { + return; + } + available_ = true; + min_free_bytes_ = last_free_bytes_; + log_state("startup", last_free_bytes_, total_bytes_); + } + + /** + * @brief Logs current memory state at a tagged checkpoint + * @param tag Descriptive tag for this memory sample (e.g., "after_load") + */ + void log_sample(const char* tag) { + if (!available_) { + return; + } + size_t free_bytes = 0; + size_t total_bytes = 0; + if (!query(&free_bytes, &total_bytes)) { + return; + } + min_free_bytes_ = std::min(min_free_bytes_, free_bytes); + total_bytes_ = total_bytes; + last_free_bytes_ = free_bytes; + log_state(tag, free_bytes, total_bytes); + } + + /** + * @brief Destructor - logs final memory state and peak usage summary + */ + ~CudaMemoryTracker() { + if (!available_) { + return; + } + size_t free_bytes = 0; + size_t total_bytes = 0; + if (!query(&free_bytes, &total_bytes)) { + return; + } + min_free_bytes_ = std::min(min_free_bytes_, free_bytes); + total_bytes_ = total_bytes; + last_free_bytes_ = free_bytes; + const double peak_mb = + static_cast(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0); + const double total_mb = + static_cast(total_bytes_) / (1024.0 * 1024.0); + ET_LOG( + Info, + "CUDA memory peak usage: %.2f MB, total: %.2f MB", + peak_mb, + total_mb); + } + + private: + /** + * @brief Queries current CUDA memory info + * @param free_bytes Output parameter for free memory in bytes + * @param total_bytes Output parameter for total memory in bytes + * @return true if query succeeded, false otherwise + */ + bool query(size_t* free_bytes, size_t* total_bytes) { + cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes); + if (err != cudaSuccess) { + if (!error_logged_) { + error_logged_ = true; + ET_LOG( + Error, + "cudaMemGetInfo failed with error: %s", + cudaGetErrorString(err)); + } + available_ = false; + return false; + } + return true; + } + + /** + * @brief Logs the current memory state + * @param tag Tag describing this log point + * @param free_bytes Current free memory in bytes + * @param total_bytes Current total memory in bytes + */ + void log_state(const char* tag, size_t free_bytes, size_t total_bytes) const { + const double used_mb = + static_cast(total_bytes - free_bytes) / (1024.0 * 1024.0); + const double free_mb = static_cast(free_bytes) / (1024.0 * 1024.0); + const double total_mb = + static_cast(total_bytes) / (1024.0 * 1024.0); + ET_LOG( + Info, + "CUDA memory (%s): used %.2f MB, free %.2f MB, total %.2f MB", + tag, + used_mb, + free_mb, + total_mb); + } + + bool available_{false}; + bool error_logged_{false}; + size_t last_free_bytes_{0}; + size_t total_bytes_{0}; + size_t min_free_bytes_{std::numeric_limits::max()}; +}; + +} // namespace executorch::backends::cuda diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 29edf955751..ac3c7e054b0 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -25,6 +25,8 @@ #include #include +#include + #if defined(ET_USE_THREADPOOL) #include #include @@ -296,6 +298,9 @@ int32_t main(int32_t argc, char** argv) { int32_t cpu_threads = FLAGS_cpu_threads; bool warmup = FLAGS_warmup; + // Initialize memory tracker + ::executorch::backends::cuda::CudaMemoryTracker mem_tracker; + #if defined(ET_USE_THREADPOOL) uint32_t num_performant_cores = cpu_threads == -1 ? ::executorch::extension::cpuinfo::get_num_performant_cores() @@ -332,6 +337,9 @@ int32_t main(int32_t argc, char** argv) { return 1; } + // Probe CUDA memory after loading model + mem_tracker.log_sample("after_load"); + // Prepare inputs std::vector inputs = { make_text_input("[INST][BEGIN_AUDIO]"), @@ -362,6 +370,9 @@ int32_t main(int32_t argc, char** argv) { return 1; } + // Probe CUDA memory after generation + mem_tracker.log_sample("after_generate"); + printf("\n"); return 0; } From 383b91f41bf98d9a32e2130f8ae50c1575a3e77c Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 12 Nov 2025 09:49:37 -0800 Subject: [PATCH 2/4] Define CUDA_AVAILABLE when building CUDA backend; wire GPU stats into runner --- backends/cuda/runtime/memory_tracker.h | 58 ++++++++++++++++--- examples/models/voxtral/multimodal.cpp | 11 ---- extension/llm/runner/CMakeLists.txt | 15 +++++ extension/llm/runner/multimodal_runner.cpp | 31 +++++++++- extension/llm/runner/multimodal_runner.h | 9 +++ extension/llm/runner/stats.h | 66 +++++++++++++++++++++- 6 files changed, 168 insertions(+), 22 deletions(-) diff --git a/backends/cuda/runtime/memory_tracker.h b/backends/cuda/runtime/memory_tracker.h index 1bed7da9374..8defa26e441 100644 --- a/backends/cuda/runtime/memory_tracker.h +++ b/backends/cuda/runtime/memory_tracker.h @@ -33,6 +33,11 @@ class CudaMemoryTracker { return; } available_ = true; + // Record the initial free bytes observed at startup. We'll use this as a + // baseline so reported "peak usage" reflects additional memory used + // since the tracker was created (instead of the absolute device usage, + // which may include other processes). + initial_free_bytes_ = last_free_bytes_; min_free_bytes_ = last_free_bytes_; log_state("startup", last_free_bytes_, total_bytes_); } @@ -71,15 +76,22 @@ class CudaMemoryTracker { min_free_bytes_ = std::min(min_free_bytes_, free_bytes); total_bytes_ = total_bytes; last_free_bytes_ = free_bytes; - const double peak_mb = - static_cast(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0); - const double total_mb = - static_cast(total_bytes_) / (1024.0 * 1024.0); - ET_LOG( - Info, - "CUDA memory peak usage: %.2f MB, total: %.2f MB", - peak_mb, - total_mb); + // Compute peak usage relative to the initial free baseline so that + // allocations by other processes present at startup are not attributed + // to this process. If for some reason initial_free_bytes_ was not set, + // fall back to absolute device usage. + double peak_mb = 0.0; + if (initial_free_bytes_ != std::numeric_limits::max()) { + size_t used_delta = 0; + if (initial_free_bytes_ > min_free_bytes_) { + used_delta = initial_free_bytes_ - min_free_bytes_; + } + peak_mb = static_cast(used_delta) / (1024.0 * 1024.0); + } else { + peak_mb = static_cast(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0); + } + const double total_mb = static_cast(total_bytes_) / (1024.0 * 1024.0); + ET_LOG(Info, "CUDA memory peak usage (since startup): %.2f MB, device total: %.2f MB", peak_mb, total_mb); } private: @@ -131,6 +143,34 @@ class CudaMemoryTracker { size_t last_free_bytes_{0}; size_t total_bytes_{0}; size_t min_free_bytes_{std::numeric_limits::max()}; + // Baseline free bytes observed at tracker construction. Used to compute + // peak usage attributable to this process since the tracker started. + size_t initial_free_bytes_{std::numeric_limits::max()}; + public: + // Simple accessors to allow other components to read last-sampled values. + // These are safe to call after a successful log_sample() invocation. + uint64_t last_free_bytes() const { return static_cast(last_free_bytes_); } + uint64_t total_bytes() const { return static_cast(total_bytes_); } + uint64_t min_free_bytes() const { return static_cast(min_free_bytes_); } + uint64_t initial_free_bytes() const { return static_cast(initial_free_bytes_); } + double peak_usage_mb() const { + // Prefer peak relative to the initial free baseline; fall back to + // absolute device peak if baseline isn't available. + if (min_free_bytes_ == std::numeric_limits::max()) { + return 0.0; + } + if (initial_free_bytes_ != std::numeric_limits::max()) { + size_t used_delta = 0; + if (initial_free_bytes_ > min_free_bytes_) { + used_delta = initial_free_bytes_ - min_free_bytes_; + } + return static_cast(used_delta) / (1024.0 * 1024.0); + } + if (total_bytes_ == 0) { + return 0.0; + } + return static_cast(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0); + } }; } // namespace executorch::backends::cuda diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index ac3c7e054b0..29edf955751 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -25,8 +25,6 @@ #include #include -#include - #if defined(ET_USE_THREADPOOL) #include #include @@ -298,9 +296,6 @@ int32_t main(int32_t argc, char** argv) { int32_t cpu_threads = FLAGS_cpu_threads; bool warmup = FLAGS_warmup; - // Initialize memory tracker - ::executorch::backends::cuda::CudaMemoryTracker mem_tracker; - #if defined(ET_USE_THREADPOOL) uint32_t num_performant_cores = cpu_threads == -1 ? ::executorch::extension::cpuinfo::get_num_performant_cores() @@ -337,9 +332,6 @@ int32_t main(int32_t argc, char** argv) { return 1; } - // Probe CUDA memory after loading model - mem_tracker.log_sample("after_load"); - // Prepare inputs std::vector inputs = { make_text_input("[INST][BEGIN_AUDIO]"), @@ -370,9 +362,6 @@ int32_t main(int32_t argc, char** argv) { return 1; } - // Probe CUDA memory after generation - mem_tracker.log_sample("after_generate"); - printf("\n"); return 0; } diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 8d280b4eaf9..00f7866483e 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -55,6 +55,21 @@ target_include_directories( extension_llm_runner INTERFACE ${_common_include_directories} ) +# If the project is configured to build with CUDA support, try to find a +# CUDA runtime (prefer the CUDAToolkit package). If found, expose a +# compile-time macro so sources can conditionally compile CUDA-aware code. +if(EXECUTORCH_BUILD_CUDA) + # Prefer the modern CMake CUDAToolkit module, fall back to searching for + # the CUDA runtime library (cudart) if the package isn't available. + find_package(CUDAToolkit QUIET) + if(CUDAToolkit_FOUND) + target_compile_definitions(extension_llm_runner PUBLIC CUDA_AVAILABLE) + message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE") + else() + message(STATUS "CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found") + endif() +endif() + install( TARGETS extension_llm_runner EXPORT ExecuTorchTargets diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index 047ca27ee2b..92e50d21c67 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -15,6 +15,10 @@ #include #include +#ifdef CUDA_AVAILABLE +#include +#endif + namespace executorch::extension::llm { using ::executorch::extension::Module; @@ -38,7 +42,16 @@ MultimodalRunner::MultimodalRunner( io_manager_(std::move(io_manager)), text_token_generator_(std::move(text_token_generator)), stats_(std::move(stats)), - pos_(0) {} + pos_(0) { +#ifdef CUDA_AVAILABLE + cuda_memory_tracker_ = + std::make_unique<::executorch::backends::cuda::CudaMemoryTracker>(); + // Probe immediately after creating the tracker to capture GPU state before + // any model loading happens. + stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes(); + stats_->gpu_free_before_load_bytes = cuda_memory_tracker_->last_free_bytes(); +#endif +} bool MultimodalRunner::is_loaded() { return multimodal_prefiller_->is_method_loaded() && @@ -51,6 +64,14 @@ Error MultimodalRunner::load() { } ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load()); ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load()); + +#ifdef CUDA_AVAILABLE + cuda_memory_tracker_->log_sample("after_load"); + stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes(); + stats_->gpu_free_after_load_bytes = cuda_memory_tracker_->last_free_bytes(); + stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb(); +#endif + return Error::Ok; } @@ -192,6 +213,14 @@ Error MultimodalRunner::generate( stats_->num_generated_tokens = num_generated_tokens; // Finalize stats and call callback stats_->inference_end_ms = time_in_ms(); + +#ifdef CUDA_AVAILABLE + cuda_memory_tracker_->log_sample("after_generate"); + stats_->gpu_free_after_generate_bytes = cuda_memory_tracker_->last_free_bytes(); + // update peak in case it changed after generation + stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb(); +#endif + if (!config.warming) { printf("\n"); } diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index caf3c296038..b34b7b05ce7 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -36,6 +36,10 @@ // These are provided for backward compatibility #include +#ifdef CUDA_AVAILABLE +#include +#endif + namespace executorch { namespace extension { namespace llm { @@ -150,6 +154,11 @@ class ET_EXPERIMENTAL MultimodalRunner { std::unique_ptr text_token_generator_; std::unique_ptr stats_; +#ifdef CUDA_AVAILABLE + std::unique_ptr<::executorch::backends::cuda::CudaMemoryTracker> + cuda_memory_tracker_; +#endif + // Internal state int64_t pos_; }; diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h index 19766329ed3..03120fb8f4c 100644 --- a/extension/llm/runner/stats.h +++ b/extension/llm/runner/stats.h @@ -49,6 +49,14 @@ struct ET_EXPERIMENTAL Stats { int64_t num_prompt_tokens; // Token count from generated (total - prompt) int64_t num_generated_tokens; + // GPU memory stats (optional; may be zero if not available) + // GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate + // "not available". + uint64_t gpu_total_bytes = static_cast(-1); + uint64_t gpu_free_before_load_bytes = static_cast(-1); + uint64_t gpu_free_after_load_bytes = static_cast(-1); + uint64_t gpu_free_after_generate_bytes = static_cast(-1); + double gpu_peak_usage_mb = -1.0; inline void on_sampling_begin() { aggregate_sampling_timer_start_timestamp = time_in_ms(); } @@ -75,6 +83,11 @@ struct ET_EXPERIMENTAL Stats { aggregate_sampling_time_ms = 0; num_prompt_tokens = 0; num_generated_tokens = 0; + gpu_total_bytes = static_cast(-1); + gpu_free_before_load_bytes = static_cast(-1); + gpu_free_after_load_bytes = static_cast(-1); + gpu_free_after_generate_bytes = static_cast(-1); + gpu_peak_usage_mb = -1.0; aggregate_sampling_timer_start_timestamp = 0; } @@ -93,7 +106,29 @@ inline std::string stats_to_json_string(const Stats& stats) { << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << "," << "\"first_token_ms\":" << stats.first_token_ms << "," << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms - << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":" + << ","; + // Only include GPU fields in the JSON if gpu_total_bytes is valid (not + // equal to sentinel -1) + if (stats.gpu_total_bytes != static_cast(-1)) { + ss << "\"gpu_total_bytes\":" << stats.gpu_total_bytes; + if (stats.gpu_free_before_load_bytes != static_cast(-1)) { + ss << ",\"gpu_free_before_load_bytes\":" + << stats.gpu_free_before_load_bytes; + } + if (stats.gpu_free_after_load_bytes != static_cast(-1)) { + ss << ",\"gpu_free_after_load_bytes\":" + << stats.gpu_free_after_load_bytes; + } + if (stats.gpu_free_after_generate_bytes != static_cast(-1)) { + ss << ",\"gpu_free_after_generate_bytes\":" + << stats.gpu_free_after_generate_bytes; + } + if (stats.gpu_peak_usage_mb >= 0.0) { + ss << ",\"gpu_peak_usage_mb\":" << stats.gpu_peak_usage_mb; + } + ss << ","; + } + ss << "\"SCALING_FACTOR_UNITS_PER_SECOND\":" << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}"; return ss.str(); } @@ -156,6 +191,35 @@ inline void print_report(const Stats& stats) { stats.num_prompt_tokens + stats.num_generated_tokens, (double)stats.aggregate_sampling_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND); + + // GPU memory reporting (only meaningful if GPU fields were populated) + if (stats.gpu_total_bytes != static_cast(-1)) { + ET_LOG( + Info, + "\tGPU total memory: %.2f MB", + stats.gpu_total_bytes / 1024.0 / 1024.0); + if (stats.gpu_free_before_load_bytes != static_cast(-1)) { + ET_LOG( + Info, + "\tGPU free before load: %.2f MB", + stats.gpu_free_before_load_bytes / 1024.0 / 1024.0); + } + if (stats.gpu_free_after_load_bytes != static_cast(-1)) { + ET_LOG( + Info, + "\tGPU free after load: %.2f MB", + stats.gpu_free_after_load_bytes / 1024.0 / 1024.0); + } + if (stats.gpu_free_after_generate_bytes != static_cast(-1)) { + ET_LOG( + Info, + "\tGPU free after generate: %.2f MB", + stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0); + } + if (stats.gpu_peak_usage_mb >= 0.0) { + ET_LOG(Info, "\tGPU peak usage: %.2f MB", stats.gpu_peak_usage_mb); + } + } } } // namespace llm From 56f89976488b817a9a95be652cc6a7cd84279d3c Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 12 Nov 2025 11:54:49 -0800 Subject: [PATCH 3/4] Fix load time calculation --- backends/cuda/runtime/memory_tracker.h | 32 ++++++++++++++++------ extension/llm/runner/CMakeLists.txt | 15 ++++++---- extension/llm/runner/multimodal_runner.cpp | 15 +++++----- extension/llm/runner/stats.h | 4 +-- 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/backends/cuda/runtime/memory_tracker.h b/backends/cuda/runtime/memory_tracker.h index 8defa26e441..e09a96da6a6 100644 --- a/backends/cuda/runtime/memory_tracker.h +++ b/backends/cuda/runtime/memory_tracker.h @@ -88,10 +88,16 @@ class CudaMemoryTracker { } peak_mb = static_cast(used_delta) / (1024.0 * 1024.0); } else { - peak_mb = static_cast(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0); + peak_mb = static_cast(total_bytes_ - min_free_bytes_) / + (1024.0 * 1024.0); } - const double total_mb = static_cast(total_bytes_) / (1024.0 * 1024.0); - ET_LOG(Info, "CUDA memory peak usage (since startup): %.2f MB, device total: %.2f MB", peak_mb, total_mb); + const double total_mb = + static_cast(total_bytes_) / (1024.0 * 1024.0); + ET_LOG( + Info, + "CUDA memory peak usage (since startup): %.2f MB, device total: %.2f MB", + peak_mb, + total_mb); } private: @@ -146,13 +152,22 @@ class CudaMemoryTracker { // Baseline free bytes observed at tracker construction. Used to compute // peak usage attributable to this process since the tracker started. size_t initial_free_bytes_{std::numeric_limits::max()}; + public: // Simple accessors to allow other components to read last-sampled values. // These are safe to call after a successful log_sample() invocation. - uint64_t last_free_bytes() const { return static_cast(last_free_bytes_); } - uint64_t total_bytes() const { return static_cast(total_bytes_); } - uint64_t min_free_bytes() const { return static_cast(min_free_bytes_); } - uint64_t initial_free_bytes() const { return static_cast(initial_free_bytes_); } + uint64_t last_free_bytes() const { + return static_cast(last_free_bytes_); + } + uint64_t total_bytes() const { + return static_cast(total_bytes_); + } + uint64_t min_free_bytes() const { + return static_cast(min_free_bytes_); + } + uint64_t initial_free_bytes() const { + return static_cast(initial_free_bytes_); + } double peak_usage_mb() const { // Prefer peak relative to the initial free baseline; fall back to // absolute device peak if baseline isn't available. @@ -169,7 +184,8 @@ class CudaMemoryTracker { if (total_bytes_ == 0) { return 0.0; } - return static_cast(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0); + return static_cast(total_bytes_ - min_free_bytes_) / + (1024.0 * 1024.0); } }; diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 00f7866483e..de77c21636f 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -55,18 +55,21 @@ target_include_directories( extension_llm_runner INTERFACE ${_common_include_directories} ) -# If the project is configured to build with CUDA support, try to find a -# CUDA runtime (prefer the CUDAToolkit package). If found, expose a -# compile-time macro so sources can conditionally compile CUDA-aware code. +# If the project is configured to build with CUDA support, try to find a CUDA +# runtime (prefer the CUDAToolkit package). If found, expose a compile-time +# macro so sources can conditionally compile CUDA-aware code. if(EXECUTORCH_BUILD_CUDA) - # Prefer the modern CMake CUDAToolkit module, fall back to searching for - # the CUDA runtime library (cudart) if the package isn't available. + # Prefer the modern CMake CUDAToolkit module, fall back to searching for the + # CUDA runtime library (cudart) if the package isn't available. find_package(CUDAToolkit QUIET) if(CUDAToolkit_FOUND) target_compile_definitions(extension_llm_runner PUBLIC CUDA_AVAILABLE) message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE") else() - message(STATUS "CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found") + message( + STATUS + "CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found" + ) endif() endif() diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index 92e50d21c67..5c0c1e658a7 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -62,14 +62,16 @@ Error MultimodalRunner::load() { if (is_loaded()) { return Error::Ok; } + stats_->model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load()); ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load()); + stats_->model_load_end_ms = time_in_ms(); #ifdef CUDA_AVAILABLE - cuda_memory_tracker_->log_sample("after_load"); - stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes(); - stats_->gpu_free_after_load_bytes = cuda_memory_tracker_->last_free_bytes(); - stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb(); + cuda_memory_tracker_->log_sample("after_load"); + stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes(); + stats_->gpu_free_after_load_bytes = cuda_memory_tracker_->last_free_bytes(); + stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb(); #endif return Error::Ok; @@ -107,9 +109,7 @@ Error MultimodalRunner::generate( } if (!is_loaded()) { - stats_->model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); - stats_->model_load_end_ms = time_in_ms(); } if (config.warming) { @@ -216,7 +216,8 @@ Error MultimodalRunner::generate( #ifdef CUDA_AVAILABLE cuda_memory_tracker_->log_sample("after_generate"); - stats_->gpu_free_after_generate_bytes = cuda_memory_tracker_->last_free_bytes(); + stats_->gpu_free_after_generate_bytes = + cuda_memory_tracker_->last_free_bytes(); // update peak in case it changed after generation stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb(); #endif diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h index 03120fb8f4c..e7b2af4492e 100644 --- a/extension/llm/runner/stats.h +++ b/extension/llm/runner/stats.h @@ -84,7 +84,7 @@ struct ET_EXPERIMENTAL Stats { num_prompt_tokens = 0; num_generated_tokens = 0; gpu_total_bytes = static_cast(-1); - gpu_free_before_load_bytes = static_cast(-1); + gpu_free_before_load_bytes = static_cast(-1); gpu_free_after_load_bytes = static_cast(-1); gpu_free_after_generate_bytes = static_cast(-1); gpu_peak_usage_mb = -1.0; @@ -106,7 +106,7 @@ inline std::string stats_to_json_string(const Stats& stats) { << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << "," << "\"first_token_ms\":" << stats.first_token_ms << "," << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms - << ","; + << ","; // Only include GPU fields in the JSON if gpu_total_bytes is valid (not // equal to sentinel -1) if (stats.gpu_total_bytes != static_cast(-1)) { From 59e6a655a0facab796b2a9d6a4e27b85c19ab38b Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 12 Nov 2025 14:10:03 -0800 Subject: [PATCH 4/4] Link cudart to extension_llm_runner --- extension/llm/runner/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index de77c21636f..6a2c1989922 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -64,6 +64,7 @@ if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit QUIET) if(CUDAToolkit_FOUND) target_compile_definitions(extension_llm_runner PUBLIC CUDA_AVAILABLE) + target_link_libraries(extension_llm_runner PUBLIC CUDA::cudart) message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE") else() message(