From 38049a0002cad33903bedc5006fab408e32da656 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Wed, 12 Nov 2025 09:33:15 -0800
Subject: [PATCH 1/4] Add a CUDA memory tracker and use it in voxtral runner

---
 backends/cuda/runtime/memory_tracker.h | 136 +++++++++++++++++++++++++
 examples/models/voxtral/multimodal.cpp |  11 ++
 2 files changed, 147 insertions(+)
 create mode 100644 backends/cuda/runtime/memory_tracker.h
diff --git a/backends/cuda/runtime/memory_tracker.h b/backends/cuda/runtime/memory_tracker.h
new file mode 100644
index 00000000000..1bed7da9374
--- /dev/null
+++ b/backends/cuda/runtime/memory_tracker.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <limits>
+
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+
+/**
+ * @class CudaMemoryTracker
+ * @brief Tracks CUDA memory usage and logs memory state at key points
+ *
+ * This class provides utilities to query and track CUDA memory usage,
+ * including peak memory usage and detailed memory state logging.
+ */
+class CudaMemoryTracker {
+ public:
+  /**
+   * @brief Constructor - initializes tracker and logs startup memory state
+   */
+  CudaMemoryTracker() {
+    if (!query(&last_free_bytes_, &total_bytes_)) {
+      return;
+    }
+    available_ = true;
+    min_free_bytes_ = last_free_bytes_;
+    log_state("startup", last_free_bytes_, total_bytes_);
+  }
+
+  /**
+   * @brief Logs current memory state at a tagged checkpoint
+   * @param tag Descriptive tag for this memory sample (e.g., "after_load")
+   */
+  void log_sample(const char* tag) {
+    if (!available_) {
+      return;
+    }
+    size_t free_bytes = 0;
+    size_t total_bytes = 0;
+    if (!query(&free_bytes, &total_bytes)) {
+      return;
+    }
+    min_free_bytes_ = std::min(min_free_bytes_, free_bytes);
+    total_bytes_ = total_bytes;
+    last_free_bytes_ = free_bytes;
+    log_state(tag, free_bytes, total_bytes);
+  }
+
+  /**
+   * @brief Destructor - logs final memory state and peak usage summary
+   */
+  ~CudaMemoryTracker() {
+    if (!available_) {
+      return;
+    }
+    size_t free_bytes = 0;
+    size_t total_bytes = 0;
+    if (!query(&free_bytes, &total_bytes)) {
+      return;
+    }
+    min_free_bytes_ = std::min(min_free_bytes_, free_bytes);
+    total_bytes_ = total_bytes;
+    last_free_bytes_ = free_bytes;
+    const double peak_mb =
+        static_cast<double>(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0);
+    const double total_mb =
+        static_cast<double>(total_bytes_) / (1024.0 * 1024.0);
+    ET_LOG(
+        Info,
+        "CUDA memory peak usage: %.2f MB, total: %.2f MB",
+        peak_mb,
+        total_mb);
+  }
+
+ private:
+  /**
+   * @brief Queries current CUDA memory info
+   * @param free_bytes Output parameter for free memory in bytes
+   * @param total_bytes Output parameter for total memory in bytes
+   * @return true if query succeeded, false otherwise
+   */
+  bool query(size_t* free_bytes, size_t* total_bytes) {
+    cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes);
+    if (err != cudaSuccess) {
+      if (!error_logged_) {
+        error_logged_ = true;
+        ET_LOG(
+            Error,
+            "cudaMemGetInfo failed with error: %s",
+            cudaGetErrorString(err));
+      }
+      available_ = false;
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * @brief Logs the current memory state
+   * @param tag Tag describing this log point
+   * @param free_bytes Current free memory in bytes
+   * @param total_bytes Current total memory in bytes
+   */
+  void log_state(const char* tag, size_t free_bytes, size_t total_bytes) const {
+    const double used_mb =
+        static_cast<double>(total_bytes - free_bytes) / (1024.0 * 1024.0);
+    const double free_mb = static_cast<double>(free_bytes) / (1024.0 * 1024.0);
+    const double total_mb =
+        static_cast<double>(total_bytes) / (1024.0 * 1024.0);
+    ET_LOG(
+        Info,
+        "CUDA memory (%s): used %.2f MB, free %.2f MB, total %.2f MB",
+        tag,
+        used_mb,
+        free_mb,
+        total_mb);
+  }
+
+  bool available_{false};
+  bool error_logged_{false};
+  size_t last_free_bytes_{0};
+  size_t total_bytes_{0};
+  size_t min_free_bytes_{std::numeric_limits<size_t>::max()};
+};
+
+} // namespace executorch::backends::cuda
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 29edf955751..ac3c7e054b0 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -25,6 +25,8 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
+#include <executorch/backends/cuda/runtime/memory_tracker.h>
+
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
@@ -296,6 +298,9 @@ int32_t main(int32_t argc, char** argv) {
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
 
+  // Initialize memory tracker
+  ::executorch::backends::cuda::CudaMemoryTracker mem_tracker;
+
 #if defined(ET_USE_THREADPOOL)
   uint32_t num_performant_cores = cpu_threads == -1
       ? ::executorch::extension::cpuinfo::get_num_performant_cores()
@@ -332,6 +337,9 @@ int32_t main(int32_t argc, char** argv) {
     return 1;
   }
 
+  // Probe CUDA memory after loading model
+  mem_tracker.log_sample("after_load");
+
   // Prepare inputs
   std::vector<MultimodalInput> inputs = {
       make_text_input("<s>[INST][BEGIN_AUDIO]"),
@@ -362,6 +370,9 @@ int32_t main(int32_t argc, char** argv) {
     return 1;
   }
 
+  // Probe CUDA memory after generation
+  mem_tracker.log_sample("after_generate");
+
   printf("\n");
   return 0;
 }

From 383b91f41bf98d9a32e2130f8ae50c1575a3e77c Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Wed, 12 Nov 2025 09:49:37 -0800
Subject: [PATCH 2/4] Define CUDA_AVAILABLE when building CUDA backend; wire
 GPU stats into runner

---
 backends/cuda/runtime/memory_tracker.h     | 58 ++++++++++++++++---
 examples/models/voxtral/multimodal.cpp     | 11 ----
 extension/llm/runner/CMakeLists.txt        | 15 +++++
 extension/llm/runner/multimodal_runner.cpp | 31 +++++++++-
 extension/llm/runner/multimodal_runner.h   |  9 +++
 extension/llm/runner/stats.h               | 66 +++++++++++++++++++++-
 6 files changed, 168 insertions(+), 22 deletions(-)

diff --git a/backends/cuda/runtime/memory_tracker.h b/backends/cuda/runtime/memory_tracker.h
index 1bed7da9374..8defa26e441 100644
--- a/backends/cuda/runtime/memory_tracker.h
+++ b/backends/cuda/runtime/memory_tracker.h
@@ -33,6 +33,11 @@ class CudaMemoryTracker {
       return;
     }
     available_ = true;
+    // Record the initial free bytes observed at startup. We'll use this as a
+    // baseline so reported "peak usage" reflects additional memory used
+    // since the tracker was created (instead of the absolute device usage,
+    // which may include other processes).
+    initial_free_bytes_ = last_free_bytes_;
     min_free_bytes_ = last_free_bytes_;
     log_state("startup", last_free_bytes_, total_bytes_);
   }
@@ -71,15 +76,22 @@ class CudaMemoryTracker {
     min_free_bytes_ = std::min(min_free_bytes_, free_bytes);
     total_bytes_ = total_bytes;
     last_free_bytes_ = free_bytes;
-    const double peak_mb =
-        static_cast<double>(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0);
-    const double total_mb =
-        static_cast<double>(total_bytes_) / (1024.0 * 1024.0);
-    ET_LOG(
-        Info,
-        "CUDA memory peak usage: %.2f MB, total: %.2f MB",
-        peak_mb,
-        total_mb);
+    // Compute peak usage relative to the initial free baseline so that
+    // allocations by other processes present at startup are not attributed
+    // to this process. If for some reason initial_free_bytes_ was not set,
+    // fall back to absolute device usage.
+    double peak_mb = 0.0;
+    if (initial_free_bytes_ != std::numeric_limits<size_t>::max()) {
+      size_t used_delta = 0;
+      if (initial_free_bytes_ > min_free_bytes_) {
+        used_delta = initial_free_bytes_ - min_free_bytes_;
+      }
+      peak_mb = static_cast<double>(used_delta) / (1024.0 * 1024.0);
+    } else {
+      peak_mb = static_cast<double>(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0);
+    }
+    const double total_mb = static_cast<double>(total_bytes_) / (1024.0 * 1024.0);
+    ET_LOG(Info, "CUDA memory peak usage (since startup): %.2f MB, device total: %.2f MB", peak_mb, total_mb);
   }
 
  private:
@@ -131,6 +143,34 @@ class CudaMemoryTracker {
   size_t last_free_bytes_{0};
   size_t total_bytes_{0};
   size_t min_free_bytes_{std::numeric_limits<size_t>::max()};
+  // Baseline free bytes observed at tracker construction. Used to compute
+  // peak usage attributable to this process since the tracker started.
+  size_t initial_free_bytes_{std::numeric_limits<size_t>::max()};
+ public:
+  // Simple accessors to allow other components to read last-sampled values.
+  // These are safe to call after a successful log_sample() invocation.
+  uint64_t last_free_bytes() const { return static_cast<uint64_t>(last_free_bytes_); }
+  uint64_t total_bytes() const { return static_cast<uint64_t>(total_bytes_); }
+  uint64_t min_free_bytes() const { return static_cast<uint64_t>(min_free_bytes_); }
+  uint64_t initial_free_bytes() const { return static_cast<uint64_t>(initial_free_bytes_); }
+  double peak_usage_mb() const {
+    // Prefer peak relative to the initial free baseline; fall back to
+    // absolute device peak if baseline isn't available.
+    if (min_free_bytes_ == std::numeric_limits<size_t>::max()) {
+      return 0.0;
+    }
+    if (initial_free_bytes_ != std::numeric_limits<size_t>::max()) {
+      size_t used_delta = 0;
+      if (initial_free_bytes_ > min_free_bytes_) {
+        used_delta = initial_free_bytes_ - min_free_bytes_;
+      }
+      return static_cast<double>(used_delta) / (1024.0 * 1024.0);
+    }
+    if (total_bytes_ == 0) {
+      return 0.0;
+    }
+    return static_cast<double>(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0);
+  }
 };
 
 } // namespace executorch::backends::cuda
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index ac3c7e054b0..29edf955751 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -25,8 +25,6 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
-#include <executorch/backends/cuda/runtime/memory_tracker.h>
-
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
@@ -298,9 +296,6 @@ int32_t main(int32_t argc, char** argv) {
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
 
-  // Initialize memory tracker
-  ::executorch::backends::cuda::CudaMemoryTracker mem_tracker;
-
 #if defined(ET_USE_THREADPOOL)
   uint32_t num_performant_cores = cpu_threads == -1
       ? ::executorch::extension::cpuinfo::get_num_performant_cores()
@@ -337,9 +332,6 @@ int32_t main(int32_t argc, char** argv) {
     return 1;
   }
 
-  // Probe CUDA memory after loading model
-  mem_tracker.log_sample("after_load");
-
   // Prepare inputs
   std::vector<MultimodalInput> inputs = {
       make_text_input("<s>[INST][BEGIN_AUDIO]"),
@@ -370,9 +362,6 @@ int32_t main(int32_t argc, char** argv) {
     return 1;
   }
 
-  // Probe CUDA memory after generation
-  mem_tracker.log_sample("after_generate");
-
   printf("\n");
   return 0;
 }
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 8d280b4eaf9..00f7866483e 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -55,6 +55,21 @@ target_include_directories(
   extension_llm_runner INTERFACE ${_common_include_directories}
 )
 
+# If the project is configured to build with CUDA support, try to find a
+# CUDA runtime (prefer the CUDAToolkit package). If found, expose a
+# compile-time macro so sources can conditionally compile CUDA-aware code.
+if(EXECUTORCH_BUILD_CUDA)
+  # Prefer the modern CMake CUDAToolkit module, fall back to searching for
+  # the CUDA runtime library (cudart) if the package isn't available.
+  find_package(CUDAToolkit QUIET)
+  if(CUDAToolkit_FOUND)
+    target_compile_definitions(extension_llm_runner PUBLIC CUDA_AVAILABLE)
+    message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE")
+  else()
+    message(STATUS "CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found")
+  endif()
+endif()
+
 install(
   TARGETS extension_llm_runner
   EXPORT ExecuTorchTargets
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index 047ca27ee2b..92e50d21c67 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -15,6 +15,10 @@
 #include <pytorch/tokenizers/hf_tokenizer.h>
 #include <pytorch/tokenizers/sentencepiece.h>
 
+#ifdef CUDA_AVAILABLE
+#include <executorch/backends/cuda/runtime/memory_tracker.h>
+#endif
+
 namespace executorch::extension::llm {
 
 using ::executorch::extension::Module;
@@ -38,7 +42,16 @@ MultimodalRunner::MultimodalRunner(
       io_manager_(std::move(io_manager)),
       text_token_generator_(std::move(text_token_generator)),
       stats_(std::move(stats)),
-      pos_(0) {}
+      pos_(0) {
+#ifdef CUDA_AVAILABLE
+  cuda_memory_tracker_ =
+      std::make_unique<::executorch::backends::cuda::CudaMemoryTracker>();
+  // Probe immediately after creating the tracker to capture GPU state before
+  // any model loading happens.
+  stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes();
+  stats_->gpu_free_before_load_bytes = cuda_memory_tracker_->last_free_bytes();
+#endif
+}
 
 bool MultimodalRunner::is_loaded() {
   return multimodal_prefiller_->is_method_loaded() &&
@@ -51,6 +64,14 @@ Error MultimodalRunner::load() {
   }
   ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load());
   ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
+
+#ifdef CUDA_AVAILABLE
+    cuda_memory_tracker_->log_sample("after_load");
+    stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes();
+    stats_->gpu_free_after_load_bytes = cuda_memory_tracker_->last_free_bytes();
+    stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
+#endif
+
   return Error::Ok;
 }
 
@@ -192,6 +213,14 @@ Error MultimodalRunner::generate(
   stats_->num_generated_tokens = num_generated_tokens;
   // Finalize stats and call callback
   stats_->inference_end_ms = time_in_ms();
+
+#ifdef CUDA_AVAILABLE
+  cuda_memory_tracker_->log_sample("after_generate");
+  stats_->gpu_free_after_generate_bytes = cuda_memory_tracker_->last_free_bytes();
+  // update peak in case it changed after generation
+  stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
+#endif
+
   if (!config.warming) {
     printf("\n");
   }
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index caf3c296038..b34b7b05ce7 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -36,6 +36,10 @@
 // These are provided for backward compatibility
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
 
+#ifdef CUDA_AVAILABLE
+#include <executorch/backends/cuda/runtime/memory_tracker.h>
+#endif
+
 namespace executorch {
 namespace extension {
 namespace llm {
@@ -150,6 +154,11 @@ class ET_EXPERIMENTAL MultimodalRunner {
   std::unique_ptr<TextTokenGenerator> text_token_generator_;
   std::unique_ptr<Stats> stats_;
 
+#ifdef CUDA_AVAILABLE
+  std::unique_ptr<::executorch::backends::cuda::CudaMemoryTracker>
+      cuda_memory_tracker_;
+#endif
+
   // Internal state
   int64_t pos_;
 };
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
index 19766329ed3..03120fb8f4c 100644
--- a/extension/llm/runner/stats.h
+++ b/extension/llm/runner/stats.h
@@ -49,6 +49,14 @@ struct ET_EXPERIMENTAL Stats {
   int64_t num_prompt_tokens;
   // Token count from generated (total - prompt)
   int64_t num_generated_tokens;
+  // GPU memory stats (optional; may be zero if not available)
+  // GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
+  // "not available".
+  uint64_t gpu_total_bytes = static_cast<uint64_t>(-1);
+  uint64_t gpu_free_before_load_bytes = static_cast<uint64_t>(-1);
+  uint64_t gpu_free_after_load_bytes = static_cast<uint64_t>(-1);
+  uint64_t gpu_free_after_generate_bytes = static_cast<uint64_t>(-1);
+  double gpu_peak_usage_mb = -1.0;
   inline void on_sampling_begin() {
     aggregate_sampling_timer_start_timestamp = time_in_ms();
   }
@@ -75,6 +83,11 @@ struct ET_EXPERIMENTAL Stats {
     aggregate_sampling_time_ms = 0;
     num_prompt_tokens = 0;
     num_generated_tokens = 0;
+    gpu_total_bytes = static_cast<uint64_t>(-1);
+  gpu_free_before_load_bytes = static_cast<uint64_t>(-1);
+    gpu_free_after_load_bytes = static_cast<uint64_t>(-1);
+    gpu_free_after_generate_bytes = static_cast<uint64_t>(-1);
+    gpu_peak_usage_mb = -1.0;
     aggregate_sampling_timer_start_timestamp = 0;
   }
 
@@ -93,7 +106,29 @@ inline std::string stats_to_json_string(const Stats& stats) {
      << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
      << "\"first_token_ms\":" << stats.first_token_ms << ","
      << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+    << ",";
+  // Only include GPU fields in the JSON if gpu_total_bytes is valid (not
+  // equal to sentinel -1)
+  if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
+    ss << "\"gpu_total_bytes\":" << stats.gpu_total_bytes;
+    if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
+      ss << ",\"gpu_free_before_load_bytes\":"
+         << stats.gpu_free_before_load_bytes;
+    }
+    if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
+      ss << ",\"gpu_free_after_load_bytes\":"
+         << stats.gpu_free_after_load_bytes;
+    }
+    if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
+      ss << ",\"gpu_free_after_generate_bytes\":"
+         << stats.gpu_free_after_generate_bytes;
+    }
+    if (stats.gpu_peak_usage_mb >= 0.0) {
+      ss << ",\"gpu_peak_usage_mb\":" << stats.gpu_peak_usage_mb;
+    }
+    ss << ",";
+  }
+  ss << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
      << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
   return ss.str();
 }
@@ -156,6 +191,35 @@ inline void print_report(const Stats& stats) {
       stats.num_prompt_tokens + stats.num_generated_tokens,
       (double)stats.aggregate_sampling_time_ms /
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // GPU memory reporting (only meaningful if GPU fields were populated)
+  if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
+    ET_LOG(
+        Info,
+        "\tGPU total memory: %.2f MB",
+        stats.gpu_total_bytes / 1024.0 / 1024.0);
+    if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
+      ET_LOG(
+          Info,
+          "\tGPU free before load: %.2f MB",
+          stats.gpu_free_before_load_bytes / 1024.0 / 1024.0);
+    }
+    if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
+      ET_LOG(
+          Info,
+          "\tGPU free after load: %.2f MB",
+          stats.gpu_free_after_load_bytes / 1024.0 / 1024.0);
+    }
+    if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
+      ET_LOG(
+          Info,
+          "\tGPU free after generate: %.2f MB",
+          stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0);
+    }
+    if (stats.gpu_peak_usage_mb >= 0.0) {
+      ET_LOG(Info, "\tGPU peak usage: %.2f MB", stats.gpu_peak_usage_mb);
+    }
+  }
 }
 
 } // namespace llm

From 56f89976488b817a9a95be652cc6a7cd84279d3c Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Wed, 12 Nov 2025 11:54:49 -0800
Subject: [PATCH 3/4] Fix load time calculation

---
 backends/cuda/runtime/memory_tracker.h     | 32 ++++++++++++++++------
 extension/llm/runner/CMakeLists.txt        | 15 ++++++----
 extension/llm/runner/multimodal_runner.cpp | 15 +++++-----
 extension/llm/runner/stats.h               |  4 +--
 4 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/backends/cuda/runtime/memory_tracker.h b/backends/cuda/runtime/memory_tracker.h
index 8defa26e441..e09a96da6a6 100644
--- a/backends/cuda/runtime/memory_tracker.h
+++ b/backends/cuda/runtime/memory_tracker.h
@@ -88,10 +88,16 @@ class CudaMemoryTracker {
       }
       peak_mb = static_cast<double>(used_delta) / (1024.0 * 1024.0);
     } else {
-      peak_mb = static_cast<double>(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0);
+      peak_mb = static_cast<double>(total_bytes_ - min_free_bytes_) /
+          (1024.0 * 1024.0);
     }
-    const double total_mb = static_cast<double>(total_bytes_) / (1024.0 * 1024.0);
-    ET_LOG(Info, "CUDA memory peak usage (since startup): %.2f MB, device total: %.2f MB", peak_mb, total_mb);
+    const double total_mb =
+        static_cast<double>(total_bytes_) / (1024.0 * 1024.0);
+    ET_LOG(
+        Info,
+        "CUDA memory peak usage (since startup): %.2f MB, device total: %.2f MB",
+        peak_mb,
+        total_mb);
   }
 
  private:
@@ -146,13 +152,22 @@ class CudaMemoryTracker {
   // Baseline free bytes observed at tracker construction. Used to compute
   // peak usage attributable to this process since the tracker started.
   size_t initial_free_bytes_{std::numeric_limits<size_t>::max()};
+
  public:
   // Simple accessors to allow other components to read last-sampled values.
   // These are safe to call after a successful log_sample() invocation.
-  uint64_t last_free_bytes() const { return static_cast<uint64_t>(last_free_bytes_); }
-  uint64_t total_bytes() const { return static_cast<uint64_t>(total_bytes_); }
-  uint64_t min_free_bytes() const { return static_cast<uint64_t>(min_free_bytes_); }
-  uint64_t initial_free_bytes() const { return static_cast<uint64_t>(initial_free_bytes_); }
+  uint64_t last_free_bytes() const {
+    return static_cast<uint64_t>(last_free_bytes_);
+  }
+  uint64_t total_bytes() const {
+    return static_cast<uint64_t>(total_bytes_);
+  }
+  uint64_t min_free_bytes() const {
+    return static_cast<uint64_t>(min_free_bytes_);
+  }
+  uint64_t initial_free_bytes() const {
+    return static_cast<uint64_t>(initial_free_bytes_);
+  }
   double peak_usage_mb() const {
     // Prefer peak relative to the initial free baseline; fall back to
     // absolute device peak if baseline isn't available.
@@ -169,7 +184,8 @@ class CudaMemoryTracker {
     if (total_bytes_ == 0) {
       return 0.0;
     }
-    return static_cast<double>(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0);
+    return static_cast<double>(total_bytes_ - min_free_bytes_) /
+        (1024.0 * 1024.0);
   }
 };
 
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 00f7866483e..de77c21636f 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -55,18 +55,21 @@ target_include_directories(
   extension_llm_runner INTERFACE ${_common_include_directories}
 )
 
-# If the project is configured to build with CUDA support, try to find a
-# CUDA runtime (prefer the CUDAToolkit package). If found, expose a
-# compile-time macro so sources can conditionally compile CUDA-aware code.
+# If the project is configured to build with CUDA support, try to find a CUDA
+# runtime (prefer the CUDAToolkit package). If found, expose a compile-time
+# macro so sources can conditionally compile CUDA-aware code.
 if(EXECUTORCH_BUILD_CUDA)
-  # Prefer the modern CMake CUDAToolkit module, fall back to searching for
-  # the CUDA runtime library (cudart) if the package isn't available.
+  # Prefer the modern CMake CUDAToolkit module, fall back to searching for the
+  # CUDA runtime library (cudart) if the package isn't available.
   find_package(CUDAToolkit QUIET)
   if(CUDAToolkit_FOUND)
     target_compile_definitions(extension_llm_runner PUBLIC CUDA_AVAILABLE)
     message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE")
   else()
-    message(STATUS "CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found")
+    message(
+      STATUS
+        "CUDA requested (EXECUTORCH_BUILD_CUDA=ON) but no CUDA runtime found"
+    )
   endif()
 endif()
 
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index 92e50d21c67..5c0c1e658a7 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -62,14 +62,16 @@ Error MultimodalRunner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
+  stats_->model_load_start_ms = time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load());
   ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
+  stats_->model_load_end_ms = time_in_ms();
 
 #ifdef CUDA_AVAILABLE
-    cuda_memory_tracker_->log_sample("after_load");
-    stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes();
-    stats_->gpu_free_after_load_bytes = cuda_memory_tracker_->last_free_bytes();
-    stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
+  cuda_memory_tracker_->log_sample("after_load");
+  stats_->gpu_total_bytes = cuda_memory_tracker_->total_bytes();
+  stats_->gpu_free_after_load_bytes = cuda_memory_tracker_->last_free_bytes();
+  stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
 #endif
 
   return Error::Ok;
@@ -107,9 +109,7 @@ Error MultimodalRunner::generate(
   }
 
   if (!is_loaded()) {
-    stats_->model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_->model_load_end_ms = time_in_ms();
   }
 
   if (config.warming) {
@@ -216,7 +216,8 @@ Error MultimodalRunner::generate(
 
 #ifdef CUDA_AVAILABLE
   cuda_memory_tracker_->log_sample("after_generate");
-  stats_->gpu_free_after_generate_bytes = cuda_memory_tracker_->last_free_bytes();
+  stats_->gpu_free_after_generate_bytes =
+      cuda_memory_tracker_->last_free_bytes();
   // update peak in case it changed after generation
   stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
 #endif
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
index 03120fb8f4c..e7b2af4492e 100644
--- a/extension/llm/runner/stats.h
+++ b/extension/llm/runner/stats.h
@@ -84,7 +84,7 @@ struct ET_EXPERIMENTAL Stats {
     num_prompt_tokens = 0;
     num_generated_tokens = 0;
     gpu_total_bytes = static_cast<uint64_t>(-1);
-  gpu_free_before_load_bytes = static_cast<uint64_t>(-1);
+    gpu_free_before_load_bytes = static_cast<uint64_t>(-1);
     gpu_free_after_load_bytes = static_cast<uint64_t>(-1);
     gpu_free_after_generate_bytes = static_cast<uint64_t>(-1);
     gpu_peak_usage_mb = -1.0;
@@ -106,7 +106,7 @@ inline std::string stats_to_json_string(const Stats& stats) {
      << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
      << "\"first_token_ms\":" << stats.first_token_ms << ","
      << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-    << ",";
+     << ",";
   // Only include GPU fields in the JSON if gpu_total_bytes is valid (not
   // equal to sentinel -1)
   if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {

From 59e6a655a0facab796b2a9d6a4e27b85c19ab38b Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Wed, 12 Nov 2025 14:10:03 -0800
Subject: [PATCH 4/4] Link cudart to extension_llm_runner

---
 extension/llm/runner/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index de77c21636f..6a2c1989922 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -64,6 +64,7 @@ if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit QUIET)
   if(CUDAToolkit_FOUND)
     target_compile_definitions(extension_llm_runner PUBLIC CUDA_AVAILABLE)
+    target_link_libraries(extension_llm_runner PUBLIC CUDA::cudart)
     message(STATUS "CUDAToolkit found; defining CUDA_AVAILABLE")
   else()
     message(