From e7a15a3673c9ba38e3cc31966166c05d317649ae Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 15 Oct 2025 16:56:41 -0700
Subject: [PATCH 01/17] [aoti-et] Store weights outside of .so

---
 .ci/docker/ci_commit_pins/pytorch.txt      |  2 +-
 backends/aoti/aoti_delegate_handle.h       |  6 ++++
 backends/cuda/cuda_backend.py              | 42 ++++++++++++++++++----
 backends/cuda/runtime/cuda_backend.cpp     | 22 +++++++++++-
 examples/models/voxtral/multimodal.cpp     |  2 +-
 extension/llm/runner/llm_runner_helper.cpp |  8 ++---
 extension/llm/runner/llm_runner_helper.h   |  3 +-
 torch_pin.py                               |  2 +-
 8 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index e3a53c8bcb5..408d456bbe4 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-53a2908a10f414a2f85caa06703a26a40e873869
+12d7cc5cd3da00094c5801aff4c77550e2a59528
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index 2e72fc39821..82ce2521750 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -71,6 +71,11 @@ using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
     AOTInductorModelContainerHandle container_handle,
     size_t* num_constants);
 
+// Update the model container with the constant tensors
+using AOTInductorModelUpdateConstantsFromBlobFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    const uint8_t* weight_blob_ptr);
+
 } // extern "C"
 
 // AOTI Delegate Handle structure
@@ -87,6 +92,7 @@ struct AOTIDelegateHandle {
   AOTInductorModelContainerGetNumInputsFunc get_num_inputs;
   AOTInductorModelContainerGetNumOutputsFunc get_num_outputs;
   AOTInductorModelContainerRunFunc run;
+  AOTInductorModelUpdateConstantsFromBlobFunc update_constants_from_blob;
 };
 
 } // namespace aoti
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 05d01972833..c5521cb5a0e 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -146,8 +146,11 @@ def preprocess(
             "aot_inductor.embed_kernel_binary": True,
             # Do not link against the full PyTorch/libtorch library
             "aot_inductor.link_libtorch": False,
-            # Package model constants and other generated files directly in the shared object (.so) file
-            "aot_inductor.package_constants_in_so": True,
+            # Separate weight constants from the .so file
+            "aot_inductor.package": True,
+            "aot_inductor.package_constants_in_so": False,
+            # Store weight constants on disk in a binary blob
+            "aot_inductor.package_constants_on_disk_format": "binary_blob",
             # Enable maximum automatic tuning for optimal performance
             "max_autotune": True,
             # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch
@@ -162,7 +165,8 @@ def preprocess(
             ]
         ), torch.no_grad():
             # torch._logging.set_logs(post_grad_graphs=True)
-            so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
+            # Here we should expect 1 so file and 1 weight blob in the same directory.
+            paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
             if len(missing_fallback_kernels) > 0:
                 formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
                 raise RuntimeError(
@@ -170,17 +174,41 @@ def preprocess(
                     "Please add them to the AOTI backend."
                 )
 
+        # Extract the .so and .blob paths from the returned list
+        so_path = None
+        blob_path = None
+        for path in paths:
+            if path.endswith(".wrapper.so"):
+                so_path = path
+            elif path.endswith(".wrapper_weights.blob"):
+                blob_path = path
+
+        if so_path is None:
+            raise RuntimeError(
+                f"Could not find .wrapper.so file in compiled paths, got {paths}"
+            )
+
         # pyre-ignorep[6]: Incompatible parameter type
         with open(so_path, "rb") as f:
             so_data = f.read()
 
         named_data_store = NamedDataStore()
         method_name = CudaBackend.method_name_from_compile_specs(compile_specs)
-        named_data_store.add_named_data(
-            method_name + "_so_blob", so_data, 1, "aoti_cuda_blob"
-        )
 
-        # Clean up the generated so file; it has been packaged into the NamdeDataStore
+        # Keep the so file in the NamedDataStore, so that it can be packaged into the .pte file.
+        named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
+
+        # Add weights blob to named data store if it exists
+        if blob_path is not None:
+            with open(blob_path, "rb") as f:
+                blob_data = f.read()
+            named_data_store.add_named_data(
+                method_name + "_weights_blob", blob_data, 1, "aoti_cuda_blob"
+            )
+            # Clean up the weights blob file
+            os.remove(blob_path)
+
+        # Clean up the generated so file; it has been packaged into the NamedDataStore
         # pyre-ignorep[6]: Incompatible parameter type
         os.remove(so_path)
 
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index e61b03ee8e6..bde5d802191 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -84,6 +84,12 @@ class ET_EXPERIMENTAL CudaBackend final
 
     LOAD_SYMBOL(handle, run, AOTInductorModelContainerRun, so_handle);
 
+    LOAD_SYMBOL(
+        handle,
+        update_constants_from_blob,
+        AOTInductorModelUpdateConstantsFromBlob,
+        so_handle);
+
     return Error::Ok;
   }
 
@@ -145,13 +151,14 @@ class ET_EXPERIMENTAL CudaBackend final
     // Finish writing the file to disk
     outfile.close();
 
+    // Free the buffer immediately after writing to disk
+    aoti_cuda_buffer->Free();
     // Load the lib
     Result<void*> lib_handle_res = load_library(so_path);
     if (!lib_handle_res.ok()) {
       return lib_handle_res.error();
     }
     void* lib_handle = lib_handle_res.get();
-
     processed->Free();
 
     // Create handle and load function pointers into it
@@ -172,6 +179,19 @@ class ET_EXPERIMENTAL CudaBackend final
 
     handle->container_handle = container_handle;
 
+    // Look into named data map for constant data
+    std::string weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+    auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
+    if (buffer_res.ok()) {
+      ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
+      const void* weights_blob = buffer_res->data();
+      // Feed the weights blob into the container. Under the hood it's copying
+      // weights, so we should free the buffer immediately.
+      ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
+          handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
+      buffer_res->Free();
+    }
     // Create a CUDA stream for asynchronous execution
     cudaStream_t cuda_stream;
     ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index b3dd5e3ab68..29edf955751 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -319,7 +319,7 @@ int32_t main(int32_t argc, char** argv) {
   // Create multimodal runner
   std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
       ::executorch::extension::llm::create_multimodal_runner(
-          model_path, std::move(tokenizer), data_path);
+          model_path, std::move(tokenizer), data_path, Module::LoadMode::Mmap);
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create multimodal runner");
     return 1;
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index d1e4ff2ce45..674be820072 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -268,7 +268,8 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 std::unique_ptr<MultimodalRunner> create_multimodal_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::optional<const std::string> data_path) {
+    std::optional<const std::string> data_path,
+    Module::LoadMode load_mode) {
   // Sanity check tokenizer
   if (!tokenizer || !tokenizer->is_loaded()) {
     ET_LOG(Error, "Tokenizer is null or not loaded");
@@ -278,10 +279,9 @@ std::unique_ptr<MultimodalRunner> create_multimodal_runner(
   // Create the Module
   std::unique_ptr<Module> module;
   if (data_path.has_value()) {
-    module = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
+    module = std::make_unique<Module>(model_path, data_path.value(), load_mode);
   } else {
-    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+    module = std::make_unique<Module>(model_path, load_mode);
   }
 
   // Get metadata from Module
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index 5c109581e19..08f0efd0353 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -140,6 +140,7 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::optional<const std::string> data_path = std::nullopt);
+    std::optional<const std::string> data_path = std::nullopt,
+    Module::LoadMode load_mode = Module::LoadMode::File);
 
 } // namespace executorch::extension::llm
diff --git a/torch_pin.py b/torch_pin.py
index 02040c91963..5ea6fdf97a1 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20251003"
+NIGHTLY_VERSION = "dev20251011"

From 263e3b577c37ebcfea12f98174906062b21681b4 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 15 Oct 2025 17:16:09 -0700
Subject: [PATCH 02/17] Change torch_pin to 20251012

---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 torch_pin.py                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 408d456bbe4..cbc25dfd058 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-12d7cc5cd3da00094c5801aff4c77550e2a59528
+4f8a986b8feb4a171b8a68a2a3664275ec54a75f
diff --git a/torch_pin.py b/torch_pin.py
index 5ea6fdf97a1..24fdaf3766f 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20251011"
+NIGHTLY_VERSION = "dev20251012"

From b13cc3fa3835bff26a4fa56d753b9989d4b14d47 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 15 Oct 2025 17:54:51 -0700
Subject: [PATCH 03/17] Fix c10 sync

---
 .../c10/c10/util/llvmMathExtras.h             |  4 +-
 .../c10/torch/headeronly/macros/Macros.h      | 42 +++++++++++++++++++
 .../c10/torch/headeronly/util/BFloat16.h      |  4 +-
 .../c10/torch/headeronly/util/Half.h          |  3 +-
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h b/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h
index 556699be04b..6321297a61c 100644
--- a/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h
+++ b/runtime/core/portable_type/c10/c10/util/llvmMathExtras.h
@@ -70,7 +70,7 @@ enum ZeroBehavior {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct TrailingZerosCounter {
-  static std::size_t count(T Val, ZeroBehavior) {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
     if (!Val)
       return std::numeric_limits<T>::digits;
     if (Val & 0x1)
@@ -147,7 +147,7 @@ std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct LeadingZerosCounter {
-  static std::size_t count(T Val, ZeroBehavior) {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
     if (!Val)
       return std::numeric_limits<T>::digits;
 
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index 558edb175ae..e340e7626a0 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -359,6 +359,7 @@ static inline int C10_WARP_SIZE_INTERNAL() {
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)
 #define SYCL_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
 #if defined(NDEBUG)
@@ -396,6 +397,26 @@ __host__ __device__
                static_cast<unsigned>(__LINE__)), \
            0);                                   \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                     \
+  if (C10_UNLIKELY(!(cond))) {                                        \
+    (void)(printf(                                                    \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(           \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: " \
+                      "Assertion failed: `" #cond "`: " msg "\n",     \
+        __func__,                                                     \
+        blockIdx.x,                                                   \
+        blockIdx.y,                                                   \
+        blockIdx.z,                                                   \
+        threadIdx.x,                                                  \
+        threadIdx.y,                                                  \
+        threadIdx.z,                                                  \
+        ##__VA_ARGS__));                                              \
+    (void)(_wassert(                                                  \
+               _CRT_WIDE(#cond),                                      \
+               _CRT_WIDE(__FILE__),                                   \
+               static_cast<unsigned>(__LINE__)),                      \
+           0);                                                        \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                 \
   if (C10_UNLIKELY(!(cond))) {                   \
     (void)(_wassert(                             \
@@ -455,6 +476,10 @@ __host__ __device__
   if C10_UNLIKELY (!(cond)) {             \
     abort();                              \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...) \
+  if C10_UNLIKELY (!(cond)) {                     \
+    abort();                                      \
+  }
 #define SYCL_KERNEL_ASSERT(cond) \
   if C10_UNLIKELY (!(cond)) {    \
     abort();                     \
@@ -470,6 +495,23 @@ __host__ __device__
     __assert_fail(                                                     \
         msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                        \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    printf(                                                            \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(            \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: "  \
+            "Assertion failed: `" #cond "`: " msg "\n",                \
+        __func__,                                                      \
+        blockIdx.x,                                                    \
+        blockIdx.y,                                                    \
+        blockIdx.z,                                                    \
+        threadIdx.x,                                                   \
+        threadIdx.y,                                                   \
+        threadIdx.z,                                                   \
+        ##__VA_ARGS__); \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                                         \
   if (C10_UNLIKELY(!(cond))) {                                           \
     __assert_fail(                                                       \
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
index 2c1f805ac7b..ac47e3f844a 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
@@ -39,7 +39,9 @@ struct alignas(2) BFloat16 {
     return from_bits_t();
   }
 
-  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+  constexpr C10_HOST_DEVICE BFloat16(
+      unsigned short bits,
+      from_bits_t /*unused*/)
       : x(bits) {}
   /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/Half.h b/runtime/core/portable_type/c10/torch/headeronly/util/Half.h
index 59a86f07e33..9673301e2de 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/util/Half.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/util/Half.h
@@ -80,7 +80,8 @@ struct alignas(2) Half {
   Half() = default;
 #endif
 
-  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t /*unused*/)
+      : x(bits) {}
 #if defined(__aarch64__) && !defined(__CUDACC__)
   inline Half(float16_t value);
   inline operator float16_t() const;

From 60535b989dfa19195d02d32ccb4a2568018c8ae8 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 15 Oct 2025 23:00:13 -0700
Subject: [PATCH 04/17] Address comments

---
 backends/cuda/cuda_backend.py          | 21 ++++----
 backends/cuda/runtime/cuda_backend.cpp | 67 +++++++++++---------------
 2 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index c5521cb5a0e..ba6da92b991 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -183,9 +183,9 @@ def preprocess(
             elif path.endswith(".wrapper_weights.blob"):
                 blob_path = path
 
-        if so_path is None:
+        if so_path is None or blob_path is None:
             raise RuntimeError(
-                f"Could not find .wrapper.so file in compiled paths, got {paths}"
+                f"Could not find required files in compiled paths, got {paths}"
             )
 
         # pyre-ignorep[6]: Incompatible parameter type
@@ -198,15 +198,14 @@ def preprocess(
         # Keep the so file in the NamedDataStore, so that it can be packaged into the .pte file.
         named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
 
-        # Add weights blob to named data store if it exists
-        if blob_path is not None:
-            with open(blob_path, "rb") as f:
-                blob_data = f.read()
-            named_data_store.add_named_data(
-                method_name + "_weights_blob", blob_data, 1, "aoti_cuda_blob"
-            )
-            # Clean up the weights blob file
-            os.remove(blob_path)
+        # Add weights blob to named data store
+        with open(blob_path, "rb") as f:
+            blob_data = f.read()
+        named_data_store.add_named_data(
+            method_name + "_weights_blob", blob_data, 1, "aoti_cuda_blob"
+        )
+        # Clean up the weights blob file
+        os.remove(blob_path)
 
         # Clean up the generated so file; it has been packaged into the NamedDataStore
         # pyre-ignorep[6]: Incompatible parameter type
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index bde5d802191..da8aec2ae21 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -27,15 +27,6 @@
 
 namespace executorch::backends::cuda {
 
-#define LOAD_SYMBOL(handle, member, name, so_handle)                 \
-  do {                                                               \
-    auto symbol_res = get_function(so_handle, #name);                \
-    if (!symbol_res.ok()) {                                          \
-      return symbol_res.error();                                     \
-    }                                                                \
-    handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
-  } while (0)
-
 using namespace std;
 using namespace aoti;
 
@@ -61,35 +52,35 @@ class ET_EXPERIMENTAL CudaBackend final
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
-    LOAD_SYMBOL(
-        handle,
-        create_with_device,
-        AOTInductorModelContainerCreateWithDevice,
-        so_handle);
-
-    LOAD_SYMBOL(
-        handle, delete_container, AOTInductorModelContainerDelete, so_handle);
-
-    LOAD_SYMBOL(
-        handle,
-        get_num_inputs,
-        AOTInductorModelContainerGetNumInputs,
-        so_handle);
-
-    LOAD_SYMBOL(
-        handle,
-        get_num_outputs,
-        AOTInductorModelContainerGetNumOutputs,
-        so_handle);
-
-    LOAD_SYMBOL(handle, run, AOTInductorModelContainerRun, so_handle);
-
-    LOAD_SYMBOL(
-        handle,
-        update_constants_from_blob,
-        AOTInductorModelUpdateConstantsFromBlob,
-        so_handle);
 
+#define LOAD_SYMBOL(member, name)                                    \
+  do {                                                               \
+    auto symbol_res = get_function(so_handle, #name);                \
+    if (!symbol_res.ok()) {                                          \
+      return symbol_res.error();                                     \
+    }                                                                \
+    handle->member = reinterpret_cast<name##Func>(symbol_res.get()); \
+  } while (0)
+
+    LOAD_SYMBOL(create_with_device, AOTInductorModelContainerCreateWithDevice);
+
+    LOAD_SYMBOL(delete_container, AOTInductorModelContainerDelete);
+
+    LOAD_SYMBOL(get_num_inputs, AOTInductorModelContainerGetNumInputs);
+
+    LOAD_SYMBOL(get_num_outputs, AOTInductorModelContainerGetNumOutputs);
+
+    LOAD_SYMBOL(run, AOTInductorModelContainerRun);
+#undef LOAD_SYMBOL
+
+    handle->update_constants_from_blob =
+        reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
+            dlsym(so_handle, "AOTInductorModelUpdateConstantsFromBlob"));
+    if (handle->update_constants_from_blob == nullptr) {
+      ET_LOG(
+          Info,
+          "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)");
+    }
     return Error::Ok;
   }
 
@@ -183,7 +174,7 @@ class ET_EXPERIMENTAL CudaBackend final
     std::string weights_blob_key =
         method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
     auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
-    if (buffer_res.ok()) {
+    if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
       ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
       const void* weights_blob = buffer_res->data();
       // Feed the weights blob into the container. Under the hood it's copying

From a3ab05ce73cb09afae669b0c9db0c90f18c49e9a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 15:15:13 -0700
Subject: [PATCH 05/17] Try to fix loader_path issue

---
 CMakeLists.txt | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5091a2af2e..95a6cfee9b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,27 +99,6 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-# Setup RPATH. See
-# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
-# Use separate rpaths during build and install phases
-set(CMAKE_SKIP_BUILD_RPATH OFF)
-# Don't use the install-rpath during the build phase
-set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
-# Automatically add all linked folders that are NOT in the build directory to
-# the rpath (per library?)
-#
-# TODO: Doesn't work for us right now because we are not installing .so's into
-# the correct locations. For example we have libcustom_ops_aot_lib.so depending
-# on _portable_lib.so, which was eventually put under
-# <site-packages>/executorch/extension/pybindings/ but this rpath is not
-# automatically added because at build time it seems `portable_lib` is being
-# built under the same directory, so no extra rpath is being added. To properly
-# fix this we need to install `portable_lib` into the correct path.
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
-# ------------------------------ OPTIONS -------------------------------------
-# WARNING: Please don't add example specific options in this CMakeLists.txt.
-# Instead please use `find_package(executorch REQUIRED)` in the example
-# directory and add a new executable in the example `CMakeLists.txt`.
 
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this

From d05a3057b5eb94fa287c00cc7f0b735c9f81bd74 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 16:32:58 -0700
Subject: [PATCH 06/17] Retry fixing

---
 CMakeLists.txt                                    | 13 ++++++++-----
 backends/cuda/runtime/cuda_backend.cpp            | 15 ++++++++-------
 .../models/moshi/mimi/install_requirements.sh     |  4 ++--
 examples/models/moshi/mimi/test_mimi.py           |  3 +--
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95a6cfee9b6..487dce7acae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,7 +99,6 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -890,10 +889,14 @@ if(EXECUTORCH_BUILD_PYBIND)
   # This goes from executorch/extension/pybindings up to site-packages, then to
   # torch/lib
   if(APPLE)
-    set_target_properties(
-      portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
-                              INSTALL_RPATH "@loader_path/../../../torch/lib"
-    )
+    get_target_property(existing_rpath portable_lib INSTALL_RPATH)
+    string(FIND "${existing_rpath}" "@loader_path" pos)
+    if(pos EQUAL -1)
+      set_target_properties(
+        portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
+                                INSTALL_RPATH "@loader_path/../../../torch/lib"
+      )
+    endif()
   else()
     set_target_properties(
       portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index da8aec2ae21..8664891f9f8 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -109,13 +109,13 @@ class ET_EXPERIMENTAL CudaBackend final
         method_name.empty() ? "so_blob" : method_name + "_so_blob";
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
-    auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
+    auto aoti_dso_buffer = named_data_map->get_data(so_blob_key.c_str());
     ET_CHECK_OR_RETURN_ERROR(
-        aoti_cuda_buffer.ok(),
+        aoti_dso_buffer.ok(),
         Internal,
         "Failed to get data for key %s: 0x%x",
         so_blob_key.c_str(),
-        static_cast<uint32_t>(aoti_cuda_buffer.error()));
+        static_cast<uint32_t>(aoti_dso_buffer.error()));
 
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
@@ -129,12 +129,12 @@ class ET_EXPERIMENTAL CudaBackend final
     ET_LOG(
         Info,
         "Writing %zu bytes to %s",
-        aoti_cuda_buffer->size(),
+        aoti_dso_buffer->size(),
         so_path.c_str());
 
     outfile.write(
-        static_cast<const char*>(aoti_cuda_buffer->data()),
-        aoti_cuda_buffer->size());
+        static_cast<const char*>(aoti_dso_buffer->data()),
+        aoti_dso_buffer->size());
 
     ET_CHECK_OR_RETURN_ERROR(
         outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
@@ -143,13 +143,14 @@ class ET_EXPERIMENTAL CudaBackend final
     outfile.close();
 
     // Free the buffer immediately after writing to disk
-    aoti_cuda_buffer->Free();
+    aoti_dso_buffer->Free();
     // Load the lib
     Result<void*> lib_handle_res = load_library(so_path);
     if (!lib_handle_res.ok()) {
       return lib_handle_res.error();
     }
     void* lib_handle = lib_handle_res.get();
+
     processed->Free();
 
     // Create handle and load function pointers into it
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index 6df4caf8692..d3efb04448c 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,8 +8,8 @@
 set -x
 
 conda install -c conda-forge "ffmpeg<8" -y
-pip install torchcodec==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-pip install moshi==0.2.4
+pip install torchcodec==0.7.0.dev20251012 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install moshi==0.2.4 --no-deps
 pip install bitsandbytes soundfile
 # Run llama2/install requirements for torchao deps
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
index d0c3c2ceb15..93513c54e78 100644
--- a/examples/models/moshi/mimi/test_mimi.py
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -189,8 +189,7 @@ def forward(self, x):
                 x = self.mimi_model.upsample(x)
                 (emb,) = self.mimi_model.decoder_transformer(x)
                 emb.transpose(1, 2)
-                with self.mimi_model._context_for_encoder_decoder:
-                    out = self.mimi_model.decoder(emb)
+                out = self.mimi_model.decoder(emb)
                 return out
 
         emb_input = torch.rand(1, 1, 512, device="cpu")

From feff90f1061f10a3a93573bf5c84728418777b7c Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 16:40:31 -0700
Subject: [PATCH 07/17] Test

---
 CMakeLists.txt | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 487dce7acae..47e0c20ddd0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -888,21 +888,21 @@ if(EXECUTORCH_BUILD_PYBIND)
   # Set RPATH to find PyTorch libraries relative to the installation location
   # This goes from executorch/extension/pybindings up to site-packages, then to
   # torch/lib
-  if(APPLE)
-    get_target_property(existing_rpath portable_lib INSTALL_RPATH)
-    string(FIND "${existing_rpath}" "@loader_path" pos)
-    if(pos EQUAL -1)
-      set_target_properties(
-        portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
-                                INSTALL_RPATH "@loader_path/../../../torch/lib"
-      )
-    endif()
-  else()
-    set_target_properties(
-      portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
-                              INSTALL_RPATH "$ORIGIN/../../../torch/lib"
-    )
-  endif()
+  # if(APPLE)
+  #   get_target_property(existing_rpath portable_lib INSTALL_RPATH)
+  #   string(FIND "${existing_rpath}" "@loader_path" pos)
+  #   if(pos EQUAL -1)
+  #     set_target_properties(
+  #       portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
+  #                               INSTALL_RPATH "@loader_path/../../../torch/lib"
+  #     )
+  #   endif()
+  # else()
+  #   set_target_properties(
+  #     portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
+  #                             INSTALL_RPATH "$ORIGIN/../../../torch/lib"
+  #   )
+  # endif()
 
   install(
     TARGETS portable_lib

From 89c03575761c4b711ffd3df9ff623c45df423ca6 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 20:55:34 -0700
Subject: [PATCH 08/17] Update

---
 CMakeLists.txt | 38 ++++++++++++++++++++++----------------
 torch_pin.py   |  2 +-
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47e0c20ddd0..d32fe55742b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -887,22 +887,28 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   # Set RPATH to find PyTorch libraries relative to the installation location
   # This goes from executorch/extension/pybindings up to site-packages, then to
-  # torch/lib
-  # if(APPLE)
-  #   get_target_property(existing_rpath portable_lib INSTALL_RPATH)
-  #   string(FIND "${existing_rpath}" "@loader_path" pos)
-  #   if(pos EQUAL -1)
-  #     set_target_properties(
-  #       portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
-  #                               INSTALL_RPATH "@loader_path/../../../torch/lib"
-  #     )
-  #   endif()
-  # else()
-  #   set_target_properties(
-  #     portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
-  #                             INSTALL_RPATH "$ORIGIN/../../../torch/lib"
-  #   )
-  # endif()
+  # torch/lib. Don't do this to APPLE, as it will error out on the following
+  # error:
+  #
+  get_target_property(_rpath portable_lib INSTALL_RPATH)
+  if(NOT _rpath)
+    if(APPLE)
+      set_target_properties(
+        portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
+                                INSTALL_RPATH "@loader_path/../../../torch/lib"
+      )
+    else()
+      set_target_properties(
+        portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
+                                INSTALL_RPATH "$ORIGIN/../../../torch/lib"
+      )
+    endif()
+  else()
+    message(
+      STATUS
+        "Skipping setting RPATH for portable_lib on Apple platforms, found: ${_rpath}"
+    )
+  endif()
 
   install(
     TARGETS portable_lib
diff --git a/torch_pin.py b/torch_pin.py
index 24fdaf3766f..5e54c848d13 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20251012"
+NIGHTLY_VERSION = "dev20251015"

From 1b2200ecc059fee2fce09237f6cd41ad6655e7bc Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 22:08:27 -0700
Subject: [PATCH 09/17] Fix lintrunner

---
 backends/cuda/runtime/cuda_backend.cpp             | 3 +--
 examples/models/moshi/mimi/install_requirements.sh | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 8664891f9f8..b07fb36ea75 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -52,7 +52,6 @@ class ET_EXPERIMENTAL CudaBackend final
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
-
 #define LOAD_SYMBOL(member, name)                                    \
   do {                                                               \
     auto symbol_res = get_function(so_handle, #name);                \
@@ -75,7 +74,7 @@ class ET_EXPERIMENTAL CudaBackend final
 
     handle->update_constants_from_blob =
         reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
-            dlsym(so_handle, "AOTInductorModelUpdateConstantsFromBlob"));
+            get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob"));
     if (handle->update_constants_from_blob == nullptr) {
       ET_LOG(
           Info,
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index d3efb04448c..003a7af70bd 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -10,7 +10,7 @@ set -x
 conda install -c conda-forge "ffmpeg<8" -y
 pip install torchcodec==0.7.0.dev20251012 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.4 --no-deps
-pip install bitsandbytes soundfile
+pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 bash "$SCRIPT_DIR"/../../llama/install_requirements.sh

From 426cf90d4a071b1dbaf8b324bba7eec3628dc4ea Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 22:44:04 -0700
Subject: [PATCH 10/17] Fix cuda_backend

---
 backends/cuda/runtime/cuda_backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index b07fb36ea75..dd749040cf1 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -74,7 +74,7 @@ class ET_EXPERIMENTAL CudaBackend final
 
     handle->update_constants_from_blob =
         reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
-            get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob"));
+            get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob").get());
     if (handle->update_constants_from_blob == nullptr) {
       ET_LOG(
           Info,

From a51d12e33b40892c464e8a0ee74a0c340f7ceba7 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 23:31:06 -0700
Subject: [PATCH 11/17] More fix

---
 .github/workflows/pull.yml             |  1 +
 CMakeLists.txt                         | 23 +++++++----------------
 backends/cuda/runtime/cuda_backend.cpp | 11 +++++++----
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 8f0d8f6e571..c96b85740bc 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -351,6 +351,7 @@ jobs:
 
         # reinstall executorch
         bash ./install_executorch.sh --minimal
+        pip list
 
         # run python unittest
         python -m unittest examples.models.moshi.mimi.test_mimi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d32fe55742b..1b96c12fbf3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -890,23 +890,14 @@ if(EXECUTORCH_BUILD_PYBIND)
   # torch/lib. Don't do this to APPLE, as it will error out on the following
   # error:
   #
-  get_target_property(_rpath portable_lib INSTALL_RPATH)
-  if(NOT _rpath)
-    if(APPLE)
-      set_target_properties(
-        portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
-                                INSTALL_RPATH "@loader_path/../../../torch/lib"
-      )
-    else()
-      set_target_properties(
-        portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
-                                INSTALL_RPATH "$ORIGIN/../../../torch/lib"
-      )
-    endif()
+  if(APPLE)
+    # Skip setting @loader_path for APPLE, since it causes error like ld:
+    # duplicate LC_RPATH '@loader_path' in '<site-packages>/torch/lib/
+    # libtorch_cpu.dylib'
   else()
-    message(
-      STATUS
-        "Skipping setting RPATH for portable_lib on Apple platforms, found: ${_rpath}"
+    set_target_properties(
+      portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
+                              INSTALL_RPATH "$ORIGIN/../../../torch/lib"
     )
   endif()
 
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index dd749040cf1..0cef859ddfb 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -72,10 +72,13 @@ class ET_EXPERIMENTAL CudaBackend final
     LOAD_SYMBOL(run, AOTInductorModelContainerRun);
 #undef LOAD_SYMBOL
 
-    handle->update_constants_from_blob =
-        reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
-            get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob").get());
-    if (handle->update_constants_from_blob == nullptr) {
+    auto symbol_res =
+        get_function(so_handle, "AOTInductorModelUpdateConstantsFromBlob");
+    if (symbol_res.ok()) {
+      handle->update_constants_from_blob =
+          reinterpret_cast<AOTInductorModelUpdateConstantsFromBlobFunc>(
+              symbol_res.get());
+    } else {
       ET_LOG(
           Info,
           "Failed to load AOTInductorModelUpdateConstantsFromBlob. This .so is probably compiled on an old version of torch (<2.9.0)");

From fdd8333637593f1437e6e75a33cc58bef6d2cd5d Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Thu, 16 Oct 2025 23:48:39 -0700
Subject: [PATCH 12/17] Update pytorch pin

---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index cbc25dfd058..a8de771a69d 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-4f8a986b8feb4a171b8a68a2a3664275ec54a75f
+e6f766c7d750d40603eee3f66c5915bac606b3ea

From a5a9a543150dc021ddcef9d091f4453c2884a98b Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Fri, 17 Oct 2025 01:00:32 -0700
Subject: [PATCH 13/17] Fix moshi

---
 examples/models/moshi/mimi/install_requirements.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index 003a7af70bd..5f2f3ac04c7 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -9,7 +9,7 @@ set -x
 
 conda install -c conda-forge "ffmpeg<8" -y
 pip install torchcodec==0.7.0.dev20251012 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-pip install moshi==0.2.4 --no-deps
+pip install moshi==0.2.11 --no-deps
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

From 952e548329aff20609c5837860b6239c7532eb6f Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Fri, 17 Oct 2025 10:25:30 -0700
Subject: [PATCH 14/17] More fixes

---
 .ci/scripts/setup-macos.sh                         | 2 +-
 examples/models/moshi/mimi/install_requirements.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 4b43a730710..09c0ee8fe69 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -99,7 +99,7 @@ print_cmake_info() {
   # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
   # where cmake dependencies couldn't be found. This seems to point to how conda
   # links $CMAKE_EXEC to its package cache when cloning a new environment
-  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
+  # install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
   # Adding the rpath will invalidate cmake signature, so signing it again here
   # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
   # with an exit code 137 otherwise
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index 5f2f3ac04c7..bddd960f8a7 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -9,7 +9,7 @@ set -x
 
 conda install -c conda-forge "ffmpeg<8" -y
 pip install torchcodec==0.7.0.dev20251012 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-pip install moshi==0.2.11 --no-deps
+pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

From 79ff7c98df8348ab49bfaf1e47cfb2b5d729aaa4 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Fri, 17 Oct 2025 10:47:29 -0700
Subject: [PATCH 15/17] Try remove loader_path

---
 .ci/scripts/setup-macos.sh |  2 +-
 .ci/scripts/utils.sh       | 46 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 09c0ee8fe69..4b43a730710 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -99,7 +99,7 @@ print_cmake_info() {
   # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
   # where cmake dependencies couldn't be found. This seems to point to how conda
   # links $CMAKE_EXEC to its package cache when cloning a new environment
-  # install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
+  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
   # Adding the rpath will invalidate cmake signature, so signing it again here
   # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
   # with an exit code 137 otherwise
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index f896d3f1d40..d53c46ad459 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -44,6 +44,51 @@ install_pip_dependencies() {
   popd || return
 }
 
+dedupe_macos_loader_path_rpaths() {
+  if [[ "$(uname)" != "Darwin" ]]; then
+    return
+  fi
+
+  local torch_lib_dir
+  torch_lib_dir=$(python - <<'PY'
+import os
+try:
+    import torch
+except Exception:
+    raise SystemExit(0)
+
+print(os.path.join(os.path.dirname(torch.__file__), "lib"))
+PY
+)
+
+  if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
+    return
+  fi
+
+  local torch_libs=(
+    "libtorch_cpu.dylib"
+    "libtorch.dylib"
+    "libc10.dylib"
+  )
+
+  for lib_name in "${torch_libs[@]}"; do
+    local lib_path="${torch_lib_dir}/${lib_name}"
+    if [[ ! -f "${lib_path}" ]]; then
+      continue
+    fi
+
+    local removed=0
+    # Repeatedly remove the @loader_path rpath entries until none remain.
+    while install_name_tool -delete_rpath @loader_path "${lib_path}" 2>/dev/null; do
+      removed=1
+    done
+
+    if [[ "${removed}" == "1" ]]; then
+      install_name_tool -add_rpath @loader_path "${lib_path}" || true
+    fi
+  done
+}
+
 install_domains() {
   echo "Install torchvision and torchaudio"
   pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}"
@@ -101,6 +146,7 @@ install_pytorch_and_domains() {
     echo "Use cached wheel at ${cached_torch_wheel}"
   fi
 
+  dedupe_macos_loader_path_rpaths
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
   export TORCHAUDIO_VERSION

From d62a7017969d95fe6512550722e72605d42f2310 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Fri, 17 Oct 2025 11:05:29 -0700
Subject: [PATCH 16/17] Fix again

---
 .ci/scripts/utils.sh | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index d53c46ad459..bc35aec2b4d 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -50,16 +50,7 @@ dedupe_macos_loader_path_rpaths() {
   fi
 
   local torch_lib_dir
-  torch_lib_dir=$(python - <<'PY'
-import os
-try:
-    import torch
-except Exception:
-    raise SystemExit(0)
-
-print(os.path.join(os.path.dirname(torch.__file__), "lib"))
-PY
-)
+  torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
 
   if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
     return

From 989def38e726b3f3066d741684c59967b85d85c0 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Fri, 17 Oct 2025 11:29:59 -0700
Subject: [PATCH 17/17] Fix

---
 .ci/scripts/utils.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index bc35aec2b4d..8f48e75e712 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -50,8 +50,10 @@ dedupe_macos_loader_path_rpaths() {
   fi
 
   local torch_lib_dir
+  pushd ..
   torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
-
+  popd
+  
   if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
     return
   fi