Qualcomm AI Engine Direct - Support SLC allocator feature #17302

cccclai · 2026-02-10T17:43:46Z

looks like we're adding a new feature, can we update the read me regarding how to use it?

Possibly a bit more explanation on System Level Cache Allocator

Yes, this is a new feature. Users just need to set use_slc_allocator=True in compile_spec to enable it.
https://github.com/pytorch/executorch/pull/17302/changes#diff-0439f6a7c1a3a3cfb222cd6409b6754f17a1ce782dd231de1d12bbf957d588f7R1000

System Level Cache Allocator is a shared cache at the system level of a SoC, serving as the last caching layer before external DDR memory. Its primary purpose is to optimize memory bandwidth, thereby potentially improving performance and reducing power consumption. However, it is model-dependent, so it is not guaranteed to be effective in all cases.

-Original file line number
+Diff line change
@@ Expand Up / @@ -207,6 +207,7 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) { @@
           .def("Init", &PyQnnManager::Init)
           .def("InitBackend", &PyQnnManager::InitBackend)
           .def("InitContext", &PyQnnManager::InitContext)
+          .def("InitContextCache", &PyQnnManager::InitContextCache)
           .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
           .def(
               "Compile",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -236,6 +236,10 @@ class PyQnnManager { @@
         return qnn_manager_->InitContext(std::optional{graph_names});
       }
+      executorch::runtime::Error InitContextCache() {
+        return qnn_manager_->InitContextCache();
+      }
       bool IsNodeSupportedByBackend(
           std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
         return qnn_manager_->IsNodeSupportedByBackend(op_wrappers);
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -114,14 +114,21 @@ def get_8a8w_qnn_ptq_config( @@
         # the smallest scale defaults to DEFAULT_EPS_8BIT
         extra_args: Dict[str, Any] = {"eps": eps if eps else DEFAULT_EPS_8BIT}
-        act_quantization_spec = QuantizationSpec(
-            dtype=torch.uint8,
-            qscheme=(
-                torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
-            ),
-            ch_axis=0,
-            observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
-        )
+        if act_symmetric:
+            act_quantization_spec = QuantizationSpec(
+                dtype=torch.uint8,
+                qscheme=(torch.per_tensor_symmetric),
+                ch_axis=0,
+                observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+            )
+        else:
+            act_quantization_spec = QuantizationSpec(
+                dtype=torch.uint8,
+                quant_min=torch.iinfo(torch.uint8).min,
+                quant_max=torch.iinfo(torch.uint8).max,
+                qscheme=(torch.per_tensor_affine),
+                observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+            )
         weight_quantization_spec = QuantizationSpec(
             dtype=torch.int8,
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -296,6 +296,37 @@ Error QnnManager::InitContext( @@
       return Error::Ok;
     }
+    Error QnnManager::InitContextCache() {
+      if (backend_params_ptr_->backend_init_state_ ==
+          BackendInitializeState::UNINITIALIZED) {
+        QNN_EXECUTORCH_LOG_INFO(
+            "Initialize Qnn backend "
+            "parameters for Qnn executorch backend type %d",
+            options_->backend_options()->backend_type());
+        backend_params_ptr_ = QnnBackendFactory().Create(
+            backend_bundle_ptr_->implementation.get(),
+            backend_bundle_ptr_->qnn_backend_ptr.get(),
+            backend_bundle_ptr_->qnn_device_ptr.get(),
+            qnn_context_blob_,
+            options_,
+            qnn_dlc_manager_.get());
+        ET_CHECK_OR_RETURN_ERROR(
+            backend_params_ptr_ != nullptr,
+            Internal,
+            "Failed to load Qnn backend.");
+        // Note: For online_prepare or deserialization, the graph name will be
+        // obtained from the binary.
+        ET_CHECK_OR_RETURN_ERROR(
+            backend_params_ptr_->qnn_backend_cache_ptr_->Configure({}) == Error::Ok,
+            Internal,
+            "Fail to configure Qnn backend cache");
+        backend_params_ptr_->backend_init_state_ =
+            BackendInitializeState::INITIALIZED;
+      }
+      return Error::Ok;
+    }
     Error QnnManager::AllocateTensor(const std::string& graph_name) {
       std::vector<Qnn_Tensor_t> input_tensors =
           backend_params_ptr_->qnn_context_ptr_->GetGraphInputs(graph_name);
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -38,6 +38,9 @@ class QnnManager { @@
       // graph name will be obtained from the binary.
       executorch::runtime::Error InitContext(
           std::optional<std::vector<std::string>> graph_names = std::nullopt);
+      // This function only initialize the context cache to get spill fill buffer
+      // size
+      executorch::runtime::Error InitContextCache();
       executorch::runtime::Error AllocateTensor(const std::string& graph_name);
       executorch::runtime::Error AllocateTensor(
           const std::string& graph_name,
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Qualcomm AI Engine Direct - Support SLC allocator feature #17302

Diff view

Diff view

There are no files selected for viewing

cccclai Feb 10, 2026

Uh oh!

cccclai Feb 10, 2026

Uh oh!

shewu-quic Feb 11, 2026

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up @@
                 "use_dlbc in htp_options: %d", htp_options->use_dlbc());
             QNN_EXECUTORCH_LOG_INFO(
                 "use_fold_relu in htp_options: %d", htp_options->use_fold_relu());
+            QNN_EXECUTORCH_LOG_INFO(
+                "use_slc_allocator in htp_options: %d",
+                htp_options->use_slc_allocator());
             QNN_EXECUTORCH_LOG_INFO(
                 "use_multi_contexts in htp_options: %d",
                 htp_options->use_multi_contexts());
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -18,7 +18,9 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo( @@
         const QnnSystemContext_BinaryInfo_t* binaryinfo) {
       QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr;
     #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
-      QnnHtpSystemContext_GraphBlobInfo_t* htp_graphblobinfo = nullptr;
+      std::vector<QnnHtpSystemContext_GraphBlobInfo_t*> htp_graphblobinfos;
+      std::uint32_t num_graphs;
     #endif
       if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
@@ Expand All / @@ -29,8 +31,13 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo( @@
             binaryinfo->contextBinaryInfoV2.hwInfoBlob);
     #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
       } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
-        htp_graphblobinfo = static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
-            binaryinfo->contextBinaryInfoV3.graphs->graphInfoV3.graphBlobInfo);
+        num_graphs = binaryinfo->contextBinaryInfoV3.numGraphs;
+        for (size_t i = 0; i < num_graphs; ++i) {
+          htp_graphblobinfos.push_back(
+              static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
+                  binaryinfo->contextBinaryInfoV3.graphs[i]
+                      .graphInfoV3.graphBlobInfo));
+        }
     #endif
       } else {
         QNN_EXECUTORCH_LOG_WARN(
@@ Expand All / @@ -51,15 +58,24 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo( @@
       }
     #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
-      if (htp_graphblobinfo) {
-        if (htp_graphblobinfo->version ==
+      if (htp_graphblobinfos.size() > 0) {
+        // After version 2.21, we need to get spill fill buffer size from graph
+        // blob info instead of hw blob info. If there are multiple graphs, we
+        // should use the max value among all graphs.
+        if (htp_graphblobinfos[0]->version ==
             QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) {
-          spill_fill_buf_ =
-              (*htp_graphblobinfo).contextBinaryGraphBlobInfoV1.spillFillBufferSize;
+          for (size_t i = 0; i < num_graphs; ++i) {
+            uint64_t spill_fill_buf =
+                (*htp_graphblobinfos[i])
+                    .contextBinaryGraphBlobInfoV1.spillFillBufferSize;
+            if (spill_fill_buf > spill_fill_buf_) {
+              spill_fill_buf_ = spill_fill_buf;
+            }
+          }
         } else {
           QNN_EXECUTORCH_LOG_WARN(
               "Unknown QNN Htp graph blob info version %d.",
-              htp_graphblobinfo->version);
+              htp_graphblobinfos[0]->version);
           return Error::Internal;
         }
       }
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -70,6 +70,14 @@ HtpGraphCustomConfig::CreateGraphCustomConfigCommon( @@
           htp_options_->use_dlbc() ? 1.0 : 0.0;
       ret.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
+      p_custom_config = AllocGraphCustomConfig();
+      p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+      p_custom_config->optimizationOption.type =
+          QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR;
+      p_custom_config->optimizationOption.floatValue =
+          htp_options_->use_slc_allocator() ? 1.0 : 0.0;
+      ret.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
       return ret;
     }
     } // namespace qnn
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -181,6 +181,11 @@ table QnnExecuTorchHtpBackendOptions { @@
       /// When multiple graphs appear inside the same context,
       /// weights could be reused across all graphs.
       use_weight_sharing:bool;
+      /// Allows user to enable the usage of the System Level Cache Allocator for a given graph.
+      /// It will help the by reducing overall bandwith on the use case.
+      /// The feature is only supported by specific SOCs.
+      use_slc_allocator:bool;
     }
     /// Logging level of the delegate and QNN backend.
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -166,6 +166,7 @@ class QnnExecuTorchHtpBackendOptions: @@
         use_fold_relu: bool = True
         use_multi_contexts: bool = False
         use_weight_sharing: bool = False
+        use_slc_allocator: bool = False
     @unique
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -244,7 +244,12 @@ def process_exported_program(prog): @@
                         qnn_mgr = PyQnnManagerAdaptor.QnnManager(
                             m.compile_specs[0].value, m.processed_bytes
                         )
-                        assert qnn_mgr.Init().value == 0, "failed to load context binary"
+                        assert (
+                            qnn_mgr.InitBackend().value == 0
+                        ), "failed to initialize backend"
+                        assert (
+                            qnn_mgr.InitContextCache().value == 0
+                        ), "failed to init context cache"
                         max_sf_buf_size = max(
                             max_sf_buf_size, qnn_mgr.GetSpillFillBufferSize()
                         )
@@ Expand All / @@ -256,7 +261,8 @@ def process_lowered_module(module): @@
                 qnn_mgr = PyQnnManagerAdaptor.QnnManager(
                     module.compile_specs[0].value, module.processed_bytes
                 )
-                assert qnn_mgr.Init().value == 0, "failed to load context binary"
+                assert qnn_mgr.InitBackend().value == 0, "failed to initialize backend"
+                assert qnn_mgr.InitContextCache().value == 0, "failed to init context cache"
                 spill_fill_size = qnn_mgr.GetSpillFillBufferSize()
                 qnn_mgr.Destroy()
                 return spill_fill_size, {
@@ Expand Down Expand Up / @@ -991,6 +997,7 @@ def generate_htp_compiler_spec( @@
         use_dlbc: bool = False,
         use_multi_contexts: bool = False,
         use_weight_sharing: bool = False,
+        use_slc_allocator: bool = False,
     ) -> QnnExecuTorchBackendOptions:
         """
         Helper function generating backend options for QNN HTP
@@ Expand All / @@ -1006,6 +1013,9 @@ def generate_htp_compiler_spec( @@
                 could be re-used across all the splits.
             use_weight_sharing: Used with multiple_graphs, where model size will be
                 reduced when operations have the same weights across multiple graphs.
+            use_slc_allocator: Allows user to enable the usage of the System Level Cache Allocator for a given graph.
+                It will help the by reducing overall bandwith on the use case.
+                The feature is only supported by specific SOCs.
         Returns:
             QnnExecuTorchHtpBackendOptions: backend options for QNN HTP.
@@ Expand All / @@ -1023,6 +1033,7 @@ def generate_htp_compiler_spec( @@
         htp_options.use_multi_contexts = use_multi_contexts
         htp_options.use_weight_sharing = use_weight_sharing
         htp_options.use_dlbc = use_dlbc
+        htp_options.use_slc_allocator = use_slc_allocator
         return QnnExecuTorchBackendOptions(
             backend_type=QnnExecuTorchBackendType.kHtpBackend,
             htp_options=htp_options,
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -373,6 +373,26 @@ def smart_mask_updater( @@
         return pos, k_caches, v_caches
+    def evict_tokens(
+        ar_len: int,
+        atten_mask: AttentionMask,
+        pos,
+        k_caches,
+        v_caches,
+        rope_module,
+        position_shift,
+    ):
+        max_cache_len = k_caches[0].size(-1)
+        shifted_pos = pos + position_shift
+        if shifted_pos + ar_len > max_cache_len:
+            num_to_evict = rope_module.eviction_batch_size
+            k_caches, v_caches = rope_module(k_caches, v_caches)
+            position_shift -= num_to_evict
+            shifted_pos -= num_to_evict
+            atten_mask.smart_mask_init(shifted_pos)
+        return k_caches, v_caches, position_shift
     def _prefill_chunking(
         inputs: DecoderInputs,
         module: torch.fx.GraphModule,
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -418,7 +418,7 @@ def eval_llama_with_attention_sink(args): @@
                 layer.feed_forward.prepare_feedfoward_conv()
         model = convert_linear_to_conv2d(model)
-        _, atten_mask, _, k_caches, v_caches = model.get_example_inputs(use_kv_cache=True)
+        _, atten_mask, _, k_caches, v_caches = model.get_example_inputs()
         eval_data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
         neg_log_likelihoods = []
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -824,9 +824,9 @@ def compile(self, request: Request): # noqa: C901 @@
                 skip_node_op_set={"llama.fallback.default"},
             )
-            if self.config.num_sharding > 1 and self.control_args.model_mode == "kv":
-                # weight-sharing based context binaries cannot be opened in x86 host
-                update_spill_fill_size(edge_prog_mgr.exported_program("kv_forward"))
+            if self.config.num_sharding > 1:
+                for graph_name in graph_names:
+                    update_spill_fill_size(edge_prog_mgr.exported_program(graph_name))
             if self.control_args.verbose:
                 for ep in edge_prog_mgr._edge_programs.values():
@@ Expand Down @@

Qualcomm AI Engine Direct - Support SLC allocator feature #17302

Are you sure you want to change the base?

Qualcomm AI Engine Direct - Support SLC allocator feature #17302

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

cccclai Feb 10, 2026

Choose a reason for hiding this comment

Uh oh!

cccclai Feb 10, 2026

Choose a reason for hiding this comment

Uh oh!

shewu-quic Feb 11, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!