diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp index 1f45f062cfb..6a4757089fb 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp @@ -207,6 +207,7 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) { .def("Init", &PyQnnManager::Init) .def("InitBackend", &PyQnnManager::InitBackend) .def("InitContext", &PyQnnManager::InitContext) + .def("InitContextCache", &PyQnnManager::InitContextCache) .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend) .def( "Compile", diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h index c1434db5573..e1850bc4fa8 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h @@ -236,6 +236,10 @@ class PyQnnManager { return qnn_manager_->InitContext(std::optional{graph_names}); } + executorch::runtime::Error InitContextCache() { + return qnn_manager_->InitContextCache(); + } + bool IsNodeSupportedByBackend( std::vector>& op_wrappers) { return qnn_manager_->IsNodeSupportedByBackend(op_wrappers); diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py index 7b46f9d0d9b..f281692a2d4 100644 --- a/backends/qualcomm/quantizer/qconfig.py +++ b/backends/qualcomm/quantizer/qconfig.py @@ -114,14 +114,21 @@ def get_8a8w_qnn_ptq_config( # the smallest scale defaults to DEFAULT_EPS_8BIT extra_args: Dict[str, Any] = {"eps": eps if eps else DEFAULT_EPS_8BIT} - act_quantization_spec = QuantizationSpec( - dtype=torch.uint8, - qscheme=( - torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine - ), - ch_axis=0, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), - ) + if act_symmetric: + act_quantization_spec = QuantizationSpec( + dtype=torch.uint8, + qscheme=(torch.per_tensor_symmetric), + ch_axis=0, + observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + ) + else: + act_quantization_spec = QuantizationSpec( + dtype=torch.uint8, + quant_min=torch.iinfo(torch.uint8).min, + quant_max=torch.iinfo(torch.uint8).max, + qscheme=(torch.per_tensor_affine), + observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + ) weight_quantization_spec = QuantizationSpec( dtype=torch.int8, diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index e64d58a8971..871ae1925e4 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -296,6 +296,37 @@ Error QnnManager::InitContext( return Error::Ok; } +Error QnnManager::InitContextCache() { + if (backend_params_ptr_->backend_init_state_ == + BackendInitializeState::UNINITIALIZED) { + QNN_EXECUTORCH_LOG_INFO( + "Initialize Qnn backend " + "parameters for Qnn executorch backend type %d", + options_->backend_options()->backend_type()); + backend_params_ptr_ = QnnBackendFactory().Create( + backend_bundle_ptr_->implementation.get(), + backend_bundle_ptr_->qnn_backend_ptr.get(), + backend_bundle_ptr_->qnn_device_ptr.get(), + qnn_context_blob_, + options_, + qnn_dlc_manager_.get()); + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_ != nullptr, + Internal, + "Failed to load Qnn backend."); + // Note: For online_prepare or deserialization, the graph name will be + // obtained from the binary. + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_->qnn_backend_cache_ptr_->Configure({}) == Error::Ok, + Internal, + "Fail to configure Qnn backend cache"); + + backend_params_ptr_->backend_init_state_ = + BackendInitializeState::INITIALIZED; + } + return Error::Ok; +} + Error QnnManager::AllocateTensor(const std::string& graph_name) { std::vector input_tensors = backend_params_ptr_->qnn_context_ptr_->GetGraphInputs(graph_name); diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 866a4edbad6..ef0385c517b 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -38,6 +38,9 @@ class QnnManager { // graph name will be obtained from the binary. executorch::runtime::Error InitContext( std::optional> graph_names = std::nullopt); + // This function only initialize the context cache to get spill fill buffer + // size + executorch::runtime::Error InitContextCache(); executorch::runtime::Error AllocateTensor(const std::string& graph_name); executorch::runtime::Error AllocateTensor( const std::string& graph_name, diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index 9c559d83fcc..404ebff88af 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -51,6 +51,9 @@ std::unique_ptr QnnBackendFactory::Create( "use_dlbc in htp_options: %d", htp_options->use_dlbc()); QNN_EXECUTORCH_LOG_INFO( "use_fold_relu in htp_options: %d", htp_options->use_fold_relu()); + QNN_EXECUTORCH_LOG_INFO( + "use_slc_allocator in htp_options: %d", + htp_options->use_slc_allocator()); QNN_EXECUTORCH_LOG_INFO( "use_multi_contexts in htp_options: %d", htp_options->use_multi_contexts()); diff --git a/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp index 3038a100d03..be95c9cfdbf 100644 --- a/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp @@ -18,7 +18,9 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo( const QnnSystemContext_BinaryInfo_t* binaryinfo) { QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr; #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) - QnnHtpSystemContext_GraphBlobInfo_t* htp_graphblobinfo = nullptr; + std::vector htp_graphblobinfos; + std::uint32_t num_graphs; + #endif if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { @@ -29,8 +31,13 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo( binaryinfo->contextBinaryInfoV2.hwInfoBlob); #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) { - htp_graphblobinfo = static_cast( - binaryinfo->contextBinaryInfoV3.graphs->graphInfoV3.graphBlobInfo); + num_graphs = binaryinfo->contextBinaryInfoV3.numGraphs; + for (size_t i = 0; i < num_graphs; ++i) { + htp_graphblobinfos.push_back( + static_cast( + binaryinfo->contextBinaryInfoV3.graphs[i] + .graphInfoV3.graphBlobInfo)); + } #endif } else { QNN_EXECUTORCH_LOG_WARN( @@ -51,15 +58,24 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo( } #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) - if (htp_graphblobinfo) { - if (htp_graphblobinfo->version == + if (htp_graphblobinfos.size() > 0) { + // After version 2.21, we need to get spill fill buffer size from graph + // blob info instead of hw blob info. If there are multiple graphs, we + // should use the max value among all graphs. + if (htp_graphblobinfos[0]->version == QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) { - spill_fill_buf_ = - (*htp_graphblobinfo).contextBinaryGraphBlobInfoV1.spillFillBufferSize; + for (size_t i = 0; i < num_graphs; ++i) { + uint64_t spill_fill_buf = + (*htp_graphblobinfos[i]) + .contextBinaryGraphBlobInfoV1.spillFillBufferSize; + if (spill_fill_buf > spill_fill_buf_) { + spill_fill_buf_ = spill_fill_buf; + } + } } else { QNN_EXECUTORCH_LOG_WARN( "Unknown QNN Htp graph blob info version %d.", - htp_graphblobinfo->version); + htp_graphblobinfos[0]->version); return Error::Internal; } } diff --git a/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp index 17b8438880d..3e8dd5201c6 100644 --- a/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp +++ b/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp @@ -70,6 +70,14 @@ HtpGraphCustomConfig::CreateGraphCustomConfigCommon( htp_options_->use_dlbc() ? 1.0 : 0.0; ret.push_back(static_cast(p_custom_config)); + p_custom_config = AllocGraphCustomConfig(); + p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + p_custom_config->optimizationOption.type = + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR; + p_custom_config->optimizationOption.floatValue = + htp_options_->use_slc_allocator() ? 1.0 : 0.0; + ret.push_back(static_cast(p_custom_config)); + return ret; } } // namespace qnn diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs index 4a42cfd6c57..ff224f932ed 100644 --- a/backends/qualcomm/serialization/qc_compiler_spec.fbs +++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs @@ -181,6 +181,11 @@ table QnnExecuTorchHtpBackendOptions { /// When multiple graphs appear inside the same context, /// weights could be reused across all graphs. use_weight_sharing:bool; + + /// Allows user to enable the usage of the System Level Cache Allocator for a given graph. + /// It will help the by reducing overall bandwith on the use case. + /// The feature is only supported by specific SOCs. + use_slc_allocator:bool; } /// Logging level of the delegate and QNN backend. diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py index 3f8962c8eca..626522f461c 100644 --- a/backends/qualcomm/serialization/qc_schema.py +++ b/backends/qualcomm/serialization/qc_schema.py @@ -166,6 +166,7 @@ class QnnExecuTorchHtpBackendOptions: use_fold_relu: bool = True use_multi_contexts: bool = False use_weight_sharing: bool = False + use_slc_allocator: bool = False @unique diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 9d38c0f8867..62a8a55c9f5 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -244,7 +244,12 @@ def process_exported_program(prog): qnn_mgr = PyQnnManagerAdaptor.QnnManager( m.compile_specs[0].value, m.processed_bytes ) - assert qnn_mgr.Init().value == 0, "failed to load context binary" + assert ( + qnn_mgr.InitBackend().value == 0 + ), "failed to initialize backend" + assert ( + qnn_mgr.InitContextCache().value == 0 + ), "failed to init context cache" max_sf_buf_size = max( max_sf_buf_size, qnn_mgr.GetSpillFillBufferSize() ) @@ -256,7 +261,8 @@ def process_lowered_module(module): qnn_mgr = PyQnnManagerAdaptor.QnnManager( module.compile_specs[0].value, module.processed_bytes ) - assert qnn_mgr.Init().value == 0, "failed to load context binary" + assert qnn_mgr.InitBackend().value == 0, "failed to initialize backend" + assert qnn_mgr.InitContextCache().value == 0, "failed to init context cache" spill_fill_size = qnn_mgr.GetSpillFillBufferSize() qnn_mgr.Destroy() return spill_fill_size, { @@ -991,6 +997,7 @@ def generate_htp_compiler_spec( use_dlbc: bool = False, use_multi_contexts: bool = False, use_weight_sharing: bool = False, + use_slc_allocator: bool = False, ) -> QnnExecuTorchBackendOptions: """ Helper function generating backend options for QNN HTP @@ -1006,6 +1013,9 @@ def generate_htp_compiler_spec( could be re-used across all the splits. use_weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs. + use_slc_allocator: Allows user to enable the usage of the System Level Cache Allocator for a given graph. + It will help the by reducing overall bandwith on the use case. + The feature is only supported by specific SOCs. Returns: QnnExecuTorchHtpBackendOptions: backend options for QNN HTP. @@ -1023,6 +1033,7 @@ def generate_htp_compiler_spec( htp_options.use_multi_contexts = use_multi_contexts htp_options.use_weight_sharing = use_weight_sharing htp_options.use_dlbc = use_dlbc + htp_options.use_slc_allocator = use_slc_allocator return QnnExecuTorchBackendOptions( backend_type=QnnExecuTorchBackendType.kHtpBackend, htp_options=htp_options, diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py index c50cd455e75..20a7ab99c8d 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py @@ -373,6 +373,26 @@ def smart_mask_updater( return pos, k_caches, v_caches +def evict_tokens( + ar_len: int, + atten_mask: AttentionMask, + pos, + k_caches, + v_caches, + rope_module, + position_shift, +): + max_cache_len = k_caches[0].size(-1) + shifted_pos = pos + position_shift + if shifted_pos + ar_len > max_cache_len: + num_to_evict = rope_module.eviction_batch_size + k_caches, v_caches = rope_module(k_caches, v_caches) + position_shift -= num_to_evict + shifted_pos -= num_to_evict + atten_mask.smart_mask_init(shifted_pos) + return k_caches, v_caches, position_shift + + def _prefill_chunking( inputs: DecoderInputs, module: torch.fx.GraphModule, diff --git a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py index 68fbb86f7cb..608e727c3b2 100644 --- a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py +++ b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py @@ -418,7 +418,7 @@ def eval_llama_with_attention_sink(args): layer.feed_forward.prepare_feedfoward_conv() model = convert_linear_to_conv2d(model) - _, atten_mask, _, k_caches, v_caches = model.get_example_inputs(use_kv_cache=True) + _, atten_mask, _, k_caches, v_caches = model.get_example_inputs() eval_data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") neg_log_likelihoods = [] diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py index 0f5076ca94a..44917e0bd5a 100644 --- a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py +++ b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py @@ -824,9 +824,9 @@ def compile(self, request: Request): # noqa: C901 skip_node_op_set={"llama.fallback.default"}, ) - if self.config.num_sharding > 1 and self.control_args.model_mode == "kv": - # weight-sharing based context binaries cannot be opened in x86 host - update_spill_fill_size(edge_prog_mgr.exported_program("kv_forward")) + if self.config.num_sharding > 1: + for graph_name in graph_names: + update_spill_fill_size(edge_prog_mgr.exported_program(graph_name)) if self.control_args.verbose: for ep in edge_prog_mgr._edge_programs.values():