From cd8de1448dabc6e51f94dfccb58355286f80d030 Mon Sep 17 00:00:00 2001 From: Sheng Feng Wu Date: Tue, 8 Oct 2024 18:13:36 -0700 Subject: [PATCH] Qualcomm AI Engine Direct - Refine max spill fill buffer setting (#5989) Summary: - Get required spillFillBufferSize from context binary and set to compiler_spec - Quantize embedding op in qnn. - If enable multi-contexts, maxSpillFillBuffer could not set to zero. Pull Request resolved: https://github.com/pytorch/executorch/pull/5989 Reviewed By: kirklandsign Differential Revision: D64056107 Pulled By: cccclai fbshipit-source-id: 9f9846e6ac7b4a27d734d2812ac3bbad32fb194f (cherry picked from commit 01fcdf420fef23b4ee0348c37abcab74bcea1449) --- .../aot/python/PyQnnManagerAdaptor.cpp | 3 +- .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 4 ++ backends/qualcomm/runtime/QnnManager.cpp | 4 ++ backends/qualcomm/runtime/QnnManager.h | 6 +++ .../qualcomm/runtime/backends/CMakeLists.txt | 3 ++ .../runtime/backends/QnnBackendCache.cpp | 23 +++++---- .../runtime/backends/QnnBackendCache.h | 14 +++-- .../runtime/backends/QnnBackendFactory.cpp | 5 +- .../runtime/backends/QnnBackendFactory.h | 7 ++- .../runtime/backends/QnnContextCommon.h | 9 ++-- .../backends/htpbackend/HtpBackendCache.cpp | 51 +++++++++++++++++++ .../backends/htpbackend/HtpBackendCache.h | 33 ++++++++++++ .../runtime/backends/htpbackend/HtpContext.h | 4 +- .../aarch64/HtpContextCustomConfig.cpp | 3 +- backends/qualcomm/utils/utils.py | 16 +++++- examples/models/llama2/export_llama_lib.py | 10 +--- extension/llm/export/partitioner_lib.py | 2 +- extension/llm/export/quantizer_lib.py | 5 -- 18 files changed, 163 insertions(+), 39 deletions(-) create mode 100644 backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp create mode 100644 backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp index c785fd0219e..b8d567718af 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp @@ -35,7 +35,8 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) { .def("IsTensorDump", &PyQnnManager::IsTensorDump) .def("AllocateTensor", &PyQnnManager::AllocateTensor) .def("GetGraphInputs", &PyQnnManager::GetGraphInputs) - .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs); + .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs) + .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize); } } // namespace qnn } // namespace executor diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h index 4a675067f3e..9907b87c55f 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h @@ -177,6 +177,10 @@ class PyQnnManager { return ret; } + uint64_t GetSpillFillBufferSize() { + return qnn_manager_->GetSpillFillBufferSize(); + } + private: // Store the bytes object instead of a raw pointer so that this module will // keep the bytes alive. diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index f4275f0ab3d..5bfb8aa7898 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -283,6 +283,10 @@ Error QnnManager::Init() { qnn_loaded_backend_, logger_.get(), qnn_context_blob_, options_); ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.") + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_->qnn_backend_cache_ptr_->Configure() == Error::Ok, + Internal, + "Fail to configure Qnn backend cache"); ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_->qnn_backend_ptr_->Configure() == Error::Ok, Internal, diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 3d1cc3863aa..89d47c78c94 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -70,6 +70,12 @@ class QnnManager { // Pre-register custom memory handle from the SharedBuffer before execution Error PreRegisterMem(); + uint64_t GetSpillFillBufferSize() { + auto* htp_backend_cache_ptr = static_cast( + backend_params_ptr_->qnn_backend_cache_ptr_.get()); + return htp_backend_cache_ptr->GetSpillFillBufferSize(); + } + std::vector> GetGraphInputs() { return input_tensors_; } diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt index ed61d7545a9..9147d4f32a9 100644 --- a/backends/qualcomm/runtime/backends/CMakeLists.txt +++ b/backends/qualcomm/runtime/backends/CMakeLists.txt @@ -77,7 +77,9 @@ target_sources( target_sources( qnn_backend_cache PUBLIC ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCache.h + ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpBackendCache.h PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCache.cpp + ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpBackendCache.cpp ) # qnn_graph @@ -130,6 +132,7 @@ set(qnn_header_basenames HTP/QnnHtpPerfInfrastructure.h HTP/QnnHtpProfile.h HTP/QnnHtpProperty.h + HTP/QnnHtpSystemContext.h QnnInterface.h QnnLog.h QnnMem.h diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp index 1ed51ed14f6..6a568b8bc59 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp @@ -28,13 +28,20 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() { if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_WARN( - "Failed to interpret QNN Context " + "Failed to interpret QNN context " "binary. Error code %d. " "Try verifying binary with online-prepare format.", QNN_GET_ERROR_CODE(error)); return Error::Internal; } + Error status = RetrieveBackendBinaryInfo(binaryinfo); + if (status == Error::Internal) { + QNN_EXECUTORCH_LOG_ERROR( + "Failed to retrieve backend binary info from QNN context binary."); + return Error::Internal; + } + if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { num_graphs = binaryinfo->contextBinaryInfoV1.numGraphs; graph = binaryinfo->contextBinaryInfoV1.graphs; @@ -81,20 +88,18 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() { return Error::Ok; } -QnnBackendCache::QnnBackendCache( - const QnnExecuTorchContextBinary& qnn_context_blob) - : qnn_context_blob_(qnn_context_blob) { +Error QnnBackendCache::Configure() { if (qnn_context_blob_.buffer == nullptr) { state_ = SERIALIZE; QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE."); - return; + return Error::Ok; } if (qnn_sys_impl_.Load() != Error::Ok) { QNN_EXECUTORCH_LOG_ERROR( "Failed to Load QnnSystem " "APIs. Caching mechanism is being disabled."); - return; + return Error::Internal; } Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -109,7 +114,7 @@ QnnBackendCache::QnnBackendCache( "Failed to create Qnn " "SystemContext. Caching mechanism will be disabled. Error code %d", QNN_GET_ERROR_CODE(error)); - return; + return Error::Internal; } // DO DESERIALIZE @@ -125,7 +130,7 @@ QnnBackendCache::QnnBackendCache( if (qcir::VerifyGraphBuffer(verifier)) { state_ = ONLINE_PREPARE; - return; + return Error::Ok; } QNN_EXECUTORCH_LOG_ERROR( @@ -133,8 +138,8 @@ QnnBackendCache::QnnBackendCache( "might be broken. Please consider to re-generate the " "cache."); InvalidateCache(); - return; } + return Error::Ok; } QnnBackendCache::~QnnBackendCache() { diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h index ad6d3d0bd7b..6b1f5863a15 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.h +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h @@ -23,9 +23,9 @@ class QnnBackendCache { DESERIALIZE = 2, ONLINE_PREPARE = 3, }; - explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob); - - ~QnnBackendCache(); + explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob) + : qnn_context_blob_(qnn_context_blob) {} + virtual ~QnnBackendCache(); QnnBackendCache(const QnnBackendCache&) = delete; QnnBackendCache(QnnBackendCache&&) = delete; QnnBackendCache& operator=(const QnnBackendCache&) = delete; @@ -51,6 +51,14 @@ class QnnBackendCache { return graph_name_; } + Error Configure(); + + protected: + virtual Error RetrieveBackendBinaryInfo( + __ET_UNUSED const QnnSystemContext_BinaryInfo_t* binaryinfo) { + return Error::Ok; + } + private: Error GetQnnGraphInfoFromBinary(); diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index 9fb292613a3..52128a8b496 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -56,11 +56,14 @@ std::unique_ptr QnnBackendFactory::Create( backend_params->qnn_device_ptr_ = std::make_unique( implementation, logger, options->soc_info(), htp_options); + backend_params->qnn_backend_cache_ptr_ = + std::make_unique(qnn_context_blob); + backend_params->qnn_context_ptr_ = std::make_unique( implementation, backend_params->qnn_backend_ptr_.get(), backend_params->qnn_device_ptr_.get(), - qnn_context_blob, + backend_params->qnn_backend_cache_ptr_.get(), htp_options); backend_params->qnn_graph_ptr_ = std::make_unique( diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h index ab47113a538..dfa6b825088 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +37,7 @@ typedef struct BackendConfigParameters { std::unique_ptr qnn_device_ptr_; std::unique_ptr qnn_graph_ptr_; std::unique_ptr qnn_mem_manager_ptr_; + std::unique_ptr qnn_backend_cache_ptr_; // Default ctor BackendConfigParameters() @@ -43,10 +46,12 @@ typedef struct BackendConfigParameters { qnn_context_ptr_(nullptr), qnn_device_ptr_(nullptr), qnn_graph_ptr_(nullptr), - qnn_mem_manager_ptr_(nullptr) {} + qnn_mem_manager_ptr_(nullptr), + qnn_backend_cache_ptr_(nullptr) {} // Default dtor ~BackendConfigParameters() { qnn_graph_ptr_.reset(); + qnn_backend_cache_ptr_.reset(); qnn_mem_manager_ptr_.reset(); qnn_context_ptr_.reset(); qnn_device_ptr_.reset(); diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h index e706a5a870f..1246e1c2f83 100644 --- a/backends/qualcomm/runtime/backends/QnnContextCommon.h +++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h @@ -22,13 +22,12 @@ class QnnContext { const QnnImplementation& implementation, QnnBackend* backend, QnnDevice* device, - const QnnExecuTorchContextBinary& qnn_context_blob) + QnnBackendCache* cache) : handle_(nullptr), implementation_(implementation), backend_(backend), - device_(device) { - cache_ = std::make_unique(qnn_context_blob); - } + device_(device), + cache_(cache) {} virtual ~QnnContext(); Error Configure(); @@ -67,7 +66,7 @@ class QnnContext { const QnnImplementation& implementation_; QnnBackend* backend_; QnnDevice* device_; - std::unique_ptr cache_; + QnnBackendCache* cache_; std::vector binary_buffer_; }; } // namespace qnn diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp new file mode 100644 index 00000000000..8cd9b69aa67 --- /dev/null +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include "HTP/QnnHtpSystemContext.h" + +namespace torch { +namespace executor { +namespace qnn { +Error HtpBackendCache::RetrieveBackendBinaryInfo( + const QnnSystemContext_BinaryInfo_t* binaryinfo) { + QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr; + + if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { + htp_hwblobinfo = static_cast( + binaryinfo->contextBinaryInfoV1.hwInfoBlob); + } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) { + htp_hwblobinfo = static_cast( + binaryinfo->contextBinaryInfoV2.hwInfoBlob); + } else { + QNN_EXECUTORCH_LOG_WARN( + "Unknown QNN BinaryInfo version %d.", binaryinfo->version); + return Error::Internal; + } + + if (htp_hwblobinfo == nullptr) { + QNN_EXECUTORCH_LOG_WARN( + "Htp hardware blob information is not found in binary information."); + return Error::Ok; + } + + if (htp_hwblobinfo->version == + QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) { + spill_fill_buf_ = + (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize; + } else { + QNN_EXECUTORCH_LOG_WARN( + "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version); + return Error::Internal; + } + + return Error::Ok; +} + +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h new file mode 100644 index 00000000000..b97fce18c51 --- /dev/null +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once +#include + +namespace torch { +namespace executor { +namespace qnn { +class HtpBackendCache : public QnnBackendCache { + public: + explicit HtpBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob) + : QnnBackendCache(qnn_context_blob), spill_fill_buf_(0) {} + ~HtpBackendCache() override = default; + + uint64_t GetSpillFillBufferSize() { + return spill_fill_buf_; + } + + protected: + Error RetrieveBackendBinaryInfo( + const QnnSystemContext_BinaryInfo_t* binaryinfo) override; + + private: + uint64_t spill_fill_buf_; +}; +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h b/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h index 14d06824a12..f3487ad05bc 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h @@ -22,9 +22,9 @@ class HtpContext : public QnnContext { const QnnImplementation& implementation, QnnBackend* backend, QnnDevice* device, - const QnnExecuTorchContextBinary& qnn_context_blob, + QnnBackendCache* cache, const QnnExecuTorchHtpBackendOptions* htp_options) - : QnnContext(implementation, backend, device, qnn_context_blob) { + : QnnContext(implementation, backend, device, cache) { htp_context_custom_config_ = std::make_unique(this, htp_options); } diff --git a/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp index 88f09c3cf4e..dfc5a3e2766 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp +++ b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp @@ -19,7 +19,8 @@ HtpContextCustomConfig::CreateContextCustomConfig() { QnnHtpContext_CustomConfig_t* p_custom_config = nullptr; const HtpContext* htp_ctx = static_cast(context_); - if (htp_options_->use_multi_contexts()) { + if (htp_options_->use_multi_contexts() && + htp_options_->max_sf_buf_size() != 0) { p_custom_config = AllocContextCustomConfig(); p_custom_config->option = QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS; diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index a0c0abf7295..e1be24d0d64 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -208,16 +208,28 @@ def process_exported_program(prog): == QnnExecuTorchBackendType.kHtpBackend and options.backend_options.htp_options.use_multi_contexts ): - max_sf_buf_size = max(max_sf_buf_size, len(m.processed_bytes)) + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + m.compile_specs[0].value, m.processed_bytes + ) + assert qnn_mgr.Init().value == 0, "failed to load context binary" + max_sf_buf_size = max( + max_sf_buf_size, qnn_mgr.GetSpillFillBufferSize() + ) module_map[m] = options + qnn_mgr.Destroy() return max_sf_buf_size, module_map def process_lowered_module(module): + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + module.compile_specs[0].value, module.processed_bytes + ) + assert qnn_mgr.Init().value == 0, "failed to load context binary" spill_fill_size = ( - len(module.processed_bytes) + qnn_mgr.GetSpillFillBufferSize() if custom_buffer_size is None else custom_buffer_size ) + qnn_mgr.Destroy() return spill_fill_size, { module: convert_to_option(module.compile_specs[0].value) } diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 5cef72c1e6e..70b83989625 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -580,10 +580,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.num_sharding > 0 and args.qnn: from executorch.backends.qualcomm.utils.utils import canonicalize_program - # TODO: Need to remove this once we have better way to handle buffer size - canonicalize_program( - builder.edge_manager.exported_program(), custom_buffer_size=542048256 - ) + canonicalize_program(builder.edge_manager.exported_program()) builder = builder.to_executorch() @@ -600,10 +597,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.num_sharding > 0 and args.qnn: from executorch.backends.qualcomm.utils.utils import canonicalize_program - # TODO: Need to remove this once we have better way to handle buffer size - canonicalize_program( - builder.edge_manager.exported_program(), custom_buffer_size=542048256 - ) + canonicalize_program(builder.edge_manager.exported_program()) builder = builder.to_executorch() diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index bba16dd8a4d..8de4e1a00c4 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -167,7 +167,7 @@ def get_qnn_partitioner( ) use_fp16 = True - skip_node_op_set = {"llama.fallback.default", "aten.embedding.default"} + skip_node_op_set = {"llama.fallback.default"} if pt2e_quantize is not None: use_fp16 = False diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 45d9932724e..30701e4fa54 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -209,11 +209,6 @@ def get_qnn_quantizer( quantization_mode is None ), "Currently qnn backend only supports QnnQuantizer via pt2e flow" qnn_quantizer.add_custom_quant_annotations(custom_annotations) - qnn_quantizer.add_discard_ops( - [ - torch.ops.aten.embedding.default, - ] - ) return qnn_quantizer, quant_dtype