Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
.def("Init", &PyQnnManager::Init)
.def("InitBackend", &PyQnnManager::InitBackend)
.def("InitContext", &PyQnnManager::InitContext)
.def("InitContextCache", &PyQnnManager::InitContextCache)
.def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
.def(
"Compile",
Expand Down
4 changes: 4 additions & 0 deletions backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ class PyQnnManager {
return qnn_manager_->InitContext(std::optional{graph_names});
}

executorch::runtime::Error InitContextCache() {
return qnn_manager_->InitContextCache();
}

bool IsNodeSupportedByBackend(
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
return qnn_manager_->IsNodeSupportedByBackend(op_wrappers);
Expand Down
23 changes: 15 additions & 8 deletions backends/qualcomm/quantizer/qconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,21 @@ def get_8a8w_qnn_ptq_config(
# the smallest scale defaults to DEFAULT_EPS_8BIT
extra_args: Dict[str, Any] = {"eps": eps if eps else DEFAULT_EPS_8BIT}

act_quantization_spec = QuantizationSpec(
dtype=torch.uint8,
qscheme=(
torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
),
ch_axis=0,
observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
)
if act_symmetric:
act_quantization_spec = QuantizationSpec(
dtype=torch.uint8,
qscheme=(torch.per_tensor_symmetric),
ch_axis=0,
observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
)
else:
act_quantization_spec = QuantizationSpec(
dtype=torch.uint8,
quant_min=torch.iinfo(torch.uint8).min,
quant_max=torch.iinfo(torch.uint8).max,
qscheme=(torch.per_tensor_affine),
observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
)

weight_quantization_spec = QuantizationSpec(
dtype=torch.int8,
Expand Down
31 changes: 31 additions & 0 deletions backends/qualcomm/runtime/QnnManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,37 @@ Error QnnManager::InitContext(
return Error::Ok;
}

Error QnnManager::InitContextCache() {
if (backend_params_ptr_->backend_init_state_ ==
BackendInitializeState::UNINITIALIZED) {
QNN_EXECUTORCH_LOG_INFO(
"Initialize Qnn backend "
"parameters for Qnn executorch backend type %d",
options_->backend_options()->backend_type());
backend_params_ptr_ = QnnBackendFactory().Create(
backend_bundle_ptr_->implementation.get(),
backend_bundle_ptr_->qnn_backend_ptr.get(),
backend_bundle_ptr_->qnn_device_ptr.get(),
qnn_context_blob_,
options_,
qnn_dlc_manager_.get());
ET_CHECK_OR_RETURN_ERROR(
backend_params_ptr_ != nullptr,
Internal,
"Failed to load Qnn backend.");
// Note: For online_prepare or deserialization, the graph name will be
// obtained from the binary.
ET_CHECK_OR_RETURN_ERROR(
backend_params_ptr_->qnn_backend_cache_ptr_->Configure({}) == Error::Ok,
Internal,
"Fail to configure Qnn backend cache");

backend_params_ptr_->backend_init_state_ =
BackendInitializeState::INITIALIZED;
}
return Error::Ok;
}

Error QnnManager::AllocateTensor(const std::string& graph_name) {
std::vector<Qnn_Tensor_t> input_tensors =
backend_params_ptr_->qnn_context_ptr_->GetGraphInputs(graph_name);
Expand Down
3 changes: 3 additions & 0 deletions backends/qualcomm/runtime/QnnManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class QnnManager {
// graph name will be obtained from the binary.
executorch::runtime::Error InitContext(
std::optional<std::vector<std::string>> graph_names = std::nullopt);
// This function only initialize the context cache to get spill fill buffer
// size
executorch::runtime::Error InitContextCache();
executorch::runtime::Error AllocateTensor(const std::string& graph_name);
executorch::runtime::Error AllocateTensor(
const std::string& graph_name,
Expand Down
3 changes: 3 additions & 0 deletions backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
"use_dlbc in htp_options: %d", htp_options->use_dlbc());
QNN_EXECUTORCH_LOG_INFO(
"use_fold_relu in htp_options: %d", htp_options->use_fold_relu());
QNN_EXECUTORCH_LOG_INFO(
"use_slc_allocator in htp_options: %d",
htp_options->use_slc_allocator());
QNN_EXECUTORCH_LOG_INFO(
"use_multi_contexts in htp_options: %d",
htp_options->use_multi_contexts());
Expand Down
32 changes: 24 additions & 8 deletions backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo(
const QnnSystemContext_BinaryInfo_t* binaryinfo) {
QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr;
#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
QnnHtpSystemContext_GraphBlobInfo_t* htp_graphblobinfo = nullptr;
std::vector<QnnHtpSystemContext_GraphBlobInfo_t*> htp_graphblobinfos;
std::uint32_t num_graphs;

#endif

if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
Expand All @@ -29,8 +31,13 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo(
binaryinfo->contextBinaryInfoV2.hwInfoBlob);
#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
} else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
htp_graphblobinfo = static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
binaryinfo->contextBinaryInfoV3.graphs->graphInfoV3.graphBlobInfo);
num_graphs = binaryinfo->contextBinaryInfoV3.numGraphs;
for (size_t i = 0; i < num_graphs; ++i) {
htp_graphblobinfos.push_back(
static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
binaryinfo->contextBinaryInfoV3.graphs[i]
.graphInfoV3.graphBlobInfo));
}
#endif
} else {
QNN_EXECUTORCH_LOG_WARN(
Expand All @@ -51,15 +58,24 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo(
}

#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
if (htp_graphblobinfo) {
if (htp_graphblobinfo->version ==
if (htp_graphblobinfos.size() > 0) {
// After version 2.21, we need to get spill fill buffer size from graph
// blob info instead of hw blob info. If there are multiple graphs, we
// should use the max value among all graphs.
if (htp_graphblobinfos[0]->version ==
QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) {
spill_fill_buf_ =
(*htp_graphblobinfo).contextBinaryGraphBlobInfoV1.spillFillBufferSize;
for (size_t i = 0; i < num_graphs; ++i) {
uint64_t spill_fill_buf =
(*htp_graphblobinfos[i])
.contextBinaryGraphBlobInfoV1.spillFillBufferSize;
if (spill_fill_buf > spill_fill_buf_) {
spill_fill_buf_ = spill_fill_buf;
}
}
} else {
QNN_EXECUTORCH_LOG_WARN(
"Unknown QNN Htp graph blob info version %d.",
htp_graphblobinfo->version);
htp_graphblobinfos[0]->version);
return Error::Internal;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,14 @@ HtpGraphCustomConfig::CreateGraphCustomConfigCommon(
htp_options_->use_dlbc() ? 1.0 : 0.0;
ret.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));

p_custom_config = AllocGraphCustomConfig();
p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
p_custom_config->optimizationOption.type =
QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR;
p_custom_config->optimizationOption.floatValue =
htp_options_->use_slc_allocator() ? 1.0 : 0.0;
ret.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));

return ret;
}
} // namespace qnn
Expand Down
5 changes: 5 additions & 0 deletions backends/qualcomm/serialization/qc_compiler_spec.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ table QnnExecuTorchHtpBackendOptions {
/// When multiple graphs appear inside the same context,
/// weights could be reused across all graphs.
use_weight_sharing:bool;

/// Allows user to enable the usage of the System Level Cache Allocator for a given graph.
/// It will help the by reducing overall bandwith on the use case.
/// The feature is only supported by specific SOCs.
use_slc_allocator:bool;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like we're adding a new feature, can we update the read me regarding how to use it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly a bit more explanation on System Level Cache Allocator

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is a new feature. Users just need to set use_slc_allocator=True in compile_spec to enable it.
https://github.com/pytorch/executorch/pull/17302/changes#diff-0439f6a7c1a3a3cfb222cd6409b6754f17a1ce782dd231de1d12bbf957d588f7R1000

System Level Cache Allocator is a shared cache at the system level of a SoC, serving as the last caching layer before external DDR memory. Its primary purpose is to optimize memory bandwidth, thereby potentially improving performance and reducing power consumption. However, it is model-dependent, so it is not guaranteed to be effective in all cases.

}

/// Logging level of the delegate and QNN backend.
Expand Down
1 change: 1 addition & 0 deletions backends/qualcomm/serialization/qc_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class QnnExecuTorchHtpBackendOptions:
use_fold_relu: bool = True
use_multi_contexts: bool = False
use_weight_sharing: bool = False
use_slc_allocator: bool = False


@unique
Expand Down
15 changes: 13 additions & 2 deletions backends/qualcomm/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,12 @@ def process_exported_program(prog):
qnn_mgr = PyQnnManagerAdaptor.QnnManager(
m.compile_specs[0].value, m.processed_bytes
)
assert qnn_mgr.Init().value == 0, "failed to load context binary"
assert (
qnn_mgr.InitBackend().value == 0
), "failed to initialize backend"
assert (
qnn_mgr.InitContextCache().value == 0
), "failed to init context cache"
max_sf_buf_size = max(
max_sf_buf_size, qnn_mgr.GetSpillFillBufferSize()
)
Expand All @@ -256,7 +261,8 @@ def process_lowered_module(module):
qnn_mgr = PyQnnManagerAdaptor.QnnManager(
module.compile_specs[0].value, module.processed_bytes
)
assert qnn_mgr.Init().value == 0, "failed to load context binary"
assert qnn_mgr.InitBackend().value == 0, "failed to initialize backend"
assert qnn_mgr.InitContextCache().value == 0, "failed to init context cache"
spill_fill_size = qnn_mgr.GetSpillFillBufferSize()
qnn_mgr.Destroy()
return spill_fill_size, {
Expand Down Expand Up @@ -991,6 +997,7 @@ def generate_htp_compiler_spec(
use_dlbc: bool = False,
use_multi_contexts: bool = False,
use_weight_sharing: bool = False,
use_slc_allocator: bool = False,
) -> QnnExecuTorchBackendOptions:
"""
Helper function generating backend options for QNN HTP
Expand All @@ -1006,6 +1013,9 @@ def generate_htp_compiler_spec(
could be re-used across all the splits.
use_weight_sharing: Used with multiple_graphs, where model size will be
reduced when operations have the same weights across multiple graphs.
use_slc_allocator: Allows user to enable the usage of the System Level Cache Allocator for a given graph.
It will help the by reducing overall bandwith on the use case.
The feature is only supported by specific SOCs.

Returns:
QnnExecuTorchHtpBackendOptions: backend options for QNN HTP.
Expand All @@ -1023,6 +1033,7 @@ def generate_htp_compiler_spec(
htp_options.use_multi_contexts = use_multi_contexts
htp_options.use_weight_sharing = use_weight_sharing
htp_options.use_dlbc = use_dlbc
htp_options.use_slc_allocator = use_slc_allocator
return QnnExecuTorchBackendOptions(
backend_type=QnnExecuTorchBackendType.kHtpBackend,
htp_options=htp_options,
Expand Down
20 changes: 20 additions & 0 deletions examples/qualcomm/oss_scripts/llama/decoder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,26 @@ def smart_mask_updater(
return pos, k_caches, v_caches


def evict_tokens(
ar_len: int,
atten_mask: AttentionMask,
pos,
k_caches,
v_caches,
rope_module,
position_shift,
):
max_cache_len = k_caches[0].size(-1)
shifted_pos = pos + position_shift
if shifted_pos + ar_len > max_cache_len:
num_to_evict = rope_module.eviction_batch_size
k_caches, v_caches = rope_module(k_caches, v_caches)
position_shift -= num_to_evict
shifted_pos -= num_to_evict
atten_mask.smart_mask_init(shifted_pos)
return k_caches, v_caches, position_shift


def _prefill_chunking(
inputs: DecoderInputs,
module: torch.fx.GraphModule,
Expand Down
2 changes: 1 addition & 1 deletion examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def eval_llama_with_attention_sink(args):
layer.feed_forward.prepare_feedfoward_conv()
model = convert_linear_to_conv2d(model)

_, atten_mask, _, k_caches, v_caches = model.get_example_inputs(use_kv_cache=True)
_, atten_mask, _, k_caches, v_caches = model.get_example_inputs()
eval_data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

neg_log_likelihoods = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -824,9 +824,9 @@ def compile(self, request: Request): # noqa: C901
skip_node_op_set={"llama.fallback.default"},
)

if self.config.num_sharding > 1 and self.control_args.model_mode == "kv":
# weight-sharing based context binaries cannot be opened in x86 host
update_spill_fill_size(edge_prog_mgr.exported_program("kv_forward"))
if self.config.num_sharding > 1:
for graph_name in graph_names:
update_spill_fill_size(edge_prog_mgr.exported_program(graph_name))

if self.control_args.verbose:
for ep in edge_prog_mgr._edge_programs.values():
Expand Down
Loading