From cd8de1448dabc6e51f94dfccb58355286f80d030 Mon Sep 17 00:00:00 2001
From: Sheng Feng Wu <shewu@qti.qualcomm.com>
Date: Tue, 8 Oct 2024 18:13:36 -0700
Subject: [PATCH] Qualcomm AI Engine Direct - Refine max spill fill buffer
 setting (#5989)

Summary:
- Get required spillFillBufferSize from context binary and set to compiler_spec
- Quantize embedding op in qnn.
- If enable multi-contexts, maxSpillFillBuffer could not set to zero.

Pull Request resolved: https://github.com/pytorch/executorch/pull/5989

Reviewed By: kirklandsign

Differential Revision: D64056107

Pulled By: cccclai

fbshipit-source-id: 9f9846e6ac7b4a27d734d2812ac3bbad32fb194f
(cherry picked from commit 01fcdf420fef23b4ee0348c37abcab74bcea1449)
---
 .../aot/python/PyQnnManagerAdaptor.cpp        |  3 +-
 .../qualcomm/aot/python/PyQnnManagerAdaptor.h |  4 ++
 backends/qualcomm/runtime/QnnManager.cpp      |  4 ++
 backends/qualcomm/runtime/QnnManager.h        |  6 +++
 .../qualcomm/runtime/backends/CMakeLists.txt  |  3 ++
 .../runtime/backends/QnnBackendCache.cpp      | 23 +++++----
 .../runtime/backends/QnnBackendCache.h        | 14 +++--
 .../runtime/backends/QnnBackendFactory.cpp    |  5 +-
 .../runtime/backends/QnnBackendFactory.h      |  7 ++-
 .../runtime/backends/QnnContextCommon.h       |  9 ++--
 .../backends/htpbackend/HtpBackendCache.cpp   | 51 +++++++++++++++++++
 .../backends/htpbackend/HtpBackendCache.h     | 33 ++++++++++++
 .../runtime/backends/htpbackend/HtpContext.h  |  4 +-
 .../aarch64/HtpContextCustomConfig.cpp        |  3 +-
 backends/qualcomm/utils/utils.py              | 16 +++++-
 examples/models/llama2/export_llama_lib.py    | 10 +---
 extension/llm/export/partitioner_lib.py       |  2 +-
 extension/llm/export/quantizer_lib.py         |  5 --
 18 files changed, 163 insertions(+), 39 deletions(-)
 create mode 100644 backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
 create mode 100644 backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h

diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
index c785fd0219e..b8d567718af 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -35,7 +35,8 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
       .def("IsTensorDump", &PyQnnManager::IsTensorDump)
       .def("AllocateTensor", &PyQnnManager::AllocateTensor)
       .def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
-      .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs);
+      .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs)
+      .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize);
 }
 } // namespace qnn
 } // namespace executor
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
index 4a675067f3e..9907b87c55f 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -177,6 +177,10 @@ class PyQnnManager {
     return ret;
   }
 
+  uint64_t GetSpillFillBufferSize() {
+    return qnn_manager_->GetSpillFillBufferSize();
+  }
+
  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index f4275f0ab3d..5bfb8aa7898 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -283,6 +283,10 @@ Error QnnManager::Init() {
         qnn_loaded_backend_, logger_.get(), qnn_context_blob_, options_);
     ET_CHECK_OR_RETURN_ERROR(
         backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.")
+    ET_CHECK_OR_RETURN_ERROR(
+        backend_params_ptr_->qnn_backend_cache_ptr_->Configure() == Error::Ok,
+        Internal,
+        "Fail to configure Qnn backend cache");
     ET_CHECK_OR_RETURN_ERROR(
         backend_params_ptr_->qnn_backend_ptr_->Configure() == Error::Ok,
         Internal,
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 3d1cc3863aa..89d47c78c94 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -70,6 +70,12 @@ class QnnManager {
   // Pre-register custom memory handle from the SharedBuffer before execution
   Error PreRegisterMem();
 
+  uint64_t GetSpillFillBufferSize() {
+    auto* htp_backend_cache_ptr = static_cast<HtpBackendCache*>(
+        backend_params_ptr_->qnn_backend_cache_ptr_.get());
+    return htp_backend_cache_ptr->GetSpillFillBufferSize();
+  }
+
   std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
     return input_tensors_;
   }
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
index ed61d7545a9..9147d4f32a9 100644
--- a/backends/qualcomm/runtime/backends/CMakeLists.txt
+++ b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -77,7 +77,9 @@ target_sources(
 target_sources(
   qnn_backend_cache
   PUBLIC ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCache.h
+         ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpBackendCache.h
   PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCache.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpBackendCache.cpp
 )
 
 # qnn_graph
@@ -130,6 +132,7 @@ set(qnn_header_basenames
     HTP/QnnHtpPerfInfrastructure.h
     HTP/QnnHtpProfile.h
     HTP/QnnHtpProperty.h
+    HTP/QnnHtpSystemContext.h
     QnnInterface.h
     QnnLog.h
     QnnMem.h
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 1ed51ed14f6..6a568b8bc59 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -28,13 +28,20 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
 
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_WARN(
-        "Failed to interpret QNN Context "
+        "Failed to interpret QNN context "
         "binary. Error code %d. "
         "Try verifying binary with online-prepare format.",
         QNN_GET_ERROR_CODE(error));
     return Error::Internal;
   }
 
+  Error status = RetrieveBackendBinaryInfo(binaryinfo);
+  if (status == Error::Internal) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Failed to retrieve backend binary info from QNN context binary.");
+    return Error::Internal;
+  }
+
   if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
     num_graphs = binaryinfo->contextBinaryInfoV1.numGraphs;
     graph = binaryinfo->contextBinaryInfoV1.graphs;
@@ -81,20 +88,18 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
   return Error::Ok;
 }
 
-QnnBackendCache::QnnBackendCache(
-    const QnnExecuTorchContextBinary& qnn_context_blob)
-    : qnn_context_blob_(qnn_context_blob) {
+Error QnnBackendCache::Configure() {
   if (qnn_context_blob_.buffer == nullptr) {
     state_ = SERIALIZE;
     QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE.");
-    return;
+    return Error::Ok;
   }
 
   if (qnn_sys_impl_.Load() != Error::Ok) {
     QNN_EXECUTORCH_LOG_ERROR(
         "Failed to Load QnnSystem "
         "APIs. Caching mechanism is being disabled.");
-    return;
+    return Error::Internal;
   }
 
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -109,7 +114,7 @@ QnnBackendCache::QnnBackendCache(
         "Failed to create Qnn "
         "SystemContext. Caching mechanism will be disabled. Error code %d",
         QNN_GET_ERROR_CODE(error));
-    return;
+    return Error::Internal;
   }
 
   // DO DESERIALIZE
@@ -125,7 +130,7 @@ QnnBackendCache::QnnBackendCache(
 
     if (qcir::VerifyGraphBuffer(verifier)) {
       state_ = ONLINE_PREPARE;
-      return;
+      return Error::Ok;
     }
 
     QNN_EXECUTORCH_LOG_ERROR(
@@ -133,8 +138,8 @@ QnnBackendCache::QnnBackendCache(
         "might be broken. Please consider to re-generate the "
         "cache.");
     InvalidateCache();
-    return;
   }
+  return Error::Ok;
 }
 
 QnnBackendCache::~QnnBackendCache() {
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h
index ad6d3d0bd7b..6b1f5863a15 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h
@@ -23,9 +23,9 @@ class QnnBackendCache {
     DESERIALIZE = 2,
     ONLINE_PREPARE = 3,
   };
-  explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob);
-
-  ~QnnBackendCache();
+  explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
+      : qnn_context_blob_(qnn_context_blob) {}
+  virtual ~QnnBackendCache();
   QnnBackendCache(const QnnBackendCache&) = delete;
   QnnBackendCache(QnnBackendCache&&) = delete;
   QnnBackendCache& operator=(const QnnBackendCache&) = delete;
@@ -51,6 +51,14 @@ class QnnBackendCache {
     return graph_name_;
   }
 
+  Error Configure();
+
+ protected:
+  virtual Error RetrieveBackendBinaryInfo(
+      __ET_UNUSED const QnnSystemContext_BinaryInfo_t* binaryinfo) {
+    return Error::Ok;
+  }
+
  private:
   Error GetQnnGraphInfoFromBinary();
 
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index 9fb292613a3..52128a8b496 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -56,11 +56,14 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
       backend_params->qnn_device_ptr_ = std::make_unique<HtpDevice>(
           implementation, logger, options->soc_info(), htp_options);
 
+      backend_params->qnn_backend_cache_ptr_ =
+          std::make_unique<HtpBackendCache>(qnn_context_blob);
+
       backend_params->qnn_context_ptr_ = std::make_unique<HtpContext>(
           implementation,
           backend_params->qnn_backend_ptr_.get(),
           backend_params->qnn_device_ptr_.get(),
-          qnn_context_blob,
+          backend_params->qnn_backend_cache_ptr_.get(),
           htp_options);
 
       backend_params->qnn_graph_ptr_ = std::make_unique<HtpGraph>(
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
index ab47113a538..dfa6b825088 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnDeviceCommon.h>
@@ -16,6 +17,7 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnLogger.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnMemManager.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h>
+#include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpDevice.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h>
@@ -35,6 +37,7 @@ typedef struct BackendConfigParameters {
   std::unique_ptr<QnnDevice> qnn_device_ptr_;
   std::unique_ptr<QnnGraph> qnn_graph_ptr_;
   std::unique_ptr<QnnMemManager> qnn_mem_manager_ptr_;
+  std::unique_ptr<QnnBackendCache> qnn_backend_cache_ptr_;
 
   // Default ctor
   BackendConfigParameters()
@@ -43,10 +46,12 @@ typedef struct BackendConfigParameters {
         qnn_context_ptr_(nullptr),
         qnn_device_ptr_(nullptr),
         qnn_graph_ptr_(nullptr),
-        qnn_mem_manager_ptr_(nullptr) {}
+        qnn_mem_manager_ptr_(nullptr),
+        qnn_backend_cache_ptr_(nullptr) {}
   // Default dtor
   ~BackendConfigParameters() {
     qnn_graph_ptr_.reset();
+    qnn_backend_cache_ptr_.reset();
     qnn_mem_manager_ptr_.reset();
     qnn_context_ptr_.reset();
     qnn_device_ptr_.reset();
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h
index e706a5a870f..1246e1c2f83 100644
--- a/backends/qualcomm/runtime/backends/QnnContextCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h
@@ -22,13 +22,12 @@ class QnnContext {
       const QnnImplementation& implementation,
       QnnBackend* backend,
       QnnDevice* device,
-      const QnnExecuTorchContextBinary& qnn_context_blob)
+      QnnBackendCache* cache)
       : handle_(nullptr),
         implementation_(implementation),
         backend_(backend),
-        device_(device) {
-    cache_ = std::make_unique<QnnBackendCache>(qnn_context_blob);
-  }
+        device_(device),
+        cache_(cache) {}
 
   virtual ~QnnContext();
   Error Configure();
@@ -67,7 +66,7 @@ class QnnContext {
   const QnnImplementation& implementation_;
   QnnBackend* backend_;
   QnnDevice* device_;
-  std::unique_ptr<QnnBackendCache> cache_;
+  QnnBackendCache* cache_;
   std::vector<char> binary_buffer_;
 };
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
new file mode 100644
index 00000000000..8cd9b69aa67
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h>
+#include "HTP/QnnHtpSystemContext.h"
+
+namespace torch {
+namespace executor {
+namespace qnn {
+Error HtpBackendCache::RetrieveBackendBinaryInfo(
+    const QnnSystemContext_BinaryInfo_t* binaryinfo) {
+  QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr;
+
+  if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
+    htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
+        binaryinfo->contextBinaryInfoV1.hwInfoBlob);
+  } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
+    htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
+        binaryinfo->contextBinaryInfoV2.hwInfoBlob);
+  } else {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Unknown QNN BinaryInfo version %d.", binaryinfo->version);
+    return Error::Internal;
+  }
+
+  if (htp_hwblobinfo == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Htp hardware blob information is not found in binary information.");
+    return Error::Ok;
+  }
+
+  if (htp_hwblobinfo->version ==
+      QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) {
+    spill_fill_buf_ =
+        (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize;
+  } else {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version);
+    return Error::Internal;
+  }
+
+  return Error::Ok;
+}
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
new file mode 100644
index 00000000000..b97fce18c51
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+class HtpBackendCache : public QnnBackendCache {
+ public:
+  explicit HtpBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
+      : QnnBackendCache(qnn_context_blob), spill_fill_buf_(0) {}
+  ~HtpBackendCache() override = default;
+
+  uint64_t GetSpillFillBufferSize() {
+    return spill_fill_buf_;
+  }
+
+ protected:
+  Error RetrieveBackendBinaryInfo(
+      const QnnSystemContext_BinaryInfo_t* binaryinfo) override;
+
+ private:
+  uint64_t spill_fill_buf_;
+};
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h b/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h
index 14d06824a12..f3487ad05bc 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h
@@ -22,9 +22,9 @@ class HtpContext : public QnnContext {
       const QnnImplementation& implementation,
       QnnBackend* backend,
       QnnDevice* device,
-      const QnnExecuTorchContextBinary& qnn_context_blob,
+      QnnBackendCache* cache,
       const QnnExecuTorchHtpBackendOptions* htp_options)
-      : QnnContext(implementation, backend, device, qnn_context_blob) {
+      : QnnContext(implementation, backend, device, cache) {
     htp_context_custom_config_ =
         std::make_unique<HtpContextCustomConfig>(this, htp_options);
   }
diff --git a/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp
index 88f09c3cf4e..dfc5a3e2766 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp
+++ b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp
@@ -19,7 +19,8 @@ HtpContextCustomConfig::CreateContextCustomConfig() {
   QnnHtpContext_CustomConfig_t* p_custom_config = nullptr;
   const HtpContext* htp_ctx = static_cast<const HtpContext*>(context_);
 
-  if (htp_options_->use_multi_contexts()) {
+  if (htp_options_->use_multi_contexts() &&
+      htp_options_->max_sf_buf_size() != 0) {
     p_custom_config = AllocContextCustomConfig();
     p_custom_config->option =
         QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS;
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index a0c0abf7295..e1be24d0d64 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -208,16 +208,28 @@ def process_exported_program(prog):
                     == QnnExecuTorchBackendType.kHtpBackend
                     and options.backend_options.htp_options.use_multi_contexts
                 ):
-                    max_sf_buf_size = max(max_sf_buf_size, len(m.processed_bytes))
+                    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+                        m.compile_specs[0].value, m.processed_bytes
+                    )
+                    assert qnn_mgr.Init().value == 0, "failed to load context binary"
+                    max_sf_buf_size = max(
+                        max_sf_buf_size, qnn_mgr.GetSpillFillBufferSize()
+                    )
                     module_map[m] = options
+                    qnn_mgr.Destroy()
             return max_sf_buf_size, module_map
 
         def process_lowered_module(module):
+            qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+                module.compile_specs[0].value, module.processed_bytes
+            )
+            assert qnn_mgr.Init().value == 0, "failed to load context binary"
             spill_fill_size = (
-                len(module.processed_bytes)
+                qnn_mgr.GetSpillFillBufferSize()
                 if custom_buffer_size is None
                 else custom_buffer_size
             )
+            qnn_mgr.Destroy()
             return spill_fill_size, {
                 module: convert_to_option(module.compile_specs[0].value)
             }
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 5cef72c1e6e..70b83989625 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -580,10 +580,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            # TODO: Need to remove this once we have better way to handle buffer size
-            canonicalize_program(
-                builder.edge_manager.exported_program(), custom_buffer_size=542048256
-            )
+            canonicalize_program(builder.edge_manager.exported_program())
 
         builder = builder.to_executorch()
 
@@ -600,10 +597,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            # TODO: Need to remove this once we have better way to handle buffer size
-            canonicalize_program(
-                builder.edge_manager.exported_program(), custom_buffer_size=542048256
-            )
+            canonicalize_program(builder.edge_manager.exported_program())
 
         builder = builder.to_executorch()
 
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index bba16dd8a4d..8de4e1a00c4 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -167,7 +167,7 @@ def get_qnn_partitioner(
         )
 
     use_fp16 = True
-    skip_node_op_set = {"llama.fallback.default", "aten.embedding.default"}
+    skip_node_op_set = {"llama.fallback.default"}
     if pt2e_quantize is not None:
         use_fp16 = False
 
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 45d9932724e..30701e4fa54 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -209,11 +209,6 @@ def get_qnn_quantizer(
         quantization_mode is None
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
-    qnn_quantizer.add_discard_ops(
-        [
-            torch.ops.aten.embedding.default,
-        ]
-    )
 
     return qnn_quantizer, quant_dtype