diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
index 1f45f062cfb..6a4757089fb 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -207,6 +207,7 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
       .def("Init", &PyQnnManager::Init)
       .def("InitBackend", &PyQnnManager::InitBackend)
       .def("InitContext", &PyQnnManager::InitContext)
+      .def("InitContextCache", &PyQnnManager::InitContextCache)
       .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
       .def(
           "Compile",
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
index c1434db5573..e1850bc4fa8 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -236,6 +236,10 @@ class PyQnnManager {
     return qnn_manager_->InitContext(std::optional{graph_names});
   }
 
+  executorch::runtime::Error InitContextCache() {
+    return qnn_manager_->InitContextCache();
+  }
+
   bool IsNodeSupportedByBackend(
       std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
     return qnn_manager_->IsNodeSupportedByBackend(op_wrappers);
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
index 7b46f9d0d9b..f281692a2d4 100644
--- a/backends/qualcomm/quantizer/qconfig.py
+++ b/backends/qualcomm/quantizer/qconfig.py
@@ -114,14 +114,21 @@ def get_8a8w_qnn_ptq_config(
     # the smallest scale defaults to DEFAULT_EPS_8BIT
     extra_args: Dict[str, Any] = {"eps": eps if eps else DEFAULT_EPS_8BIT}
 
-    act_quantization_spec = QuantizationSpec(
-        dtype=torch.uint8,
-        qscheme=(
-            torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
-        ),
-        ch_axis=0,
-        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
-    )
+    if act_symmetric:
+        act_quantization_spec = QuantizationSpec(
+            dtype=torch.uint8,
+            qscheme=(torch.per_tensor_symmetric),
+            ch_axis=0,
+            observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        )
+    else:
+        act_quantization_spec = QuantizationSpec(
+            dtype=torch.uint8,
+            quant_min=torch.iinfo(torch.uint8).min,
+            quant_max=torch.iinfo(torch.uint8).max,
+            qscheme=(torch.per_tensor_affine),
+            observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        )
 
     weight_quantization_spec = QuantizationSpec(
         dtype=torch.int8,
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index e64d58a8971..871ae1925e4 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -296,6 +296,37 @@ Error QnnManager::InitContext(
   return Error::Ok;
 }
 
+Error QnnManager::InitContextCache() {
+  if (backend_params_ptr_->backend_init_state_ ==
+      BackendInitializeState::UNINITIALIZED) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Initialize Qnn backend "
+        "parameters for Qnn executorch backend type %d",
+        options_->backend_options()->backend_type());
+    backend_params_ptr_ = QnnBackendFactory().Create(
+        backend_bundle_ptr_->implementation.get(),
+        backend_bundle_ptr_->qnn_backend_ptr.get(),
+        backend_bundle_ptr_->qnn_device_ptr.get(),
+        qnn_context_blob_,
+        options_,
+        qnn_dlc_manager_.get());
+    ET_CHECK_OR_RETURN_ERROR(
+        backend_params_ptr_ != nullptr,
+        Internal,
+        "Failed to load Qnn backend.");
+    // Note: For online_prepare or deserialization, the graph name will be
+    // obtained from the binary.
+    ET_CHECK_OR_RETURN_ERROR(
+        backend_params_ptr_->qnn_backend_cache_ptr_->Configure({}) == Error::Ok,
+        Internal,
+        "Fail to configure Qnn backend cache");
+
+    backend_params_ptr_->backend_init_state_ =
+        BackendInitializeState::INITIALIZED;
+  }
+  return Error::Ok;
+}
+
 Error QnnManager::AllocateTensor(const std::string& graph_name) {
   std::vector<Qnn_Tensor_t> input_tensors =
       backend_params_ptr_->qnn_context_ptr_->GetGraphInputs(graph_name);
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 866a4edbad6..ef0385c517b 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -38,6 +38,9 @@ class QnnManager {
   // graph name will be obtained from the binary.
   executorch::runtime::Error InitContext(
       std::optional<std::vector<std::string>> graph_names = std::nullopt);
+  // This function only initialize the context cache to get spill fill buffer
+  // size
+  executorch::runtime::Error InitContextCache();
   executorch::runtime::Error AllocateTensor(const std::string& graph_name);
   executorch::runtime::Error AllocateTensor(
       const std::string& graph_name,
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index 9c559d83fcc..404ebff88af 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -51,6 +51,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
             "use_dlbc in htp_options: %d", htp_options->use_dlbc());
         QNN_EXECUTORCH_LOG_INFO(
             "use_fold_relu in htp_options: %d", htp_options->use_fold_relu());
+        QNN_EXECUTORCH_LOG_INFO(
+            "use_slc_allocator in htp_options: %d",
+            htp_options->use_slc_allocator());
         QNN_EXECUTORCH_LOG_INFO(
             "use_multi_contexts in htp_options: %d",
             htp_options->use_multi_contexts());
diff --git a/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp
index 3038a100d03..be95c9cfdbf 100644
--- a/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/htp/HtpBackendCache.cpp
@@ -18,7 +18,9 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo(
     const QnnSystemContext_BinaryInfo_t* binaryinfo) {
   QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr;
 #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
-  QnnHtpSystemContext_GraphBlobInfo_t* htp_graphblobinfo = nullptr;
+  std::vector<QnnHtpSystemContext_GraphBlobInfo_t*> htp_graphblobinfos;
+  std::uint32_t num_graphs;
+
 #endif
 
   if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
@@ -29,8 +31,13 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo(
         binaryinfo->contextBinaryInfoV2.hwInfoBlob);
 #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
   } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
-    htp_graphblobinfo = static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
-        binaryinfo->contextBinaryInfoV3.graphs->graphInfoV3.graphBlobInfo);
+    num_graphs = binaryinfo->contextBinaryInfoV3.numGraphs;
+    for (size_t i = 0; i < num_graphs; ++i) {
+      htp_graphblobinfos.push_back(
+          static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
+              binaryinfo->contextBinaryInfoV3.graphs[i]
+                  .graphInfoV3.graphBlobInfo));
+    }
 #endif
   } else {
     QNN_EXECUTORCH_LOG_WARN(
@@ -51,15 +58,24 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo(
   }
 
 #if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
-  if (htp_graphblobinfo) {
-    if (htp_graphblobinfo->version ==
+  if (htp_graphblobinfos.size() > 0) {
+    // After version 2.21, we need to get spill fill buffer size from graph
+    // blob info instead of hw blob info. If there are multiple graphs, we
+    // should use the max value among all graphs.
+    if (htp_graphblobinfos[0]->version ==
         QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) {
-      spill_fill_buf_ =
-          (*htp_graphblobinfo).contextBinaryGraphBlobInfoV1.spillFillBufferSize;
+      for (size_t i = 0; i < num_graphs; ++i) {
+        uint64_t spill_fill_buf =
+            (*htp_graphblobinfos[i])
+                .contextBinaryGraphBlobInfoV1.spillFillBufferSize;
+        if (spill_fill_buf > spill_fill_buf_) {
+          spill_fill_buf_ = spill_fill_buf;
+        }
+      }
     } else {
       QNN_EXECUTORCH_LOG_WARN(
           "Unknown QNN Htp graph blob info version %d.",
-          htp_graphblobinfo->version);
+          htp_graphblobinfos[0]->version);
       return Error::Internal;
     }
   }
diff --git a/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp
index 17b8438880d..3e8dd5201c6 100644
--- a/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp
+++ b/backends/qualcomm/runtime/backends/htp/HtpGraphCustomConfig.cpp
@@ -70,6 +70,14 @@ HtpGraphCustomConfig::CreateGraphCustomConfigCommon(
       htp_options_->use_dlbc() ? 1.0 : 0.0;
   ret.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
 
+  p_custom_config = AllocGraphCustomConfig();
+  p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+  p_custom_config->optimizationOption.type =
+      QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR;
+  p_custom_config->optimizationOption.floatValue =
+      htp_options_->use_slc_allocator() ? 1.0 : 0.0;
+  ret.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
+
   return ret;
 }
 } // namespace qnn
diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
index 4a42cfd6c57..ff224f932ed 100644
--- a/backends/qualcomm/serialization/qc_compiler_spec.fbs
+++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -181,6 +181,11 @@ table QnnExecuTorchHtpBackendOptions {
   /// When multiple graphs appear inside the same context,
   /// weights could be reused across all graphs.
   use_weight_sharing:bool;
+
+  /// Allows user to enable the usage of the System Level Cache Allocator for a given graph. 
+  /// It will help the by reducing overall bandwith on the use case.
+  /// The feature is only supported by specific SOCs.
+  use_slc_allocator:bool;
 }
 
 /// Logging level of the delegate and QNN backend.
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index 3f8962c8eca..626522f461c 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -166,6 +166,7 @@ class QnnExecuTorchHtpBackendOptions:
     use_fold_relu: bool = True
     use_multi_contexts: bool = False
     use_weight_sharing: bool = False
+    use_slc_allocator: bool = False
 
 
 @unique
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 9d38c0f8867..62a8a55c9f5 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -244,7 +244,12 @@ def process_exported_program(prog):
                     qnn_mgr = PyQnnManagerAdaptor.QnnManager(
                         m.compile_specs[0].value, m.processed_bytes
                     )
-                    assert qnn_mgr.Init().value == 0, "failed to load context binary"
+                    assert (
+                        qnn_mgr.InitBackend().value == 0
+                    ), "failed to initialize backend"
+                    assert (
+                        qnn_mgr.InitContextCache().value == 0
+                    ), "failed to init context cache"
                     max_sf_buf_size = max(
                         max_sf_buf_size, qnn_mgr.GetSpillFillBufferSize()
                     )
@@ -256,7 +261,8 @@ def process_lowered_module(module):
             qnn_mgr = PyQnnManagerAdaptor.QnnManager(
                 module.compile_specs[0].value, module.processed_bytes
             )
-            assert qnn_mgr.Init().value == 0, "failed to load context binary"
+            assert qnn_mgr.InitBackend().value == 0, "failed to initialize backend"
+            assert qnn_mgr.InitContextCache().value == 0, "failed to init context cache"
             spill_fill_size = qnn_mgr.GetSpillFillBufferSize()
             qnn_mgr.Destroy()
             return spill_fill_size, {
@@ -991,6 +997,7 @@ def generate_htp_compiler_spec(
     use_dlbc: bool = False,
     use_multi_contexts: bool = False,
     use_weight_sharing: bool = False,
+    use_slc_allocator: bool = False,
 ) -> QnnExecuTorchBackendOptions:
     """
     Helper function generating backend options for QNN HTP
@@ -1006,6 +1013,9 @@ def generate_htp_compiler_spec(
             could be re-used across all the splits.
         use_weight_sharing: Used with multiple_graphs, where model size will be
             reduced when operations have the same weights across multiple graphs.
+        use_slc_allocator: Allows user to enable the usage of the System Level Cache Allocator for a given graph.
+            It will help the by reducing overall bandwith on the use case.
+            The feature is only supported by specific SOCs.
 
     Returns:
         QnnExecuTorchHtpBackendOptions: backend options for QNN HTP.
@@ -1023,6 +1033,7 @@ def generate_htp_compiler_spec(
     htp_options.use_multi_contexts = use_multi_contexts
     htp_options.use_weight_sharing = use_weight_sharing
     htp_options.use_dlbc = use_dlbc
+    htp_options.use_slc_allocator = use_slc_allocator
     return QnnExecuTorchBackendOptions(
         backend_type=QnnExecuTorchBackendType.kHtpBackend,
         htp_options=htp_options,
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
index c50cd455e75..20a7ab99c8d 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -373,6 +373,26 @@ def smart_mask_updater(
     return pos, k_caches, v_caches
 
 
+def evict_tokens(
+    ar_len: int,
+    atten_mask: AttentionMask,
+    pos,
+    k_caches,
+    v_caches,
+    rope_module,
+    position_shift,
+):
+    max_cache_len = k_caches[0].size(-1)
+    shifted_pos = pos + position_shift
+    if shifted_pos + ar_len > max_cache_len:
+        num_to_evict = rope_module.eviction_batch_size
+        k_caches, v_caches = rope_module(k_caches, v_caches)
+        position_shift -= num_to_evict
+        shifted_pos -= num_to_evict
+        atten_mask.smart_mask_init(shifted_pos)
+    return k_caches, v_caches, position_shift
+
+
 def _prefill_chunking(
     inputs: DecoderInputs,
     module: torch.fx.GraphModule,
diff --git a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
index 68fbb86f7cb..608e727c3b2 100644
--- a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
+++ b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
@@ -418,7 +418,7 @@ def eval_llama_with_attention_sink(args):
             layer.feed_forward.prepare_feedfoward_conv()
     model = convert_linear_to_conv2d(model)
 
-    _, atten_mask, _, k_caches, v_caches = model.get_example_inputs(use_kv_cache=True)
+    _, atten_mask, _, k_caches, v_caches = model.get_example_inputs()
     eval_data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
 
     neg_log_likelihoods = []
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
index 0f5076ca94a..44917e0bd5a 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
@@ -824,9 +824,9 @@ def compile(self, request: Request):  # noqa: C901
             skip_node_op_set={"llama.fallback.default"},
         )
 
-        if self.config.num_sharding > 1 and self.control_args.model_mode == "kv":
-            # weight-sharing based context binaries cannot be opened in x86 host
-            update_spill_fill_size(edge_prog_mgr.exported_program("kv_forward"))
+        if self.config.num_sharding > 1:
+            for graph_name in graph_names:
+                update_spill_fill_size(edge_prog_mgr.exported_program(graph_name))
 
         if self.control_args.verbose:
             for ep in edge_prog_mgr._edge_programs.values():