diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index c48ac2056aa..2492b1fd3d6 100644
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -11,7 +11,7 @@ set -o xtrace
 build_qnn_backend() {
   echo "Start building qnn backend."
   export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
+  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
   bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release
diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh
index 3b39e1aafe3..92ffd07bccc 100644
--- a/.ci/scripts/setup-qnn-deps.sh
+++ b/.ci/scripts/setup-qnn-deps.sh
@@ -7,14 +7,18 @@
 
 set -ex
 
+verify_pkg_installed() {
+  echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed")
+}
+
 install_qnn() {
   echo "Start installing qnn."
   QNN_INSTALLATION_DIR=/tmp/qnn
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
-  curl -Lo /tmp/v2.23.0.24.06.24.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip"
+  curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip"
   echo "Finishing downloading qnn sdk."
-  unzip -qo /tmp/v2.23.0.24.06.24.zip -d /tmp
+  unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp
   echo "Finishing unzip qnn sdk."
 
 
@@ -26,4 +30,22 @@ install_qnn() {
   ls -lah "${QNN_INSTALLATION_DIR}"
 }
 
+setup_libc++() {
+  sudo apt-get update
+  pkgs_to_check=('libc++-dev')
+  j=0
+  while [ $j -lt ${#pkgs_to_check[*]} ]; do
+    install_status=$(verify_pkg_installed ${pkgs_to_check[$j]})
+    if [ "$install_status" == "" ]; then
+      sudo apt-get install -y ${pkgs_to_check[$j]}
+      if [[ $? -ne 0 ]]; then
+        echo "ERROR: Failed to install required packages for libc++"
+        exit 1
+      fi
+    fi
+    j=$(( $j +1));
+  done
+}
+
+setup_libc++
 install_qnn
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 290ece7b8e6..5721b7fd607 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -75,7 +75,7 @@ echo "COREML option ${COREML}"
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
+  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 7dc6d15e407..8ac87b2302d 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -33,6 +33,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
         -DCMAKE_BUILD_TYPE=${BUILD_TYPE}            \
+        -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index e589337666d..0b8574573fb 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -209,7 +209,13 @@ elif [[ "${BACKEND}" == "coreml" ]]; then
   fi
 elif [[ "${BACKEND}" == "xnnpack" ]]; then
   echo "Testing ${MODEL_NAME} with xnnpack..."
-  test_model_with_xnnpack true true
+  WITH_QUANTIZATION=true
+  WITH_DELEGATION=true
+  if [[ "$MODEL_NAME" == "mobilebert" ]]; then
+    # TODO(T197452682)
+    WITH_QUANTIZATION=false
+  fi
+  test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}"
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index c98fa98bb26..ba58435c69a 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -178,6 +178,7 @@ jobs:
   upload-models:
     needs: export-models
     runs-on: linux.2xlarge
+    if: always()  # Continue this job regardless of previous job outcome
     steps:
       - name: Download the models from GitHub
         uses: actions/download-artifact@v3
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 416d1ca805e..cb1b2b6a1b2 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -165,6 +165,8 @@ jobs:
           # Test llama2
           if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
             DELEGATE_CONFIG="xnnpack+custom+qe"
+          elif [[ ${{ matrix.delegate }} == "coreml" ]]; then
+            DELEGATE_CONFIG="coreml"
           fi
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
@@ -177,6 +179,7 @@ jobs:
   upload-models:
     needs: export-models
     runs-on: linux.2xlarge
+    if: always()  # Continue this job regardless of previous job outcome
     steps:
       - name: Download the models from GitHub
         uses: actions/download-artifact@v3
diff --git a/.lintrunner.toml b/.lintrunner.toml
index c28512c5986..eca965bb1e6 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -74,6 +74,8 @@ exclude_patterns = [
     # NB: Objective-C is not supported
     'examples/apple/**',
     'examples/demo-apps/apple_ios/**',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
 ]
 command = [
     'python',
@@ -177,6 +179,8 @@ exclude_patterns = [
     '**/*.bat',
     '**/*.jpg',
     '**/*.jar',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
 ]
 command = [
     'python',
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2ad23f84d17..d434c1fe198 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -131,9 +131,7 @@ for detailed advice.
 
 #### C++ language version
 
-**C++11.**
-
-NOTE: The code does not yet fully conform to this, and some files require C++17.
+**C++17.**
 
 Rationale: This is a compromise between being compatible with older, proprietary
 toolchains, and having access to relatively modern C++ features.
diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index 375fdf406b2..5084405c468 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -3,6 +3,7 @@
 # CoreML backend for delegating a EdgeProgram to CoreML.
 
 import json
+import logging
 
 import shutil
 import uuid
@@ -14,6 +15,7 @@
 from typing import Any, Dict, final, List, Optional, Tuple
 
 import coremltools as ct
+import coremltools.optimize as cto
 import executorchcoreml
 
 from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
 
 class COMPILE_SPEC_KEYS(Enum):
     COMPUTE_UNITS = "compute_units"
     MODEL_TYPE = "model_type"
     MIN_DEPLOYMENT_TARGET = "min_deployment_target"
     MODEL_COMPUTE_PRECISION = "model_compute_precision"
+    OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
 
 
 class MODEL_PATHS(Enum):
@@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
             compute_unit.name.lower().encode("utf-8"),
         )
 
+    @staticmethod
+    def generate_op_linear_quantizer_config_compile_spec(
+        op_linear_quantizer_config: Dict,
+    ) -> CompileSpec:
+        """
+        Returns the compile spec representing the model post conversion quantization,
+        which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
+        """
+        str_representation = json.dumps(op_linear_quantizer_config)
+        byte_representation = str_representation.encode("utf-8")
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
+            byte_representation,
+        )
+
+    @staticmethod
+    def op_linear_quantizer_config_from_compile_specs(
+        compile_specs: List[CompileSpec],
+    ) -> cto.coreml.OpLinearQuantizerConfig:
+        """
+        Returns the model's post conversion quantization by parsing the list of compile specs.
+        """
+        for compile_spec in compile_specs:
+            if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
+                config_dict_str = compile_spec.value.decode("utf-8")
+                config_dict = json.loads(config_dict_str)
+                config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
+                return config
+
+        return None
+
     @staticmethod
     def generate_compile_specs(
         compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
         minimum_deployment_target: ct.target = ct.target.iOS15,
         compute_precision: ct.precision = ct.precision.FLOAT16,
         model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
+        op_linear_quantizer_config: Optional[Dict] = None,
     ) -> List[CompileSpec]:
         """
         Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +230,12 @@ def generate_compile_specs(
             CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
         )
         compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
+        if op_linear_quantizer_config is not None:
+            compile_specs.append(
+                CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
+                    op_linear_quantizer_config
+                )
+            )
 
         return compile_specs
 
@@ -368,18 +412,18 @@ def preprocess(
                 compile_specs,
             )
         )
-
         model_compute_precision: ct.precision = (
             CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
         )
-
         minimum_deployment_target: ct.target = (
             CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
         )
-
         compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
             compile_specs
         )
+        op_linear_quantizer_config = (
+            CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
+        )
 
         mlmodel = ct.convert(
             model=edge_program,
@@ -392,4 +436,15 @@ def preprocess(
             compute_units=compute_units,
         )
 
+        if op_linear_quantizer_config is not None:
+            logger.warning(
+                "Core ML Backend op_linear_quantizer_config API is experimental"
+            )
+            config = cto.coreml.OptimizationConfig(
+                global_config=op_linear_quantizer_config,
+                # skip embedding
+                op_type_configs={"gather": None},
+            )
+            mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
+
         return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)
diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py
index ecf6d44b19c..c0b6663f729 100644
--- a/backends/apple/coreml/partition/coreml_partitioner.py
+++ b/backends/apple/coreml/partition/coreml_partitioner.py
@@ -17,7 +17,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupportBase
@@ -61,6 +61,7 @@ def __init__(
         self,
         skip_ops_for_coreml_delegation: Optional[List[str]] = None,
         compile_specs: Optional[List[CompileSpec]] = None,
+        take_over_mutable_buffer: Optional[bool] = True,
     ) -> None:
         if skip_ops_for_coreml_delegation is None:
             skip_ops_for_coreml_delegation = []
@@ -69,6 +70,7 @@ def __init__(
             backend_id=CoreMLBackend.__name__,
             compile_specs=compile_specs if compile_specs is not None else [],
         )
+        self.take_over_mutable_buffer = take_over_mutable_buffer
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # Run the CapabilityBasedPartitioner to return the largest possible
@@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
                 partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
+        if self.take_over_mutable_buffer:
+            logger.info(
+                "Core ML partitioner will take over torch mutable buffer as Core ML state, "
+                "so if your model contains mutable buffer, "
+                "then you will need MacOS15+/iOS18+ to execute. "
+                "If you want your mutable buffer model to be compatible with older OS, "
+                "then please set `take_over_mutable_buffer=False`"
+            )
+            tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
index 0018b5ffc2d..b6c9a073e08 100755
--- a/backends/apple/coreml/scripts/install_requirements.sh
+++ b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 8.0b2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
@@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel
 
 echo "${green}ExecuTorch: Installing coremltools."
 pip install "$COREMLTOOLS_DIR_PATH"
+# CoreMLTools have started supporting numpy 2.0,
+# but ExecuTorch example model test env is still using older transformers,
+# so for now we will need to downgrade numpy to 1.x
+# TODO: Remove this numpy downgrade once later transformers starts to be used
+pip install numpy==1.26.4
 STATUS=$?
 if [ $STATUS -ne 0 ]; then
     echo "${red}ExecuTorch: Failed to install coremltools."
diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
index 34cf531b261..72a7fbf0932 100644
--- a/backends/apple/coreml/test/test_coreml_partitioner.py
+++ b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -4,11 +4,14 @@
 
 import unittest
 
+import coremltools as ct
+
 import executorch.exir
 
 import torch
 import torchvision
 
+from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 
 
@@ -86,8 +89,54 @@ def test_vit_skip_conv(self):
             if node.op == "call_function"
         ] == total
 
+    def test_buffer(self):
+        embedding_dim = 3
+        max_seq_len = 2
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer(
+                    "cache",
+                    torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32),
+                )
+
+            def forward(self, q, k_val, input_pos):
+                q_T = q.transpose(0, 1)
+                k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val)
+                attn = k.mm(q_T)
+                return attn
+
+        model = Model()
+        model.eval()
+
+        q = torch.randn((1, embedding_dim))
+        k_val = torch.randn((1, embedding_dim))
+        input_pos = torch.tensor([0])
+        example_inputs = (q, k_val, input_pos)
+        exir_program_aten = torch.export.export(model, example_inputs)
+
+        compile_specs = CoreMLBackend.generate_compile_specs(
+            minimum_deployment_target=ct.target.iOS18
+        )
+        partitioner = CoreMLPartitioner(compile_specs=compile_specs)
+        edge_program_manager = executorch.exir.to_edge(
+            exir_program_aten, compile_config=self.edge_compile_config
+        )
+        delegated_program_manager = edge_program_manager.to_backend(partitioner)
+
+        assert [
+            node.target.__name__
+            for node in delegated_program_manager.exported_program().graph.nodes
+            if node.op == "call_function"
+        ] == [
+            "executorch_call_delegate",
+            "getitem",
+        ]
+
 
 if __name__ == "__main__":
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_buffer()
diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py
index 20e1b2b8d76..339aa62719f 100644
--- a/backends/arm/operators/op_mean_dim.py
+++ b/backends/arm/operators/op_mean_dim.py
@@ -11,7 +11,6 @@
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common
 
 
 @register_node_visitor
@@ -30,29 +29,4 @@ def define_node(
         is_quant_node: bool,
     ) -> None:
 
-        input_tensor = inputs[0]
-        dim = node.args[1]
-        keep_dim = node.args[2]
-
-        # mean.dim(-1, -2) is the same as avg_pool2d when just computing mean over HW dimensions.
-        # Since tosa doesn't have mean.dim operation, lowers it to average pooling instead.
-        if dim == [-1, -2]:
-            if keep_dim is True:
-                # Given the shape format of input is (N, C, H, W)
-                kernel_size = [input_tensor.shape[2], input_tensor.shape[3]]
-                stride = [1, 1]
-                padding = [0, 0, 0, 0]
-
-                build_avg_pool_2d_common(
-                    node,
-                    tosa_graph,
-                    input_tensor,
-                    kernel_size,
-                    stride,
-                    padding,
-                    is_quant_node,
-                    output,
-                )
-                return
-
         raise AssertionError("unsupported")
diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py
index 914bf57aabc..db8511df613 100644
--- a/backends/arm/passes/arm_pass_manager.py
+++ b/backends/arm/passes/arm_pass_manager.py
@@ -15,6 +15,9 @@
 from executorch.backends.arm.passes.convert_split_to_slice import (
     ConvertSplitToSlicePass,
 )
+from executorch.backends.arm.passes.meandim_to_averagepool_pass import (
+    ConvertMeanDimToAveragePool,
+)
 from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass
 from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -33,6 +36,7 @@ def transform_to_backend_pipeline(
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(RemoveClonePass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(ConvertSplitToSlicePass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
diff --git a/backends/arm/passes/meandim_to_averagepool_pass.py b/backends/arm/passes/meandim_to_averagepool_pass.py
new file mode 100644
index 00000000000..3f57e8023ca
--- /dev/null
+++ b/backends/arm/passes/meandim_to_averagepool_pass.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, cast, Dict, Tuple
+
+import torch.fx
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+
+Argument = Any
+
+
+class ConvertMeanDimToAveragePool(ExportPass):
+    """
+    Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation.
+    """
+
+    def call_operator(
+        self,
+        op: torch.fx.node.Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op != exir_ops.edge.aten.mean.dim:
+            return super().call_operator(op, args, kwargs, meta)
+
+        input_value = cast(ProxyValue, args[0])
+        dim = cast(list, args[1])
+        keep_dim = cast(bool, args[2]) if len(args) > 2 else False
+
+        # averagepool2d gets converted to a mean operation with dim = [-1, -2] and keep_dim = True
+        # so check the dim argument for this case
+        if dim == [-1, -2] and keep_dim is True:
+            # Given the shape format of input is (N, C, H, W)
+            kernel_size = [
+                input_value.to_tensor().size()[2],
+                input_value.to_tensor().size()[3],
+            ]
+            stride = [1, 1]
+            return super().call_operator(
+                exir_ops.edge.aten.avg_pool2d.default,
+                (input_value, kernel_size, stride),
+                {},
+                meta,
+            )
+        else:
+            return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index e0db958f743..e48d749c194 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -106,7 +106,12 @@ def _test_meandim_tosa_u55_BI_pipeline(
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .check_not(
+                [
+                    "executorch_exir_dialects_edge__ops_aten_mean_dim",
+                    "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default",
+                ]
+            )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
         )
diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
new file mode 100644
index 00000000000..1cd63e6e52e
--- /dev/null
+++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
@@ -0,0 +1,75 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.arm.passes.meandim_to_averagepool_pass import (
+    ConvertMeanDimToAveragePool,
+)
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import RunPasses
+
+
+class MeanDim(torch.nn.Module):
+    def forward(self, x):
+        return torch.mean(x, dim=[-1, -2], keepdim=True)
+
+    def get_inputs(self):
+        return (torch.rand(1, 1280, 7, 7),)
+
+
+class MeanDim2(torch.nn.Module):
+    def forward(self, x):
+        return torch.mean(x, dim=1)
+
+    def get_inputs(self):
+        return (torch.rand(1, 1280, 7, 7),)
+
+
+class TestMeandimToAveragePool2dPass(unittest.TestCase):
+    """
+    Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d
+    for the special case where dim is [-1, -2] and keepdim is True.
+    """
+
+    def test_tosa_BI_meandim_to_averagepool(self):
+        module = MeanDim()
+        test_pass_stage = RunPasses([ConvertMeanDimToAveragePool])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
+        )
+
+    def test_tosa_BI_meandim_no_modification(self):
+        module = MeanDim2()
+        test_pass_stage = RunPasses([ConvertMeanDimToAveragePool])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
+        )
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index d077169022a..08093efe317 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -60,6 +60,17 @@ python_library(
     ],
 )
 
+python_library(
+    name = "ops_registrations",
+    srcs = [
+        "ops_registrations.py",
+    ],
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/backends/cadence/aot:utils",
+    ],
+)
+
 export_file(name = "functions.yaml")
 
 executorch_generated_lib(
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index a4d856ebed2..e73de6ab7ce 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -4,12 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-strict
+
 from math import prod
 from typing import Optional, Tuple
 
 import torch
-from executorch.exir.scalar_type import ScalarType
-from torch.library import impl, Library
+from torch.library import Library, register_fake
 
 from .utils import get_conv1d_output_size, get_conv2d_output_size
 
@@ -67,31 +68,31 @@
 m = Library("cadence", "IMPL", "Meta")
 
 
-@impl(m, "quantize_per_tensor")
+@register_fake("cadence::quantize_per_tensor")
 def quantize_per_tensor_meta(
     input: torch.Tensor,
     scale: float,
     zero_point: int,
     quant_min: int,
     quant_max: int,
-    dtype: ScalarType,
-):
+    dtype: torch.dtype,
+) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=dtype)
 
 
-@impl(m, "dequantize_per_tensor")
+@register_fake("cadence::dequantize_per_tensor")
 def dequantize_per_tensor_meta(
     input: torch.Tensor,
     scale: float,
     zero_point: int,
     quant_min: int,
     quant_max: int,
-    dtype: ScalarType,
-):
+    dtype: torch.dtype,
+) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=torch.float)
 
 
-@impl(m, "quantized_linear")
+@register_fake("cadence::quantized_linear")
 def quantized_linear_meta(
     src: torch.Tensor,
     weight: torch.Tensor,
@@ -102,7 +103,7 @@ def quantized_linear_meta(
     out_shift: torch.Tensor,
     out_zero_point: int,
     offset: Optional[torch.Tensor],
-):
+) -> torch.Tensor:
     # src comes in shape [leading_dims, in_dim]
     # weight comes in shape [out_dim, in_dim]
     # output comes in empty with shape [leading_dims, out_dim]
@@ -113,7 +114,7 @@ def quantized_linear_meta(
     return src.new_empty(out_size, dtype=torch.uint8)
 
 
-@impl(m, "quantized_conv")
+@register_fake("cadence::quantized_conv")
 def quantized_conv_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -151,7 +152,7 @@ def quantized_conv_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@impl(m, "quantized_layer_norm")
+@register_fake("cadence::quantized_layer_norm")
 def quantized_layer_norm_meta(
     input: torch.Tensor,
     X_scale: torch.Tensor,
@@ -162,22 +163,22 @@ def quantized_layer_norm_meta(
     eps: float,
     output_scale: float,
     output_zero_point: int,
-):
+) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=torch.uint8)
 
 
-@impl(m, "quantized_relu")
+@register_fake("cadence::quantized_relu")
 def quantized_relu_meta(
     X: torch.Tensor,
     X_zero_point: torch.Tensor,
     out_zero_point: int,
     out_multiplier: torch.Tensor,
     out_shift: torch.Tensor,
-):
+) -> torch.Tensor:
     return X.new_empty(X.size(), dtype=torch.uint8)
 
 
-@impl(m, "quantized_matmul")
+@register_fake("cadence::quantized_matmul")
 def quantized_matmul_meta(
     X: torch.Tensor,
     X_zero_point: int,
diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py
index 973b457bade..d830c1bb312 100644
--- a/backends/example/test_example_delegate.py
+++ b/backends/example/test_example_delegate.py
@@ -46,7 +46,7 @@ def get_example_inputs():
         )
 
         m = model.eval()
-        m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs))
+        m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
         # print("original model:", m)
         quantizer = ExampleQuantizer()
         # quantizer = XNNPACKQuantizer()
@@ -82,7 +82,7 @@ def test_delegate_mobilenet_v2(self):
         )
 
         m = model.eval()
-        m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs))
+        m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
         quantizer = ExampleQuantizer()
 
         m = prepare_pt2e(m, quantizer)
diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index 4b233d94f04..744b1193d5a 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -25,9 +25,13 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include)
 
 # targets
 add_library(neuron_backend SHARED)
-target_link_libraries(
-  neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log
-                         ${NEURON_BUFFER_ALLOCATOR_LIB}
+target_link_libraries(neuron_backend
+    PRIVATE
+    executorch_no_prim_ops
+    portable_ops_lib
+    android
+    log
+    ${NEURON_BUFFER_ALLOCATOR_LIB}
 )
 target_sources(
   neuron_backend
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index d3bf98bae72..79c02e22072 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -38,6 +38,7 @@
     op_quantize,
     op_relu,
     op_reshape,
+    op_rms_norm,
     op_rsqrt,
     op_select_copy,
     op_sigmoid,
@@ -92,6 +93,7 @@
     op_quantize,
     op_relu,
     op_reshape,
+    op_rms_norm,
     op_rsqrt,
     op_select_copy,
     op_sigmoid,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index e07a745df5f..514bc6efd78 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -202,7 +202,7 @@ def get_quant_tensor_value(
 
         dtype = quant_configs[QCOM_DTYPE]
 
-        tensor = tensor.div(scale + 1e-6).add(zero_point).round().to(dtype)
+        tensor = tensor.div(scale).add(zero_point).round().to(dtype)
         # Make the backends access data correctly
         if quant_configs.get(QCOM_BITWIDTH) == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py
index 13b24c0d722..6b2e9ab91d8 100644
--- a/backends/qualcomm/builders/op_batch_norm.py
+++ b/backends/qualcomm/builders/op_batch_norm.py
@@ -8,6 +8,11 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_QUANT_ATTRS,
+    QCOM_QUANT_MAX,
+    QCOM_SCALE,
+)
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -21,6 +26,14 @@ class BatchNorm(NodeVisitor):
     def __init__(self, *args) -> None:
         super().__init__(*args)
 
+    def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor):
+        if isinstance(tensor, torch._subclasses.FakeTensor):
+            return
+
+        if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
+            diff = max(abs(tensor.max()), abs(tensor.min()))
+            quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX]
+
     def define_node(
         self,
         node: torch.fx.Node,
@@ -48,6 +61,7 @@ def define_node(
 
         amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps)
         bias_tensor = bias_tensor - amount
+        self.update_encoding(bias_node, bias_tensor)
         bias_tensor_wrapper = self.define_tensor(
             bias_node,
             bias_tensor,
@@ -57,6 +71,7 @@ def define_node(
         )
 
         filter_tensor = filter_tensor / torch.sqrt(var_tensor + eps)
+        self.update_encoding(filter_node, filter_tensor)
         filter_tensor_wrapper = self.define_tensor(
             filter_node,
             filter_tensor,
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py
index 909cc6a21f6..4b58edbac63 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv2d.py
@@ -10,16 +10,7 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_DATA,
-    QCOM_DTYPE,
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import (
@@ -94,52 +85,6 @@ def _add_conv_op_parameter(
 
         return conv_op
 
-    def _get_bias_tensor(
-        self,
-        node: torch.fx.Node,
-        nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
-        num_output_channel: int,
-    ) -> PyQnnWrapper.PyQnnOpWrapper:
-        # build dummy node if bias is not given
-        bias_node = (
-            node.args[2]
-            if node.args[2] is not None
-            else torch.fx.Node(
-                node.graph,
-                node.name + "_runtime_bias",
-                "call_function",
-                exir_ops.edge.aten.full.default,
-                (),  # args
-                {},  # kwargs
-            )
-        )
-        # zeros tensor to meet HTP constraint if bias is not given
-        bias_tensor = (
-            get_parameter(bias_node, self.edge_program)
-            if node.args[2] is not None
-            else torch.zeros(num_output_channel)
-        )
-        # insert quant attribute to meet HTP constraint if bias is not given
-        if (
-            node.args[2] is None
-            and (bias_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS)) is not None
-        ):
-            quant_attrs = bias_quant_attrs.copy()
-            quant_attrs[QCOM_ZERO_POINT] = 0
-            quant_attrs[QCOM_SCALE] = 0
-            quant_attrs[QCOM_DTYPE] = torch.int32
-            quant_attrs[QCOM_QUANT_MAX] = torch.iinfo(torch.int32).max
-            quant_attrs[QCOM_QUANT_MIN] = torch.iinfo(torch.int32).min + 1
-            bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
-
-        return self.define_tensor(
-            bias_node,
-            bias_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-            nodes_to_wrappers,
-            is_input_tensor=False,
-        )
-
     def _define_conv1d(
         self,
         node: torch.fx.Node,
@@ -204,9 +149,17 @@ def _define_conv1d(
             is_input_tensor=False,
         )
         conv_input_tensors = [unsqueeze_output_tensor_wrapper, filter_tensor_wrapper]
-        conv_input_tensors.append(
-            self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1])
-        )
+        if node.args[2] is not None:
+            bias_node = node.args[2]
+            bias_tensor = get_parameter(bias_node, self.edge_program)
+            bias_tensor_wrapper = self.define_tensor(
+                bias_node,
+                bias_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                nodes_to_wrappers,
+                is_input_tensor=False,
+            )
+            conv_input_tensors.append(bias_tensor_wrapper)
 
         stride = [1] + cast(List[int], node.args[3])
         padding = [0] + cast(List[int], node.args[4])
@@ -312,9 +265,18 @@ def define_node(
             is_input_tensor=False,
         )
         conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper]
-        conv_input_tensors.append(
-            self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1])
-        )
+
+        if node.args[2] is not None:
+            bias_node = node.args[2]
+            bias_tensor = get_parameter(bias_node, self.edge_program)
+            bias_tensor_wrapper = self.define_tensor(
+                bias_node,
+                bias_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                nodes_to_wrappers,
+                is_input_tensor=False,
+            )
+            conv_input_tensors.append(bias_tensor_wrapper)
 
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py
new file mode 100644
index 00000000000..e99b1f47ba1
--- /dev/null
+++ b/backends/qualcomm/builders/op_rms_norm.py
@@ -0,0 +1,127 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+import numpy as np
+
+import torch
+from executorch.backends.qualcomm.builders.utils import get_parameter
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpRmsNorm, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class RmsNormVisitor(NodeVisitor):
+    target = ["aten.rms_norm.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        # args of node : ['input', 'normalized_shape', 'weight', 'eps']
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+
+        # should be a immutable list
+        normalized_shapes = node.args[1]
+        if (
+            len(normalized_shapes) != 1
+            and normalized_shapes[0] != input_tensor.shape[-1]
+        ):
+            print("Only supports normalization with last input dimension")
+            return
+        axes = [node.args[0].meta["val"].dim() - 1]
+        axes_shape = [len(axes)]
+
+        weight_node = node.args[2]
+        weight_tensor = get_parameter(weight_node, self.edge_program)
+        weight_tensor_wrapper = self.define_tensor(
+            weight_node,
+            weight_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        # Fake node, nn moudle seems to be inconsistant with document
+        bias_tensor = torch.zeros(weight_tensor.shape)
+        bias_node = torch.fx.Node(
+            node.graph,
+            node.name + "_runtime_bias",
+            "call_function",
+            exir_ops.edge.aten.tensor.default,
+            (),  # args
+            {},  # kwargs
+        )
+        if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
+            bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+        bias_tensor_wrapper = self.define_tensor(
+            bias_node,
+            bias_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        epsilon = node.args[3]
+        if isinstance(epsilon, torch.fx.Node):
+            epsilon = get_parameter(epsilon, self.edge_program)
+            epsilon = (
+                epsilon
+                if isinstance(epsilon, float)
+                else torch.finfo(epsilon.dtype).eps
+            )
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        rms_nrom_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpRmsNorm.op_name,
+        )
+
+        rms_nrom_op.AddInputTensors(
+            [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper]
+        )
+        rms_nrom_op.AddOutputTensors([output_tensor_wrapper])
+        rms_nrom_op.AddScalarParam(
+            OpRmsNorm.param_epsilon,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
+            {QCOM_DATA: np.float32(epsilon)},
+        )
+        rms_nrom_op.AddTensorParam(
+            OpRmsNorm.param_axes,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(axes_shape),
+            axes_shape,
+            np.array(axes, dtype=np.uint32),
+            True,
+        )
+
+        return rms_nrom_op
diff --git a/backends/qualcomm/builders/op_softmax.py b/backends/qualcomm/builders/op_softmax.py
index ae4c89bbb96..cda40aed458 100644
--- a/backends/qualcomm/builders/op_softmax.py
+++ b/backends/qualcomm/builders/op_softmax.py
@@ -17,7 +17,7 @@
 
 @register_node_visitor
 class Softmax(NodeVisitor):
-    target = ["aten._softmax.default"]
+    target = ["aten._softmax.default", "aten._safe_softmax.default"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 4a87e5dbbb3..8ac702f2ad5 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -278,6 +278,13 @@ class OpResizeNearestNeighbor:
     param_half_pixel_centers: str = "half_pixel_centers"
 
 
+@dataclass(init=False, frozen=True)
+class OpRmsNorm:
+    op_name: str = "RmsNorm"
+    param_epsilon: str = "epsilon"
+    param_axes: str = "axes"
+
+
 @dataclass(init=False, frozen=True)
 class OpScatterNd:
     op_name: str = "ScatterNd"
diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py
index 5f111ee9c8b..1db50694ece 100644
--- a/backends/qualcomm/passes/annotate_and_quant_scalar.py
+++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py
@@ -14,7 +14,7 @@
 from executorch.exir.passes import dead_code_elimination_pass
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
-from .utils import get_quant_attrs
+from .utils import dq_ops, get_quant_attrs
 
 
 class AnnotateAndQuantScalar(ExportPass):
@@ -78,6 +78,7 @@ def _annotate_scalar_node(
             float,
             torch.float32,
             torch.int32,
+            torch.int64,
         ]:
             return
 
@@ -88,30 +89,43 @@ def _traverse_binary_node(self, graph_module: torch.fx.GraphModule):
             graph_module.graph, self.binary_op_sources
         )
         src_partitions = list(itertools.chain(*src_partitions.values()))
+        processed = set()
         for src_partition in src_partitions:
-            output = src_partition.output_nodes[0]
-            if (
-                output.meta.get(QCOM_QUANT_ATTRS)
-                and len(src_partition.input_nodes) == 1
-            ):
-                dq_node = src_partition.input_nodes[0]
-                q_node = dq_node.args[0]
-                q_node_attrs = get_quant_attrs(graph_module, q_node)
-
-                scalar_nodes = [n for n in output.args if n != dq_node]
-                if len(scalar_nodes) == 0:
+            # need post process here to identify partitioned nodes:
+            src_fn_dict = {}
+            for n in src_partition.nodes:
+                # e.g.
+                # meta["source_fn_stack"]: [('mul', <built-in function mul>)]
+                # we'll use <built-in function mul> as grouping key
+                node_list = src_fn_dict.setdefault(n.meta["source_fn_stack"][-1][1], [])
+                node_list.append(n)
+
+            for nodes in src_fn_dict.values():
+                output = [n for n in nodes if n in src_partition.output_nodes][0]
+                # if all args have been annotated, it shouldn't be a scalar operation
+                if all(arg.target in dq_ops for arg in output.args):
                     continue
 
-                scalar_node = scalar_nodes[0]
-                source_scalar_node = self._get_source_scalar_node(scalar_node)
-                # we'll abandon cast op here, since the constant scalar will
-                # be pre-loaded into QNN context binary
-                output.replace_input_with(scalar_node, source_scalar_node)
+                if output not in processed and QCOM_QUANT_ATTRS in output.meta:
+                    dq_node = [n for n in output.args if n.target in dq_ops][0]
+                    q_node = dq_node.args[0]
+                    q_node_attrs = get_quant_attrs(graph_module, q_node)
+
+                    scalar_nodes = [n for n in output.args if n != dq_node]
+                    if len(scalar_nodes) == 0:
+                        continue
+
+                    scalar_node = scalar_nodes[0]
+                    source_scalar_node = self._get_source_scalar_node(scalar_node)
+                    # we'll abandon cast op here, since the constant scalar will
+                    # be pre-loaded into QNN context binary
+                    output.replace_input_with(scalar_node, source_scalar_node)
 
-                scalar_quant_attrs = self._update_scalar_node_attrs(
-                    source_scalar_node, q_node_attrs
-                )
-                self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs)
+                    scalar_quant_attrs = self._update_scalar_node_attrs(
+                        source_scalar_node, q_node_attrs
+                    )
+                    self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs)
+                    processed.add(output)
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._traverse_binary_node(graph_module)
diff --git a/backends/qualcomm/passes/i64_to_i32.py b/backends/qualcomm/passes/i64_to_i32.py
index 7814a3ff0d6..1d2171cc37a 100644
--- a/backends/qualcomm/passes/i64_to_i32.py
+++ b/backends/qualcomm/passes/i64_to_i32.py
@@ -5,7 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 from executorch.backends.qualcomm.builders.utils import get_parameter, is_constant
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch._subclasses.fake_tensor import FakeTensor
 
 
 class I64toI32(ExportPass):
@@ -16,6 +18,8 @@ class I64toI32(ExportPass):
     def __init__(self, edge_program: torch.export.ExportedProgram):
         super(I64toI32, self).__init__()
         self.edge_program = edge_program
+        # pyre-ignore[4]
+        self.copy_op = exir_ops.edge.aten._to_copy.default
 
     def _update_meta(self, node: torch.fx.node) -> None:
         meta_val = node.meta["val"]
@@ -32,6 +36,10 @@ def _update_meta(self, node: torch.fx.node) -> None:
             if meta_val.dtype == torch.int64:
                 node.meta["val"] = meta_val.to(torch.float)
 
+    # pyre-ignore[2]
+    def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool:
+        return isinstance(node_val, FakeTensor) and node_val.dtype == dtype
+
     def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
         for n in graph_module.graph.nodes:
             if is_constant(n, self.edge_program):
@@ -39,6 +47,22 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
                 if param.dtype == torch.int64:
                     # QNN does not support int64
                     self._update_meta(n)
+            elif n.op == "placeholder":
+                node_val = n.meta["val"]
+                if self._is_tensor_of_dtype(node_val, torch.int64):
+                    with graph_module.graph.inserting_after(n):
+                        args = (n,)
+                        to_dst_node = graph_module.graph.create_node(
+                            "call_function",
+                            self.copy_op,
+                            args,
+                            {"dtype": torch.int32},
+                        )
+                        to_dst_node.meta["val"] = node_val.to(torch.int32)
+
+                        # Replace usage of the src dtype result with the dst dtype result.
+                        n.replace_all_uses_with(to_dst_node)
+                        to_dst_node.args = (n,)
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._cast_to_int32(graph_module)
diff --git a/backends/qualcomm/passes/recompose_pixel_shuffle.py b/backends/qualcomm/passes/recompose_pixel_shuffle.py
deleted file mode 100644
index 9eec6bfa264..00000000000
--- a/backends/qualcomm/passes/recompose_pixel_shuffle.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
-
-
-class RecomposePixelShuffle(ExportPass):
-    """
-    Merge decomposed operators back to one super node.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        graph = graph_module.graph
-        # decomposed core aten ops
-        partitions = get_source_partitions(graph, [torch.nn.PixelShuffle])
-        for _, src_partitions in partitions.items():
-            for src_partition in src_partitions:
-                input_node = src_partition.input_nodes[0]
-                output_node = src_partition.output_nodes[0]
-                with graph.inserting_after(input_node):
-                    h_in_shape = input_node.meta["val"].shape[2]
-                    h_out_shape = output_node.meta["val"].shape[2]
-                    upscale_factor = h_out_shape / h_in_shape
-
-                    pixel_shuffle_node = graph.create_node(
-                        "call_function",
-                        exir_ops.edge.aten.pixel_shuffle.default,
-                        (input_node, int(upscale_factor)),
-                    )
-                    users = output_node.users.copy()
-                    for user in users:
-                        user.replace_input_with(output_node, pixel_shuffle_node)
-                    # copy metadata
-                    pixel_shuffle_node.meta = output_node.meta
-
-        graph.eliminate_dead_code()
-        graph_module.recompile()
-        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
index a47f3d119a5..00d46639089 100644
--- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py
+++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
@@ -6,7 +6,6 @@
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
 class RecomposePixelUnshuffle(ExportPass):
@@ -85,30 +84,6 @@ def call(self, graph_module: torch.fx.GraphModule):
                     # copy metadata
                     pixel_unshuffle_node.meta = node.meta
 
-        # decomposed core aten ops
-        if not self.quantization_capture:
-            partitions = get_source_partitions(graph, [torch.nn.PixelUnshuffle])
-            for _, src_partitions in partitions.items():
-                for src_partition in src_partitions:
-                    input_node = src_partition.input_nodes[0]
-                    output_node = src_partition.output_nodes[0]
-                    with graph.inserting_after(input_node):
-                        h_in_shape = input_node.meta["val"].shape[2]
-                        h_out_shape = output_node.meta["val"].shape[2]
-                        downscale_factor = h_in_shape / h_out_shape
-
-                        op = self.op
-                        pixel_unshuffle_node = graph.create_node(
-                            "call_function",
-                            op,
-                            (input_node, int(downscale_factor)),
-                        )
-                        users = output_node.users.copy()
-                        for user in users:
-                            user.replace_input_with(output_node, pixel_unshuffle_node)
-                        # copy metadata
-                        pixel_unshuffle_node.meta = output_node.meta
-
         graph.eliminate_dead_code()
         graph_module.recompile()
         return PassResult(graph_module, True)
diff --git a/backends/qualcomm/passes/recompose_rms_norm.py b/backends/qualcomm/passes/recompose_rms_norm.py
new file mode 100644
index 00000000000..b26de8bd794
--- /dev/null
+++ b/backends/qualcomm/passes/recompose_rms_norm.py
@@ -0,0 +1,76 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+from .utils import dq_ops
+
+
+class RecomposeRmsNorm(ExportPass):
+    """
+    Merge decomposed operators back to one super node.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def _get_eps_node(self, nodes):
+        # eps: one of inputs of add node
+        add_node = [n for n in nodes if hasattr(n, "name") and "add" in n.name][0]
+        for a in add_node.args:
+            if isinstance(a, float) or a.op != "call_function":
+                return a
+
+    def _get_gamma_node(self, output_node):
+        # gamma: one of inputs of output node
+        for a in output_node.args:
+            if a.op != "call_function" or a.target in dq_ops:
+                return a
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        partitions = get_source_partitions(graph, [torch.nn.RMSNorm])
+        for _, src_partitions in partitions.items():
+            for src_partition in src_partitions:
+                input_len = len(src_partition.input_nodes)
+                if input_len == 1:
+                    input_node = src_partition.input_nodes[0]
+                elif input_len == 2:
+                    inp_0, inp_1 = src_partition.input_nodes
+                    input_node = inp_0 if len(inp_0.users) == 2 else inp_1
+                else:
+                    raise RuntimeError(
+                        f"Found a edge case of rms_node partitoin {src_partition}, which has {input_len} inputs"
+                    )
+
+                output_node = src_partition.output_nodes[0]
+                eps_node = self._get_eps_node(src_partition.nodes)
+                gamma_node = self._get_gamma_node(output_node)
+
+                with graph.inserting_before(output_node):
+                    # args schema
+                    # (Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
+                    rms_node = graph.create_node(
+                        "call_function",
+                        exir_ops.edge.aten.rms_norm.default,
+                        (
+                            input_node,
+                            list(gamma_node.meta["val"].shape),
+                            gamma_node,
+                            eps_node,
+                        ),
+                    )
+                    users = output_node.users.copy()
+                    for user in users:
+                        user.replace_input_with(output_node, rms_node)
+                    # copy metadata
+                    rms_node.meta = output_node.meta
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/passes/replace_index_put_input.py b/backends/qualcomm/passes/replace_index_put_input.py
new file mode 100644
index 00000000000..1eb210cf67e
--- /dev/null
+++ b/backends/qualcomm/passes/replace_index_put_input.py
@@ -0,0 +1,54 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING, QCOM_QUANT_ATTRS
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ReplaceIndexPutInput(ExportPass):
+    """
+    Index put input workaround for quantized module
+    """
+
+    dq_q_map = {
+        # per tensor
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+        # per channel
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    }
+
+    def __init__(self, edge_program: torch.export.ExportedProgram):
+        super(ReplaceIndexPutInput, self).__init__()
+        self.edge_program = edge_program
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == exir_ops.edge.aten.index_put.default:
+                if (
+                    copy_node := list(node.users)[0]
+                ) and copy_node.target == exir_ops.edge.aten.copy.default:
+                    m_buffer_node = copy_node.args[0]
+                    bad_frozen_node = node.args[0]
+                    if QCOM_QUANT_ATTRS in bad_frozen_node.meta:
+                        m_buffer_node.meta[QCOM_QUANT_ATTRS] = bad_frozen_node.meta[
+                            QCOM_QUANT_ATTRS
+                        ]
+                        m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] = (
+                            self.dq_q_map[
+                                m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING]
+                            ]
+                        )
+                    with graph.inserting_after(bad_frozen_node):
+                        node.replace_input_with(bad_frozen_node, m_buffer_node)
+                else:
+                    continue
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
index b2c86e50d33..9cde50b9c70 100644
--- a/backends/qualcomm/quantizer/custom_annotation.py
+++ b/backends/qualcomm/quantizer/custom_annotation.py
@@ -91,15 +91,17 @@ def is_edge_condition(node: Node):
     def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig):
         if is_edge_condition(node):
             return
-        if node.target == torch.ops.aten.index_put_.default:
+        if node.target in [
+            torch.ops.aten.index_put.default,
+            torch.ops.aten.index_put_.default,
+        ]:
             annotate_index_put(node, quantization_config)
             annotate_matmul_input1(node.args[0], quantization_config)
         elif node.target == torch.ops.aten.cat.default:
             annotate_cat(node, quantization_config)
             # Expect that the inputs of the cat op are select ops
-            for arg in node.args[0][1:]:
-                annotate_single_in_single_out(arg, quantization_config)
-            annotate_matmul_input1(node.args[0][0], quantization_config)
+            for arg in node.args[0]:
+                annotate_matmul_input1(arg, quantization_config)
         else:
             annotate_single_in_single_out(node, quantization_config)
             annotate_matmul_input1(node.args[0], quantization_config)
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index d31b4753a3d..d3ae1194acd 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import numbers
+import operator
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
@@ -77,7 +78,7 @@ def _derive_bias_qparams_fn(
 
 
 def get_default_8bit_qnn_ptq_config(
-    act_symmetric: bool = False, act_observer=MinMaxObserver
+    act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-12}
 
@@ -96,7 +97,7 @@ def get_default_8bit_qnn_ptq_config(
         quant_max=torch.iinfo(torch.int8).max,
         qscheme=torch.per_tensor_symmetric,
         ch_axis=0,
-        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
     )
 
     bias_quantization_spec = QuantizationSpec(
@@ -104,7 +105,7 @@ def get_default_8bit_qnn_ptq_config(
         quant_min=torch.iinfo(torch.int32).min,
         quant_max=torch.iinfo(torch.int32).max,
         qscheme=torch.per_tensor_symmetric,
-        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
+        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
     )
 
     quantization_config = QuantizationConfig(
@@ -619,7 +620,13 @@ def annotate_upsample_nearest2d(
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.softmax.int, torch.ops.aten._softmax.default])
+@register_annotator(
+    [
+        torch.ops.aten.softmax.int,
+        torch.ops.aten._softmax.default,
+        torch.ops.aten._safe_softmax.default,
+    ]
+)
 def annotate_softmax(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -684,6 +691,31 @@ def annotate_squeeze(node: Node, quantization_config: QuantizationConfig) -> Non
         annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.rms_norm.default])
+def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None:
+    act_node = node.args[0]
+    weight_node = node.args[2]
+
+    if _is_annotated([node]):
+        return
+
+    # TODO current only support 16a16w
+    _annotate_input_qspec_map(
+        node,
+        act_node,
+        quantization_config.input_activation,
+    )
+
+    _annotate_input_qspec_map(
+        node,
+        weight_node,
+        quantization_config.input_activation,
+    )
+    nodes_to_mark_annotated = [node]
+    _annotate_output_qspec(node, quantization_config.output_activation)
+    _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+
 @register_annotator([torch.ops.aten.rsqrt.default])
 def annotate_rsqrt(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -975,6 +1007,38 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None
     node.meta["source_fn_stack"] = [(node, torch.nn.Linear)]
 
 
+@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default])
+def annotate_batch_norm(node: Node, quantization_config: QuantizationConfig) -> None:
+    act, weight, bias = node.args[0:3]
+    if _is_annotated([node]):
+        return
+
+    _annotate_input_qspec_map(
+        node,
+        act,
+        quantization_config.input_activation,
+    )
+    # QNN requires uint8 instead of int8 in 'weight' config
+    _annotate_input_qspec_map(
+        node,
+        weight,
+        quantization_config.input_activation,
+    )
+    _annotate_input_qspec_map(
+        node,
+        bias,
+        quantization_config.bias,
+    )
+    _annotate_output_qspec(node, quantization_config.output_activation)
+    _mark_nodes_as_annotated([node, *node.args[0:3]])
+
+
+@register_annotator([operator.getitem])
+def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> None:
+    _annotate_output_qspec(node, quantization_config.output_activation)
+    _mark_nodes_as_annotated([node])
+
+
 @register_annotator([torch.ops.aten.layer_norm.default])
 def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) -> None:
     act_node = node.args[0]
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 319cc6092cd..e448a219284 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -55,6 +55,16 @@ def forward(self, x):
         return self.avgPool(x)
 
 
+class BatchNorm(torch.nn.Module):
+    def __init__(self, n_features):
+        super().__init__()
+        self.native_batchnorm = torch.nn.BatchNorm2d(n_features)
+        self.eval()
+
+    def forward(self, x):
+        return self.native_batchnorm(x)
+
+
 class Bmm(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -734,6 +744,16 @@ def forward(self, x):
         )
 
 
+class RmsNorm(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.eps = 1e-5
+        self.rms = torch.nn.RMSNorm([4], 1e-5)
+
+    def forward(self, x):
+        return self.rms(x)
+
+
 class Rsqrt(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index cba23f935c2..d17fce2b839 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -16,6 +16,7 @@
 from executorch.backends.qualcomm.tests.utils import (
     generate_context_binary,
     QnnPartitioner,
+    QnnQuantizer,
     QuantDtype,
     TestQNN,
     to_backend,
@@ -33,6 +34,7 @@
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
+    skip_annotation,
 )
 
 from executorch.examples.qualcomm.utils import setup_common_args_and_variables
@@ -50,8 +52,8 @@
 from executorch.examples.models.mobilenet_v3 import MV3Model
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
 from executorch.examples.models.wav2letter import Wav2LetterModel
+from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import disable_validation
-from executorch.exir.program._program import EdgeCompileConfig, ExirExportedProgram
 
 
 class TestQNNFloatingPointOperator(TestQNN):
@@ -81,6 +83,11 @@ def test_qnn_backend_avg_pool2d(self):
         sample_input = (torch.randn(1, 3, 2, 2),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_batch_norm(self):
+        module = BatchNorm(32)  # noqa: F405
+        sample_input = (torch.randn([4, 32, 16, 16]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_bmm(self):
         module = Bmm()  # noqa: F405
         torch.manual_seed(8)
@@ -291,7 +298,6 @@ def test_qnn_backend_layer_norm(self):
         sample_input = (torch.randn(196, 768),)
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("only works on QNN 2.17")
     def test_qnn_backend_leaky_relu(self):
         test_comb = [
             {
@@ -334,7 +340,7 @@ def test_qnn_backend_mean_dim(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("it will hang in runtime")
+    @unittest.skip("failed to lower in QNN 2.25")
     def test_qnn_backend_mha(self):
         module = MultiheadAttention()  # noqa: F405
         sample_input = (torch.randn(1, 197, 96),)
@@ -362,7 +368,6 @@ def test_qnn_backend_pow_tensor_scalar(self):
         sample_input = (torch.rand([2, 4, 3, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("only works on QNN 2.17")
     def test_qnn_backend_prelu(self):
         test_comb = [
             {
@@ -393,6 +398,11 @@ def test_qnn_backend_reshape(self):
         sample_input = (torch.randn([3, 4]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_rms_norm(self):
+        module = RmsNorm()  # noqa: F405
+        sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_rsqrt(self):
         module = Rsqrt()  # noqa: F405
         sample_input = (torch.abs(torch.randn([3, 4])),)
@@ -655,6 +665,12 @@ def test_qnn_backend_avg_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_batch_norm(self):
+        module = BatchNorm(32)  # noqa: F405
+        sample_input = (torch.randn([4, 32, 16, 16]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_bmm(self):
         module = Bmm()  # noqa: F405
         torch.manual_seed(8)
@@ -662,13 +678,6 @@ def test_qnn_backend_bmm(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("not applicable")
-    def test_qnn_backend_cast(self):
-        module = Cast()  # noqa: F405
-        sample_input = (10 * torch.rand((9, 4, 5, 3)),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
-
     def test_qnn_backend_cat(self):
         modules = [Cat2(), Cat3(), Cat4()]  # noqa: F405
         sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2))
@@ -1000,6 +1009,14 @@ def test_qnn_backend_reshape(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_rms_norm(self):
+        module = RmsNorm()  # noqa: F405
+        sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),)
+        module = self.get_qdq_module(
+            module, sample_input, quant_dtype=QuantDtype.use_16a4w
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_rsqrt(self):
         module = Rsqrt()  # noqa: F405
         sample_input = (torch.abs(torch.randn([3, 4])),)
@@ -1329,16 +1346,10 @@ def test_qnn_backend_multi_contexts_composite(self):
             lowered_method=to_backend,
         )
         sample_input = module.get_random_input()
-        edge_prog = ExirExportedProgram(
+        edge_prog = to_edge(
             torch.export.export(module, sample_input),
-            after_to_edge_passes=False,
-        ).to_edge(
-            EdgeCompileConfig(
-                _check_ir_validity=False,
-                _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-            )
         )
-        canonicalize_program(edge_prog.exported_program)
+        canonicalize_program(edge_prog.exported_program())
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
@@ -1388,6 +1399,7 @@ def test_qnn_backend_online_prepare(self):
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.skip("segfault happens in recent torch.export.export")
     def test_qnn_backend_context_direct(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
@@ -1431,7 +1443,7 @@ def setUp(self):
             saver=False,
         )
 
-    def test_qnn_backend_skip_node_id(self):
+    def test_qnn_backend_skip_node_id_partitioner(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         module = self.get_qdq_module(module, sample_input)
@@ -1442,7 +1454,43 @@ def test_qnn_backend_skip_node_id(self):
             skip_node_id_set={"aten_add_tensor", "aten_mean_dim"},
         )
 
-    def test_qnn_backend_skip_node_op(self):
+    def test_qnn_backend_skip_node_id_quantizer(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+
+        # define partitioner
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+        )
+        partitioner = QnnPartitioner(compiler_specs)
+        # define quantizer
+        quantizer = QnnQuantizer()
+
+        # define calibration method
+        def calibrator(gm):
+            gm(*sample_input)
+
+        # get partially lowererd graph module
+        graph_module, exported_progs = skip_annotation(
+            nn_module=module,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=sample_input,
+            calibration_cb=calibrator,
+            fp_node_id_set={"conv2d"},
+        )
+        self.assertEqual(len(exported_progs), 1)
+        # lower all graph again, the skipped operators will be left in CPU
+        exec_prog = to_edge(
+            torch.export.export(graph_module, sample_input),
+        ).to_executorch()
+        self.verify_output(module, sample_input, exec_prog)
+
+    def test_qnn_backend_skip_node_op_partitioner(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         module = self.get_qdq_module(module, sample_input)
@@ -1453,6 +1501,79 @@ def test_qnn_backend_skip_node_op(self):
             skip_node_op_set={"aten.add.Tensor"},
         )
 
+    def test_qnn_backend_skip_node_op_quantizer(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+
+        # define partitioner
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+        )
+        partitioner = QnnPartitioner(compiler_specs)
+        # define quantizer
+        quantizer = QnnQuantizer()
+
+        # define calibration method
+        def calibrator(gm):
+            gm(*sample_input)
+
+        # get partially lowererd graph module
+        graph_module, exported_progs = skip_annotation(
+            nn_module=module,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=sample_input,
+            calibration_cb=calibrator,
+            fp_node_op_set={torch.ops.aten.add.Tensor},
+        )
+        self.assertEqual(len(exported_progs), 2)
+        # lower all graph again, the skipped operators will be left in CPU
+        exec_prog = exec_prog = to_edge(
+            torch.export.export(graph_module, sample_input),
+        ).to_executorch()
+        self.verify_output(module, sample_input, exec_prog)
+
+    def test_qnn_backend_graph_level_mixed_precision(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+
+        # define partitioner
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+        )
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+        )
+        partitioner = QnnPartitioner(compiler_specs)
+        # define quantizer
+        quantizer = QnnQuantizer()
+
+        # define calibration method
+        def calibrator(gm):
+            gm(*sample_input)
+
+        # get partially lowererd graph module
+        graph_module, exported_progs = skip_annotation(
+            nn_module=module,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=sample_input,
+            calibration_cb=calibrator,
+            fp_node_id_set={"add", "mean"},
+            fallback_to_cpu=False,
+        )
+        self.assertEqual(len(exported_progs), 5)
+        # lower all graph again, the skipped operators will be delegated with fp16
+        exec_prog = to_edge(
+            torch.export.export(graph_module, sample_input),
+        ).to_executorch()
+        self.verify_output(module, sample_input, exec_prog)
+
     def test_qnn_backend_multi_contexts(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -1493,16 +1614,10 @@ def test_qnn_backend_multi_contexts_composite(self):
             quantize_method=self.get_qdq_module,
         )
         sample_input = module.get_random_input()
-        edge_prog = ExirExportedProgram(
+        edge_prog = to_edge(
             torch.export.export(module, sample_input),
-            after_to_edge_passes=False,
-        ).to_edge(
-            EdgeCompileConfig(
-                _check_ir_validity=False,
-                _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-            )
         )
-        canonicalize_program(edge_prog.exported_program)
+        canonicalize_program(edge_prog.exported_program())
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
@@ -1555,6 +1670,7 @@ def test_qnn_backend_online_prepare(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.skip("segfault happens in recent torch.export.export")
     def test_qnn_backend_context_direct(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
@@ -2418,6 +2534,7 @@ def test_stories_single_llama(self):
                 model_out = msg["result"][0]
                 self.assertTrue(model_out.startswith(golden_start_with))
 
+    @unittest.skip("dynamic shape inputs appear in recent torch.export.export")
     def test_mobilebert(self):
         if not self.required_envs([self.pretrained_weight]):
             self.skipTest("missing required envs")
@@ -2458,13 +2575,8 @@ def test_mobilebert(self):
                 for k, v in cpu.items():
                     self.assertLessEqual(abs(v[0] - htp[k][0]), 2)
 
-    @unittest.skip("will be enabled after TODOs got resolved")
+    @unittest.skip("eagar mode fake quant works well, need further investigation")
     def test_ptq_mobilebert(self):
-        # TODO: 2 approaches to resolve accuracy issue
-        # 1. fallback embedding layers:
-        #    - skip annotation in quantizer (need PR to provide helper funciton)
-        #    - skip operators in partitioner (use existent "skip_node_op_set")
-        # 2. investigate different quantization configurations / mechanisms
         if not self.required_envs([self.pretrained_weight]):
             self.skipTest("missing required envs")
 
@@ -2481,6 +2593,8 @@ def test_ptq_mobilebert(self):
             self.model,
             "--pretrained_weight",
             self.pretrained_weight,
+            "--ptq",
+            "16a16w",
             "--ip",
             self.ip,
             "--port",
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index b206a7e1330..0d9e1a69679 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -41,7 +41,7 @@
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
-from executorch.exir.program._program import ExecutorchProgram
+from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -192,7 +192,9 @@ def verify_output(
         with tempfile.TemporaryDirectory() as tmp_dir:
             buffer = (
                 executorch_prog.buffer
-                if isinstance(executorch_prog, ExecutorchProgram)
+                if isinstance(
+                    executorch_prog, (ExecutorchProgram, ExecutorchProgramManager)
+                )
                 else executorch_prog.buffer()
             )
             (
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 6dc0c4c3c8d..2a954f90d24 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import operator
 from collections import OrderedDict
 from typing import Callable, Dict, List, Tuple
 
@@ -38,7 +39,11 @@
 from executorch.backends.qualcomm.passes.recompose_pixel_unshuffle import (
     RecomposePixelUnshuffle,
 )
+from executorch.backends.qualcomm.passes.recompose_rms_norm import RecomposeRmsNorm
 from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy
+from executorch.backends.qualcomm.passes.replace_index_put_input import (
+    ReplaceIndexPutInput,
+)
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     _soc_info_table,
     QcomChipset,
@@ -56,6 +61,7 @@
     convert_to_option,
 )
 from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC
+
 from executorch.exir import ExirExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.lowered_backend_module import LoweredBackendModule
@@ -63,9 +69,74 @@
 from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
 from torch.export.exported_program import ExportedProgram
 from torch.fx import passes
+from torch.fx.passes.operator_support import OperatorSupportBase
 from torch.library import Library
 
 
+class _AnnotationSkipper(OperatorSupportBase):
+    """
+    Class used to partition out unwanted graph nodes.
+    e.g. - nodes are prevented from quantization annotation
+         - nodes have been grouped together as a submodule
+
+    Attributes
+    ----------
+    fp_node_id_set : set
+        a set contains nodes' name to be left in fp precision
+    fp_node_op_set : set
+        a set contains nodes' target (aten dialect) to be left in fp precision
+    skip_annotated_submodule : bool
+        flag to skip annotated submodule or not
+
+    Methods
+    -------
+    should_delegate(n: torch.fx.Node)
+        identify the residual nodes haven't be lowered with fixed-precision
+    should_skip(n: torch.fx.Node)
+        identify the nodes should be kept out with fixed-precision or not
+    is_node_supported(_, node: torch.fx.Node)
+        overridden method for graph partitioning
+    """
+
+    def __init__(
+        self,
+        fp_node_id_set: set = None,
+        fp_node_op_set: set = None,
+        skip_annotated_submodule: bool = False,
+    ):
+        self.fp_node_id_set = fp_node_id_set
+        self.fp_node_op_set = fp_node_op_set
+        self.skip_annotated_submodule = skip_annotated_submodule
+
+    def should_delegate(self, n: torch.fx.Node):
+        return n.op == "call_function" and n.target != operator.getitem
+
+    def should_skip(self, n: torch.fx.Node):
+        return n.name in self.fp_node_id_set or n.target in self.fp_node_op_set
+
+    def is_node_supported(self, _, node: torch.fx.Node) -> bool:
+        if self.skip_annotated_submodule:
+            if node.op == "get_attr":
+                return all(self.should_delegate(user) for user in node.users)
+            return self.should_delegate(node)
+
+        if any(
+            [
+                node.op in ("placeholder", "output"),
+                self.should_skip(node),
+                # check if parameters belong to fallbacked operator
+                (
+                    node.op == "get_attr"
+                    and all(self.should_skip(user) for user in node.users)
+                ),
+            ]
+        ):
+            print(f"[QNN Quantizer Annotation]: {node.name} | Skipped")
+            return False
+
+        return True
+
+
 def qnn_capture_config():
     return exir.CaptureConfig(enable_aot=True)
 
@@ -184,8 +255,10 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
     # The below super ops are supported by QNN
     remove_decompositions = [
         torch.ops.aten.pixel_shuffle.default,
+        torch.ops.aten.pixel_unshuffle.default,
         torch.ops.aten.hardsigmoid.default,
         torch.ops.aten.hardswish.default,
+        torch.ops.aten._safe_softmax.default,
     ]
 
     for key in remove_decompositions:
@@ -201,6 +274,7 @@ def _transform(edge_program: ExportedProgram) -> None:
     graph_module = edge_program.graph_module
     RemoveRedundancy()(graph_module)
     RecomposePixelUnshuffle()(graph_module)
+    RecomposeRmsNorm()(graph_module)
     ConvertToLinear()(graph_module)
     ConvertPReLU(edge_program)(graph_module)
     ConvertBmmToMatmul()(graph_module)
@@ -211,6 +285,7 @@ def _transform(edge_program: ExportedProgram) -> None:
     AnnotateDecomposed(edge_program)(graph_module)
     FoldQDQ()(graph_module)
     LayoutTransform(edge_program)(graph_module)
+    ReplaceIndexPutInput(edge_program)(graph_module)
 
     # Since QDQ nodes are stripped, update graph signature again to validate program
     edge_program._graph_signature = _get_updated_graph_signature(
@@ -238,6 +313,285 @@ def capture_program(
     return edge_ep
 
 
+def _partition_graph_into_submodules(gm, subgm_tag, subgm_cb, ptn):
+    from torch.fx.passes.utils.fuser_utils import (
+        erase_nodes,
+        fuse_as_graphmodule,
+        insert_subgm,
+        legalize_graph,
+        topo_sort,
+    )
+
+    partitions = ptn.propose_partitions()
+    # insert meta for each partition group
+    for i, partition in enumerate(partitions):
+        for node in partition.nodes:
+            node.meta[subgm_tag] = i
+
+    for i in range(len(partitions)):
+        # find nodes with same group id in current graph
+        node_list = [
+            node for node in gm.graph.nodes if node.meta.get(subgm_tag, "") == i
+        ]
+        # fuse group nodes into submodule
+        sorted_nodes = topo_sort(node_list)
+        submodule_name = f"{subgm_tag}_{i}"
+        subgm, orig_inputs, orig_outputs = fuse_as_graphmodule(
+            gm, sorted_nodes, submodule_name
+        )
+        # insert submodule & trim group nodes
+        gm = insert_subgm(
+            gm,
+            subgm_cb(subgm, submodule_name),
+            orig_inputs,
+            orig_outputs,
+        )
+        erase_nodes(gm, sorted_nodes)
+        legalize_graph(gm)
+
+    gm.recompile()
+    return gm
+
+
+def _canonicalize_graph_with_lowered_module(gm, subgm_tag, ptn):
+    from executorch.exir.backend.backend_api import to_backend
+
+    # return lowered program for user to debug
+    exported_progs = []
+    # partition each submodule which went through convert_pt2e
+    for node in gm.graph.nodes:
+        if node.op == "call_module" and subgm_tag in node.name:
+            # obtain sample inputs through meta
+            subgm_input = [
+                torch.ones(arg.meta["val"].shape, dtype=arg.meta["val"].dtype)
+                for arg in node.args
+            ]
+            # program meets QNN backend requirement
+            sub_prog = capture_program(gm.get_submodule(node.name), tuple(subgm_input))
+            # start lowering with given partitioner
+            exported_progs.append(to_backend(sub_prog.exported_program, ptn))
+            # replace submodule with lowered module
+            gm.set_submodule(
+                node.name,
+                exported_progs[-1].graph_module,
+            )
+            # if node has multiple outputs, getitems will be default generated
+            if all(n.target != operator.getitem for n in node.users):
+                with gm.graph.inserting_after(node):
+                    getitem_node = gm.graph.call_function(
+                        operator.getitem,
+                        (node, 0),
+                    )
+                    getitem_node.meta = node.meta
+                    node.replace_all_uses_with(
+                        replace_with=getitem_node,
+                        delete_user_cb=lambda user: user.target != operator.getitem,
+                    )
+
+    gm.recompile()
+    return gm, exported_progs
+
+
+def skip_annotation(
+    nn_module: torch.nn.Module,
+    quantizer,
+    partitioner,
+    sample_input: Tuple[torch.Tensor, ...],
+    calibration_cb: Callable[[torch.fx.GraphModule], None],
+    fp_node_id_set: set = None,
+    fp_node_op_set: set = None,
+    fallback_to_cpu: bool = True,
+):
+    r"""
+    Exclude speific operators from quantizer annotation.
+    Skipped operators will defaultly stay in CPU, set 'fallback_to_cpu'
+    to False for trying to delegate them with FP16 precision.
+
+    e.g.: consider following graph:
+    bias_1 weight_1 input_1   bias_2 weight_2 input_2
+      | (placeholder) |         | (placeholder) |
+       \      |      /           \      |      /
+        \     |     /             \     |     /
+         \    |    /               \    |    /
+           conv2d_1                 conv2d_2
+           (torch.ops.aten.conv2d.default)
+               \                       /
+                \                     /
+                 \_______     _______/
+                         add_1
+             (torch.ops.aten.add.default)
+                           |
+                         output
+
+    If user wants to skip convolution op by names with
+    'skip_node_id_set' = {"conv2d_1"}
+    "bias_1 / weight_1 / input_1 / input_2 / conv2d_1"
+    will be partitioned out and not annotated / lowered with QNN.
+
+    [Generated graph]
+    bias_1 weight_1 input_1   input_2
+      | (placeholder) |          |
+       \      |      /           |
+        \     |     /            |
+         \    |    /             |
+           conv2d_1              |
+              \                 /
+               \               /
+                \             /
+               lowered_module_1
+            (QNN fixed precision)
+                      |
+                    output
+
+    If user wants to skip convolution op by target with
+    'skip_node_op_set' = {torch.ops.aten.conv2d.default}
+    "bias_1 / weight_1 / input_1 / conv2d_1,
+     bias_2 / weight_2 / input_2 / conv2d_2"
+    will be partitioned out and not annotated / lowered with QNN.
+
+    [Generated graph]
+    bias_1 weight_1 input_1   bias_2 weight_2 input_2
+      | (placeholder) |         | (placeholder) |
+       \      |      /           \      |      /
+        \     |     /             \     |     /
+         \    |    /               \    |    /
+           conv2d_1                 conv2d_2
+           (torch.ops.aten.conv2d.default)
+               \                       /
+                \                     /
+                 \__               __/
+                    lowered_module_1
+                 (QNN fixed precision)
+                           |
+                         output
+
+    If user wants to delegate the skipped conv2d from above graph
+    with 'fallback_to_cpu' = False:
+
+    [Generated graph]
+       input_1         input_2
+    (placeholder)   (placeholder)
+          |               |
+          \               /
+          lowered_module_2
+         (QNN fp16 precision)
+                  |
+                  |
+          lowered_module_1
+         (QNN fixed precision)
+                  |
+                output
+
+    Args:
+        nn_module (torch.nn.Module): The module to be lowered.
+        quantizer (QnnQuantizer): Instance of QnnQuantizer.
+        partitioner (QnnPartitioner): Instance of QnnPartitioner.
+        sample_input ((torch.Tensor, ...)): Sample input tensors for graph exporting.
+        calibration_cb (callable): Callback function for user-defined calibration.
+        fp_node_id_set ({str, ...}): Set of operator names to be left in fp precision.
+        fp_node_op_set ({torch.ops.aten.xxx, ...}): Set of operator targets to be left in fp precision.
+        fallback_to_cpu (bool): Whether to lower skipped nodes to fp16 or not.
+
+    Returns:
+        exported_programs: List of programs lowered to QnnBackend (quantized graphs only).
+    """
+    from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+        QnnExecuTorchHtpPrecision,
+    )
+    from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import (
+        convert_to_option,
+    )
+    from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+    from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+
+    def prepare_subgm(subgm, subgm_name):
+        # prepare current submodule for quantization annotation
+        subgm_prepared = prepare_pt2e(subgm, quantizer)
+        # overwrite this attribute or name will be set to "GraphModule"
+        # we could not identify each submodule if action is not performed
+        subgm_prepared.__class__.__name__ = subgm_name
+        return subgm_prepared
+
+    fp_node_id_set = fp_node_id_set if fp_node_id_set is not None else set()
+    fp_node_op_set = fp_node_op_set if fp_node_op_set is not None else set()
+    graph_module = torch.export.export(nn_module, sample_input).module()
+    # define node support type
+    capability_partitioner = CapabilityBasedPartitioner(
+        graph_module,
+        _AnnotationSkipper(fp_node_id_set, fp_node_op_set),
+        allows_single_node_partition=True,
+    )
+    subgm_tag = "annotated_group"
+    graph_module = _partition_graph_into_submodules(
+        gm=graph_module,
+        subgm_tag=subgm_tag,
+        subgm_cb=prepare_subgm,
+        ptn=capability_partitioner,
+    )
+    # perform calibration
+    calibration_cb(graph_module)
+    # convert sub modules which went through prepare_pt2e
+    for node in graph_module.graph.nodes:
+        if node.op == "call_module":
+            graph_module.set_submodule(
+                node.name, convert_pt2e(graph_module.get_submodule(node.name))
+            )
+    # canonicalize graph for lowering again
+    graph_module, exported_progs = _canonicalize_graph_with_lowered_module(
+        gm=graph_module,
+        subgm_tag=subgm_tag,
+        ptn=partitioner,
+    )
+
+    if not fallback_to_cpu:
+        try:
+            from executorch.exir.backend.partitioner import DelegationSpec
+
+            # change HTP compiler spec for hardware to enable fp16
+            qnn_option = generate_qnn_executorch_option(
+                partitioner.compiler_specs_snapshot
+            )
+            compile_option = convert_to_option(qnn_option)
+            htp_options = compile_option.backend_options.htp_options
+            htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16
+            partitioner.delegation_spec = DelegationSpec(
+                "QnnBackend",
+                [
+                    CompileSpec(
+                        QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option)
+                    )
+                ],
+            )
+        except:
+            print(
+                "Failed to change HTP compiler spec with 'use_fp16' as True,"
+                " skipped operators will fallback to cpu,"
+            )
+            return graph_module, exported_progs
+
+        # try lowering skipped operator into fp16
+        capability_partitioner = CapabilityBasedPartitioner(
+            graph_module,
+            _AnnotationSkipper(skip_annotated_submodule=True),
+            allows_single_node_partition=True,
+        )
+        subgm_tag = "skipped_group"
+        graph_module = _partition_graph_into_submodules(
+            gm=graph_module,
+            subgm_tag=subgm_tag,
+            subgm_cb=lambda subgm, _: subgm,
+            ptn=capability_partitioner,
+        )
+        graph_module, exported_progs_fp = _canonicalize_graph_with_lowered_module(
+            gm=graph_module,
+            subgm_tag=subgm_tag,
+            ptn=partitioner,
+        )
+        exported_progs.extend(exported_progs_fp)
+
+    return graph_module, exported_progs
+
+
 def from_context_binary(
     ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650
 ):
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 6fe6746ec0d..dc507f91626 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -356,6 +356,14 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
+utils::uvec3 vTensor::mapped_extents() const {
+  utils::uvec3 m_extents;
+  m_extents[0] = storage_.image_extents_[axis_mapping_.at(0)];
+  m_extents[1] = storage_.image_extents_[axis_mapping_.at(1)];
+  m_extents[2] = storage_.image_extents_[axis_mapping_.at(2)];
+  return m_extents;
+}
+
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
   if (!sizes_uniform_.buffer()) {
     sizes_uniform_ =
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 70f363796fd..31052b351de 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -347,10 +347,25 @@ class vTensor final {
     return storage_.storage_type_ == utils::kBuffer;
   }
 
+  /*
+   * Returns the raw image extents of the underlying image texture used to store
+   * the tensor's data. Note that due to axis mapping, the X, Y, and Z extents
+   * may not correspond to the width, height, or channels dimension of the
+   * tensor.
+   */
   inline const utils::uvec3& image_extents() const {
     return storage_.image_extents_;
   }
 
+  /*
+   * Returns the image extents of the underlying image texture, but re-ordered
+   * such that the first element is the extent of the axis used to represent the
+   * tensor's width dimension, the second element is the extent of the axis used
+   * to represent the tensor's height dimension, and the third element is the
+   * extent of the axis used to represent the tensor's channels dimension.
+   */
+  utils::uvec3 mapped_extents() const;
+
   /*
    * Extract an `vkapi::ScalarType` from the TensorOptions member
    */
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index afdc8290cdd..46787955336 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -288,6 +288,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().image_extents();
   }
 
+  inline utils::uvec3 mapped_extents_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().mapped_extents();
+  }
+
   inline int32_t numel_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().numel();
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
index 1698efb0b15..6e964c745e3 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -16,90 +16,219 @@ $if MAT2_IS_TRANSPOSED:
 $if BATCH_MODE:
   #define BATCH_MODE
 
-$if TILE_ROW == "tile_row_2":
-  #define TILE_ROW_2
+$if HAS_BIAS:
+  #define HAS_BIAS
 
 #include "indexing_utils.h"
-#include "matmul.h"
 
-// addmm will have additional arguments compared to regular mm
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
+${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat1_axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat2_axis_mapping")}
+$if HAS_BIAS:
+  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "bias_axis_mapping")}
+  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
-};
+layout(constant_id = 3) const int out_packed_dim = C_DIM;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
-  ivec4 self_sizes;
-};
+// To convince the SPIR-V compiler to unroll the loops optimally, need this
+// macro
+#define FOUR 4
 
-layout(set = 0, binding = 7) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
+#define TILE_ROWS ${TILE_ROWS}
+
+// we avoid mat4 and vec4 usage here as they compile to much less efficient
+// SPIR-V
+struct FloatMatrix_2d {
+  float data[TILE_ROWS][FOUR];
 };
 
-layout(set = 0, binding = 8) uniform PRECISION restrict Params {
-  float alpha;
-  float beta;
+struct FloatMatrix_3d {
+  float data[TILE_ROWS][FOUR][FOUR];
 };
 
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+#ifdef BATCH_MODE
+  #define FloatMatrix FloatMatrix_3d
+#else
+  #define FloatMatrix FloatMatrix_2d
+#endif // BATCH_MODE
+
+#ifdef HAS_BIAS
+// get texel from self tensor (channel_packed) in addmm
+vec4 get_texel_C_packed(const ivec2 idx) {
+  ivec3 bias_pos = ivec3(0);
+  if (bias_sizes.x > 1) {
+    bias_pos[bias_axis_mapping.x] = idx.x;
+  }
+  if (bias_sizes.y > 1) {
+    bias_pos[bias_axis_mapping.y] = idx.y;
+  }
 
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  return texelFetch(bias_tensor, bias_pos, 0);
+}
+#endif // HAS_BIAS
+
+FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
+  FloatMatrix results;
+  for (int i = 0; i < TILE_ROWS; i++) {
+    for (int j = 0; j < FOUR; j++) {
+#ifdef BATCH_MODE
+      for (int k = 0; k < FOUR; k++) {
+        results.data[i][j][k] = 0.0f;
+      }
+#else
+      results.data[i][j] = 0.0f;
+#endif // BATCH_MODE
+    }
+  }
+  vec4 mat1_tensor_partial_load[TILE_ROWS];
+  vec4 mat2_tensor_partial_load[FOUR];
+
+#ifdef MAT2_IS_TRANSPOSED
+  const int mat2_k_axis = mat2_axis_mapping.x;
+  const int mat2_row_axis = mat2_axis_mapping.y;
+#else
+  const int mat2_k_axis = mat2_axis_mapping.y;
+  const int mat2_row_axis = mat2_axis_mapping.x;
+#endif // MAT2_IS_TRANSPOSED
+
+#ifdef BATCH_MODE
+  for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
+    if (out_idx_tl.z + batch_idx >= out_sizes.z) {
+      break;
+    }
+#endif // BATCH_MODE
+  for (int k = 0; k < mat1_sizes.x; k+=4) {
+    const int k_div4 = k >> 2;
+    // read and cache (4 x TILE_ROWS) tile of mat1
+    for (int r = 0; r < TILE_ROWS; r++) {
+      ivec3 mat1_pos = ivec3(0);
+      mat1_pos[mat1_axis_mapping.x] = k_div4;
+      mat1_pos[mat1_axis_mapping.y] = out_idx_tl.y + r;
+#ifdef BATCH_MODE
+      mat1_pos[mat1_axis_mapping.z] = out_idx_tl.z + batch_idx;
+#endif // BATCH_MODE
+
+      mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0);
+    }
 
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
+    // read and cache (4 x 4) tile of mat2
+    for (int r = 0; r < FOUR; ++r) {
+      ivec3 mat2_pos = ivec3(0);
+      mat2_pos[mat2_k_axis] = k_div4;
+      mat2_pos[mat2_row_axis] = out_idx_tl.x + r;
+#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED)
+      mat2_pos[mat2_axis_mapping.z] = out_idx_tl.z + batch_idx;
+#endif // BATCH_MODE
+
+      mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0);
+    }
+
+    // perform partial dot products and add partial result to results
+    for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
+      for (int out_col = 0; out_col < FOUR; out_col++) {
+#ifdef BATCH_MODE
+        results.data[out_row][out_col][batch_idx] +=
+#else
+        results.data[out_row][out_col] +=
+#endif // BATCH_MODE
+            dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]);
+      }
+    }
   }
+#ifdef BATCH_MODE
+  }
+#endif // BATCH_MODE
+
+  return results;
+}
 
-  $if BATCH_MODE:
-    FloatMatrix_3d results = matmul_partial_3d(
-      im_mat1,
-      im_mat2,
-      pos,
-      out_sizes[2],
-      in_limits[0]);
-  $else:
-    FloatMatrix_2d results = matmul_partial_2d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-
-  for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) {
-    for (int idx_r = 0; idx_r < FOUR; idx_r++) {
-      const ivec3 out_pos =
-          ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z);
-
-      vec4 self_texel = get_texel_C_packed(
-          im_self,
-          out_pos,
-          self_sizes.x == 1,
-          self_sizes.y == 1);
-
-      // results is in transposed order w.r.t. the desired output
-      $if BATCH_MODE:
-        imageStore(
-          im_out,
-          out_pos,
-          vec4(
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][0],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][1],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][2],
-              beta * self_texel.x + alpha * results.data[idx_c][idx_r][3]));
-      $else:
-        imageStore(
-            im_out,
-            out_pos,
-            vec4(
-                beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
+//
+// Write result matrix to output (3D matmul)
+//
+
+void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) {
+  ivec3 out_pos = to_texture_pos(
+      out_idx_tl, out_sizes, out_axis_mapping, out_packed_dim);
+
+  for (int tile_c = 0;
+       tile_c < TILE_ROWS;
+       tile_c++, out_pos[out_axis_mapping.y]++) {
+    out_pos[out_axis_mapping.x] = out_idx_tl.x;
+
+    for (int tile_r = 0;
+         tile_r < FOUR;
+         tile_r++, out_pos[out_axis_mapping.x]++) {
+
+#ifdef HAS_BIAS
+      ivec2 bias_idx;
+      bias_idx[bias_axis_mapping.x] = out_pos[out_axis_mapping.x];
+      bias_idx[bias_axis_mapping.y] = out_pos[out_axis_mapping.y];
+      float bias_val = get_texel_C_packed(bias_idx).x;
+#ifdef BATCH_MODE
+      vec4 bias_texel = vec4(bias_val);
+#else
+      vec4 bias_texel = vec4(bias_val, 0, 0, 0);
+#endif // BATCH_MODE
+#endif // HAS_BIAS
+
+#ifdef BATCH_MODE
+      vec4 out_texel = vec4(
+            results.data[tile_c][tile_r][0],
+            results.data[tile_c][tile_r][1],
+            results.data[tile_c][tile_r][2],
+            results.data[tile_c][tile_r][3]);
+#else
+      vec4 out_texel = vec4(
+            results.data[tile_c][tile_r],
+            0.0,
+            0.0,
+            0.0);
+#endif // BATCH_MODE
+
+#ifdef HAS_BIAS
+      imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel);
+#else
+      imageStore(out_tensor, out_pos, out_texel);
+#endif // HAS_BIAS
     }
   }
 }
+
+void main() {
+  // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of
+  // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4)
+  // tile of output elements will be computed. Note the sizes are written in
+  // (W x H x C) format.
+  const ivec3 tile_idx = ivec3(gl_GlobalInvocationID);
+
+  // Calculate the tensor index of the top left element in the output tile
+  const ivec4 out_idx_topleft = ivec4(
+      tile_idx.x * 4,
+      tile_idx.y * TILE_ROWS,
+#ifdef BATCH_MODE
+      tile_idx.z * 4,
+#else
+      tile_idx.z,
+#endif // BATCH_MODE
+      0);
+
+  // If the top left element is already out of range, then skip
+  if (any(greaterThanEqual(out_idx_topleft, out_sizes))) {
+    return;
+  }
+
+  FloatMatrix results = matmul_partial(out_idx_topleft);
+
+  write_results_C_packed(out_idx_topleft, results);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
index b958d3b9543..c82c2003d20 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
@@ -7,24 +7,37 @@
 addmm_optimized:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    PACKING: C_packed
     MAT2_IS_TRANSPOSED: false
     BATCH_MODE: false
-    TILE_ROW: tile_row_4
+    TILE_ROWS: 4
+    HAS_BIAS: true
   generate_variant_forall:
-    TILE_ROW:
-      - VALUE: tile_row_4
-      - VALUE: tile_row_2
+    TILE_ROWS:
+      - VALUE: 4
+        SUFFIX: tile_row_4
+      - VALUE: 2
+        SUFFIX: tile_row_2
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
     - NAME: addmm_optimized
+    - NAME: matmul_optimized
+      HAS_BIAS: false
     - NAME: linear_optimized
       MAT2_IS_TRANSPOSED: true
+    - NAME: matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
+      HAS_BIAS: false
     - NAME: batch_addmm_optimized
       BATCH_MODE: true
+    - NAME: batch_matmul_optimized
+      BATCH_MODE: true
+      HAS_BIAS: false
     - NAME: batch_linear_optimized
       MAT2_IS_TRANSPOSED: true
       BATCH_MODE: true
+    - NAME: batch_matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
+      BATCH_MODE: true
+      HAS_BIAS: false
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
deleted file mode 100644
index 8634371a7b4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-$if BATCH_MODE:
-  #define BATCH_MODE
-
-$if TILE_ROW == "tile_row_2":
-  #define TILE_ROW_2
-
-#include "indexing_utils.h"
-#include "matmul.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  $if BATCH_MODE:
-    FloatMatrix_3d results = matmul_partial_3d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-  $else:
-    FloatMatrix_2d results = matmul_partial_2d(
-        im_mat1,
-        im_mat2,
-        pos,
-        out_sizes[2],
-        in_limits[0]);
-
-  for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) {
-    for (int idx_r = 0; idx_r < FOUR; idx_r++) {
-      const ivec3 out_pos =
-          ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z);
-
-      // results is in transposed order w.r.t. the desired output
-      $if BATCH_MODE:
-        imageStore(
-          im_out,
-          out_pos,
-          vec4(
-              results.data[idx_c][idx_r][0],
-              results.data[idx_c][idx_r][1],
-              results.data[idx_c][idx_r][2],
-              results.data[idx_c][idx_r][3]));
-      $else:
-        imageStore(
-            im_out,
-            out_pos,
-            vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
deleted file mode 100644
index 9268d5a25aa..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-matmul_optimized:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    PACKING: C_packed
-    MAT2_IS_TRANSPOSED: false
-    BATCH_MODE: false
-    TILE_ROW: tile_row_4
-  generate_variant_forall:
-    TILE_ROW:
-      - VALUE: tile_row_4
-      - VALUE: tile_row_2
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: matmul_optimized
-    - NAME: matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-    - NAME: batch_matmul_optimized
-      BATCH_MODE: true
-    - NAME: batch_matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-      BATCH_MODE: true
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
index 63b60bf52f7..14c814b084a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -174,10 +174,19 @@ void add_addmm_optimized_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   utils::uvec3 global_size;
+
+  // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
+  // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
+  // channels packed, C does not need to be divided by 4. The "identity" of each
+  // thread is the (x, y, z) coordinate of the output tile it is computing, and
+  // this identity can be used to compute the tensor index of the top left
+  // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1});
+    // Use `mapped_extents` instead of `image_extents` because the workgroup
+    // axes need to correspond to tensor dimensions.
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1});
   }
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
@@ -191,14 +200,18 @@ void add_addmm_optimized_node(
        {{mat1_W_packed, mat2_packed, self}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
+          graph.axis_mapping_ubo(out),
+          graph.sizes_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(mat1_W_packed),
+          graph.sizes_ubo(mat2_packed),
+          graph.axis_mapping_ubo(mat2_packed),
           graph.sizes_ubo(self),
-          graph.texture_limits_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_whcn_idx_of(out)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index a25a602e38f..07618239a65 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -181,12 +181,21 @@ void add_matmul_optimized_node(
 
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
+  // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
+  // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
+  // channels packed, C does not need to be divided by 4. The "identity" of each
+  // thread is the (x, y, z) coordinate of the output tile it is computing, and
+  // this identity can be used to compute the tensor index of the top left
+  // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
   utils::uvec3 global_size;
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1});
+    // Use `mapped_extents` instead of `image_extents` because the workgroup
+    // axes need to correspond to tensor dimensions.
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1});
   }
+
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -199,12 +208,15 @@ void add_matmul_optimized_node(
        {{mat1_W_packed, mat2_packed}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
-          graph.texture_limits_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(out),
+          graph.sizes_ubo(mat1_W_packed),
+          graph.axis_mapping_ubo(mat1_W_packed),
+          graph.sizes_ubo(mat2_packed),
+          graph.axis_mapping_ubo(mat2_packed),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_whcn_idx_of(out)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));
diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h
index 20c6254e1a0..9af908eb170 100644
--- a/backends/vulkan/tools/gpuinfo/include/architecture.h
+++ b/backends/vulkan/tools/gpuinfo/include/architecture.h
@@ -242,7 +242,7 @@ void warp_size(const App& app, const bool verbose = false) {
     });
 
     std::vector<int32_t> data(app.nthread_logic);
-    copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes());
+    out_buf.copy_to(data.data(), out_buf.nbytes());
 
     if (verbose) {
       std::stringstream ss;
diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/passes/convert_to_linear.py
index 69f882523c8..2cef71bf927 100644
--- a/backends/xnnpack/passes/convert_to_linear.py
+++ b/backends/xnnpack/passes/convert_to_linear.py
@@ -13,9 +13,8 @@
 from executorch.backends.transforms.addmm_mm_to_linear import (
     apply_addmm_mm_to_linear_transform,
 )
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
-from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.fx.passes.utils.source_matcher_utils import (
@@ -27,7 +26,7 @@
 logger.setLevel(logging.WARNING)
 
 
-class ConvertToLinearPass(XNNPACKPass):
+class ConvertToLinearPass(ExportPass):
     linear_modules = [
         torch.nn.Linear,
         torch.nn.functional.linear,
@@ -71,28 +70,24 @@ def get_arg(node: torch.fx.Node, arg: str):
             map_ = {"input": 0, "weight": 1}
             return None if arg == "bias" else node.args[map_[arg]]
 
-    def find_bias_for_mm(self, src_partition: SourcePartition, weight: torch.fx.Node):
+    def find_bias_for_mm(self, src_partition: SourcePartition, mm_node: torch.fx.Node):
         """
         For linear decomposed with mm + add, find bias in src partition
         """
-        out_channels = get_shape(weight)[0]
-        bias = None
-
-        # Try to find bias node in all nodes
-        for node in src_partition.nodes:
-            if is_param_node(self.exported_program, node) and node != weight:
-                bias = node
-
-        if bias is not None:
-            assert get_shape(bias) == [
-                out_channels
-            ], f"Expected bias shape {[out_channels]} but got {get_shape(bias)}"
-        else:
-            assert exir_ops.edge.aten.add.Tensor not in [
-                node.target for node in src_partition.nodes
-            ], f"Expecting to find bias for Linear module: {src_partition} but could not find it"
 
-        return bias
+        mm_users = list(mm_node.users.keys())
+        if len(mm_users) != 1:
+            return None
+
+        add_node = mm_users[0]
+        if add_node.target != exir_ops.edge.aten.add.Tensor:
+            return None
+
+        for arg in add_node.all_input_nodes:
+            if arg != mm_node and arg in src_partition.input_nodes:
+                return arg
+
+        return None
 
     def create_linear(
         self,
@@ -119,7 +114,7 @@ def create_linear(
             src_partition.input_nodes + src_partition.params,  # bias can be in params
         )
         if linear_bias is None and node.target == exir_ops.edge.aten.mm.default:
-            linear_bias = self.find_bias_for_mm(src_partition, linear_weight)
+            linear_bias = self.find_bias_for_mm(src_partition, node)
 
         logger.debug(f"Found bias(?): {linear_bias} from node {node}")
 
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 3c076cc5bdf..917512d71b6 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -54,20 +54,6 @@ build_android_native_library() {
   fi
   cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
-  cmake examples/models/llama2 \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI="$ANDROID_ABI" \
-    -DANDROID_PLATFORM=android-23 \
-    -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DCMAKE_BUILD_TYPE=Release \
-    -B"${CMAKE_OUT}"/examples/models/llama2
-
-  cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release
-
-
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="${ANDROID_ABI}" \
@@ -75,6 +61,7 @@ build_android_native_library() {
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
     -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"/extension/android
@@ -110,7 +97,7 @@ build_aar() {
   find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   # Zip all necessary files into the AAR file
   zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml
-  zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml
+  zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml
   popd
 }
 
diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
index a7790be7fed..3076cde1a99 100644
--- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/span.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/profiler.h>
 #include "${fn_header}" // Generated Function import headers
@@ -21,7 +22,8 @@
 // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed
 // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast
 // arguments into C++ types (instead of IValue) and delegate to unboxed kernels.
-using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>;
+using KernelSpan =
+    ::executorch::runtime::Span<const ::executorch::runtime::Kernel>;
 namespace torch {
 namespace executor {
 namespace function {
@@ -31,15 +33,15 @@ static Kernel kernels_to_register[] = {
     ${unboxed_kernels} // Generated kernels
 };
 
-// Explicitly convert to ArrayRef, so that the API can take an empty C array of
+// Explicitly convert to Span, so that the API can take an empty C array of
 // Kernels.
-static KernelArrayRef kernel_array_ref(
+static KernelSpan kernel_span(
     kernels_to_register,
     kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel));
 
 // Return value not used. Keep the static variable assignment to register
 // kernels in static initialization time.
-static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+static auto success_with_kernel_reg = register_kernels(kernel_span);
 } // namespace
 } // namespace function
 } // namespace executor
diff --git a/codegen/templates/RegisterKernels.cpp b/codegen/templates/RegisterKernels.cpp
index 2313a30a307..91eac200222 100644
--- a/codegen/templates/RegisterKernels.cpp
+++ b/codegen/templates/RegisterKernels.cpp
@@ -19,7 +19,8 @@ Error register_all_kernels() {
   Kernel kernels_to_register[] = {
       ${unboxed_kernels} // Generated kernels
   };
-  Error success_with_kernel_reg = register_kernels(kernels_to_register);
+  Error success_with_kernel_reg =
+      ::executorch::runtime::register_kernels({kernels_to_register});
   if (success_with_kernel_reg != Error::Ok) {
     ET_LOG(Error, "Failed register all kernels");
     return success_with_kernel_reg;
diff --git a/devtools/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp
index d174cbdcdad..54f84f6fef1 100644
--- a/devtools/bundled_program/bundled_program.cpp
+++ b/devtools/bundled_program/bundled_program.cpp
@@ -23,13 +23,21 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch {
-namespace executor {
+using exec_aten::ArrayRef;
+using exec_aten::Half;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::Method;
+using ::executorch::runtime::Result;
+
+namespace executorch {
 namespace bundled_program {
 
 namespace {
 
-#define kMaxDim 16
+constexpr size_t kMaxDim = 16;
 
 #ifdef USE_ATEN_LIB
 
@@ -53,6 +61,7 @@ at::Tensor tensor_like(bundled_program_flatbuffer::Tensor* bundled_tensor) {
 }
 
 #else // !USE_ATEN_LIB
+using torch::executor::TensorImpl;
 // Create a tensorimpl with same content using bundled tensor
 TensorImpl impl_like(bundled_program_flatbuffer::Tensor* bundled_tensor) {
   ScalarType scalar_type =
@@ -234,9 +243,9 @@ get_method_test_suite(
 } // namespace
 
 // Load testset_idx-th bundled data into the Method
-ET_NODISCARD Error LoadBundledInput(
+ET_NODISCARD Error load_bundled_input(
     Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx) {
   ET_CHECK_OR_RETURN_ERROR(
       bundled_program_flatbuffer::BundledProgramBufferHasIdentifier(
@@ -319,19 +328,19 @@ ET_NODISCARD Error LoadBundledInput(
     ET_CHECK_OR_RETURN_ERROR(
         status == Error::Ok,
         NotSupported,
-        "set_input failed during load bundled inputs with status %" PRIu32,
-        static_cast<error_code_t>(status));
+        "set_input failed during load bundled inputs with status 0%" PRIx32,
+        static_cast<uint32_t>(status));
   }
 
-  internal::event_tracer_set_bundled_input_index(
+  ::executorch::runtime::internal::event_tracer_set_bundled_input_index(
       method.get_event_tracer(), testset_idx);
 
   return Error::Ok;
 }
 
-ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
+ET_NODISCARD Error verify_method_outputs(
     Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx,
     double rtol,
     double atol) {
@@ -390,12 +399,12 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
   return Error::Ok;
 }
 
-ET_NODISCARD Error GetProgramData(
+ET_NODISCARD Error get_program_data(
     void* file_data,
     size_t file_data_len,
     const void** out_program_data,
     size_t* out_program_data_len) {
-  if (IsBundledProgram(file_data)) {
+  if (is_bundled_program(file_data, file_data_len)) {
     auto program_bundled =
         bundled_program_flatbuffer::GetBundledProgram(file_data);
     *out_program_data = program_bundled->program()->data();
@@ -410,11 +419,13 @@ ET_NODISCARD Error GetProgramData(
   return Error::Ok;
 }
 
-bool IsBundledProgram(void* file_data) {
+bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) {
+  // Even though the flatbuffer API doesn't accept a length, it's important to
+  // require one so that we could change the internal representation, or use a
+  // future API that does require a length.
   return bundled_program_flatbuffer::BundledProgramBufferHasIdentifier(
       file_data);
 }
 
 } // namespace bundled_program
-} // namespace executor
-} // namespace torch
+} // namespace executorch
diff --git a/devtools/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h
index 8b42923866e..884ca6f21bc 100644
--- a/devtools/bundled_program/bundled_program.h
+++ b/devtools/bundled_program/bundled_program.h
@@ -11,14 +11,13 @@
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
 namespace bundled_program {
 
 /**
  * An opaque pointer to a serialized bundled program.
  */
-using serialized_bundled_program = const void;
+using SerializedBundledProgram = const void;
 
 /**
  * Load testset_idx-th bundled input of method_idx-th Method test in
@@ -31,9 +30,9 @@ using serialized_bundled_program = const void;
  * @returns Return Error::Ok if load successfully, or the error happens during
  * execution.
  */
-ET_NODISCARD Error LoadBundledInput(
-    Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+ET_NODISCARD ::executorch::runtime::Error load_bundled_input(
+    ::executorch::runtime::Method& method,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx);
 
 /**
@@ -49,9 +48,9 @@ ET_NODISCARD Error LoadBundledInput(
  * @returns Return Error::Ok if two outputs match, or the error happens during
  * execution.
  */
-ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
-    Method& method,
-    serialized_bundled_program* bundled_program_ptr,
+ET_NODISCARD ::executorch::runtime::Error verify_method_outputs(
+    ::executorch::runtime::Method& method,
+    SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx,
     double rtol = 1e-5,
     double atol = 1e-8);
@@ -73,7 +72,7 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
  * in it, and out_program_data/out_program_data_len point to the data. Other
  * values on failure.
  */
-ET_NODISCARD Error GetProgramData(
+ET_NODISCARD ::executorch::runtime::Error get_program_data(
     void* file_data,
     size_t file_data_len,
     const void** out_program_data,
@@ -83,11 +82,61 @@ ET_NODISCARD Error GetProgramData(
  * Checks whether the given file is a bundled program.
  *
  * @param[in] file_data The contents of the given file.
+ * @param[in] file_data_len The length of file_data, in bytes.
  *
  * @returns true if the given file is a bundled program, false otherwise
  */
-bool IsBundledProgram(void* file_data);
+bool is_bundled_program(void* file_data, size_t file_data_len);
+
+/// DEPRECATED: Use the version with the file_data_len parameter.
+ET_DEPRECATED inline bool is_bundled_program(void* file_data) {
+  // 128 is enough data to contain the identifier in the flatbuffer header.
+  return is_bundled_program(file_data, 128);
+}
+
+} // namespace bundled_program
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace bundled_program {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using serialized_bundled_program =
+    ::executorch::bundled_program::SerializedBundledProgram;
+
+ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput(
+    ::executorch::runtime::Method& method,
+    serialized_bundled_program* bundled_program_ptr,
+    size_t testset_idx) {
+  return ::executorch::bundled_program::load_bundled_input(
+      method, bundled_program_ptr, testset_idx);
+}
+
+ET_NODISCARD inline ::executorch::runtime::Error
+VerifyResultWithBundledExpectedOutput(
+    ::executorch::runtime::Method& method,
+    serialized_bundled_program* bundled_program_ptr,
+    size_t testset_idx,
+    double rtol = 1e-5,
+    double atol = 1e-8) {
+  return ::executorch::bundled_program::verify_method_outputs(
+      method, bundled_program_ptr, testset_idx, rtol, atol);
+}
+
+ET_NODISCARD inline ::executorch::runtime::Error GetProgramData(
+    void* file_data,
+    size_t file_data_len,
+    const void** out_program_data,
+    size_t* out_program_data_len) {
+  return ::executorch::bundled_program::get_program_data(
+      file_data, file_data_len, out_program_data, out_program_data_len);
+}
 
+inline bool IsBundledProgram(void* file_data) {
+  // 128 is enough data to contain the identifier in the flatbuffer header.
+  return ::executorch::bundled_program::is_bundled_program(file_data, 128);
+}
 } // namespace bundled_program
 } // namespace executor
 } // namespace torch
diff --git a/devtools/etdump/emitter.cpp b/devtools/etdump/emitter.cpp
index dfca6295306..653c75cb084 100644
--- a/devtools/etdump/emitter.cpp
+++ b/devtools/etdump/emitter.cpp
@@ -6,16 +6,25 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <stdio.h>
+#include <executorch/devtools/etdump/emitter.h>
+
 #include <cstdint>
+#include <cstring>
+
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <flatcc/flatcc_builder.h>
 
-#include "executorch/devtools/etdump/emitter.h"
-#include "executorch/runtime/platform/assert.h"
+using executorch::etdump::internal::ETDumpStaticAllocator;
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace etdump {
+namespace internal {
 
-static int _allocator_fn(
+namespace {
+
+int allocator_fn(
     void* alloc_context,
     flatcc_iovec_t* b,
     size_t request,
@@ -24,8 +33,8 @@ static int _allocator_fn(
   void* p;
   size_t n;
 
-  struct etdump_static_allocator* state =
-      (struct etdump_static_allocator*)alloc_context;
+  ETDumpStaticAllocator* state =
+      reinterpret_cast<ETDumpStaticAllocator*>(alloc_context);
 
   // This allocator doesn't support freeing memory.
   if (request == 0) {
@@ -113,14 +122,14 @@ static int _allocator_fn(
 
 // This emitter implementation emits to a fixed size buffer and will fail if it
 // runs out of room on either end.
-static int _emitter_fn(
+int emitter_fn(
     void* emit_context,
     const flatcc_iovec_t* iov,
     int iov_count,
     flatbuffers_soffset_t offset,
     size_t len) {
-  struct etdump_static_allocator* E =
-      (struct etdump_static_allocator*)emit_context;
+  ETDumpStaticAllocator* E =
+      reinterpret_cast<ETDumpStaticAllocator*>(emit_context);
   uint8_t* p;
 
   if (offset < 0) {
@@ -144,40 +153,15 @@ static int _emitter_fn(
   return 0;
 }
 
-/*******************************************************************************
- * Public Functions
- ******************************************************************************/
-
-int etdump_static_allocator_builder_init(
-    flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc) {
-  ET_CHECK(builder != nullptr);
-  ET_CHECK(alloc != nullptr);
-
-  // Ensure data size is multiple of 32 (minimum allocation size).
-  ET_CHECK((alloc->data_size & 0x1F) == 0);
-  // Ensure out_size is divisable by 2 to ensure front/back sizes are equal for
-  // emitter..
-  ET_CHECK((alloc->out_size & 0x1) == 0);
-
-  return flatcc_builder_custom_init(
-      builder, _emitter_fn, alloc, _allocator_fn, alloc);
-}
-
-void etdump_static_allocator_reset(struct etdump_static_allocator* alloc) {
-  ET_CHECK(alloc != nullptr);
-  alloc->allocated = 0;
-  size_t n = alloc->out_size / 2;
-  alloc->front_cursor = &alloc->data[alloc->data_size + n];
-  alloc->front_left = n;
-}
+} // namespace
 
-int et_flatcc_custom_init(
+int etdump_flatcc_custom_init(
     flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc) {
+    struct ETDumpStaticAllocator* alloc) {
   return flatcc_builder_custom_init(
-      builder, _emitter_fn, alloc, _allocator_fn, alloc);
+      builder, emitter_fn, alloc, allocator_fn, alloc);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace internal
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/emitter.h b/devtools/etdump/emitter.h
index bf8ab0b1e1c..09c1b56aa56 100644
--- a/devtools/etdump/emitter.h
+++ b/devtools/etdump/emitter.h
@@ -6,26 +6,23 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <stdint.h>
-#include <stdlib.h>
+#pragma once
 
-#include <executorch/devtools/etdump/etdump_flatcc.h>
-#include <flatcc/flatcc_builder.h>
+#include <cstdint>
+#include <cstdlib>
 
-#pragma once
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 
-namespace torch {
-namespace executor {
+typedef struct flatcc_builder flatcc_builder_t;
 
-int et_flatcc_custom_init(
-    flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc);
+namespace executorch {
+namespace etdump {
+namespace internal {
 
-int etdump_static_allocator_builder_init(
+int etdump_flatcc_custom_init(
     flatcc_builder_t* builder,
-    struct etdump_static_allocator* alloc);
-
-void etdump_static_allocator_reset(struct etdump_static_allocator* alloc);
+    internal::ETDumpStaticAllocator* alloc);
 
-} // namespace executor
-} // namespace torch
+} // namespace internal
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
index ca46c12f51c..4c05bb5acee 100644
--- a/devtools/etdump/etdump_flatcc.cpp
+++ b/devtools/etdump/etdump_flatcc.cpp
@@ -6,19 +6,33 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "executorch/devtools/etdump/etdump_flatcc.h"
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+
+#include <cstring>
+
+#include <executorch/devtools/etdump/emitter.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/assert.h>
+
 #include <flatcc/flatcc_types.h>
-#include <stdio.h>
-#include <string.h>
-#include "executorch/devtools/etdump/emitter.h"
-#include "executorch/runtime/core/exec_aten/exec_aten.h"
-#include "executorch/runtime/core/exec_aten/util/scalar_type_util.h"
-#include "executorch/runtime/platform/assert.h"
 
-namespace torch {
-namespace executor {
+using ::exec_aten::Tensor;
+using ::executorch::runtime::AllocatorID;
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::ChainID;
+using ::executorch::runtime::DebugHandle;
+using ::executorch::runtime::DelegateDebugIdType;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::EventTracerEntry;
+using ::executorch::runtime::LoggedEValueType;
+using ::executorch::runtime::Span;
+using ::executorch::runtime::Tag;
+
+namespace executorch {
+namespace etdump {
 
 namespace {
 
@@ -50,30 +64,30 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
 }
 
 etdump_Tensor_ref_t add_tensor_entry(
-    flatcc_builder_t* builder,
+    flatcc_builder_t* builder_,
     const exec_aten::Tensor& tensor,
     long offset) {
-  etdump_Tensor_start(builder);
+  etdump_Tensor_start(builder_);
 
   etdump_Tensor_scalar_type_add(
-      builder, get_flatbuffer_scalar_type(tensor.scalar_type()));
-  etdump_Tensor_sizes_start(builder);
+      builder_, get_flatbuffer_scalar_type(tensor.scalar_type()));
+  etdump_Tensor_sizes_start(builder_);
 
   for (auto dim : tensor.sizes()) {
     int64_t cast_dim = static_cast<int64_t>(dim);
-    etdump_Tensor_sizes_push(builder, &cast_dim);
+    etdump_Tensor_sizes_push(builder_, &cast_dim);
   }
-  etdump_Tensor_sizes_end(builder);
+  etdump_Tensor_sizes_end(builder_);
 
-  etdump_Tensor_strides_start(builder);
+  etdump_Tensor_strides_start(builder_);
   for (auto dim : tensor.strides()) {
     int64_t cast_dim = static_cast<int64_t>(dim);
-    etdump_Tensor_strides_push(builder, &cast_dim);
+    etdump_Tensor_strides_push(builder_, &cast_dim);
   }
-  etdump_Tensor_strides_end(builder);
-  etdump_Tensor_offset_add(builder, offset);
+  etdump_Tensor_strides_end(builder_);
+  etdump_Tensor_offset_add(builder_, offset);
 
-  return etdump_Tensor_end(builder);
+  return etdump_Tensor_end(builder_);
 }
 
 static uint8_t* alignPointer(void* ptr, size_t alignment) {
@@ -88,71 +102,71 @@ static uint8_t* alignPointer(void* ptr, size_t alignment) {
 
 } // namespace
 
-constexpr size_t max_alloc_buf_size = 128 * 1024;
-
 // Constructor implementation
 ETDumpGen::ETDumpGen(Span<uint8_t> buffer) {
-  // Initialize the flatcc builder using the buffer and buffer size.
+  constexpr size_t max_alloc_buf_size = 128 * 1024;
+
+  // Initialize the flatcc builder_ using the buffer and buffer size.
 
   if (buffer.data() != nullptr) {
-    builder = (struct flatcc_builder*)alignPointer(buffer.data(), 64);
+    builder_ = (struct flatcc_builder*)alignPointer(buffer.data(), 64);
     uintptr_t buffer_with_builder =
-        (uintptr_t)alignPointer(builder + sizeof(struct flatcc_builder), 64);
+        (uintptr_t)alignPointer(builder_ + sizeof(struct flatcc_builder), 64);
     size_t buffer_size = buffer.size() -
         (size_t)(buffer_with_builder - (uintptr_t)buffer.data());
-    alloc.set_buffer(
+    alloc_.set_buffer(
         (uint8_t*)buffer_with_builder,
         buffer_size,
         (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size
                                                         : buffer_size / 4));
-    et_flatcc_custom_init(builder, &alloc);
+    internal::etdump_flatcc_custom_init(builder_, &alloc_);
   } else {
-    builder = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder));
+    builder_ = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder));
     ET_CHECK_MSG(
-        builder != nullptr, "Failed to allocate memory for flatcc builder.");
-    flatcc_builder_init(builder);
+        builder_ != nullptr, "Failed to allocate memory for flatcc builder_.");
+    flatcc_builder_init(builder_);
   }
   reset();
 }
 
 ETDumpGen::~ETDumpGen() {
-  flatcc_builder_clear(builder);
+  flatcc_builder_clear(builder_);
   if (!is_static_etdump()) {
-    free(builder);
+    free(builder_);
   }
 }
 
 void ETDumpGen::reset() {
-  etdump_gen_state = ETDumpGen_Init;
-  num_blocks = 0;
-  flatcc_builder_reset(builder);
-  flatbuffers_buffer_start(builder, etdump_ETDump_file_identifier);
-  etdump_ETDump_start_as_root_with_size(builder);
-  etdump_ETDump_version_add(builder, ETDUMP_VERSION);
-  etdump_ETDump_run_data_start(builder);
-  etdump_ETDump_run_data_push_start(builder);
+  state_ = State::Init;
+  num_blocks_ = 0;
+  flatcc_builder_reset(builder_);
+  flatbuffers_buffer_start(builder_, etdump_ETDump_file_identifier);
+  etdump_ETDump_start_as_root_with_size(builder_);
+  etdump_ETDump_version_add(builder_, ETDUMP_VERSION);
+  etdump_ETDump_run_data_start(builder_);
+  etdump_ETDump_run_data_push_start(builder_);
 }
 
 void ETDumpGen::create_event_block(const char* name) {
-  if (etdump_gen_state == ETDumpGen_Adding_Events) {
-    etdump_RunData_events_end(builder);
-  } else if (etdump_gen_state == ETDumpGen_Done) {
+  if (state_ == State::AddingEvents) {
+    etdump_RunData_events_end(builder_);
+  } else if (state_ == State::Done) {
     reset();
   }
-  if (num_blocks > 0) {
-    etdump_ETDump_run_data_push_end(builder);
-    etdump_ETDump_run_data_push_start(builder);
+  if (num_blocks_ > 0) {
+    etdump_ETDump_run_data_push_end(builder_);
+    etdump_ETDump_run_data_push_start(builder_);
   }
-  ++num_blocks;
-  etdump_RunData_name_create_strn(builder, name, strlen(name));
-  if (bundled_input_index != -1) {
-    etdump_RunData_bundled_input_index_add(builder, bundled_input_index);
+  ++num_blocks_;
+  etdump_RunData_name_create_strn(builder_, name, strlen(name));
+  if (bundled_input_index_ != -1) {
+    etdump_RunData_bundled_input_index_add(builder_, bundled_input_index_);
   }
-  etdump_gen_state = ETDumpGen_Block_Created;
+  state_ = State::BlockCreated;
 }
 
 int64_t ETDumpGen::create_string_entry(const char* name) {
-  return flatbuffers_string_create_str(builder, name);
+  return flatbuffers_string_create_str(builder_, name);
 }
 
 // ETDumpGen has the following possible states, ETDumpGen_Init,
@@ -169,16 +183,15 @@ int64_t ETDumpGen::create_string_entry(const char* name) {
 // type again. In this case once we close the allocators table and start pushing
 // to the events table we cannot push to the allocators table again.
 void ETDumpGen::check_ready_to_add_events() {
-  if (etdump_gen_state != ETDumpGen_Adding_Events) {
+  if (state_ != State::AddingEvents) {
     ET_CHECK_MSG(
-        (etdump_gen_state == ETDumpGen_Adding_Allocators ||
-         etdump_gen_state == ETDumpGen_Block_Created),
+        (state_ == State::AddingAllocators || state_ == State::BlockCreated),
         "ETDumpGen in an invalid state. Cannot add new events now.");
-    if (etdump_gen_state == ETDumpGen_Adding_Allocators) {
-      etdump_RunData_allocators_end(builder);
+    if (state_ == State::AddingAllocators) {
+      etdump_RunData_allocators_end(builder_);
     }
-    etdump_RunData_events_start(builder);
-    etdump_gen_state = ETDumpGen_Adding_Events;
+    etdump_RunData_events_start(builder_);
+    state_ = State::AddingEvents;
   }
 }
 
@@ -231,29 +244,29 @@ void ETDumpGen::end_profiling_delegate(
   check_ready_to_add_events();
 
   // Start building the ProfileEvent entry.
-  etdump_ProfileEvent_start(builder);
-  etdump_ProfileEvent_start_time_add(builder, event_tracer_entry.start_time);
-  etdump_ProfileEvent_end_time_add(builder, end_time);
-  etdump_ProfileEvent_chain_index_add(builder, chain_id_);
-  etdump_ProfileEvent_instruction_id_add(builder, debug_handle_);
+  etdump_ProfileEvent_start(builder_);
+  etdump_ProfileEvent_start_time_add(builder_, event_tracer_entry.start_time);
+  etdump_ProfileEvent_end_time_add(builder_, end_time);
+  etdump_ProfileEvent_chain_index_add(builder_, chain_id_);
+  etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_);
   // Delegate debug identifier can either be of a string type or an integer
   // type. If it's a string type then it's a value of type
   // flatbuffers_string_ref_t type, whereas if it's an integer type then we
   // write the integer value directly.
   if (event_tracer_entry.delegate_event_id_type == DelegateDebugIdType::kInt) {
     etdump_ProfileEvent_delegate_debug_id_int_add(
-        builder, event_tracer_entry.event_id);
+        builder_, event_tracer_entry.event_id);
   } else {
     etdump_ProfileEvent_delegate_debug_id_str_add(
-        builder, event_tracer_entry.event_id);
+        builder_, event_tracer_entry.event_id);
   }
   flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe(
-      builder, (const uint8_t*)metadata, metadata_len);
-  etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref);
-  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder);
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_profile_event_add(builder, id);
-  etdump_RunData_events_push_end(builder);
+      builder_, (const uint8_t*)metadata, metadata_len);
+  etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref);
+  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_profile_event_add(builder_, id);
+  etdump_RunData_events_push_end(builder_);
 }
 
 void ETDumpGen::log_profiling_delegate(
@@ -268,24 +281,24 @@ void ETDumpGen::log_profiling_delegate(
       "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details.");
   check_ready_to_add_events();
   int64_t string_id = name != nullptr ? create_string_entry(name) : -1;
-  etdump_ProfileEvent_start(builder);
-  etdump_ProfileEvent_start_time_add(builder, start_time);
-  etdump_ProfileEvent_end_time_add(builder, end_time);
-  etdump_ProfileEvent_chain_index_add(builder, chain_id_);
-  etdump_ProfileEvent_instruction_id_add(builder, debug_handle_);
+  etdump_ProfileEvent_start(builder_);
+  etdump_ProfileEvent_start_time_add(builder_, start_time);
+  etdump_ProfileEvent_end_time_add(builder_, end_time);
+  etdump_ProfileEvent_chain_index_add(builder_, chain_id_);
+  etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_);
   if (string_id == -1) {
     etdump_ProfileEvent_delegate_debug_id_int_add(
-        builder, delegate_debug_index);
+        builder_, delegate_debug_index);
   } else {
-    etdump_ProfileEvent_delegate_debug_id_str_add(builder, string_id);
+    etdump_ProfileEvent_delegate_debug_id_str_add(builder_, string_id);
   }
   flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe(
-      builder, (const uint8_t*)metadata, metadata_len);
-  etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref);
-  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder);
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_profile_event_add(builder, id);
-  etdump_RunData_events_push_end(builder);
+      builder_, (const uint8_t*)metadata, metadata_len);
+  etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref);
+  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_profile_event_add(builder_, id);
+  etdump_RunData_events_push_end(builder_);
 }
 
 void ETDumpGen::log_intermediate_output_delegate(
@@ -331,7 +344,7 @@ void ETDumpGen::log_intermediate_output_delegate_helper(
   ET_CHECK_MSG(
       (name == nullptr) ^ (delegate_debug_index == -1),
       "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details.");
-  if (debug_buffer.empty()) {
+  if (debug_buffer_.empty()) {
     ET_CHECK_MSG(0, "Must pre-set debug buffer with set_debug_buffer()\n");
     return;
   }
@@ -339,71 +352,71 @@ void ETDumpGen::log_intermediate_output_delegate_helper(
   check_ready_to_add_events();
   int64_t string_id = name != nullptr ? create_string_entry(name) : -1;
 
-  etdump_DebugEvent_start(builder);
+  etdump_DebugEvent_start(builder_);
 
-  etdump_DebugEvent_chain_index_add(builder, chain_id_);
-  etdump_DebugEvent_instruction_id_add(builder, debug_handle_);
+  etdump_DebugEvent_chain_index_add(builder_, chain_id_);
+  etdump_DebugEvent_instruction_id_add(builder_, debug_handle_);
   if (string_id == -1) {
-    etdump_DebugEvent_delegate_debug_id_int_add(builder, delegate_debug_index);
+    etdump_DebugEvent_delegate_debug_id_int_add(builder_, delegate_debug_index);
   } else {
-    etdump_DebugEvent_delegate_debug_id_str_add(builder, string_id);
+    etdump_DebugEvent_delegate_debug_id_str_add(builder_, string_id);
   }
 
   // Check the type of `output` then call the corresponding logging functions
   if constexpr (std::is_same<T, Tensor>::value) {
     long offset = copy_tensor_to_debug_buffer(output);
-    etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder, output, offset);
+    etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset);
 
-    etdump_Value_start(builder);
-    etdump_Value_val_add(builder, etdump_ValueType_Tensor);
-    etdump_Value_tensor_add(builder, tensor_ref);
+    etdump_Value_start(builder_);
+    etdump_Value_val_add(builder_, etdump_ValueType_Tensor);
+    etdump_Value_tensor_add(builder_, tensor_ref);
 
   } else if constexpr (std::is_same<T, ArrayRef<Tensor>>::value) {
-    etdump_Tensor_vec_start(builder);
+    etdump_Tensor_vec_start(builder_);
     for (size_t i = 0; i < output.size(); ++i) {
       long offset = copy_tensor_to_debug_buffer(output[i]);
       etdump_Tensor_vec_push(
-          builder, add_tensor_entry(builder, output[i], offset));
+          builder_, add_tensor_entry(builder_, output[i], offset));
     }
-    etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder);
+    etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_);
     etdump_TensorList_ref_t tensor_list_ref =
-        etdump_TensorList_create(builder, tensor_vec_ref);
+        etdump_TensorList_create(builder_, tensor_vec_ref);
 
-    etdump_Value_start(builder);
-    etdump_Value_val_add(builder, etdump_ValueType_TensorList);
-    etdump_Value_tensor_list_add(builder, tensor_list_ref);
+    etdump_Value_start(builder_);
+    etdump_Value_val_add(builder_, etdump_ValueType_TensorList);
+    etdump_Value_tensor_list_add(builder_, tensor_list_ref);
   } else if constexpr (std::is_same<T, int>::value) {
-    auto int_ref = etdump_Int_create(builder, output);
+    auto int_ref = etdump_Int_create(builder_, output);
 
-    etdump_Value_start(builder);
-    etdump_Value_val_add(builder, etdump_ValueType_Int);
-    etdump_Value_int_value_add(builder, int_ref);
+    etdump_Value_start(builder_);
+    etdump_Value_val_add(builder_, etdump_ValueType_Int);
+    etdump_Value_int_value_add(builder_, int_ref);
   } else if constexpr (std::is_same<T, double>::value) {
-    auto double_ref = etdump_Double_create(builder, output);
+    auto double_ref = etdump_Double_create(builder_, output);
 
-    etdump_Value_start(builder);
-    etdump_Value_double_value_add(builder, double_ref);
-    etdump_Value_val_add(builder, etdump_ValueType_Double);
+    etdump_Value_start(builder_);
+    etdump_Value_double_value_add(builder_, double_ref);
+    etdump_Value_val_add(builder_, etdump_ValueType_Double);
   } else if constexpr (std::is_same<T, bool>::value) {
     flatbuffers_bool_t flatbuffer_bool_val =
         output ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE;
-    auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val);
+    auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val);
 
-    etdump_Value_start(builder);
-    etdump_Value_bool_value_add(builder, bool_ref);
-    etdump_Value_val_add(builder, etdump_ValueType_Bool);
+    etdump_Value_start(builder_);
+    etdump_Value_bool_value_add(builder_, bool_ref);
+    etdump_Value_val_add(builder_, etdump_ValueType_Bool);
   } else {
     ET_CHECK_MSG(0, "Unsupported output type for intermediate logging\n");
   }
 
-  auto value_ref = etdump_Value_end(builder);
-  etdump_DebugEvent_debug_entry_add(builder, value_ref);
+  auto value_ref = etdump_Value_end(builder_);
+  etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
-  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder);
+  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_);
 
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_debug_event_add(builder, debug_event);
-  etdump_RunData_events_push_end(builder);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_debug_event_add(builder_, debug_event);
+  etdump_RunData_events_push_end(builder_);
 }
 
 void ETDumpGen::end_profiling(EventTracerEntry prof_entry) {
@@ -413,32 +426,31 @@ void ETDumpGen::end_profiling(EventTracerEntry prof_entry) {
       "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event.");
   check_ready_to_add_events();
 
-  etdump_ProfileEvent_start(builder);
-  etdump_ProfileEvent_start_time_add(builder, prof_entry.start_time);
-  etdump_ProfileEvent_end_time_add(builder, end_time);
-  etdump_ProfileEvent_chain_index_add(builder, prof_entry.chain_id);
-  etdump_ProfileEvent_instruction_id_add(builder, prof_entry.debug_handle);
+  etdump_ProfileEvent_start(builder_);
+  etdump_ProfileEvent_start_time_add(builder_, prof_entry.start_time);
+  etdump_ProfileEvent_end_time_add(builder_, end_time);
+  etdump_ProfileEvent_chain_index_add(builder_, prof_entry.chain_id);
+  etdump_ProfileEvent_instruction_id_add(builder_, prof_entry.debug_handle);
   if (prof_entry.event_id != -1) {
-    etdump_ProfileEvent_name_add(builder, prof_entry.event_id);
+    etdump_ProfileEvent_name_add(builder_, prof_entry.event_id);
   }
-  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder);
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_profile_event_add(builder, id);
-  etdump_RunData_events_push_end(builder);
+  etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_profile_event_add(builder_, id);
+  etdump_RunData_events_push_end(builder_);
 }
 
 AllocatorID ETDumpGen::track_allocator(const char* name) {
   ET_CHECK_MSG(
-      (etdump_gen_state == ETDumpGen_Block_Created ||
-       etdump_gen_state == ETDumpGen_Adding_Allocators),
+      (state_ == State::BlockCreated || state_ == State::AddingAllocators),
       "Allocators can only be added immediately after a new block is created and before any events are added.");
-  if (etdump_gen_state != ETDumpGen_Adding_Allocators) {
-    etdump_RunData_allocators_start(builder);
-    etdump_gen_state = ETDumpGen_Adding_Allocators;
+  if (state_ != State::AddingAllocators) {
+    etdump_RunData_allocators_start(builder_);
+    state_ = State::AddingAllocators;
   }
   flatbuffers_string_ref_t ref = create_string_entry(name);
-  etdump_RunData_allocators_push_create(builder, ref);
-  return etdump_RunData_allocators_reserved_len(builder);
+  etdump_RunData_allocators_push_create(builder_, ref);
+  return etdump_RunData_allocators_reserved_len(builder_);
 }
 
 void ETDumpGen::track_allocation(
@@ -446,43 +458,43 @@ void ETDumpGen::track_allocation(
     size_t allocation_size) {
   check_ready_to_add_events();
 
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_allocation_event_create(builder, allocator_id, allocation_size);
-  etdump_RunData_events_push_end(builder);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_allocation_event_create(builder_, allocator_id, allocation_size);
+  etdump_RunData_events_push_end(builder_);
 }
 
-etdump_result ETDumpGen::get_etdump_data() {
-  etdump_result result;
-  if (etdump_gen_state == ETDumpGen_Adding_Events) {
-    etdump_RunData_events_end(builder);
-  } else if (etdump_gen_state == ETDumpGen_Adding_Allocators) {
-    etdump_RunData_allocators_end(builder);
-  } else if (etdump_gen_state == ETDumpGen_Init) {
+ETDumpResult ETDumpGen::get_etdump_data() {
+  ETDumpResult result;
+  if (state_ == State::AddingEvents) {
+    etdump_RunData_events_end(builder_);
+  } else if (state_ == State::AddingAllocators) {
+    etdump_RunData_allocators_end(builder_);
+  } else if (state_ == State::Init) {
     result.buf = nullptr;
     result.size = 0;
     return result;
   }
-  etdump_ETDump_run_data_push_end(builder);
-  etdump_ETDump_run_data_end(builder);
-  etdump_ETDump_ref_t root = etdump_ETDump_end(builder);
-  flatbuffers_buffer_end(builder, root);
-  if (num_blocks == 0) {
+  etdump_ETDump_run_data_push_end(builder_);
+  etdump_ETDump_run_data_end(builder_);
+  etdump_ETDump_ref_t root = etdump_ETDump_end(builder_);
+  flatbuffers_buffer_end(builder_, root);
+  if (num_blocks_ == 0) {
     result = {nullptr, 0};
   } else {
-    if (alloc.data) {
-      result.buf = alloc.front_cursor;
-      result.size = alloc.out_size - alloc.front_left;
+    if (alloc_.data) {
+      result.buf = alloc_.front_cursor;
+      result.size = alloc_.out_size - alloc_.front_left;
     } else {
       result.buf =
-          flatcc_builder_finalize_aligned_buffer(builder, &result.size);
+          flatcc_builder_finalize_aligned_buffer(builder_, &result.size);
     }
   }
-  etdump_gen_state = ETDumpGen_Done;
+  state_ = State::Done;
   return result;
 }
 
 void ETDumpGen::set_debug_buffer(Span<uint8_t> buffer) {
-  debug_buffer = buffer;
+  debug_buffer_ = buffer;
 }
 
 size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) {
@@ -490,94 +502,94 @@ size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) {
     return static_cast<size_t>(-1);
   }
   uint8_t* offset_ptr =
-      alignPointer(debug_buffer.data() + debug_buffer_offset, 64);
-  debug_buffer_offset = (offset_ptr - debug_buffer.data()) + tensor.nbytes();
+      alignPointer(debug_buffer_.data() + debug_buffer_offset_, 64);
+  debug_buffer_offset_ = (offset_ptr - debug_buffer_.data()) + tensor.nbytes();
   ET_CHECK_MSG(
-      debug_buffer_offset <= debug_buffer.size(),
+      debug_buffer_offset_ <= debug_buffer_.size(),
       "Ran out of space to store intermediate outputs.");
   memcpy(offset_ptr, tensor.const_data_ptr(), tensor.nbytes());
-  return (size_t)(offset_ptr - debug_buffer.data());
+  return (size_t)(offset_ptr - debug_buffer_.data());
 }
 
 void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
-  if (debug_buffer.empty()) {
+  if (debug_buffer_.empty()) {
     return;
   }
 
   check_ready_to_add_events();
 
-  etdump_DebugEvent_start(builder);
+  etdump_DebugEvent_start(builder_);
 
-  etdump_DebugEvent_chain_index_add(builder, chain_id_);
-  etdump_DebugEvent_instruction_id_add(builder, debug_handle_);
+  etdump_DebugEvent_chain_index_add(builder_, chain_id_);
+  etdump_DebugEvent_instruction_id_add(builder_, debug_handle_);
 
   switch (evalue.tag) {
     case Tag::Tensor: {
       exec_aten::Tensor tensor = evalue.toTensor();
       long offset = copy_tensor_to_debug_buffer(tensor);
       etdump_Tensor_ref_t tensor_ref =
-          add_tensor_entry(builder, tensor, offset);
+          add_tensor_entry(builder_, tensor, offset);
 
-      etdump_Value_start(builder);
-      etdump_Value_val_add(builder, etdump_ValueType_Tensor);
-      etdump_Value_tensor_add(builder, tensor_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_val_add(builder_, etdump_ValueType_Tensor);
+      etdump_Value_tensor_add(builder_, tensor_ref);
       if (evalue_type == LoggedEValueType::kProgramOutput) {
-        auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE);
-        etdump_Value_output_add(builder, bool_ref);
+        auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE);
+        etdump_Value_output_add(builder_, bool_ref);
       }
-      auto value_ref = etdump_Value_end(builder);
+      auto value_ref = etdump_Value_end(builder_);
 
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
       break;
     }
 
     case Tag::ListTensor: {
       exec_aten::ArrayRef<exec_aten::Tensor> tensors = evalue.toTensorList();
-      etdump_Tensor_vec_start(builder);
+      etdump_Tensor_vec_start(builder_);
       for (size_t i = 0; i < tensors.size(); ++i) {
         long offset = copy_tensor_to_debug_buffer(tensors[i]);
         etdump_Tensor_vec_push(
-            builder, add_tensor_entry(builder, tensors[i], offset));
+            builder_, add_tensor_entry(builder_, tensors[i], offset));
       }
-      etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder);
+      etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_);
       etdump_TensorList_ref_t tensor_list_ref =
-          etdump_TensorList_create(builder, tensor_vec_ref);
+          etdump_TensorList_create(builder_, tensor_vec_ref);
 
-      etdump_Value_start(builder);
-      etdump_Value_val_add(builder, etdump_ValueType_TensorList);
-      etdump_Value_tensor_list_add(builder, tensor_list_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_val_add(builder_, etdump_ValueType_TensorList);
+      etdump_Value_tensor_list_add(builder_, tensor_list_ref);
       if (evalue_type == LoggedEValueType::kProgramOutput) {
-        auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE);
-        etdump_Value_output_add(builder, bool_ref);
+        auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE);
+        etdump_Value_output_add(builder_, bool_ref);
       }
-      auto value_ref = etdump_Value_end(builder);
+      auto value_ref = etdump_Value_end(builder_);
 
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
       break;
     }
 
     case Tag::Int: {
       int64_t val = evalue.toInt();
-      auto int_ref = etdump_Int_create(builder, val);
+      auto int_ref = etdump_Int_create(builder_, val);
 
-      etdump_Value_start(builder);
-      etdump_Value_val_add(builder, etdump_ValueType_Int);
-      etdump_Value_int_value_add(builder, int_ref);
-      auto value_ref = etdump_Value_end(builder);
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_val_add(builder_, etdump_ValueType_Int);
+      etdump_Value_int_value_add(builder_, int_ref);
+      auto value_ref = etdump_Value_end(builder_);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
       break;
     }
 
     case Tag::Double: {
       double val = evalue.toDouble();
-      auto double_ref = etdump_Double_create(builder, val);
+      auto double_ref = etdump_Double_create(builder_, val);
 
-      etdump_Value_start(builder);
-      etdump_Value_double_value_add(builder, double_ref);
-      etdump_Value_val_add(builder, etdump_ValueType_Double);
-      auto value_ref = etdump_Value_end(builder);
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_double_value_add(builder_, double_ref);
+      etdump_Value_val_add(builder_, etdump_ValueType_Double);
+      auto value_ref = etdump_Value_end(builder_);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
       break;
     }
@@ -585,13 +597,13 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
     case Tag::Bool: {
       flatbuffers_bool_t flatbuffer_bool_val =
           evalue.toBool() ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE;
-      auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val);
+      auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val);
 
-      etdump_Value_start(builder);
-      etdump_Value_bool_value_add(builder, bool_ref);
-      etdump_Value_val_add(builder, etdump_ValueType_Bool);
-      auto value_ref = etdump_Value_end(builder);
-      etdump_DebugEvent_debug_entry_add(builder, value_ref);
+      etdump_Value_start(builder_);
+      etdump_Value_bool_value_add(builder_, bool_ref);
+      etdump_Value_val_add(builder_, etdump_ValueType_Bool);
+      auto value_ref = etdump_Value_end(builder_);
+      etdump_DebugEvent_debug_entry_add(builder_, value_ref);
 
       break;
     }
@@ -604,20 +616,20 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
       break;
   }
 
-  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder);
+  etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_);
 
-  etdump_RunData_events_push_start(builder);
-  etdump_Event_debug_event_add(builder, debug_event);
-  etdump_RunData_events_push_end(builder);
+  etdump_RunData_events_push_start(builder_);
+  etdump_Event_debug_event_add(builder_, debug_event);
+  etdump_RunData_events_push_end(builder_);
 }
 
 size_t ETDumpGen::get_num_blocks() {
-  return num_blocks;
+  return num_blocks_;
 }
 
 bool ETDumpGen::is_static_etdump() {
-  return alloc.data != nullptr;
+  return alloc_.data != nullptr;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h
index e56d09f8107..0bd891a0970 100644
--- a/devtools/etdump/etdump_flatcc.h
+++ b/devtools/etdump/etdump_flatcc.h
@@ -8,33 +8,22 @@
 
 #pragma once
 
-#include <executorch/runtime/core/span.h>
 #include <cstdint>
-#include "executorch/runtime/core/event_tracer.h"
-#include "executorch/runtime/platform/platform.h"
+
+#include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/platform.h>
 
 #define ETDUMP_VERSION 0
 
 struct flatcc_builder;
 
-namespace torch {
-namespace executor {
-
-enum ETDumpGen_State {
-  ETDumpGen_Init,
-  ETDumpGen_Block_Created,
-  ETDumpGen_Adding_Allocators,
-  ETDumpGen_Adding_Events,
-  ETDumpGen_Done,
-};
+namespace executorch {
+namespace etdump {
 
-struct etdump_result {
-  void* buf;
-  size_t size;
-};
-
-struct etdump_static_allocator {
-  etdump_static_allocator() {}
+namespace internal {
+struct ETDumpStaticAllocator {
+  ETDumpStaticAllocator() = default;
 
   void
   set_buffer(uint8_t* buffer, size_t total_buf_size, size_t alloc_buf_size) {
@@ -64,61 +53,72 @@ struct etdump_static_allocator {
   // Bytes left in front of front_cursor.
   size_t front_left{0};
 };
+} // namespace internal
+
+struct ETDumpResult {
+  void* buf;
+  size_t size;
+};
 
-class ETDumpGen : public EventTracer {
+class ETDumpGen : public ::executorch::runtime::EventTracer {
  public:
-  ETDumpGen(Span<uint8_t> buffer = {nullptr, (size_t)0});
+  ETDumpGen(::executorch::runtime::Span<uint8_t> buffer = {nullptr, (size_t)0});
   ~ETDumpGen() override;
   void clear_builder();
 
   void create_event_block(const char* name) override;
-  virtual EventTracerEntry start_profiling(
+  virtual ::executorch::runtime::EventTracerEntry start_profiling(
       const char* name,
-      ChainID chain_id = -1,
-      DebugHandle debug_handle = 0) override;
-  virtual void end_profiling(EventTracerEntry prof_entry) override;
-  virtual EventTracerEntry start_profiling_delegate(
+      ::executorch::runtime::ChainID chain_id = -1,
+      ::executorch::runtime::DebugHandle debug_handle = 0) override;
+  virtual void end_profiling(
+      ::executorch::runtime::EventTracerEntry prof_entry) override;
+  virtual ::executorch::runtime::EventTracerEntry start_profiling_delegate(
       const char* name,
-      DebugHandle delegate_debug_index) override;
+      ::executorch::runtime::DebugHandle delegate_debug_index) override;
   virtual void end_profiling_delegate(
-      EventTracerEntry prof_entry,
+      ::executorch::runtime::EventTracerEntry prof_entry,
       const void* metadata,
       size_t metadata_len) override;
   virtual void log_profiling_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       et_timestamp_t start_time,
       et_timestamp_t end_time,
       const void* metadata,
       size_t metadata_len) override;
-  virtual void track_allocation(AllocatorID id, size_t size) override;
-  virtual AllocatorID track_allocator(const char* name) override;
+  virtual void track_allocation(
+      ::executorch::runtime::AllocatorID id,
+      size_t size) override;
+  virtual ::executorch::runtime::AllocatorID track_allocator(
+      const char* name) override;
   virtual void log_evalue(
-      const EValue& evalue,
-      LoggedEValueType evalue_type =
-          LoggedEValueType::kIntermediateOutput) override;
+      const ::executorch::runtime::EValue& evalue,
+      ::executorch::runtime::LoggedEValueType evalue_type =
+          ::executorch::runtime::LoggedEValueType::kIntermediateOutput)
+      override;
   /**
    * Log an intermediate tensor output from a delegate.
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
-      const Tensor& output) override;
+      ::executorch::runtime::DebugHandle delegate_debug_index,
+      const exec_aten::Tensor& output) override;
 
   /**
    * Log an intermediate tensor array output from a delegate.
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
-      const ArrayRef<Tensor> output) override;
+      ::executorch::runtime::DebugHandle delegate_debug_index,
+      const ::executorch::runtime::ArrayRef<exec_aten::Tensor> output) override;
 
   /**
    * Log an intermediate int output from a delegate.
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const int& output) override;
 
   /**
@@ -126,7 +126,7 @@ class ETDumpGen : public EventTracer {
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const bool& output) override;
 
   /**
@@ -134,22 +134,22 @@ class ETDumpGen : public EventTracer {
    */
   virtual void log_intermediate_output_delegate(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const double& output) override;
-  void set_debug_buffer(Span<uint8_t> buffer);
-  etdump_result get_etdump_data();
+  void set_debug_buffer(::executorch::runtime::Span<uint8_t> buffer);
+  ETDumpResult get_etdump_data();
   size_t get_num_blocks();
   bool is_static_etdump();
   void reset();
 
  private:
-  struct flatcc_builder* builder;
-  size_t num_blocks = 0;
-  Span<uint8_t> debug_buffer;
-  size_t debug_buffer_offset = 0;
-  int bundled_input_index = -1;
-  ETDumpGen_State etdump_gen_state = ETDumpGen_Init;
-  struct etdump_static_allocator alloc;
+  enum class State {
+    Init,
+    BlockCreated,
+    AddingAllocators,
+    AddingEvents,
+    Done,
+  };
 
   void check_ready_to_add_events();
   int64_t create_string_entry(const char* name);
@@ -162,9 +162,26 @@ class ETDumpGen : public EventTracer {
   template <typename T>
   void log_intermediate_output_delegate_helper(
       const char* name,
-      DebugHandle delegate_debug_index,
+      ::executorch::runtime::DebugHandle delegate_debug_index,
       const T& output);
+
+  struct flatcc_builder* builder_;
+  size_t num_blocks_ = 0;
+  ::executorch::runtime::Span<uint8_t> debug_buffer_;
+  size_t debug_buffer_offset_ = 0;
+  int bundled_input_index_ = -1;
+  State state_ = State::Init;
+  struct internal::ETDumpStaticAllocator alloc_;
 };
 
+} // namespace etdump
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using etdump_result = ::executorch::etdump::ETDumpResult;
+using ::executorch::etdump::ETDumpGen;
 } // namespace executor
 } // namespace torch
diff --git a/devtools/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs
index d90d278f5fc..1244ebd4aeb 100644
--- a/devtools/etdump/etdump_schema_flatcc.fbs
+++ b/devtools/etdump/etdump_schema_flatcc.fbs
@@ -76,6 +76,10 @@ table DebugEvent {
 
   // String based delegate debug identifier.
   delegate_debug_id_str:string;
+
+  // Name assigned to this debug event by the runtime. If it is an operator
+  // call this will just be the name of the operator that was executed.
+  name:string;
 }
 
 // All the details pertaining to an allocation done in the runtime. The main
diff --git a/devtools/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs
index fdfe550e9e3..a8da080c679 100644
--- a/devtools/etdump/scalar_type.fbs
+++ b/devtools/etdump/scalar_type.fbs
@@ -14,6 +14,7 @@ enum ScalarType : byte {
   SHORT = 2,
   INT = 3,
   LONG = 4,
+  HALF = 5,
   FLOAT = 6,
   DOUBLE = 7,
   BOOL = 11,
@@ -24,7 +25,6 @@ enum ScalarType : byte {
   QUINT4X2 = 16,
   QUINT2X4 = 17,
   // Types currently not implemented.
-  // Half = 5,
   // COMPLEXHALF = 8,
   // COMPLEXFLOAT = 9,
   // COMPLEXDOUBLE = 10,
diff --git a/devtools/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py
index f19f328d3fa..404fa1c9758 100644
--- a/devtools/etdump/schema_flatcc.py
+++ b/devtools/etdump/schema_flatcc.py
@@ -93,6 +93,7 @@ class Value:
 
 @dataclass
 class DebugEvent:
+    name: Optional[str]
     chain_index: int
     instruction_id: int
     delegate_debug_id_int: Optional[int]
diff --git a/devtools/etdump/targets.bzl b/devtools/etdump/targets.bzl
index 6d548ce650f..ddbb35eab74 100644
--- a/devtools/etdump/targets.bzl
+++ b/devtools/etdump/targets.bzl
@@ -95,9 +95,11 @@ def define_common_targets():
                 "etdump_flatcc.cpp",
                 "emitter.cpp",
             ],
+            headers = [
+                "emitter.h",
+            ],
             exported_headers = [
                 "etdump_flatcc.h",
-                "emitter.h",
             ],
             deps = [
                 "//executorch/runtime/platform:platform",
diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp
index de8c0abc39d..b750e21eb07 100644
--- a/devtools/etdump/tests/etdump_test.cpp
+++ b/devtools/etdump/tests/etdump_test.cpp
@@ -20,8 +20,20 @@
 #include <cstring>
 #include <memory>
 
-namespace torch {
-namespace executor {
+using ::exec_aten::ScalarType;
+using ::exec_aten::Tensor;
+using ::executorch::etdump::ETDumpGen;
+using ::executorch::etdump::ETDumpResult;
+using ::executorch::runtime::AllocatorID;
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::BoxedEvalueList;
+using ::executorch::runtime::DelegateDebugIdType;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::EventTracerEntry;
+using ::executorch::runtime::LoggedEValueType;
+using ::executorch::runtime::Span;
+using ::executorch::runtime::Tag;
+using ::executorch::runtime::testing::TensorFactory;
 
 class ProfilerETDumpTest : public ::testing::Test {
  protected:
@@ -49,7 +61,7 @@ TEST_F(ProfilerETDumpTest, SingleProfileEvent) {
     EventTracerEntry entry = etdump_gen[i]->start_profiling("test_event", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -105,7 +117,7 @@ TEST_F(ProfilerETDumpTest, EmptyBlocks) {
         etdump_gen[i]->start_profiling("test_event_1", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -160,7 +172,7 @@ TEST_F(ProfilerETDumpTest, AllocationEvents) {
 
 TEST_F(ProfilerETDumpTest, DebugEvent) {
   for (size_t i = 0; i < 2; i++) {
-    testing::TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Float> tf;
     EValue evalue(tf.ones({3, 2}));
 
     etdump_gen[i]->create_event_block("test_block");
@@ -189,7 +201,7 @@ TEST_F(ProfilerETDumpTest, DebugEvent) {
 
 TEST_F(ProfilerETDumpTest, DebugEventTensorList) {
   for (size_t i = 0; i < 2; i++) {
-    testing::TensorFactory<ScalarType::Int> tf;
+    TensorFactory<ScalarType::Int> tf;
     exec_aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})};
     EValue evalue_1(storage[0]);
     EValue evalue_2(storage[1]);
@@ -212,7 +224,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) {
 }
 
 TEST_F(ProfilerETDumpTest, VerifyLogging) {
-  testing::TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Float> tf;
   EValue evalue(tf.ones({3, 2}));
 
   for (size_t i = 0; i < 2; i++) {
@@ -225,7 +237,7 @@ TEST_F(ProfilerETDumpTest, VerifyLogging) {
     etdump_gen[i]->log_evalue(evalue);
     etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -297,7 +309,7 @@ TEST_F(ProfilerETDumpTest, MultipleBlocksWithEvents) {
     entry = etdump_gen[i]->start_profiling("test_event", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -363,7 +375,7 @@ TEST_F(ProfilerETDumpTest, VerifyData) {
     entry = etdump_gen[i]->start_profiling("test_event2", 0, 1);
     etdump_gen[i]->end_profiling(entry);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -421,7 +433,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) {
     Span<uint8_t> buffer((uint8_t*)ptr, 2048);
 
     etdump_gen[i]->create_event_block("test_block");
-    testing::TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Float> tf;
 
     ET_EXPECT_DEATH(
         etdump_gen[i]->log_intermediate_output_delegate(
@@ -462,7 +474,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) {
         static_cast<torch::executor::DebugHandle>(-1),
         true);
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -474,7 +486,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) {
 }
 
 TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) {
-  testing::TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Float> tf;
   EValue evalue(tf.ones({3, 2}));
 
   for (size_t i = 0; i < 2; i++) {
@@ -492,7 +504,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) {
     etdump_gen[i]->log_intermediate_output_delegate(
         nullptr, 258, tf.ones({5, 6}));
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -603,7 +615,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateEvents) {
         etdump_gen[i]->end_profiling(entry),
         "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event.");
 
-    etdump_result result = etdump_gen[i]->get_etdump_data();
+    ETDumpResult result = etdump_gen[i]->get_etdump_data();
     ASSERT_TRUE(result.buf != nullptr);
     ASSERT_TRUE(result.size != 0);
 
@@ -681,7 +693,7 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) {
           etdump_gen[i]->start_profiling("test_event", 0, 1);
       etdump_gen[i]->end_profiling(entry);
 
-      etdump_result result = etdump_gen[i]->get_etdump_data();
+      ETDumpResult result = etdump_gen[i]->get_etdump_data();
       ASSERT_TRUE(result.buf != nullptr);
       ASSERT_TRUE(result.size != 0);
 
@@ -712,6 +724,3 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) {
     }
   }
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/devtools/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py
index 1a7f3bd93f5..5cab3e5b2ba 100644
--- a/devtools/etdump/tests/serialize_test.py
+++ b/devtools/etdump/tests/serialize_test.py
@@ -83,6 +83,7 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:
                         profile_event=None,
                         allocation_event=None,
                         debug_event=flatcc.DebugEvent(
+                            name="test_debug_event",
                             chain_index=1,
                             instruction_id=0,
                             delegate_debug_id_str="56",
diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py
index f98e3cd3a56..0539d4f5e4b 100644
--- a/devtools/inspector/_inspector.py
+++ b/devtools/inspector/_inspector.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import dataclasses
 import logging
 import sys
@@ -39,6 +41,7 @@
 )
 from executorch.devtools.etrecord import ETRecord, parse_etrecord
 from executorch.devtools.inspector._inspector_utils import (
+    calculate_time_scale_factor,
     create_debug_handle_to_op_node_mapping,
     EDGE_DIALECT_GRAPH_KEY,
     EXCLUDED_COLUMNS_WHEN_PRINTING,
@@ -52,7 +55,6 @@
     is_inference_output_equal,
     ProgramOutput,
     RESERVED_FRAMEWORK_EVENT_NAMES,
-    TIME_SCALE_DICT,
     TimeScale,
     verify_debug_data_equivalence,
 )
@@ -150,6 +152,7 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature":
 # Signature of a DebugEvent
 @dataclass(frozen=True, order=True)
 class DebugEventSignature:
+    name: str = ""
     instruction_id: Optional[int] = -1
     delegate_id: Optional[int] = None
     delegate_id_str: Optional[str] = None
@@ -163,6 +166,7 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature":
         The Signature will convert these back to the intended None value
         """
         return DebugEventSignature(
+            event.name or "",
             event.instruction_id if event.instruction_id != -1 else None,
             event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None,
             event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None,
@@ -468,46 +472,63 @@ def _calculate_elapsed_time(start_time, end_time):
         return elapsed_time
 
     @staticmethod
-    def _populate_profiling_related_fields(
+    def _populate_event_signature_fields(
         ret_event: "Event",
-        profile_event_signature: Optional[ProfileEventSignature],
-        events: List[InstructionEvent],
-        scale_factor: float,
+        event_signature: Optional[Union[ProfileEventSignature, DebugEventSignature]],
     ) -> None:
         """
         Given a partially constructed Event, populate the fields related to
-        the profile events
+        the profile event signature or debug event signature
 
         Fields Updated:
             name
             delegate_debug_identifier
             is_delegated_op
-            perf_data
-            delegate_debug_metadatas
         """
-
-        # Fill out fields from profile event signature
-        if profile_event_signature is not None:
-            if profile_event_signature.delegate_id is not None:  # 0 is a valid value
-                delegate_debug_identifier = profile_event_signature.delegate_id
+        # TODO: T201347372 Push the None check to ealier in the stack.
+        if event_signature is not None:
+            if event_signature.delegate_id is not None:  # 0 is a valid value
+                delegate_debug_identifier = event_signature.delegate_id
             else:
-                delegate_debug_identifier = (
-                    profile_event_signature.delegate_id_str or None
-                )
+                delegate_debug_identifier = event_signature.delegate_id_str or None
 
             # Use the delegate identifier as the event name if delegated
             is_delegated_op = delegate_debug_identifier is not None
             name = (
-                profile_event_signature.name
+                event_signature.name
                 if not is_delegated_op
                 else str(delegate_debug_identifier)
             )
 
             # Update fields
-            ret_event.name = name
+            # This is for older version of etdump that doesn't have the name field for debug events, we don't update the name field
+            if name:
+                ret_event.name = name
             ret_event.delegate_debug_identifier = delegate_debug_identifier
             ret_event.is_delegated_op = is_delegated_op
 
+    @staticmethod
+    def _populate_profiling_related_fields(
+        ret_event: "Event",
+        profile_event_signature: Optional[ProfileEventSignature],
+        events: List[InstructionEvent],
+        scale_factor: float,
+    ) -> None:
+        """
+        Given a partially constructed Event, populate the fields related to
+        the profile events
+
+        Fields Updated:
+            name
+            delegate_debug_identifier
+            is_delegated_op
+            perf_data
+            delegate_debug_metadatas
+        """
+
+        # Fill out fields from profile event signature
+        Event._populate_event_signature_fields(ret_event, profile_event_signature)
+
         # Fill out fields from profile event
         data = []
         delegate_debug_metadatas = []
@@ -575,9 +596,15 @@ def _populate_debugging_related_fields(
         the debug events
 
         Fields Updated:
+            name
+            delegate_debug_identifier
+            is_delegated_op
             debug_data
         """
 
+        # Fill out fields from debug event signature
+        Event._populate_event_signature_fields(ret_event, debug_event_signature)
+
         debug_data: List[flatcc.Value] = []
         for event in events:
             if (debug_events := event.debug_events) is None:
@@ -799,9 +826,7 @@ class GroupedRunInstances:
 
         # Construct the EventBlocks
         event_blocks = []
-        scale_factor = (
-            TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale]
-        )
+        scale_factor = calculate_time_scale_factor(source_time_scale, target_time_scale)
         for run_signature, grouped_run_instance in run_groups.items():
             run_group: OrderedDict[EventSignature, List[InstructionEvent]] = (
                 grouped_run_instance.events
@@ -966,6 +991,9 @@ def __init__(
             debug_buffer_path: Debug buffer file path that contains the debug data referenced by ETDump for intermediate and program outputs.
             delegate_metadata_parser: Optional function to parse delegate metadata from an Profiling Event. Expected signature of the function is:
                     (delegate_metadata_list: List[bytes]) -> Union[List[str], Dict[str, Any]]
+            delegate_time_scale_converter: Optional function to convert the time scale of delegate profiling data. If not given, use the conversion ratio of
+                    target_time_scale/source_time_scale.
+            enable_module_hierarchy: Enable submodules in the operator graph. Defaults to False.
 
         Returns:
             None
@@ -980,6 +1008,14 @@ def __init__(
         self._source_time_scale = source_time_scale
         self._target_time_scale = target_time_scale
 
+        if delegate_time_scale_converter is None:
+            scale_factor = calculate_time_scale_factor(
+                source_time_scale, target_time_scale
+            )
+            delegate_time_scale_converter = (
+                lambda event_name, input_time: input_time / scale_factor
+            )
+
         if etrecord is None:
             self._etrecord = None
         elif isinstance(etrecord, ETRecord):
@@ -1002,10 +1038,10 @@ def __init__(
             )
 
         self.event_blocks = EventBlock._gen_from_etdump(
-            etdump,
-            self._source_time_scale,
-            self._target_time_scale,
-            output_buffer,
+            etdump=etdump,
+            source_time_scale=self._source_time_scale,
+            target_time_scale=self._target_time_scale,
+            output_buffer=output_buffer,
             delegate_metadata_parser=delegate_metadata_parser,
             delegate_time_scale_converter=delegate_time_scale_converter,
         )
diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
index 98b5fdc722f..5f04e2d0413 100644
--- a/devtools/inspector/_inspector_utils.py
+++ b/devtools/inspector/_inspector_utils.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import math
 from enum import Enum
 from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union
@@ -63,6 +65,15 @@ class TimeScale(Enum):
 }
 
 
+def calculate_time_scale_factor(
+    source_time_scale: TimeScale, target_time_scale: TimeScale
+) -> float:
+    """
+    Calculate the factor (source divided by target) between two time scales
+    """
+    return TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale]
+
+
 # Model Debug Output
 InferenceOutput: TypeAlias = Union[
     torch.Tensor, List[torch.Tensor], int, float, str, bool, None
diff --git a/devtools/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py
index 4101035f99b..85b65aa5f34 100644
--- a/devtools/inspector/tests/event_blocks_test.py
+++ b/devtools/inspector/tests/event_blocks_test.py
@@ -62,6 +62,7 @@ def _gen_sample_profile_event(
     def _gen_sample_debug_event(
         instruction_id: int,
         delegate_debug_id: Optional[Union[int, str]] = None,
+        name: str = "test_debug_event",
     ) -> flatcc.DebugEvent:
         """
         Helper for generating test DebugEvents
@@ -77,6 +78,7 @@ def _gen_sample_debug_event(
         )
 
         return flatcc.DebugEvent(
+            name=name,
             chain_index=0,
             instruction_id=instruction_id,
             delegate_debug_id_int=delegate_debug_id_int,
@@ -299,6 +301,42 @@ def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC:
 
         return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3])
 
+    @staticmethod
+    def _get_sample_etdump_flatcc_debug_events_only(
+        event_name: str,
+        delegate_debug_id: str,
+    ) -> flatcc.ETDumpFlatCC:
+        """
+        Helper for getting a sample ETDumpFlatCC object with RunData signature_a
+        and (debug_event_delegated, debug_event_non_delegated, no profile event)
+        """
+
+        debug_event_delegated = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, delegate_debug_id=delegate_debug_id, name=event_name
+        )
+        debug_event_non_delegated = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, name=event_name
+        )
+        run_data_1 = flatcc.RunData(
+            name="signature_a",
+            bundled_input_index=-1,
+            allocators=[],
+            events=[
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_delegated,
+                    profile_event=None,
+                ),
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_non_delegated,
+                    profile_event=None,
+                ),
+            ],
+        )
+
+        return ETDumpFlatCC(version=0, run_data=[run_data_1])
+
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     def test_gen_from_etdump(self) -> None:
@@ -370,6 +408,30 @@ def test_gen_from_etdump_inconsistent_debug_data(self) -> None:
         with self.assertRaises(AssertionError):
             EventBlock._gen_from_etdump(etdump)
 
+    def test_gen_from_etdump_debug_events_only(self) -> None:
+        """
+        Test generation of EventBlocks given an ETDump with only debugging events
+
+        Specifically it tests:
+        - Correct number of EventBlocks and Events
+        - Correct name of each Event
+        """
+        event_name = "test_debug_event_only"
+        delegate_debug_id = "debug_id"
+        etdump: ETDumpFlatCC = (
+            TestEventBlock._get_sample_etdump_flatcc_debug_events_only(
+                event_name=event_name,
+                delegate_debug_id=delegate_debug_id,
+            )
+        )
+        event_blocks = EventBlock._gen_from_etdump(etdump)
+        self.assertEqual(len(event_blocks), 1)
+        self.assertEqual(len(event_blocks[0].events), 2)
+        # Delegated event uses delegate_debug_id as event name
+        self.assertEqual(event_blocks[0].events[0].name, delegate_debug_id)
+        # Non delegated event uses event_name as event name
+        self.assertEqual(event_blocks[0].events[1].name, event_name)
+
     def test_inspector_event_generation(self) -> None:
         """
         Test Inspector.Event derivation from various ProfileEvent cases
diff --git a/devtools/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py
index 55f0cd10ae9..34c96eef534 100644
--- a/devtools/inspector/tests/inspector_test.py
+++ b/devtools/inspector/tests/inspector_test.py
@@ -4,13 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import random
 import statistics
 import tempfile
 import unittest
 from contextlib import redirect_stdout
 
-from typing import List
+from typing import Callable, List
 
 from unittest.mock import patch
 
@@ -32,6 +34,7 @@
     InstructionEvent,
     InstructionEventSignature,
     ProfileEventSignature,
+    TimeScale,
 )
 
 from executorch.exir import ExportedProgram
@@ -88,6 +91,33 @@ def test_inspector_constructor(self):
             # Because we mocked parse_etrecord() to return None, this method shouldn't be called
             mock_gen_graphs_from_etrecord.assert_not_called()
 
+    def test_default_delegate_time_scale_converter(self):
+        # Create a context manager to patch functions called by Inspector.__init__
+        with patch.object(
+            _inspector, "parse_etrecord", return_value=None
+        ), patch.object(
+            _inspector, "gen_etdump_object", return_value=None
+        ), patch.object(
+            EventBlock, "_gen_from_etdump"
+        ) as mock_gen_from_etdump, patch.object(
+            _inspector, "gen_graphs_from_etrecord"
+        ), patch.object(
+            _inspector, "create_debug_handle_to_op_node_mapping"
+        ):
+            # Call the constructor of Inspector
+            Inspector(
+                etdump_path=ETDUMP_PATH,
+                etrecord=ETRECORD_PATH,
+                source_time_scale=TimeScale.US,
+                target_time_scale=TimeScale.S,
+            )
+
+            # Verify delegate_time_scale_converter is set to be a callable
+            self.assertIsInstance(
+                mock_gen_from_etdump.call_args.get("delegate_time_scale_converter"),
+                Callable,
+            )
+
     def test_inspector_print_data_tabular(self):
         # Create a context manager to patch functions called by Inspector.__init__
         with patch.object(
@@ -288,6 +318,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
         )
 
         debug_event_0 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
@@ -311,6 +342,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
 
         # Note the sizes of this tensor are different from the previous one
         debug_event_1 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
@@ -355,6 +387,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
         )
 
         debug_event_0 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
@@ -378,6 +411,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
 
         # Same as the event above except for offset
         debug_event_1 = flatcc.DebugEvent(
+            name="event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_int=1,
diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py
index d853732fcc7..73511f5fcd7 100644
--- a/devtools/inspector/tests/inspector_utils_test.py
+++ b/devtools/inspector/tests/inspector_utils_test.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import tempfile
 import unittest
 from typing import Dict, Tuple
@@ -23,11 +25,13 @@
 
 from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord
 from executorch.devtools.inspector._inspector_utils import (
+    calculate_time_scale_factor,
     create_debug_handle_to_op_node_mapping,
     EDGE_DIALECT_GRAPH_KEY,
     find_populated_event,
     gen_graphs_from_etrecord,
     is_inference_output_equal,
+    TimeScale,
 )
 
 
@@ -74,6 +78,7 @@ def test_find_populated_event(self):
             end_time=2002,
         )
         debug_event = flatcc.DebugEvent(
+            name="test_debug_event",
             chain_index=1,
             instruction_id=0,
             delegate_debug_id_str="56",
@@ -170,6 +175,19 @@ def test_is_inference_output_equal_returns_true_for_same_strs(self):
             )
         )
 
+    def test_calculate_time_scale_factor_second_based(self):
+        self.assertEqual(
+            calculate_time_scale_factor(TimeScale.NS, TimeScale.MS), 1000000
+        )
+        self.assertEqual(
+            calculate_time_scale_factor(TimeScale.MS, TimeScale.NS), 1 / 1000000
+        )
+
+    def test_calculate_time_scale_factor_cycles(self):
+        self.assertEqual(
+            calculate_time_scale_factor(TimeScale.CYCLES, TimeScale.CYCLES), 1
+        )
+
 
 def gen_mock_operator_graph_with_expected_map() -> (
     Tuple[OperatorGraph, Dict[int, OperatorNode]]
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
index d610f020ef2..1fbe35c72bc 100644
--- a/docs/source/getting-started-setup.md
+++ b/docs/source/getting-started-setup.md
@@ -59,13 +59,11 @@ also work in similar environments.
   - We recommend `conda` as it provides cross-language
     support and integrates smoothly with `pip` (Python's built-in package manager)
   - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative.
-* `g++` version 8 or higher, `clang++` version 8 or higher, or another
-  C++17-compatible toolchain that supports GNU C-style [statement
-  expressions](https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html) (`({ ...
-  })` syntax).
+* `g++` version 7 or higher, `clang++` version 5 or higher, or another
+  C++17-compatible toolchain.
 
 Note that the cross-compilable core runtime code supports a wider range of
-toolchains, down to C++11. See the [Runtime Overview](./runtime-overview.md) for
+toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for
 portability details.
 
 ## Quick Setup: Colab/Jupyter Notebook Prototype
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 7bc8b4dd8b4..6766e678e0e 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -96,7 +96,7 @@ can build it for a wide variety of target systems.
 
 #### C++ Language Considerations
 
-* The code is C++11-compatible to work with older toolchains.
+* The code is C++17-compatible to work with older toolchains.
 * The runtime does not use exceptions or RTTI, although it is not antagonistic
   to them.
 * The code is compatible with GCC and Clang, and has also been built with
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 272ddcfc0c5..9cef98e6227 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -91,6 +91,7 @@ fi
 ### Optional user args
 ########
 root_dir=${1:-"${script_dir}/ethos-u-scratch"}
+mkdir -p ${root_dir}
 root_dir=$(realpath ${root_dir})
 
 ########
@@ -246,7 +247,6 @@ fi
 cd "${script_dir}"
 
 # Setup the root dir
-mkdir -p "${root_dir}"
 cd "${root_dir}"
 echo "[main] Using root dir ${root_dir}"
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 7ed9c9ec979..ac14270ed51 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -19,6 +19,7 @@
 import android.os.Bundle;
 import android.os.Handler;
 import android.os.Looper;
+import android.os.Process;
 import android.provider.MediaStore;
 import android.system.ErrnoException;
 import android.system.Os;
@@ -44,6 +45,8 @@
 import java.lang.reflect.Type;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
 import org.pytorch.executorch.LlamaCallback;
 import org.pytorch.executorch.LlamaModule;
 
@@ -70,13 +73,17 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa
   private SettingsFields mCurrentSettingsFields;
   private Handler mMemoryUpdateHandler;
   private Runnable memoryUpdater;
+  private int promptID = 0;
+  private long startPos = 0;
+  private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2;
+  private Executor executor;
 
   @Override
   public void onResult(String result) {
     if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) {
       return;
     }
-    if (result.equals("\n\n")) {
+    if (result.equals("\n\n") || result.equals("\n")) {
       if (!mResultMessage.getText().isEmpty()) {
         mResultMessage.appendText(result);
         run();
@@ -147,6 +154,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
               + (float) loadDuration / 1000
               + " sec."
               + " You can send text or image for inference";
+
+      if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+        ETLogging.getInstance().log("Llava start prefill prompt");
+        startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0);
+        ETLogging.getInstance().log("Llava completes prefill prompt");
+      }
     }
 
     Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0);
@@ -195,6 +208,11 @@ private void populateExistingMessages(String existingMsgJSON) {
     mMessageAdapter.notifyDataSetChanged();
   }
 
+  private int setPromptID() {
+
+    return mMessageAdapter.getMaxPromptID() + 1;
+  }
+
   @Override
   protected void onCreate(Bundle savedInstanceState) {
     super.onCreate(savedInstanceState);
@@ -216,6 +234,7 @@ protected void onCreate(Bundle savedInstanceState) {
     String existingMsgJSON = mDemoSharedPreferences.getSavedMessages();
     if (!existingMsgJSON.isEmpty()) {
       populateExistingMessages(existingMsgJSON);
+      promptID = setPromptID();
     }
     mSettingsButton = requireViewById(R.id.settings);
     mSettingsButton.setOnClickListener(
@@ -232,6 +251,7 @@ protected void onCreate(Bundle savedInstanceState) {
     setupCameraRoll();
     startMemoryUpdate();
     setupShowLogsButton();
+    executor = Executors.newSingleThreadExecutor();
   }
 
   @Override
@@ -537,6 +557,32 @@ private void showMediaPreview(List<Uri> uris) {
       imageViews.get(i).setVisibility(View.VISIBLE);
       imageViews.get(i).setImageURI(mSelectedImageUri.get(i));
     }
+
+    // For LLava, we want to call prefill_image as soon as an image is selected
+    // Llava only support 1 image for now
+    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+      List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
+      if (!processedImageList.isEmpty()) {
+        mMessageAdapter.add(
+            new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0));
+        mMessageAdapter.notifyDataSetChanged();
+        Runnable runnable =
+            () -> {
+              Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE);
+              ETLogging.getInstance().log("Starting runnable prefill image");
+              ETImage img = processedImageList.get(0);
+              ETLogging.getInstance().log("Llava start prefill image");
+              startPos =
+                  mModule.prefillImages(
+                      img.getInts(),
+                      img.getWidth(),
+                      img.getHeight(),
+                      ModelUtils.VISION_MODEL_IMAGE_CHANNELS,
+                      startPos);
+            };
+        executor.execute(runnable);
+      }
+    }
   }
 
   private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
@@ -552,6 +598,48 @@ private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
     mMessageAdapter.notifyDataSetChanged();
   }
 
+  private String getConversationHistory() {
+    String conversationHistory = "";
+
+    ArrayList<Message> conversations =
+        mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK);
+    if (conversations.isEmpty()) {
+      return conversationHistory;
+    }
+
+    int prevPromptID = conversations.get(0).getPromptID();
+    String conversationFormat =
+        PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType());
+    String format = conversationFormat;
+    for (int i = 0; i < conversations.size(); i++) {
+      Message conversation = conversations.get(i);
+      int currentPromptID = conversation.getPromptID();
+      if (currentPromptID != prevPromptID) {
+        conversationHistory = conversationHistory + format;
+        format = conversationFormat;
+        prevPromptID = currentPromptID;
+      }
+      if (conversation.getIsSent()) {
+        format = format.replace(PromptFormat.USER_PLACEHOLDER, conversation.getText());
+      } else {
+        format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText());
+      }
+    }
+    conversationHistory = conversationHistory + format;
+
+    return conversationHistory;
+  }
+
+  private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) {
+    if (conversationHistory.isEmpty()) {
+      return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
+    }
+
+    return mCurrentSettingsFields.getFormattedSystemPrompt()
+        + conversationHistory
+        + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt);
+  }
+
   private void onModelRunStarted() {
     mSendButton.setClickable(false);
     mSendButton.setImageResource(R.drawable.baseline_stop_24);
@@ -567,42 +655,26 @@ private void onModelRunStopped() {
     mSendButton.setOnClickListener(
         view -> {
           addSelectedImagesToChatThread(mSelectedImageUri);
-          List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
-          processedImageList.forEach(
-              image -> {
-                ETLogging.getInstance()
-                    .log(
-                        "Image preprocessed:"
-                            + " uri = "
-                            + image.getUri().getLastPathSegment()
-                            + ","
-                            + " width = "
-                            + image.getWidth()
-                            + ","
-                            + " height = "
-                            + image.getHeight()
-                            + ","
-                            + " bytes size = "
-                            + image.getBytes().length);
-              });
           String rawPrompt = mEditTextMessage.getText().toString();
-          String prompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
           // We store raw prompt into message adapter, because we don't want to show the extra
           // tokens from system prompt
-          mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, 0));
+          mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID));
           mMessageAdapter.notifyDataSetChanged();
           mEditTextMessage.setText("");
-          mResultMessage = new Message("", false, MessageType.TEXT, 0);
+          mResultMessage = new Message("", false, MessageType.TEXT, promptID);
           mMessageAdapter.add(mResultMessage);
           // Scroll to bottom of the list
           mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1);
           // After images are added to prompt and chat thread, we clear the imageURI list
           // Note: This has to be done after imageURIs are no longer needed by LlamaModule
           mSelectedImageUri = null;
+          promptID++;
           Runnable runnable =
               new Runnable() {
                 @Override
                 public void run() {
+                  Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE);
+                  ETLogging.getInstance().log("starting runnable generate()");
                   runOnUiThread(
                       new Runnable() {
                         @Override
@@ -610,37 +682,24 @@ public void run() {
                           onModelRunStarted();
                         }
                       });
-                  ETLogging.getInstance().log("Running inference.. prompt=" + prompt);
                   long generateStartTime = System.currentTimeMillis();
                   if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
                       == ModelUtils.VISION_MODEL) {
-                    if (!processedImageList.isEmpty()) {
-                      // For now, Llava only support 1 image.
-                      ETImage img = processedImageList.get(0);
-                      mModule.generate(
-                          processedImageList.get(0).getInts(),
-                          img.getWidth(),
-                          img.getHeight(),
-                          ModelUtils.VISION_MODEL_IMAGE_CHANNELS,
-                          prompt,
-                          ModelUtils.VISION_MODEL_SEQ_LEN,
-                          false,
-                          MainActivity.this);
-                    } else {
-                      // no image selected, we pass in empty int array
-                      mModule.generate(
-                          new int[0],
-                          0,
-                          0,
-                          ModelUtils.VISION_MODEL_IMAGE_CHANNELS,
-                          prompt,
-                          ModelUtils.VISION_MODEL_SEQ_LEN,
-                          false,
-                          MainActivity.this);
-                    }
+                    mModule.generateFromPos(
+                        mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt),
+                        ModelUtils.VISION_MODEL_SEQ_LEN,
+                        startPos,
+                        MainActivity.this,
+                        false);
                   } else {
+                    String finalPrompt =
+                        getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
+                    ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt);
                     mModule.generate(
-                        prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, false, MainActivity.this);
+                        finalPrompt,
+                        (int) (finalPrompt.length() * 0.75) + 64,
+                        MainActivity.this,
+                        false);
                   }
 
                   long generateDuration = System.currentTimeMillis() - generateStartTime;
@@ -655,7 +714,7 @@ public void run() {
                   ETLogging.getInstance().log("Inference completed");
                 }
               };
-          new Thread(runnable).start();
+          executor.execute(runnable);
         });
     mMessageAdapter.notifyDataSetChanged();
   }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
index d9cbd95a1a7..2538c852e48 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
@@ -16,6 +16,7 @@
 import android.widget.ImageView;
 import android.widget.TextView;
 import java.util.ArrayList;
+import java.util.Collections;
 
 public class MessageAdapter extends ArrayAdapter<Message> {
 
@@ -90,4 +91,41 @@ public void clear() {
   public ArrayList<Message> getSavedMessages() {
     return savedMessages;
   }
+
+  public ArrayList<Message> getRecentSavedTextMessages(int numOfLatestPromptMessages) {
+    ArrayList<Message> recentMessages = new ArrayList<Message>();
+    int lastIndex = savedMessages.size() - 1;
+    Message messageToAdd = savedMessages.get(lastIndex);
+    int oldPromptID = messageToAdd.getPromptID();
+
+    for (int i = 0; i < savedMessages.size(); i++) {
+      messageToAdd = savedMessages.get(lastIndex - i);
+      if (messageToAdd.getMessageType() != MessageType.SYSTEM) {
+        if (messageToAdd.getPromptID() != oldPromptID) {
+          numOfLatestPromptMessages--;
+          oldPromptID = messageToAdd.getPromptID();
+        }
+        if (numOfLatestPromptMessages > 0) {
+          if (messageToAdd.getMessageType() == MessageType.TEXT) {
+            recentMessages.add(messageToAdd);
+          }
+        } else {
+          break;
+        }
+      }
+    }
+
+    // To place the order in [input1, output1, input2, output2...]
+    Collections.reverse(recentMessages);
+    return recentMessages;
+  }
+
+  public int getMaxPromptID() {
+    int maxPromptID = -1;
+    for (Message msg : savedMessages) {
+
+      maxPromptID = Math.max(msg.getPromptID(), maxPromptID);
+    }
+    return maxPromptID;
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
index 7342b4ab00c..36e738c3d0e 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
@@ -12,6 +12,8 @@ public class PromptFormat {
 
   public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}";
   public static final String USER_PLACEHOLDER = "{{ user_prompt }}";
+  public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}";
+  public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences";
 
   public static String getSystemPromptTemplate(ModelType modelType) {
     switch (modelType) {
@@ -33,8 +35,20 @@ public static String getUserPromptTemplate(ModelType modelType) {
       case LLAMA_3_1:
         return "<|start_header_id|>user<|end_header_id|>\n"
             + USER_PLACEHOLDER
-            + "<|eot_id|>\n"
+            + "<|eot_id|>"
             + "<|start_header_id|>assistant<|end_header_id|>";
+
+      case LLAVA_1_5:
+      default:
+        return USER_PLACEHOLDER;
+    }
+  }
+
+  public static String getConversationFormat(ModelType modelType) {
+    switch (modelType) {
+      case LLAMA_3:
+      case LLAMA_3_1:
+        return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>";
       case LLAVA_1_5:
         return USER_PLACEHOLDER + " ASSISTANT:";
       default:
@@ -53,4 +67,9 @@ public static String getStopToken(ModelType modelType) {
         return "";
     }
   }
+
+  public static String getLlavaPresetPrompt() {
+    return "A chat between a curious human and an artificial intelligence assistant. The assistant"
+        + " gives helpful, detailed, and polite answers to the human's questions. USER: ";
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 5f1fc96e1ac..0736c8cda94 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -43,7 +43,7 @@ public class SettingsActivity extends AppCompatActivity {
   public SettingsFields mSettingsFields;
 
   private DemoSharedPreferences mDemoSharedPreferences;
-  public static double TEMPERATURE_MIN_VALUE = 0.1;
+  public static double TEMPERATURE_MIN_VALUE = 0.0;
 
   @Override
   protected void onCreate(Bundle savedInstanceState) {
@@ -120,6 +120,7 @@ private void setupLoadModelButton() {
                     public void onClick(DialogInterface dialog, int whichButton) {
                       mSettingsFields.saveLoadModelAction(true);
                       mLoadModelButton.setEnabled(false);
+                      onBackPressed();
                     }
                   })
               .setNegativeButton(android.R.string.no, null)
@@ -208,8 +209,7 @@ public void afterTextChanged(Editable s) {
                   new DialogInterface.OnClickListener() {
                     public void onClick(DialogInterface dialog, int whichButton) {
                       // Clear the messageAdapter and sharedPreference
-                      mSystemPromptEditText.setText(
-                          PromptFormat.getSystemPromptTemplate(mModelType));
+                      mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT);
                     }
                   })
               .setNegativeButton(android.R.string.no, null)
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
index 466d3303e28..b71799981b2 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
@@ -38,12 +38,12 @@ public String getFormattedSystemAndUserPrompt(String prompt) {
     return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt);
   }
 
-  private String getFormattedSystemPrompt() {
+  public String getFormattedSystemPrompt() {
     return PromptFormat.getSystemPromptTemplate(modelType)
         .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt);
   }
 
-  private String getFormattedUserPrompt(String prompt) {
+  public String getFormattedUserPrompt(String prompt) {
     return userPrompt.replace(PromptFormat.USER_PLACEHOLDER, prompt);
   }
 
diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
index 87d0f47c956..68d191685d3 100644
--- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -37,6 +38,8 @@ cmake examples/models/llama2 \
          -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
          -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
          -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
          -DCMAKE_BUILD_TYPE=Release \
          -B"${CMAKE_OUT}"/examples/models/llama2
 
@@ -47,7 +50,9 @@ cmake extension/android \
   -DANDROID_ABI="${ANDROID_ABI}" \
   -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"/extension/android
 
@@ -59,7 +64,7 @@ mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}"
 BUILD_AAR_DIR="$(mktemp -d)"
 mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs"
 JNI_LIBS_PATH="${BUILD_AAR_DIR}/jni"
-cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch_jni.so"
 cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
 cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
 cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
index 91a68d4b88b..5e65929426b 100644
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
@@ -37,6 +38,7 @@ cmake examples/models/llama2 \
          -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
          -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
          -DEXECUTORCH_BUILD_XNNPACK=ON \
          -DCMAKE_BUILD_TYPE=Release \
          -B"${CMAKE_OUT}"/examples/models/llama2
@@ -48,6 +50,7 @@ cmake extension/android \
   -DANDROID_ABI="${ANDROID_ABI}" \
   -DANDROID_PLATFORM=android-23 \
   -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
   -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"/extension/android
@@ -56,7 +59,7 @@ cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Relea
 
 BUILD_AAR_DIR="$(mktemp -d)"
 mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs"
-cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/libexecutorch.so"
 cp extension/android/build/libs/executorch.jar "${BUILD_AAR_DIR}/libs"
 echo \<manifest xmlns:android=\"http://schemas.android.com/apk/res/android\" \
   package=\"org.pytorch.executorch\"\> \
diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt
index 2abee59759f..1d411f07ca7 100644
--- a/examples/mediatek/CMakeLists.txt
+++ b/examples/mediatek/CMakeLists.txt
@@ -75,6 +75,44 @@ if(${ANDROID})
   )
   target_compile_options(mtk_executor_runner PUBLIC ${_common_compile_options})
 
+  set(_mtk_oss_executor_runner__srcs ${_executor_runner__srcs})
+  list(
+      TRANSFORM
+      _mtk_oss_executor_runner__srcs
+      PREPEND
+      "${EXECUTORCH_SOURCE_DIR}/"
+  )
+  list(
+      FILTER
+      _mtk_oss_executor_runner__srcs
+      EXCLUDE REGEX
+      ".*executor_runner.cpp$"
+  )
+  list(
+      PREPEND
+      _mtk_oss_executor_runner__srcs
+      ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_oss_executor_runner.cpp
+  )
+
+  add_executable(mtk_oss_executor_runner ${_mtk_oss_executor_runner__srcs})
+
+  target_include_directories(mtk_oss_executor_runner
+      PUBLIC
+      ${_common_include_directories}
+      ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include
+  )
+
+  target_link_libraries(mtk_oss_executor_runner
+      ${_executor_runner_libs}
+      executorch
+      neuron_backend
+      gflags
+  )
+  target_compile_options(mtk_oss_executor_runner
+      PUBLIC
+      ${_common_compile_options}
+  )
+
   set(_mtk_llama_executor_runner__srcs ${_mtk_executor_runner__srcs})
   list(FILTER _mtk_llama_executor_runner__srcs EXCLUDE REGEX
        ".*executor_runner.cpp$"
diff --git a/examples/mediatek/README.md b/examples/mediatek/README.md
index faca42fb50c..9727f2587fd 100644
--- a/examples/mediatek/README.md
+++ b/examples/mediatek/README.md
@@ -9,6 +9,8 @@ examples/mediatek
         ├── preformatter_templates    # Model specific prompt preformatter templates
         ├── prompts                   # Calibration Prompts
         ├── tokenizers_               # Model tokenizer scripts
+    ├── oss_utils                     # Utils for oss models
+├── eval_utils                        # Utils for eval oss models
 ├── model_export_scripts              # Model specifc export scripts
 ├── models                            # Model definitions
     ├── llm_models                    # LLM model definitions
@@ -44,6 +46,7 @@ pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylin
 ```
 
 ## AoT Flow
+### llama
 ##### Note: Verify that localhost connection is available before running AoT Flow
 1. Exporting Models to `.pte`
 - In the `examples/mediatek directory`, run:
@@ -72,6 +75,14 @@ source shell_scripts/export_llama.sh <model_name> <num_chunks> <prompt_num_token
     - eg. For `llama3-8B-instruct`, embedding bin generated in `examples/mediatek/models/llm_models/weights/llama3-8B-instruct/`
     - AoT flow will take roughly 2.5 hours (114GB RAM for `num_chunks=4`) to complete (Results will vary by device/hardware configurations)
 
+### oss
+1. Exporting Model to `.pte`
+```bash
+bash shell_scripts/export_oss.sh <model_name>
+```
+- Argument Options:
+    - `model_name`: deeplabv3/edsr/inceptionv3/inceptionv4/mobilenetv2/mobilenetv3/resnet18/resnet50
+
 # Runtime
 ## Supported Chips
 
@@ -100,6 +111,13 @@ adb push <MODEL_NAME>.pte <PHONE_PATH, e.g. /data/local/tmp>
 
 Make sure to replace `<MODEL_NAME>` with the actual name of your model file. And, replace the `<PHONE_PATH>` with the desired detination on the device.
 
+##### Note: For oss models, please push additional files to your Android device
+```bash
+adb push mtk_oss_executor_runner <PHONE_PATH, e.g. /data/local/tmp>
+adb push input_list.txt <PHONE_PATH, e.g. /data/local/tmp>
+for i in input*bin; do adb push "$i" <PHONE_PATH, e.g. /data/local/tmp>; done;
+```
+
 ### Executing the Model
 
 Execute the model on your Android device by running:
@@ -111,3 +129,21 @@ adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/<MOD
 In the command above, replace `<MODEL_NAME>` with the name of your model file and `<ITER_TIMES>` with the desired number of iterations to run the model.
 
 ##### Note: For llama models, please use `mtk_llama_executor_runner`. Refer to `examples/mediatek/executor_runner/run_llama3_sample.sh` for reference.
+##### Note: For oss models, please use `mtk_oss_executor_runner`.
+```bash
+adb shell "/data/local/tmp/mtk_oss_executor_runner --model_path /data/local/tmp/<MODEL_NAME>.pte --input_list /data/local/tmp/input_list.txt --output_folder /data/local/tmp/output_<MODEL_NAME>"
+adb pull "/data/local/tmp/output_<MODEL_NAME> ./"
+```
+
+### Check oss result on PC
+```bash
+python3 eval_utils/eval_oss_result.py --eval_type <eval_type> --target_f <golden_folder> --output_f <prediction_folder>
+```
+For example:
+```
+python3 eval_utils/eval_oss_result.py --eval_type piq --target_f edsr --output_f output_edsr
+```
+- Argument Options:
+    - `eval_type`: topk/piq/segmentation
+    - `target_f`: folder contain golden data files. file name is `golden_<data_idx>_0.bin`
+    - `output_f`: folder contain model output data files. file name is `output_<data_idx>_0.bin`
diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py
new file mode 100755
index 00000000000..f447b2ac68f
--- /dev/null
+++ b/examples/mediatek/aot_utils/oss_utils/utils.py
@@ -0,0 +1,73 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Optional
+
+import torch
+from executorch import exir
+from executorch.backends.mediatek import (
+    NeuropilotPartitioner,
+    NeuropilotQuantizer,
+    Precision,
+)
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+
+def build_executorch_binary(
+    model,
+    inputs,
+    file_name,
+    dataset,
+    quant_dtype: Optional[Precision] = None,
+):
+    if quant_dtype is not None:
+        quantizer = NeuropilotQuantizer()
+        quantizer.setup_precision(quant_dtype)
+        if quant_dtype not in Precision:
+            raise AssertionError(f"No support for Precision {quant_dtype}.")
+
+        captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
+        annotated_model = prepare_pt2e(captured_model, quantizer)
+        print("Quantizing the model...")
+        # calibration
+        for data in dataset:
+            annotated_model(*data)
+        quantized_model = convert_pt2e(annotated_model, fold_quantize=False)
+        aten_dialect = torch.export.export(quantized_model, inputs)
+    else:
+        aten_dialect = torch.export.export(model, inputs)
+
+    from executorch.exir.program._program import to_edge_transform_and_lower
+
+    edge_compile_config = exir.EdgeCompileConfig(_check_ir_validity=False)
+    # skipped op names are used for deeplabV3 model
+    neuro_partitioner = NeuropilotPartitioner(
+        [],
+        op_names_to_skip={
+            "aten_convolution_default_106",
+            "aten_convolution_default_107",
+        },
+    )
+    edge_prog = to_edge_transform_and_lower(
+        aten_dialect,
+        compile_config=edge_compile_config,
+        partitioner=[neuro_partitioner],
+    )
+
+    exec_prog = edge_prog.to_executorch(
+        config=exir.ExecutorchBackendConfig(extract_constant_segment=False)
+    )
+    with open(f"{file_name}.pte", "wb") as file:
+        file.write(exec_prog.buffer)
+
+
+def make_output_dir(path: str):
+    if os.path.exists(path):
+        for f in os.listdir(path):
+            os.remove(os.path.join(path, f))
+        os.removedirs(path)
+    os.makedirs(path)
diff --git a/examples/mediatek/eval_utils/eval_oss_result.py b/examples/mediatek/eval_utils/eval_oss_result.py
new file mode 100755
index 00000000000..3e599330b66
--- /dev/null
+++ b/examples/mediatek/eval_utils/eval_oss_result.py
@@ -0,0 +1,198 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+import os
+
+import numpy as np
+import piq
+import torch
+
+
+def check_data(target_f, predict_f):
+    target_files = os.listdir(target_f)
+    predict_files = os.listdir(predict_f)
+    if len(target_files) != len(predict_files):
+        raise RuntimeError(
+            "Data number in target folder and prediction folder must be same"
+        )
+
+    predict_set = set(predict_files)
+    for f in target_files:
+        # target file naming rule is golden_sampleId_outId.bin
+        # predict file naming rule is output_sampleId_outId.bin
+        pred_name = f.replace("golden", "output")
+        try:
+            predict_set.remove(pred_name)
+        except KeyError:
+            raise RuntimeError(f"Cannot find {pred_name} in {predict_f}")
+
+    if predict_set:
+        target_name = next(predict_set).replace("output", "golden")
+        raise RuntimeError(f"Cannot find {target_name} in {target_f}")
+
+
+def eval_topk(target_f, predict_f):
+    def solve(prob, target, k):
+        _, indices = torch.topk(prob, k=k, sorted=True)
+        golden = torch.reshape(target, [-1, 1])
+        correct = golden == indices
+        if torch.any(correct):
+            return 1
+        else:
+            return 0
+
+    target_files = os.listdir(target_f)
+
+    cnt10 = 0
+    cnt50 = 0
+    for target_name in target_files:
+        pred_name = target_name.replace("golden", "output")
+
+        pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32)
+        target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.int64)[0]
+        cnt10 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 10)
+        cnt50 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 50)
+
+    print("Top10 acc:", cnt10 * 100.0 / len(target_files))
+    print("Top50 acc:", cnt50 * 100.0 / len(target_files))
+
+
+def eval_piq(target_f, predict_f):
+    target_files = os.listdir(target_f)
+
+    psnr_list = []
+    ssim_list = []
+    for target_name in target_files:
+        pred_name = target_name.replace("golden", "output")
+        hr = np.fromfile(os.path.join(target_f, target_name), dtype=np.float32)
+        hr = hr.reshape((1, 448, 448, 3))
+        hr = np.moveaxis(hr, 3, 1)
+        hr = torch.from_numpy(hr)
+
+        sr = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32)
+        sr = sr.reshape((1, 448, 448, 3))
+        sr = np.moveaxis(sr, 3, 1)
+        sr = torch.from_numpy(sr).clamp(0, 1)
+
+        psnr_list.append(piq.psnr(hr, sr))
+        ssim_list.append(piq.ssim(hr, sr))
+
+    avg_psnr = sum(psnr_list).item() / len(psnr_list)
+    avg_ssim = sum(ssim_list).item() / len(ssim_list)
+
+    print(f"Avg of PSNR is: {avg_psnr}")
+    print(f"Avg of SSIM is: {avg_ssim}")
+
+
+def eval_segmentation(target_f, predict_f):
+    classes = [
+        "Backround",
+        "Aeroplane",
+        "Bicycle",
+        "Bird",
+        "Boat",
+        "Bottle",
+        "Bus",
+        "Car",
+        "Cat",
+        "Chair",
+        "Cow",
+        "DiningTable",
+        "Dog",
+        "Horse",
+        "MotorBike",
+        "Person",
+        "PottedPlant",
+        "Sheep",
+        "Sofa",
+        "Train",
+        "TvMonitor",
+    ]
+
+    target_files = os.listdir(target_f)
+
+    def make_confusion(goldens, predictions, num_classes):
+        def histogram(golden, predict):
+            mask = golden < num_classes
+            hist = np.bincount(
+                num_classes * golden[mask].astype(int) + predict[mask],
+                minlength=num_classes**2,
+            ).reshape(num_classes, num_classes)
+            return hist
+
+        confusion = np.zeros((num_classes, num_classes))
+        for g, p in zip(goldens, predictions):
+            confusion += histogram(g.flatten(), p.flatten())
+
+        return confusion
+
+    pred_list = []
+    target_list = []
+    for target_name in target_files:
+        pred_name = target_name.replace("golden", "output")
+        target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.uint8)
+        target_npy = target_npy.reshape((224, 224))
+        target_list.append(target_npy)
+
+        pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32)
+        pred_npy = pred_npy.reshape((224, 224, len(classes)))
+        pred_npy = pred_npy.argmax(2).astype(np.uint8)
+        pred_list.append(pred_npy)
+
+    eps = 1e-6
+    confusion = make_confusion(target_list, pred_list, len(classes))
+
+    pa = np.diag(confusion).sum() / (confusion.sum() + eps)
+    mpa = np.mean(np.diag(confusion) / (confusion.sum(axis=1) + eps))
+    iou = np.diag(confusion) / (
+        confusion.sum(axis=1) + confusion.sum(axis=0) - np.diag(confusion) + eps
+    )
+    miou = np.mean(iou)
+    cls_iou = dict(zip(classes, iou))
+
+    print(f"PA   : {pa}")
+    print(f"MPA  : {mpa}")
+    print(f"MIoU : {miou}")
+    print(f"CIoU : \n{json.dumps(cls_iou, indent=2)}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--target_f",
+        help="folder of target data",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--out_f",
+        help="folder of model prediction data",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--eval_type",
+        help="Choose eval type from: topk, piq, segmentation",
+        type=str,
+        choices=["topk", "piq", "segmentation"],
+        required=True,
+    )
+
+    args = parser.parse_args()
+
+    check_data(args.target_f, args.out_f)
+
+    if args.eval_type == "topk":
+        eval_topk(args.target_f, args.out_f)
+    elif args.eval_type == "piq":
+        eval_piq(args.target_f, args.out_f)
+    elif args.eval_type == "segmentation":
+        eval_segmentation(args.target_f, args.out_f)
diff --git a/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp
new file mode 100755
index 00000000000..3a1ad1d863b
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 MediaTek Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * This tool can run ExecuTorch model files that only use operators that
+ * are covered by the portable kernels, with possible delegate to the
+ * test_backend_compiler_lib.
+ *
+ * It sets all input tensor data to ones, and assumes that the outputs are
+ * all fp32 tensors.
+ */
+
+#include <cstdlib>
+#include <ctime>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/runtime.h>
+
+static uint8_t method_allocator_pool[8 * 1024U * 1024U]; // 8 MB
+
+// Model Path
+DEFINE_string(
+    model_path,
+    "model.pte",
+    "Model serialized in flatbuffer format. Default to 'model.pte'");
+DEFINE_string(
+    input_list,
+    "input_list.txt",
+    "Model input list. Default to 'input_list.txt'");
+DEFINE_string(
+    output_folder,
+    "outputs",
+    "Model output folder. Default to 'outputs'");
+
+using namespace torch::executor;
+using torch::executor::MemoryAllocator;
+using torch::executor::util::BufferCleanup;
+using torch::executor::util::FileDataLoader;
+using namespace std::filesystem;
+
+int main(int argc, char** argv) {
+  runtime_init();
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (argc != 1) {
+    std::string msg = "Extra commandline args:";
+    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+      msg += std::string(" ") + argv[i];
+    }
+    ET_LOG(Error, "%s", msg.c_str());
+    return 1;
+  }
+
+  // Create output folder
+  create_directories(FLAGS_output_folder);
+
+  // Create a loader to get the data of the program file. There are other
+  // DataLoaders that use mmap() or point to data that's already in memory, and
+  // users can create their own DataLoaders to load from arbitrary sources.
+  const char* model_path = FLAGS_model_path.c_str();
+  Result<FileDataLoader> loader = FileDataLoader::from(model_path);
+  ET_CHECK_MSG(
+      loader.ok(),
+      "FileDataLoader::from() failed: 0x%" PRIx32,
+      (uint32_t)loader.error());
+
+  // Parse the program file. This is immutable, and can also be reused between
+  // multiple execution invocations across multiple threads.
+  Result<Program> program = Program::load(&loader.get());
+  if (!program.ok()) {
+    ET_LOG(Error, "Failed to parse model file %s", model_path);
+    return 1;
+  }
+  ET_LOG(Info, "Model file %s is loaded.", model_path);
+
+  // Use the first method in the program.
+  const char* method_name = nullptr;
+  {
+    const auto method_name_result = program->get_method_name(0);
+    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
+    method_name = *method_name_result;
+  }
+  ET_LOG(Info, "Using method %s", method_name);
+
+  // MethodMeta describes the memory requirements of the method.
+  Result<MethodMeta> method_meta_result = program->method_meta(method_name);
+  ET_CHECK_MSG(
+      method_meta_result.ok(),
+      "Failed to get method_meta for %s: 0x%" PRIx32,
+      method_name,
+      (uint32_t)method_meta_result.error());
+
+  //
+  // The runtime does not use malloc/new; it allocates all memory using the
+  // MemoryManger provided by the client. Clients are responsible for allocating
+  // the memory ahead of time, or providing MemoryAllocator subclasses that can
+  // do it dynamically.
+  //
+
+  // The method allocator is used to allocate all dynamic C++ metadata/objects
+  // used to represent the loaded method. This allocator is only used during
+  // loading a method of the program, which will return an error if there was
+  // not enough memory.
+  //
+  // The amount of memory required depends on the loaded method and the runtime
+  // code itself. The amount of memory here is usually determined by running the
+  // method and seeing how much memory is actually used, though it's possible to
+  // subclass MemoryAllocator so that it calls malloc() under the hood (see
+  // MallocMemoryAllocator).
+  //
+  // In this example we use a statically allocated memory pool.
+  MemoryAllocator method_allocator{
+      MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
+
+  // The memory-planned buffers will back the mutable tensors used by the
+  // method. The sizes of these buffers were determined ahead of time during the
+  // memory-planning pasees.
+  //
+  // Each buffer typically corresponds to a different hardware memory bank. Most
+  // mobile environments will only have a single buffer. Some embedded
+  // environments may have more than one for, e.g., slow/large DRAM and
+  // fast/small SRAM, or for memory associated with particular cores.
+  std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
+  std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
+  size_t num_memory_planned_buffers =
+      method_meta_result->num_memory_planned_buffers();
+  for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
+    // .get() will always succeed because id < num_memory_planned_buffers.
+    size_t buffer_size = static_cast<size_t>(
+        method_meta_result->memory_planned_buffer_size(id).get());
+    ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
+    planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
+    planned_spans.push_back({planned_buffers.back().get(), buffer_size});
+  }
+  HierarchicalAllocator planned_memory(
+      {planned_spans.data(), planned_spans.size()});
+
+  // Assemble all of the allocators into the MemoryManager that the Executor
+  // will use.
+  MemoryManager memory_manager(&method_allocator, &planned_memory);
+
+  //
+  // Load the method from the program, using the provided allocators. Running
+  // the method can mutate the memory-planned buffers, so the method should only
+  // be used by a single thread at at time, but it can be reused.
+  //
+  Result<Method> method = program->load_method(method_name, &memory_manager);
+  ET_CHECK_MSG(
+      method.ok(),
+      "Loading of method %s failed with status 0x%" PRIx32,
+      method_name,
+      (uint32_t)method.error());
+  ET_LOG(Info, "Method loaded.");
+
+  std::ifstream input_list(FLAGS_input_list);
+  ET_CHECK_MSG(
+      input_list.is_open(),
+      "Error: cannot open input file %s",
+      FLAGS_input_list.c_str());
+
+  auto split = [](std::string s, std::string delimiter) {
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    std::string token;
+    std::vector<std::string> res;
+
+    while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+      token = s.substr(pos_start, pos_end - pos_start);
+      pos_start = pos_end + delim_len;
+      res.push_back(token);
+    }
+    res.push_back(s.substr(pos_start));
+    return res;
+  };
+
+  MethodMeta method_meta = method->method_meta();
+  size_t num_inputs = method_meta.num_inputs();
+  std::string file_path;
+  int inference_index = 0;
+  while (std::getline(input_list, file_path)) {
+    auto input_files = split(file_path, " ");
+    if (input_files.size() == 0) {
+      break;
+    }
+    ET_CHECK_MSG(
+        input_files.size() == num_inputs,
+        "Model expect %zu inputs but get %zu from input files",
+        num_inputs,
+        input_files.size());
+
+    // Prepare the inputs.
+    size_t num_allocated = 0;
+    ET_LOG(Info, "Number of inputs: %zu", num_inputs);
+    void** inputs = (void**)malloc(num_inputs * sizeof(void*));
+
+    for (size_t i = 0; i < num_inputs; i++) {
+      auto tag = method_meta.input_tag(i);
+      if (tag.get() != Tag::Tensor) {
+        ET_LOG(Debug, "Skipping malloc non-tensor input %zu", i);
+        continue;
+      }
+      Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(i);
+      const auto nbytes = tensor_meta->nbytes();
+      // This input is a tensor. Allocate a buffer for it.
+      void* data_ptr = malloc(nbytes);
+
+      // Read data from file
+      std::ifstream fin(input_files[i], std::ios::binary);
+      fin.seekg(0, fin.end);
+      size_t file_size = fin.tellg();
+
+      ET_CHECK_MSG(
+          file_size == nbytes,
+          "Input %zu size mismatch. file bytes: %zu, tensor bytes: %zu",
+          i,
+          file_size,
+          nbytes);
+
+      fin.seekg(0, fin.beg);
+      fin.read(static_cast<char*>(data_ptr), file_size);
+      fin.close();
+      inputs[num_allocated++] = data_ptr;
+
+      // Set backend input
+      auto scalar_type = tensor_meta->scalar_type();
+      auto sizes_raw = tensor_meta->sizes();
+      auto dim = sizes_raw.size();
+      auto dim_order_raw = tensor_meta->dim_order();
+      std::vector sizes(sizes_raw.begin(), sizes_raw.end());
+      std::vector dim_order(dim_order_raw.begin(), dim_order_raw.end());
+
+      TensorImpl impl = TensorImpl(
+          scalar_type, dim, sizes.data(), data_ptr, dim_order.data());
+
+      Tensor tensor(&impl);
+      Error ret = method->set_input(tensor, i);
+      if (ret != Error::Ok) {
+        ET_LOG(Error, "Failed to set input %zu: 0x%" PRIx32, i, (uint32_t)ret);
+        // The BufferCleanup will free the inputs when it goes out of scope.
+        BufferCleanup cleanup({inputs, num_allocated});
+        return 1;
+      }
+    }
+    BufferCleanup({inputs, num_allocated});
+    ET_LOG(Info, "Inputs prepared.");
+
+    // Run the model.
+    auto before_exec = std::chrono::high_resolution_clock::now();
+    Error status = Error::Ok;
+    status = method->execute();
+    auto after_exec = std::chrono::high_resolution_clock::now();
+    double elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(
+                              after_exec - before_exec)
+                              .count() /
+        1000.0;
+
+    ET_LOG(Info, "Inference took %f ms", elapsed_time);
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Execution of method %s failed with status 0x%" PRIx32,
+        method_name,
+        (uint32_t)status);
+    ET_LOG(Info, "Model executed successfully.");
+
+    // Get output data
+    size_t output_size = method->outputs_size();
+    ET_LOG(Info, "Number of outputs: %zu", output_size);
+    std::vector<EValue> outputs(output_size);
+    status = method->get_outputs(outputs.data(), output_size);
+    ET_CHECK(status == Error::Ok);
+    for (size_t i = 0; i < output_size; i++) {
+      auto output_tensor = outputs[i].toTensor();
+      auto output_file_name = FLAGS_output_folder + "/output_" +
+          std::to_string(inference_index) + "_" + std::to_string(i) + ".bin";
+      std::ofstream fout(output_file_name.c_str(), std::ios::binary);
+      fout.write(output_tensor.const_data_ptr<char>(), output_tensor.nbytes());
+      fout.close();
+    }
+
+    inference_index++;
+  }
+
+  return 0;
+}
diff --git a/examples/mediatek/model_export_scripts/deeplab_v3.py b/examples/mediatek/model_export_scripts/deeplab_v3.py
new file mode 100755
index 00000000000..da6766c0f54
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/deeplab_v3.py
@@ -0,0 +1,124 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import random
+
+import numpy as np
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.deeplabv3 = DeepLabV3ResNet101Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        nchw_output = self.deeplabv3(nchw_input1)
+        return nchw_output.permute(0, 2, 3, 1)
+
+
+def get_dataset(data_size, dataset_dir, download):
+    from torchvision import datasets, transforms
+
+    input_size = (224, 224)
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(input_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    dataset = list(
+        datasets.VOCSegmentation(
+            root=os.path.join(dataset_dir, "voc_image"),
+            year="2009",
+            image_set="val",
+            transform=preprocess,
+            download=download,
+        )
+    )
+
+    # prepare input data
+    random.shuffle(dataset)
+    inputs, targets, input_list = [], [], ""
+    for index, data in enumerate(dataset):
+        if index >= data_size:
+            break
+        image, target = data
+        inputs.append((image.unsqueeze(0).permute(0, 2, 3, 1),))
+        targets.append(np.array(target.resize(input_size)))
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./deeplab_v3",
+        default="./deeplab_v3",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--download",
+        help="If specified, download VOCSegmentation dataset by torchvision API",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        data_size=data_num, dataset_dir=args.artifact, download=args.download
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+            if idx == 0:
+                print("inp shape: ", d.detach().numpy().shape)
+                print("inp type: ", d.detach().numpy().dtype)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.tofile(file_name)
+        if idx == 0:
+            print("golden shape: ", data.shape)
+            print("golden type: ", data.dtype)
+
+    # build pte
+    pte_filename = "deeplabV3Resnet101_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/edsr.py b/examples/mediatek/model_export_scripts/edsr.py
new file mode 100755
index 00000000000..4192d67e569
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/edsr.py
@@ -0,0 +1,170 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import numpy as np
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.edsr import EdsrModel
+
+from PIL import Image
+from torch.utils.data import Dataset
+from torchsr.datasets import B100
+from torchvision.transforms.functional import to_tensor
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.edsr = EdsrModel().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        nchw_output = self.edsr(nchw_input1)
+        return nchw_output.permute(0, 2, 3, 1)
+
+
+class SrDataset(Dataset):
+    def __init__(self, hr_dir: str, lr_dir: str):
+        self.input_size = np.asanyarray([224, 224])
+        self.hr = []
+        self.lr = []
+
+        for file in sorted(os.listdir(hr_dir)):
+            self.hr.append(self._resize_img(os.path.join(hr_dir, file), 2))
+
+        for file in sorted(os.listdir(lr_dir)):
+            self.lr.append(self._resize_img(os.path.join(lr_dir, file), 1))
+
+        if len(self.hr) != len(self.lr):
+            raise AssertionError(
+                "The number of high resolution pics is not equal to low "
+                "resolution pics"
+            )
+
+    def __getitem__(self, idx: int):
+        return self.hr[idx], self.lr[idx]
+
+    def __len__(self):
+        return len(self.lr)
+
+    def _resize_img(self, file: str, scale: int):
+        with Image.open(file) as img:
+            return (
+                to_tensor(img.resize(tuple(self.input_size * scale)))
+                .unsqueeze(0)
+                .permute(0, 2, 3, 1)
+            )
+
+    def get_input_list(self):
+        input_list = ""
+        for i in range(len(self.lr)):
+            input_list += f"input_{i}_0.bin\n"
+        return input_list
+
+
+def get_b100(
+    dataset_dir: str,
+):
+    hr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/HR"
+    lr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/LR_bicubic/X2"
+
+    if not os.path.exists(hr_dir) or not os.path.exists(lr_dir):
+        B100(root=f"{dataset_dir}/sr_bm_dataset", scale=2, download=True)
+
+    return SrDataset(hr_dir, lr_dir)
+
+
+def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str):
+    if not (lr_dir and hr_dir) and not default_dataset:
+        raise RuntimeError(
+            "Nither custom dataset is provided nor using default dataset."
+        )
+
+    if (lr_dir and hr_dir) and default_dataset:
+        raise RuntimeError("Either use custom dataset, or use default dataset.")
+
+    if default_dataset:
+        return get_b100(dataset_dir)
+
+    return SrDataset(hr_dir, lr_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./edsr",
+        default="./edsr",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-r",
+        "--hr_ref_dir",
+        help="Path to the high resolution images",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-l",
+        "--lr_dir",
+        help="Path to the low resolution image inputs",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--default_dataset",
+        help="If specified, download and use B100 dataset by torchSR API",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    dataset = get_dataset(
+        args.hr_ref_dir, args.lr_dir, args.default_dataset, args.artifact
+    )
+
+    inputs, targets, input_list = dataset.lr, dataset.hr, dataset.get_input_list()
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "edsr_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (inputs[0],),
+        f"{args.artifact}/{pte_filename}",
+        [(input,) for input in inputs],
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/inception_v3.py b/examples/mediatek/model_export_scripts/inception_v3.py
new file mode 100755
index 00000000000..c28bd85b402
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/inception_v3.py
@@ -0,0 +1,120 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.inception_v3 import InceptionV3Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.inception = InceptionV3Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.inception(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./inceptionV3",
+        default="./inceptionV3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    pte_filename = "inceptionV3_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/inception_v4.py b/examples/mediatek/model_export_scripts/inception_v4.py
new file mode 100755
index 00000000000..ccb2ce16f22
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/inception_v4.py
@@ -0,0 +1,120 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.inception_v4 import InceptionV4Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.inception = InceptionV4Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.inception(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize((299, 299)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./inceptionV4",
+        default="./inceptionV4",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "inceptionV4_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 299, 299, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/mobilenet_v2.py b/examples/mediatek/model_export_scripts/mobilenet_v2.py
new file mode 100755
index 00000000000..97f2ed884eb
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/mobilenet_v2.py
@@ -0,0 +1,121 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.mobilenet_v2 import MV2Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.mobilenet = MV2Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.mobilenet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./mobilenetV2",
+        default="./mobilenetV2",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "mobilenetV2_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/mobilenet_v3.py b/examples/mediatek/model_export_scripts/mobilenet_v3.py
new file mode 100755
index 00000000000..fed2497ca26
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/mobilenet_v3.py
@@ -0,0 +1,121 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.mobilenet_v3 import MV3Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.mobilenet = MV3Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.mobilenet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./mobilenetV3",
+        default="./mobilenetV3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "mobilenetV3_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/resnet18.py b/examples/mediatek/model_export_scripts/resnet18.py
new file mode 100755
index 00000000000..2f3af57e7f3
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/resnet18.py
@@ -0,0 +1,122 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.resnet import ResNet18Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.resnet = ResNet18Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.resnet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./resnet18",
+        default="./resnet18",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        aaa = data.detach().numpy()
+        data.detach().numpy().tofile(file_name)
+
+    # build pte
+    pte_filename = "resnet18_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/model_export_scripts/resnet50.py b/examples/mediatek/model_export_scripts/resnet50.py
new file mode 100755
index 00000000000..ce23842447b
--- /dev/null
+++ b/examples/mediatek/model_export_scripts/resnet50.py
@@ -0,0 +1,121 @@
+# Copyright (c) MediaTek Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+from executorch.backends.mediatek import Precision
+from executorch.examples.mediatek.aot_utils.oss_utils.utils import (
+    build_executorch_binary,
+)
+from executorch.examples.models.resnet import ResNet50Model
+
+
+class NhwcWrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(NhwcWrappedModel, self).__init__()
+        self.resnet = ResNet50Model().get_eager_model()
+
+    def forward(self, input1):
+        nchw_input1 = input1.permute(0, 3, 1, 2)
+        output = self.resnet(nchw_input1)
+        return output
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        feature = feature.permute(0, 2, 3, 1)  # NHWC
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./resnet50",
+        default="./resnet50",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    inputs, targets, input_list = get_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+    )
+
+    # save data to inference on device
+    input_list_file = f"{args.artifact}/input_list.txt"
+    with open(input_list_file, "w") as f:
+        f.write(input_list)
+        f.flush()
+    for idx, data in enumerate(inputs):
+        for i, d in enumerate(data):
+            file_name = f"{args.artifact}/input_{idx}_{i}.bin"
+            d.detach().numpy().tofile(file_name)
+    for idx, data in enumerate(targets):
+        file_name = f"{args.artifact}/golden_{idx}_0.bin"
+        data.detach().numpy().tofile(file_name)
+
+    # compile to pte
+    pte_filename = "resnet50_mtk"
+    instance = NhwcWrappedModel()
+    build_executorch_binary(
+        instance.eval(),
+        (torch.randn(1, 224, 224, 3),),
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=Precision.A8W8,
+    )
diff --git a/examples/mediatek/requirements.txt b/examples/mediatek/requirements.txt
index 038700059ba..7c3de886e27 100644
--- a/examples/mediatek/requirements.txt
+++ b/examples/mediatek/requirements.txt
@@ -4,3 +4,5 @@ safetensors
 sentencepiece
 tokenizers
 transformers
+piq
+pillow
diff --git a/examples/mediatek/shell_scripts/export_oss.sh b/examples/mediatek/shell_scripts/export_oss.sh
new file mode 100755
index 00000000000..3da5dc41f94
--- /dev/null
+++ b/examples/mediatek/shell_scripts/export_oss.sh
@@ -0,0 +1,29 @@
+model=$1
+
+echo "Export model: $model"
+
+if [ $model = "deeplabv3" ]
+then
+	python3 model_export_scripts/deeplab_v3.py -d
+elif [ $model = "edsr" ]
+then
+	python3 model_export_scripts/edsr.py -d
+elif [ $model = "inceptionv3" ]
+then
+	python3 model_export_scripts/inception_v3.py -d PATH_TO_DATASET
+elif [ $model = "inceptionv4" ]
+then
+	python3 model_export_scripts/inception_v4.py -d PATH_TO_DATASET
+elif [ $model = "mobilenetv2" ]
+then
+	python3 model_export_scripts/mobilenet_v2.py -d PATH_TO_DATASET
+elif [ $model = "mobilenetv3" ]
+then
+	python3 model_export_scripts/mobilenet_v3.py -d PATH_TO_DATASET
+elif [ $model = "resnet18" ]
+then
+	python3 model_export_scripts/resnet18.py -d PATH_TO_DATASET
+elif [ $model = "resnet50" ]
+then
+	python3 model_export_scripts/resnet50.py -d PATH_TO_DATASET
+fi
diff --git a/examples/models/flamingo/preprocess/export_preprocess_lib.py b/examples/models/flamingo/preprocess/export_preprocess_lib.py
index 358b1f2149a..366f5989222 100644
--- a/examples/models/flamingo/preprocess/export_preprocess_lib.py
+++ b/examples/models/flamingo/preprocess/export_preprocess_lib.py
@@ -14,7 +14,7 @@
 from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
 
 from torch.export import Dim, ExportedProgram
-from torchtune.models.clip.inference._transforms import _CLIPImageTransform
+from torchtune.models.clip.inference._transform import _CLIPImageTransform
 
 
 def get_example_inputs() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
diff --git a/examples/models/flamingo/preprocess/test_preprocess.py b/examples/models/flamingo/preprocess/test_preprocess.py
index 34ad0ab8ed1..b990f44ca1b 100644
--- a/examples/models/flamingo/preprocess/test_preprocess.py
+++ b/examples/models/flamingo/preprocess/test_preprocess.py
@@ -22,7 +22,7 @@
 from parameterized import parameterized
 from PIL import Image
 
-from torchtune.models.clip.inference._transforms import (
+from torchtune.models.clip.inference._transform import (
     _CLIPImageTransform,
     CLIPImageTransform,
 )
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index 467949a5ebf..f1c56a5bda3 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -70,9 +70,12 @@ runtime.python_library(
         "export_llama.py",
         "export_llama_lib.py",
         "model.py",
+        "source_transformation/apply_spin_quant_r1_r2.py",
         "source_transformation/quantize.py",
+        "source_transformation/rms_norm.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
+        "source_transformation/spin_quant.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama2",
@@ -83,6 +86,7 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform",
         "//caffe2:torch",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
index 2d10f5edc0a..b8987ac5d49 100644
--- a/examples/models/llama2/eval_llama_lib.py
+++ b/examples/models/llama2/eval_llama_lib.py
@@ -41,6 +41,7 @@ def __init__(
         tokenizer: Union[SentencePieceTokenizer, Tiktoken],
         max_seq_length: Optional[int] = None,
         use_kv_cache: bool = False,
+        generate_full_logits: bool = False,
         enable_dynamic_shape: bool = True,
     ):
         super().__init__(
@@ -48,6 +49,7 @@ def __init__(
         )
         self._model = model.to(self.device)
         self._use_kv_cache = use_kv_cache
+        self._generate_full_logits = generate_full_logits
         self._enable_dynamic_shape = enable_dynamic_shape
 
     def _model_call(self, inps):
@@ -60,7 +62,10 @@ def _model_call(self, inps):
                     pos_tensor = torch.tensor([pos], dtype=torch.int64)
                     logits = self._model(inps[:, pos : pos + 1], pos_tensor)
                     result_logits.append(logits)
-                return torch.cat(result_logits, dim=1)
+                if self._generate_full_logits:
+                    return torch.cat(result_logits, dim=1)
+                else:
+                    return torch.stack(result_logits, dim=1)
             else:
                 pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device)
                 # Batch process the whole sequence.
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index f6abc3aaf4e..97228bb5c5d 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -16,7 +16,7 @@
 from enum import Enum
 from json import JSONDecodeError
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import pkg_resources
 
@@ -45,10 +45,15 @@
 from executorch.util.activation_memory_profiler import generate_memory_trace
 
 from ..model_factory import EagerModelFactory
+from .source_transformation.apply_spin_quant_r1_r2 import (
+    fuse_layer_norms,
+    get_model_with_r1_r2,
+)
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
+from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
 from .source_transformation.sdpa import (
     replace_causal_mask,
@@ -224,6 +229,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=f"{ckpt_dir}/params/demo_config.json",
         help="config.json",
     )
+    parser.add_argument(
+        "--optimized_rotation_path",
+        default=None,
+        required=False,
+        help="[QNN backend] Optimized rotation checkpoint path. Just apply R1/R2 here."
+        "You can download the optimized rotation matrices from https://github.com/facebookresearch/SpinQuant/tree/main",
+    )
     parser.add_argument(
         "-m",
         "--metadata",
@@ -287,6 +299,17 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument("-V", "--vulkan", action="store_true")
     parser.add_argument("--mps", action="store_true")
     parser.add_argument("--coreml", action="store_true")
+    parser.add_argument(
+        "--coreml-enable-state",
+        action="store_true",
+        help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
+    )
+    parser.add_argument(
+        "--coreml-quantize",
+        default=None,
+        choices=["b4w"],
+        help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)",
+    )
     parser.add_argument(
         "--qnn",
         action="store_true",
@@ -315,6 +338,23 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=False,
         help="Generate logits for all inputs.",
     )
+
+    parser.add_argument(
+        "--soc_model",
+        help="[QNN backend] SoC model of current device. e.g. 'SM8650' for Snapdragon 8 Gen 3.",
+        type=str,
+        required=False,
+        default="SM8650",
+    )
+
+    parser.add_argument(
+        "-sq",
+        "--use_spin_quant",
+        type=str,
+        default=None,
+        choices=["cuda", "native"],
+        help="Use SpinQuant for better quantization performance. Only support cuda and native.",
+    )
     return parser
 
 
@@ -386,35 +426,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
     else:
         dtype_override = None
 
-    # source transforms
-    transforms = []
-    if args.quantization_mode:
-        modelname = f"{modelname}_q"
-        transforms.append(
-            get_quant_weight_transform(args, dtype_override, verbose_export())
-        )
-
-    if args.embedding_quantize:
-        modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args))
-
-    if args.expand_rope_table:
-        transforms.append(materialze_broadcast_of_rope_freq_cis)
-
-    if args.use_sdpa_with_kv_cache:
-        transforms.append(replace_sdpa_with_custom_op)
-
-    if args.use_kv_cache:
-        if args.qnn:
-            transforms.append(replace_kv_cache_with_simple_kv_cache)
-            transforms.append(replace_sdpa_with_flex_sdpa)
-            transforms.append(replace_causal_mask)
-
-        elif args.coreml or args.mps:
-            # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition
-            # to get free perf gain.
-            transforms.append(replace_sdpa_with_simple_sdpa)
-            transforms.append(replace_causal_mask)
     return (
         _load_llama_model(
             modelname=modelname,
@@ -438,7 +449,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
         )
         .set_output_dir(output_dir_path)
         .to_dtype(dtype_override)
-        .source_transform(transforms)
+        .source_transform(_get_source_transforms(modelname, dtype_override, args))
     )
 
 
@@ -515,7 +526,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
     if args.coreml:
         coreml_partitioner = get_coreml_partitioner(
-            args.use_kv_cache, args.pt2e_quantize
+            args.use_kv_cache and args.coreml_enable_state,
+            args.embedding_quantize,
+            args.pt2e_quantize,
+            args.coreml_quantize,
         )
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
@@ -525,7 +539,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
         partitioners.append(
             get_qnn_partitioner(
-                args.use_kv_cache, args.pt2e_quantize, args.num_sharding
+                args.use_kv_cache, args.pt2e_quantize, args.num_sharding, args.soc_model
             )
         )
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
@@ -552,7 +566,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            canonicalize_program(builder.edge_manager.exported_program())
+            # TODO: Need to remove this once we have better way to handle buffer size
+            canonicalize_program(
+                builder.edge_manager.exported_program(), custom_buffer_size=542048256
+            )
 
         builder = builder.to_executorch()
 
@@ -569,7 +586,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            canonicalize_program(builder.edge_manager.exported_program())
+            # TODO: Need to remove this once we have better way to handle buffer size
+            canonicalize_program(
+                builder.edge_manager.exported_program(), custom_buffer_size=542048256
+            )
 
         builder = builder.to_executorch()
 
@@ -700,6 +720,7 @@ def _load_llama_model(
         max_seq_len=model.params.max_seq_len,
         dtype=dtype,
         use_kv_cache=use_kv_cache,
+        generate_full_logits=generate_full_logits,
         example_inputs=example_inputs,
         enable_dynamic_shape=enable_dynamic_shape,
         calibration_tasks=calibration_tasks,
@@ -718,3 +739,59 @@ def _load_llama_model(
         ),
         args=args,
     )
+
+
+def _get_source_transforms(
+    modelname: str, dtype_override: Optional[DType], args
+) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
+    transforms = []
+    if args.quantization_mode:
+        modelname = f"{modelname}_q"
+        transforms.append(
+            get_quant_weight_transform(args, dtype_override, verbose_export())
+        )
+
+    if args.embedding_quantize:
+        modelname = f"{modelname}_e"
+        transforms.append(get_quant_embedding_transform(args))
+
+    if args.expand_rope_table:
+        transforms.append(materialze_broadcast_of_rope_freq_cis)
+
+    if args.use_sdpa_with_kv_cache:
+        transforms.append(replace_sdpa_with_custom_op)
+
+    if args.use_kv_cache:
+        if args.qnn:
+            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
+            from executorch.backends.qualcomm.utils.utils import (
+                convert_linear_to_conv2d,
+            )
+
+            transforms.append(replace_kv_cache_with_simple_kv_cache)
+            transforms.append(replace_sdpa_with_flex_sdpa)
+            transforms.append(replace_causal_mask)
+            transforms.append(replace_rms_norm_with_native_rms_norm)
+            if args.optimized_rotation_path:
+                transforms.append(fuse_layer_norms)
+                transforms.append(get_model_with_r1_r2(args.optimized_rotation_path))
+            transforms.append(convert_linear_to_conv2d)
+
+        elif args.coreml or args.mps:
+            # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition
+            # to get free perf gain.
+            transforms.append(replace_sdpa_with_simple_sdpa)
+            transforms.append(replace_causal_mask)
+
+    if args.use_spin_quant:
+        if args.use_spin_quant == "cuda":
+            from .source_transformation.spin_quant import (
+                inject_fast_hadamard_transform_cuda_for_spin_quant,
+            )
+
+            transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant)
+
+        elif args.use_spin_quant == "native":
+            raise NotImplementedError("native SpinQuant is not implemented yet.")
+
+    return transforms
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 0c93115ee3b..534d90c6ed9 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -39,6 +39,7 @@ def __init__(self, dim: int, eps: float = 1e-6):
 
         """
         super().__init__()
+        self.dim = dim
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
 
diff --git a/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py
new file mode 100644
index 00000000000..e71007b1958
--- /dev/null
+++ b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py
@@ -0,0 +1,179 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing
+
+import torch
+
+
+def rotate_embeddings(model, R1: torch.Tensor) -> None:
+    # Rotate the embeddings.
+    for W in [model.tok_embeddings]:
+        dtype = W.weight.data.dtype
+        W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+        W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_attention_inputs(layer, R1) -> None:
+    # Rotate the WQ, WK and WV matrices of the self-attention layer.
+    for W in [layer.attention.wq, layer.attention.wk, layer.attention.wv]:
+        dtype = W.weight.dtype
+        W_ = W.weight.to(device="cpu", dtype=torch.float32)
+        W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_attention_output(layer, R1) -> None:
+    # Rotate output matrix of the self-attention layer.
+    W = layer.attention.wo
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype)
+    if W.bias is not None:
+        b = W.bias.data.to(device="cpu", dtype=torch.float32)
+        W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype)
+
+
+def rotate_mlp_input(layer, R1):
+    # Rotate the MLP input weights.
+    mlp_inputs = [layer.feed_forward.w3, layer.feed_forward.w1]
+    for W in mlp_inputs:
+        dtype = W.weight.dtype
+        W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+        W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_mlp_output(layer, R1):
+    # Rotate the MLP output weights and bias.
+    W = layer.feed_forward.w2
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype)
+
+    if W.bias is not None:
+        b = W.bias.data.to(device="cpu", dtype=torch.float32)
+        W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype)
+
+
+def rotate_head(model, R1: torch.Tensor) -> None:
+    # Rotate the head.
+    W = model.output
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype)
+
+
+def rotate_ov_proj(layer, head_dim, R2=None):
+    W = layer.attention.wv
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32).t()
+    transposed_shape = W_.shape
+    temp = W_.reshape(-1, transposed_shape[-1] // head_dim, head_dim)
+    temp = temp.to(torch.float32) @ R2
+    W_ = temp.reshape(transposed_shape).t()
+    W.weight.data = W_.to(device="cpu", dtype=dtype)
+
+    W = layer.attention.wo
+    dtype = W.weight.data.dtype
+    W_ = W.weight.data.to(device="cpu", dtype=torch.float32)
+    init_shape = W_.shape
+    temp = W_.reshape(-1, init_shape[-1] // head_dim, head_dim)
+    temp = temp.to(torch.float32) @ R2
+    W_ = temp.reshape(init_shape)
+    W.weight.data = W_.to(device="cpu", dtype=dtype)
+
+
+def cleanup_memory() -> None:
+    """Run GC and clear GPU memory."""
+    import gc
+
+    # gc.collect and empty cache are necessary to clean up GPU memory if the model was distributed
+    gc.collect()
+
+
+def get_model_with_r1_r2(optimized_rotation_path: str):
+    return lambda model: apply_spin_quant_r1_r2(model, optimized_rotation_path)
+
+
+def apply_spin_quant_r1_r2(model: torch.nn.Module, optimized_rotation_path: str):
+    optimized_rotation = torch.load(optimized_rotation_path, weights_only=True)
+    R1 = optimized_rotation["R1"].to(torch.float32)
+    config = model.params
+    num_heads = config.n_heads
+    head_dim = config.dim // num_heads
+
+    rotate_embeddings(model, R1)
+    rotate_head(model, R1)
+    cleanup_memory()
+
+    for idx, layer in enumerate(model.layers):
+        key = f"model.layers.{idx}.self_attn.R2"
+        R2 = optimized_rotation[key].to(torch.float32)
+        rotate_attention_inputs(layer, R1)
+        rotate_attention_output(layer, R1)
+        rotate_mlp_input(layer, R1)
+        rotate_mlp_output(layer, R1)
+        rotate_ov_proj(layer, head_dim, R2=R2)
+    return model
+
+
+def fuse_ln_linear(
+    layernorm: torch.nn.Module, linear_layers: typing.Iterable[torch.nn.Linear]
+) -> None:
+    """
+    fuse the linear operations in Layernorm into the adjacent linear blocks.
+    """
+    for linear in linear_layers:
+        linear_dtype = linear.weight.dtype
+
+        # Calculating new weight and bias
+        W_ = linear.weight.data.to(dtype=torch.float32)
+        linear.weight.data = (W_ * layernorm.weight.to(dtype=torch.float32)).to(
+            linear_dtype
+        )
+
+        if hasattr(layernorm, "bias"):
+            if linear.bias is None:
+                linear.bias = torch.nn.Parameter(
+                    torch.zeros(linear.out_features, dtype=torch.float32)
+                )
+            linear.bias.data = linear.bias.data.to(dtype=torch.float32) + torch.matmul(
+                W_, layernorm.bias.to(dtype=torch.float32)
+            )
+            linear.bias.data = linear.bias.data.to(linear_dtype)
+
+
+def fuse_layer_norms(model: torch.nn.Module):
+    # Embedding fusion
+    for W in [model.tok_embeddings]:
+        W_ = W.weight.data.to(dtype=torch.float32)
+        W.weight.data = (W_ - W_.mean(dim=-1, keepdim=True)).to(W.weight.data.dtype)
+
+    # Fuse the linear operations in Layernorm into the adjacent linear blocks.
+    for layer in model.layers:
+        # fuse the input layernorms into the linear layers
+        fuse_ln_linear(layer.ffn_norm, [layer.feed_forward.w3, layer.feed_forward.w1])
+        fuse_ln_linear(
+            layer.attention_norm,
+            [
+                layer.attention.wq,
+                layer.attention.wk,
+                layer.attention.wv,
+            ],
+        )
+
+        W_norm = layer.ffn_norm.weight.data
+        layer.ffn_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32)
+        W_norm = layer.attention_norm.weight.data
+        layer.attention_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32)
+
+    fuse_ln_linear(
+        model.norm,
+        [model.output],
+    )
+    W_norm = model.norm.weight.data
+    model.norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32)
+
+    return model
diff --git a/examples/models/llama2/source_transformation/rms_norm.py b/examples/models/llama2/source_transformation/rms_norm.py
new file mode 100644
index 00000000000..ff7e8b67457
--- /dev/null
+++ b/examples/models/llama2/source_transformation/rms_norm.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.examples.models.llama2.llama_transformer import RMSNorm
+
+
+def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, RMSNorm):
+            rms_norm = torch.nn.RMSNorm(child.dim, eps=child.eps)
+            rms_norm.weight = child.weight
+            setattr(
+                module,
+                name,
+                rms_norm,
+            )
+        else:
+            replace_rms_norm_with_native_rms_norm(child)
+    return module
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
index 8e5de7d97ae..c48fdf0ae58 100644
--- a/examples/models/llama2/source_transformation/sdpa.py
+++ b/examples/models/llama2/source_transformation/sdpa.py
@@ -118,8 +118,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
     """
-    if n_rep == 1:
-        return hidden_states
+    # TODO: Encounter the bug about source partition, need to investigate more on it.
+    # if n_rep == 1:
+    #     return hidden_states
 
     new_kv = []
     batch, n_heads, seqlen, head_dim = hidden_states.shape
diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py
new file mode 100644
index 00000000000..7b38312c182
--- /dev/null
+++ b/examples/models/llama2/source_transformation/spin_quant.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+# Helper functions for tranforming the model to be able to run SpinQuant.
+# See https://github.com/facebookresearch/SpinQuant for more details about SpinQuant.
+
+import torch
+
+import torch.nn.functional as F
+
+from executorch.examples.models.llama2.llama_transformer import FeedForward
+from torch import nn
+
+
+def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module):
+    """
+    SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer.
+    R3 needs to be injected as well when KV cache quantization is enabled.
+    """
+    try:
+        from fast_hadamard_transform import hadamard_transform
+    except ImportError:
+        raise ImportError(
+            "Please install fast-hadamard-transform: pip install fast-hadamard-transform"
+        )
+
+    class FeedForwardCustom(nn.Module):
+        def __init__(self, w1, w2, w3):
+            super().__init__()
+            self.w1 = w1
+            self.w2 = w2
+            self.w3 = w3
+
+        def forward(self, x):
+            w = F.silu(self.w1(x)) * self.w3(x)
+            n = w.shape[-1]
+            return self.w2(hadamard_transform(w.contiguous()) / torch.tensor(n).sqrt())
+
+    for name, child in module.named_children():
+        if isinstance(child, FeedForward):
+            setattr(module, name, FeedForwardCustom(child.w1, child.w2, child.w3))
+        else:
+            _inject_fast_hadamard_transform_cuda_for_spin_quant(child)
+
+
+def inject_fast_hadamard_transform_cuda_for_spin_quant(
+    module: torch.nn.Module,
+) -> torch.nn.Module:
+    _inject_fast_hadamard_transform_cuda_for_spin_quant(module)
+    return module
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index 64763c72576..1924b057ec4 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -99,12 +99,17 @@ Error LlavaRunner::generate_from_pos(
     int64_t start_pos,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const ::executorch::extension::llm::Stats&)>
-        stats_callback) {
+        stats_callback,
+    bool echo) {
   // prefill user prompt. No BOS because preset prompt already has it.
-  token_callback(prompt);
+  if (echo) {
+    token_callback(prompt);
+  }
 
   uint64_t prefill_next_token =
       ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0));
+  stats_.first_token_ms = util::time_in_ms();
+  stats_.prompt_eval_end_ms = util::time_in_ms();
   stats_.num_prompt_tokens = start_pos;
 
   // Generate tokens
@@ -113,7 +118,6 @@ Error LlavaRunner::generate_from_pos(
 
   // Bookkeeping
   stats_.num_generated_tokens = num_generated_tokens;
-  ::executorch::llm::print_report(stats_);
   if (stats_callback) {
     stats_callback(stats_);
   }
@@ -125,7 +129,8 @@ Error LlavaRunner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
+    std::function<void(const Stats&)> stats_callback,
+    bool echo) {
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
@@ -147,6 +152,7 @@ Error LlavaRunner::generate(
       };
 
   int64_t pos = 0;
+  stats_.inference_start_ms = util::time_in_ms();
 
   // prefill preset prompt
   prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0);
@@ -160,8 +166,11 @@ Error LlavaRunner::generate(
       util::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Generate tokens
-  Error err =
-      generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback);
+  Error err = generate_from_pos(
+      prompt, seq_len, pos, wrapped_callback, stats_callback, echo);
+
+  stats_.inference_end_ms = util::time_in_ms();
+  ::executorch::llm::print_report(stats_);
 
   ET_LOG(
       Info,
diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h
index 923f8180a83..e671718ae5e 100644
--- a/examples/models/llava/runner/llava_runner.h
+++ b/examples/models/llava/runner/llava_runner.h
@@ -36,7 +36,8 @@ class LlavaRunner : public MultimodalRunner {
       int32_t seq_len = 1024,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {});
+          stats_callback = {},
+      bool echo = true);
 
   /**
    * Prefill an LLaVA Module with the given images input.
@@ -70,6 +71,7 @@ class LlavaRunner : public MultimodalRunner {
    * @param start_pos The starting position in KV cache of the input in the LLM.
    * @param token_callback What to do after a token is generated.
    * @param stats_callback What to do with Stats.
+   * @param echo Whether to echo the input prompt or not.
    * @return The error code.
    */
   Error generate_from_pos(
@@ -78,7 +80,8 @@ class LlavaRunner : public MultimodalRunner {
       int64_t start_pos = 0,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {});
+          stats_callback = {},
+      bool echo = true);
 
  private:
   inline static const std::string kPresetPrompt =
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
index f7fda3b9849..df8c876abf2 100644
--- a/examples/qualcomm/oss_scripts/llama2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -16,8 +16,7 @@
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.passes.build_quant_io import BuildQuantIo
 
-from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
-from executorch.backends.qualcomm.quantizer.utils import get_16a4w_qnn_ptq_config
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     QcomChipset,
 )
@@ -34,13 +33,13 @@
 )
 from executorch.examples.qualcomm.utils import (
     make_output_dir,
+    make_quantizer,
     setup_common_args_and_variables,
     SimpleADB,
 )
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
-from executorch.exir.program._program import _get_updated_graph_signature
 from executorch.extension.llm.export.builder import DType
 
 from sentencepiece import SentencePieceProcessor
@@ -274,20 +273,12 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type):
 
     def quantize(self, quant_dtype, custom_annotations=()):
         self.quant_dtype = quant_dtype
-        quantizer = QnnQuantizer()
-        quantizer.set_per_channel_linear_quant(True)
-        quantizer.set_per_channel_conv_quant(True)
-
-        if quant_dtype == QuantDtype.use_8a8w:
-            pass  # default setting
-        elif quant_dtype == QuantDtype.use_16a4w:
-            quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(
-                get_16a4w_qnn_ptq_config(act_observer=MinMaxObserver)
-            )
-            quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
-        else:
-            raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype,
+            per_channel_conv=True,
+            per_channel_linear=True,
+            act_observer=MinMaxObserver,
+        )
         quantizer.add_custom_quant_annotations(custom_annotations)
 
         self.has_quant_io = True
@@ -367,6 +358,7 @@ def compile(args):
     )
     end_load_ts = time.time()
     print("torch.load checkpoint", end_load_ts - start_ts)
+
     llama_instance = None
     with torch.device("meta"):
         llama_instance = LlamaModel(config, output_new_cache_only=True)
@@ -383,16 +375,13 @@ def compile(args):
     for layer in llama_instance.layers:
         if getattr(layer.attention, "prepare_sha", None):
             layer.attention.prepare_sha()
-    kv_type = torch.uint8
-    if args.ptq == "8a8w":
-        quant_dtype = QuantDtype.use_8a8w
-    elif args.ptq == "16a4w":
-        quant_dtype = QuantDtype.use_16a4w
-    else:
-        raise AssertionError(
-            f"No support for quant type {args.ptq}. Support 8a8w and 16a4w."
-        )
 
+    kv_type = torch.uint8
+    assert args.ptq in [
+        "8a8w",
+        "16a4w",
+    ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w."
+    quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
     assert args.tokenizer_model is not None, "Need tokenizer model for calibration"
 
     if args.dtype_override is not None:
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 278ab8e8c02..605bb27d330 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -13,13 +13,24 @@
 
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
+    QcomChipset,
+)
+from executorch.backends.qualcomm.utils.utils import (
+    generate_htp_compiler_spec,
+    generate_qnn_executorch_compiler_spec,
+    skip_annotation,
+)
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
+    make_quantizer,
     parse_skip_delegation_node,
+    QnnPartitioner,
     setup_common_args_and_variables,
     SimpleADB,
 )
+from executorch.exir import to_edge
 from transformers import BertTokenizer, MobileBertForSequenceClassification
 
 
@@ -204,8 +215,6 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
             )
 
     model.load_state_dict(
-        # TODO: If possible, it's better to set weights_only to True
-        # https://pytorch.org/docs/stable/generated/torch.load.html
         torch.load(
             (
                 f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model"
@@ -213,7 +222,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
                 else pretrained_weight
             ),
             map_location=torch.device("cpu"),
-            weights_only=False,
+            weights_only=True,
         ),
     )
 
@@ -232,38 +241,65 @@ def main(args):
             "Please specify a device serial by -s/--device argument."
         )
 
-    pte_filename = "ptq_mb_qnn" if args.ptq else "mb_qnn"
-    batch_size = 1 if args.ptq else 3
+    batch_size, pte_filename = 1, "ptq_mb_qnn"
     model, data_val, labels = get_fine_tuned_mobilebert(
         args.artifact, args.pretrained_weight, batch_size
     )
     inputs, input_list = get_dataset(data_val)
 
-    if args.ptq == "8a8w":
-        quant_dtype = QuantDtype.use_8a8w
-    elif args.ptq == "16a16w":
-        quant_dtype = QuantDtype.use_16a16w
-    elif args.ptq == "16a4w":
-        quant_dtype = QuantDtype.use_16a4w
-    else:
+    try:
+        quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
+    except:
         raise AssertionError(
             f"No support for quant type {args.ptq}. Support 8a8w, 16a16w and 16a4w."
         )
 
     if args.use_fp16:
         quant_dtype = None
+        pte_filename = "mb_qnn"
+        build_executorch_binary(
+            model,
+            inputs[0],
+            args.model,
+            f"{args.artifact}/{pte_filename}",
+            inputs,
+            skip_node_id_set=skip_node_id_set,
+            skip_node_op_set=skip_node_op_set,
+            quant_dtype=quant_dtype,
+            shared_buffer=args.shared_buffer,
+        )
+    else:
 
-    build_executorch_binary(
-        model,
-        inputs[0],
-        args.model,
-        f"{args.artifact}/{pte_filename}",
-        inputs,
-        skip_node_id_set=skip_node_id_set,
-        skip_node_op_set=skip_node_op_set,
-        quant_dtype=quant_dtype,
-        shared_buffer=args.shared_buffer,
-    )
+        def calibrator(gm):
+            for input in inputs:
+                gm(*input)
+
+        quantizer = make_quantizer(quant_dtype=quant_dtype)
+        backend_options = generate_htp_compiler_spec(quant_dtype is not None)
+        partitioner = QnnPartitioner(
+            generate_qnn_executorch_compiler_spec(
+                soc_model=getattr(QcomChipset, args.model),
+                backend_options=backend_options,
+            ),
+            skip_node_id_set=skip_node_id_set,
+            skip_node_op_set=skip_node_op_set,
+        )
+        # skip embedding layer cause it's quantization sensitive
+        graph_module, _ = skip_annotation(
+            nn_module=model,
+            quantizer=quantizer,
+            partitioner=partitioner,
+            sample_input=inputs[0],
+            calibration_cb=calibrator,
+            fp_node_op_set={torch.ops.aten.embedding.default},
+        )
+        # lower all graph again, the skipped operators will be left in CPU
+        exec_prog = to_edge(
+            torch.export.export(graph_module, inputs[0]),
+        ).to_executorch()
+
+        with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
+            file.write(exec_prog.buffer)
 
     if args.compile_only:
         sys.exit(0)
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 1a748bb45e1..5d9a3aef262 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -19,6 +19,7 @@
 from executorch.backends.qualcomm.quantizer.quantizer import (
     get_16a4w_qnn_ptq_config,
     get_default_16bit_qnn_ptq_config,
+    get_default_8bit_qnn_ptq_config,
     QnnQuantizer,
     QuantDtype,
 )
@@ -30,7 +31,7 @@
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
@@ -178,6 +179,39 @@ def pull_etdump(self, output_path, callback=None):
             callback()
 
 
+def make_quantizer(
+    quant_dtype: Optional[QuantDtype],
+    custom_annotations=(),
+    per_channel_conv=True,
+    per_channel_linear=False,
+    act_observer=MovingAverageMinMaxObserver,
+):
+    quantizer = QnnQuantizer()
+    quantizer.add_custom_quant_annotations(custom_annotations)
+    quantizer.set_per_channel_conv_quant(per_channel_conv)
+    quantizer.set_per_channel_linear_quant(per_channel_linear)
+
+    if quant_dtype == QuantDtype.use_8a8w:
+        quantizer.set_bit8_op_quant_config(
+            get_default_8bit_qnn_ptq_config(act_observer=act_observer)
+        )
+    elif quant_dtype == QuantDtype.use_16a16w:
+        quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
+        quantizer.set_bit16_op_quant_config(
+            get_default_16bit_qnn_ptq_config(act_observer=act_observer)
+        )
+    elif quant_dtype == QuantDtype.use_16a4w:
+        quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
+        quantizer.set_bit16_op_quant_config(
+            get_16a4w_qnn_ptq_config(act_observer=act_observer)
+        )
+        quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
+    else:
+        raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
+
+    return quantizer
+
+
 # TODO: refactor to support different backends
 def build_executorch_binary(
     model,  # noqa: B006
@@ -195,27 +229,13 @@ def build_executorch_binary(
     act_observer=MovingAverageMinMaxObserver,
 ):
     if quant_dtype is not None:
-        quantizer = QnnQuantizer()
-        quantizer.add_custom_quant_annotations(custom_annotations)
-        quantizer.set_per_channel_linear_quant(per_channel_linear)
-        quantizer.set_per_channel_conv_quant(True)
-
-        if quant_dtype == QuantDtype.use_8a8w:
-            pass  # default setting
-        elif quant_dtype == QuantDtype.use_16a16w:
-            quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(
-                get_default_16bit_qnn_ptq_config(act_observer=act_observer)
-            )
-        elif quant_dtype == QuantDtype.use_16a4w:
-            quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(
-                get_16a4w_qnn_ptq_config(act_observer=act_observer)
-            )
-            quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
-        else:
-            raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
-
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype,
+            custom_annotations=custom_annotations,
+            per_channel_conv=True,
+            per_channel_linear=per_channel_linear,
+            act_observer=act_observer,
+        )
         captured_model = torch.export.export(model, inputs).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
@@ -225,29 +245,20 @@ def build_executorch_binary(
         else:
             for data in dataset:
                 annotated_model(*data)
+
         quantized_model = convert_pt2e(annotated_model)
         edge_prog = capture_program(quantized_model, inputs)
     else:
         edge_prog = capture_program(model, inputs)
 
-    arch_table = {
-        "SM8650": QcomChipset.SM8650,
-        "SM8550": QcomChipset.SM8550,
-        "SM8475": QcomChipset.SM8475,
-        "SM8450": QcomChipset.SM8450,
-    }
-
     backend_options = generate_htp_compiler_spec(
         use_fp16=False if quant_dtype else True
     )
     qnn_partitioner = QnnPartitioner(
         generate_qnn_executorch_compiler_spec(
-            soc_model=arch_table[soc_model],
+            soc_model=getattr(QcomChipset, soc_model),
             backend_options=backend_options,
-            debug=False,
-            saver=False,
             shared_buffer=shared_buffer,
-            profile=False,
         ),
         skip_node_id_set,
         skip_node_op_set,
@@ -263,15 +274,12 @@ def build_executorch_binary(
             alloc_graph_input=not shared_buffer,
             alloc_graph_output=not shared_buffer,
         ),
-        extract_delegate_segments=True,
     )
 
     if metadata is None:
-        edge_prog.exported_program = to_backend(
-            edge_prog.exported_program, qnn_partitioner
-        )
-        edge_prog.exported_program.graph_module.graph.print_tabular()
-        exec_prog = edge_prog.to_executorch(config=executorch_config)
+        exported_program = to_backend(edge_prog.exported_program, qnn_partitioner)
+        exported_program.graph_module.graph.print_tabular()
+        exec_prog = to_edge(exported_program).to_executorch(config=executorch_config)
         with open(f"{file_name}.pte", "wb") as file:
             file.write(exec_prog.buffer)
     else:
diff --git a/exir/_serialize/_dataclass.py b/exir/_serialize/_dataclass.py
index 8f6ef1c172b..013d733bcda 100644
--- a/exir/_serialize/_dataclass.py
+++ b/exir/_serialize/_dataclass.py
@@ -129,6 +129,13 @@ class Example
             data[key] = [_json_to_dataclass(e, T) for e in value]
             continue
 
+        # If T is a Union, then check which type in the Union it is and initialize.
+        # eg. Double type in schema.py
+        if get_origin(T) is Union:
+            res = [x for x in get_args(get_type_hints(cls)[key]) if x == type(value)]
+            data[key] = res[0](value)
+            continue
+
         # If T is an enum then lookup the value in the enum otherwise try to
         # cast value to whatever type is required
         if isinstance(T, enum.EnumMeta):
diff --git a/exir/_serialize/_flatbuffer.py b/exir/_serialize/_flatbuffer.py
index 93006612c73..4599249f00c 100644
--- a/exir/_serialize/_flatbuffer.py
+++ b/exir/_serialize/_flatbuffer.py
@@ -29,14 +29,6 @@ def _is_valid_alignment(alignment: int) -> bool:
     return alignment > 0 and (alignment & (alignment - 1)) == 0
 
 
-# TODO(T182299196): Replace this hack with a proper flatc binary.
-def _replace_infinity_in_json_file(content: str) -> str:
-    content = re.sub(
-        r'"double_val"\s*:\s*(-)?Infinity', r'"double_val": "\g<1>inf"', content
-    )
-    return content
-
-
 def _patch_schema_alignment(
     schema: bytes,
     constant_tensor_alignment: Optional[int],
@@ -291,11 +283,8 @@ def _program_json_to_flatbuffer(
         json_path = os.path.join(temp_dir, file_stem + ".json")
         output_path = os.path.join(temp_dir, file_stem + ".pte")
 
-        # TODO(T182299196): Replace this hack with a proper flatc binary.
-        replaced_program_json = _replace_infinity_in_json_file(program_json)
-
         with open(json_path, "wb") as json_file:
-            json_file.write(replaced_program_json.encode("ascii"))
+            json_file.write(program_json.encode("ascii"))
 
         try:
             _flatc_compile(temp_dir, schema_info.root_path, json_path)
@@ -330,6 +319,19 @@ def _program_json_to_flatbuffer(
             )
 
 
+def _replace_infinity_in_json_file(content: bytes) -> bytes:
+    """Replace -inf and inf with "inf" and "-inf" in the JSON file. program.fbs
+    is used to convert from flatbuffer to JSON. +-inf float values are not
+    supported by JSON, so we replace them with the string equivalent. When
+    converting from JSON to python dataclasses, the string is read as a Union
+    of float and string (see schema.py).
+    """
+    content = re.sub(
+        rb'"double_val"\s*:\s*(-)?inf', rb'"double_val": "\g<1>inf"', content
+    )
+    return content
+
+
 def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes:
     """Converts binary flatbuffer data into Program-compatible JSON.
 
@@ -348,4 +350,5 @@ def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes:
 
         _flatc_decompile(temp_dir, schema_info.root_path, bin_path)
         with open(json_path, "rb") as output_file:
-            return output_file.read()
+            json_data = output_file.read()
+            return _replace_infinity_in_json_file(json_data)
diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
index 2256d5fcc99..00a3d4700f0 100644
--- a/exir/_serialize/_program.py
+++ b/exir/_serialize/_program.py
@@ -553,6 +553,24 @@ def _restore_segments(program: Program, segment_data: bytes) -> Program:
                 location=DataLocation.INLINE, index=data_index
             )
 
+    # Replace constants from constant_segment into constant_buffer.
+    if program.constant_segment and len(program.constant_segment.offsets) > 0:
+        buffers: List[Buffer] = []
+        constant_segment = segments[program.constant_segment.segment_index]
+        for i in range(len(program.constant_segment.offsets)):
+            start_offset = program.constant_segment.offsets[i]
+            # Note: this is the original end offset plus any padding between
+            # it and the next start offset.
+            end_offset = (
+                program.constant_segment.offsets[i + 1]
+                if i < len(program.constant_segment.offsets) - 1
+                else len(constant_segment)
+            )
+            buffers.append(Buffer(storage=constant_segment[start_offset:end_offset]))
+        program.constant_buffer = buffers
+        program.constant_segment.segment_index = 0
+        program.constant_segment.offsets = []
+
     # Clear out the segments list since the original Program didn't have one.
     program.segments = []
     return program
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
index afd8e3d282e..f20c0b39798 100644
--- a/exir/_serialize/test/test_program.py
+++ b/exir/_serialize/test/test_program.py
@@ -272,6 +272,15 @@ def constant_segment_with_tensor_alignment(
             f"{segment_table}",
         )
 
+        # Convert back.
+        program2 = deserialize_pte_binary(pte_data)
+        # Programs are the same besides constant_buffer, as deserialization
+        # does not preserve constant segment; padding may be added
+        # during serialization.
+        self.assertEqual(program2.execution_plan, program.execution_plan)
+        # Number of constant tensors should be the same.
+        self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer))
+
     def test_canonicalize_delegate_indices(self) -> None:
         def make_execution_plan(
             name: str, delegates: List[BackendDelegate]
@@ -462,7 +471,6 @@ def gen_blob_data(size: int, pattern: bytes) -> bytes:
         assert len(ret) == size
         return ret
 
-    @unittest.skip("TODO(T181362263): Update restore segments to restore cords")
     def test_round_trip_with_segments(self) -> None:
         # Create a program with some delegate data blobs.
         program = get_test_program()
@@ -803,6 +811,15 @@ def test_constant_segment_and_delegate_segment(self) -> None:
             + b"\x40\x44\x44",
         )
 
+        # Convert back.
+        program2 = deserialize_pte_binary(pte_data)
+        # Programs are the same besides constant_buffer, as deserialization
+        # does not preserve constant segment; padding may be added
+        # during serialization.
+        self.assertEqual(program2.execution_plan, program.execution_plan)
+        # Number of constant tensors should be the same.
+        self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer))
+
 
 # Common data for extended header tests. The two example values should produce
 # the example data.
diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS
index b99f374d83c..5c3a5e3eb32 100644
--- a/exir/backend/test/TARGETS
+++ b/exir/backend/test/TARGETS
@@ -82,15 +82,14 @@ python_library(
         "//executorch/test/...",
     ],
     deps = [
-        ":backend_with_compiler_demo",
-        "//caffe2:torch",
-        "//executorch/exir:graph_module",
-        "//executorch/exir/backend:compile_spec_schema",
-        "//executorch/exir/backend:partitioner",
-        "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
-        "//executorch/exir/backend/test/demos/rpc:executor_backend_partitioner",
-        "//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess",
-        "//executorch/exir/dialects:lib",
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/exir:graph_module",
+        "fbcode//executorch/exir/backend:compile_spec_schema",
+        "fbcode//executorch/exir/backend:partitioner",
+        "fbcode//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
+        "fbcode//executorch/exir/backend/test:backend_with_compiler_demo",
+        "fbcode//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess",
+        "fbcode//executorch/exir/dialects:lib",
     ],
 )
 
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 3973011a269..da1ae0444dd 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -39,9 +39,8 @@
     _load_for_executorch_from_buffer,
 )
 from executorch.extension.pytree import tree_flatten
-from torch._export import capture_pre_autograd_graph
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
-from torch.export import export
+from torch.export import export, export_for_training
 from torch.fx.passes.operator_support import any_chain
 
 
@@ -77,7 +76,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = capture_pre_autograd_graph(mlp, example_inputs)
+        model = export_for_training(mlp, example_inputs).module()
         aten = export(model, example_inputs)
         spec_key = "path"
         spec_value = "/a/b/c/d"
@@ -138,7 +137,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = capture_pre_autograd_graph(mlp, example_inputs)
+        model = export_for_training(mlp, example_inputs).module()
         aten = export(model, example_inputs)
         edge = exir.to_edge(aten)
 
@@ -178,7 +177,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = capture_pre_autograd_graph(mlp, example_inputs)
+        model = export_for_training(mlp, example_inputs).module()
         edge = exir.to_edge(export(model, example_inputs))
 
         with self.assertRaisesRegex(
@@ -230,7 +229,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),))
+        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         delegated = edge.to_backend(PartitionerNoTagData())
 
@@ -309,7 +308,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),))
+        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -384,7 +383,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),))
+        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -472,7 +471,7 @@ def partition(
                 )
 
         inputs = (torch.ones(2, 2),)
-        model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),))
+        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         exec_prog = edge.to_backend(PartitionerTagData()).to_executorch()
         executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer)
@@ -532,7 +531,7 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),))
+        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),)))
         with self.assertRaises(RuntimeError) as error:
             _ = edge.to_backend(PartitionerTagData())
diff --git a/exir/backend/test/test_passes.py b/exir/backend/test/test_passes.py
index 8a43431520d..4dcc7757faa 100644
--- a/exir/backend/test/test_passes.py
+++ b/exir/backend/test/test_passes.py
@@ -11,8 +11,8 @@
 from executorch.exir.backend.canonical_partitioners.duplicate_constant_node_pass import (
     duplicate_constant_node,
 )
-from torch._export import capture_pre_autograd_graph
 from torch._export.utils import is_buffer
+from torch.export import export_for_training
 from torch.testing import FileCheck
 
 
@@ -29,7 +29,7 @@ def forward(self, x):
                 z = x - self.const
                 return y, z
 
-        model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),))
+        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
         edge = exir.to_edge(torch.export.export(model, (torch.ones(2, 2),)))
 
         const_nodes = [
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index 2b768fe7c23..fb5e16c6bd0 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -383,6 +383,40 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
                     node.meta["delegation_tag"] = user_tags.pop()
 
 
+def tag_mutated_buffer(edge_program: ExportedProgram) -> None:
+    """
+    Util function for partitioners. This function tags the mutated buffer nodes
+    whose users all belong within the same partition. This should be called after tagging all other nodes.
+    Any buffer which is used as input to a subgraph, will be tagged with the same tag as that
+    subgraph. Throw error when buffers is used across different partitions. That is the
+    underlying data will be owned by multiple delegates.
+    """
+    for node in edge_program.graph.nodes:
+        # Determine whether this node is a mutated buffer
+        is_mutated_buffer_node = False
+        if node.op == "placeholder" and is_buffer(edge_program, node):
+            for node_user in node.users:
+                if node_user.name in edge_program.graph_signature.buffers_to_mutate:
+                    is_mutated_buffer_node = True
+                    break
+        # This node is mutated buffer, tag it
+        if is_mutated_buffer_node:
+            user_tags = set()
+            for user in node.users:
+                user_tag = user.meta.get("delegation_tag", None)
+                if user_tag is not None:
+                    user_tags.add(user_tag)
+            if len(user_tags) > 1:
+                logging.info(
+                    f"The data node is used across multiple partitions, including {user_tags}. "
+                    "If the data is too large and it's not preferred to copy, please tag the "
+                    "constant node like node.['no_copy'] = True and they won't be copied."
+                )
+            # tag the data node with the same tag as the last user
+            if len(user_tags) > 0:
+                node.meta["delegation_tag"] = user_tags.pop()
+
+
 # TODO - style: use templated types
 class DelegateMappingBuilder:
     """
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index 2d0a6c4ca80..11a0d6d069d 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -5,10 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
+import torch
+
 from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode
 from executorch.exir.pass_manager import PassType
 from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass
@@ -38,6 +39,10 @@ class EdgeCompileConfig:
     _check_ir_validity: bool = True
     # TODO(larryliu): remove this
     _use_edge_ops: bool = True
+    # Allow core ATen ops check to be skipped for certain ops, but continue with the rest of the checks.
+    _core_aten_ops_exception_list: List[torch._ops.OpOverload] = field(
+        default_factory=list
+    )
     _skip_type_promotion: bool = False
     # TODO(gasoonjia): remove this
     # TODO(T192537614): reenanle dim order as default
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index f1b980a9aea..123896ecdba 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -23,6 +23,7 @@
     ExecutorchProgramManager,
     to_edge,
 )
+from executorch.exir._serialize._program import deserialize_pte_binary
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -35,6 +36,7 @@
 from executorch.exir.schema import (
     Bool,
     DelegateCall,
+    Double,
     EValue,
     ExecutionPlan,
     Int,
@@ -1620,3 +1622,33 @@ def forward(self, x):
         executorch_module = _load_for_executorch_from_buffer(model.buffer)
         self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1))
         self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1) + 1)
+
+    def test_infinity_in_model(self) -> None:
+        class InfinityMaskModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mask = torch.tensor([[1, 0], [0, 1]], dtype=torch.float32)
+
+            def forward(self, x):
+                masked_weights = x.masked_fill(self.mask == 0, float("-inf"))
+                return masked_weights
+
+        model = to_edge(
+            export(
+                InfinityMaskModel(),
+                (torch.randn(2, 2),),
+            )
+        )
+
+        # Confirm that we can serialize the model with infinity in it.
+        model = model.to_executorch()
+
+        # Assert that the infinity is stored as a string "-inf".
+        values = model.executorch_program.execution_plan[0].values
+        self.assertEqual(values[5].val, Double(double_val=float("-inf")))
+
+        # Confirm that we can also deserialize the model with infinity in it.
+        pte_data = deserialize_pte_binary(model.buffer)
+        self.assertEqual(
+            pte_data.execution_plan, model.executorch_program.execution_plan
+        )
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 1339760f215..6b72d190f9d 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -573,6 +573,9 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram":
             EXIRATenDialectVerifier()(ep.exported_program.graph_module)
         except ExportError:
             logging.info(
+                "If a particular operator failed core ATen IR check, please consider adding it to the exception list. "
+                "Add the operator to _core_aten_ops_exception_list in EdgeCompileConfig. This is the recommended way "
+                "to resolve this type of failure, so that the rest of the IR validation check can still be performed.\n"
                 "If you'd like to disable IR validation checking, please set _check_ir_validity in EdgeCompileConfig, "
                 "like *.to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))."
             )
@@ -590,7 +593,11 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram":
                 module_call_graph=ep.exported_program.module_call_graph,
                 example_inputs=ep.exported_program.example_inputs,
                 constants=ep.exported_program.constants,
-                verifiers=[get_aten_verifier(enable=config._check_ir_validity)],
+                verifiers=[
+                    get_aten_verifier(
+                        config=config,
+                    )
+                ],
             ),
             False,
         )
@@ -698,10 +705,13 @@ def _generate_edge_program(
     program: ExportedProgram,
     ops_set_to_not_decompose: Optional[List[torch._ops.OpOverload]] = None,
 ) -> ExportedProgram:
-
     if config._check_ir_validity:
         try:
-            EXIRATenDialectVerifier(ops_set_to_not_decompose)(program.graph_module)
+            EXIRATenDialectVerifier(
+                edge_compile_config=config,
+                class_only=False,
+                exception_list=ops_set_to_not_decompose,
+            )(program.graph_module)
         except ExportError as e:
             logging.info(f"Input program {name} is not in ATen dialect.")
             raise e
@@ -1020,13 +1030,8 @@ def to_edge_transform_and_lower(
                 edge_manager = edge_manager.to_backend({name: curr_partitioner})
 
     for name, program in edge_manager._edge_programs.items():
-        if config._check_ir_validity:
-            EXIREdgeDialectVerifier(
-                edge_compile_config=config,
-                class_only=True,
-            )()(program.graph_module)
 
-        ops_set_to_not_decompose = set()
+        ops_set_to_not_decompose: Set[torch._ops.OpOverload] = set()
         partitioners = partitioner.get(name, [])
         for curr_partitioner in partitioners:
             curr_op_set, check_op_support = curr_partitioner.ops_to_not_decompose(
@@ -1042,6 +1047,13 @@ def to_edge_transform_and_lower(
                 generate_error=True,
             )
 
+        if config._check_ir_validity:
+            EXIREdgeDialectVerifier(
+                edge_compile_config=config,
+                class_only=True,
+                exception_list=list(ops_set_to_not_decompose),
+            )()(program.graph_module)
+
     return edge_manager
 
 
@@ -1107,6 +1119,7 @@ def __init__(
         self.compile_config = compile_config or EdgeCompileConfig()
         if not isinstance(edge_programs, dict):
             edge_programs = {"forward": edge_programs}
+
         for name, program in edge_programs.items():
             try:
                 EXIREdgeDialectVerifier(
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 4d2f5dfd699..73f023e778b 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -531,11 +531,14 @@ def test_edge_manager_dialect(self):
         )
         self.assertTrue(edge_manager.exported_program().dialect == "EDGE")
 
-    def _test_edge_dialect_verifier(self, callable, validate_ir=True):
+    def _test_edge_dialect_verifier(
+        self, callable, validate_ir=True, exception_list=None
+    ):
         from executorch.exir import EdgeCompileConfig
 
         edge_compile_config = EdgeCompileConfig(
             _check_ir_validity=validate_ir,
+            _core_aten_ops_exception_list=exception_list,
         )
         # pre-autograd export. eventually this will become torch.export
         one = torch.ones(1, dtype=torch.float)
@@ -681,3 +684,35 @@ def count_nodes(graph_module, target):
             ),
             1,
         )
+
+    def test_edge_dialect_non_core_aten_ops(self):
+        class LinalgNorm(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.linalg.norm(x)
+
+        from torch._export.verifier import SpecViolationError
+
+        input = torch.arange(9, dtype=torch.float) - 4
+        ep = torch.export.export(LinalgNorm(), (input,))
+
+        # aten::linalg_norm is not a core op, so it should error out
+        with self.assertRaises(SpecViolationError):
+            _ = to_edge(ep, compile_config=EdgeCompileConfig(_check_ir_validity=True))
+
+        # with exception list, it should not error out
+        try:
+            # This should not raise error
+            _ = to_edge(
+                ep,
+                compile_config=EdgeCompileConfig(
+                    _check_ir_validity=True,
+                    _core_aten_ops_exception_list=[
+                        torch.ops.aten.linalg_vector_norm.default
+                    ],
+                ),
+            )
+        except SpecViolationError:
+            self.fail("Should not error out on linalg_vector_norm op")
diff --git a/exir/schema.py b/exir/schema.py
index 706bc611403..9436465459a 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -75,7 +75,23 @@ class Bool:
 
 @dataclass
 class Double:
-    double_val: float
+    double_val: Union[float, str]
+
+    def __init__(self, double_val: float) -> None:
+        if double_val == float("inf"):
+            self.double_val = "inf"
+        elif double_val == float("-inf"):
+            self.double_val = "-inf"
+        else:
+            self.double_val = double_val
+
+    def __post_init__(self) -> None:
+        if isinstance(self.double_val, str):
+            assert self.double_val in ["inf", "-inf"]
+        else:
+            assert isinstance(self.double_val, float)
+            assert not self.double_val == float("inf")
+            assert not self.double_val == float("-inf")
 
 
 @dataclass
diff --git a/exir/verification/verifier.py b/exir/verification/verifier.py
index 8b6ec91dd3b..b519e20393a 100644
--- a/exir/verification/verifier.py
+++ b/exir/verification/verifier.py
@@ -52,12 +52,6 @@ def _check_valid_dim_order_ops(op, use_dim_order) -> None:
 class EXIRATenDialectVerifierBase(Verifier):
     dialect = "OLD_EXIR_ATEN_DISABLED"
 
-    def __init__(
-        self, exception_list: Optional[List[torch._ops.OpOverload]] = None
-    ) -> None:
-        super().__init__()
-        self._exception_list = exception_list if exception_list else []
-
     def allowed_getattr_types(self) -> Tuple[Type[Any], ...]:
         return (
             torch.fx.GraphModule,
@@ -78,38 +72,68 @@ def __call__(self, *args, **kwargs):
             raise RuntimeError("")
 
 
-class EXIRATenDialectVerifier(EXIRATenDialectVerifierBase):
-    dialect = "OLD_EXIR_ATEN"
+def EXIRATenDialectVerifier(  # noqa: C901
+    edge_compile_config: Optional[EdgeCompileConfig] = None,
+    class_only: bool = False,
+    exception_list: Optional[List[torch._ops.OpOverload]] = None,
+):
+    """
+    Returns a verifier class that runs ATen dialect specific checks on the graph module.
+    """
+    # merge the exception list from edge_compile_config and exception_list
+    if edge_compile_config and edge_compile_config._core_aten_ops_exception_list:
+        exception_list = edge_compile_config._core_aten_ops_exception_list + (
+            exception_list or []
+        )
 
-    def _get_exception_list(self) -> List[torch._ops.OpOverload]:
-        exception_list = [
-            torch.ops.aten.mkldnn_rnn_layer.default,
-            torch.ops.aten._upsample_bilinear2d_aa.default,
-            torch.ops.aten.quantize_per_tensor.default,
-            torch.ops.aten.dequantize.self,
-            torch.ops.aten.max.default,  # TODO(T188268054)
-            torch.ops.aten.min.default,  # TODO(T188268054)
-            torch.ops.aten.full_like.default,  # TODO(T183507359)
-        ]
-        exception_list += self._exception_list
+    class _EXIRATenDialectVerifier(EXIRATenDialectVerifierBase):
+        dialect = "OLD_EXIR_ATEN"
 
-        return exception_list
+        def __init__(self) -> None:
+            super().__init__()
+            # Note: here we are using the exception list passed from EXIRATenDialectVerifier function!
+            self._exception_list = exception_list if exception_list else []
 
-    def check_valid_op(self, op):
-        if isinstance(op, OpOverload):
-            # TODO These special ops should be removable easily.
-            if op.namespace != "aten" or op in self._get_exception_list():
-                return
-            if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags:
-                # NOTE(qihan): whether view_copy operators are marked as canonical is still under
-                #            discussion.
-                raise SpecViolationError(
-                    f"Operator {op.__module__}.{op.__name__} is not Aten Canonical."
-                )
+        def _get_exception_list(self) -> List[torch._ops.OpOverload]:
+            exception_list = [
+                torch.ops.aten.mkldnn_rnn_layer.default,
+                torch.ops.aten._upsample_bilinear2d_aa.default,
+                torch.ops.aten.quantize_per_tensor.default,
+                torch.ops.aten.dequantize.self,
+                torch.ops.aten.max.default,  # TODO(T188268054)
+                torch.ops.aten.min.default,  # TODO(T188268054)
+                torch.ops.aten.full_like.default,  # TODO(T183507359)
+            ]
+            exception_list += self._exception_list
 
+            return exception_list
 
-def get_aten_verifier(enable: bool = True):
-    return EXIRATenDialectVerifier if enable else EXIRATenDialectVerifierBase
+        def check_valid_op(self, op):
+            if isinstance(op, OpOverload):
+                # TODO These special ops should be removable easily.
+                if op.namespace != "aten" or op in self._get_exception_list():
+                    return
+                if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags:
+                    # NOTE(qihan): whether view_copy operators are marked as canonical is still under
+                    #            discussion.
+                    raise SpecViolationError(
+                        f"Operator {op.__module__}.{op.__name__} is not Aten Canonical."
+                    )
+
+    ret = _EXIRATenDialectVerifier
+    if not class_only:
+        ret = ret()
+    return ret
+
+
+def get_aten_verifier(config: EdgeCompileConfig):
+    return (
+        EXIRATenDialectVerifier(
+            class_only=True, exception_list=config._core_aten_ops_exception_list
+        )
+        if config._check_ir_validity
+        else EXIRATenDialectVerifierBase
+    )
 
 
 def _get_inputs(graph_module: GraphModule) -> List[Optional[FakeTensor]]:
@@ -160,6 +184,12 @@ def EXIREdgeDialectVerifier(  # noqa: C901
     class_only: bool = False,
     exception_list: Optional[List[torch._ops.OpOverload]] = None,
 ):
+    # merge the exception list from edge_compile_config and exception_list
+    if edge_compile_config and edge_compile_config._core_aten_ops_exception_list:
+        exception_list = edge_compile_config._core_aten_ops_exception_list + (
+            exception_list or []
+        )
+
     class _EXIREdgeDialectVerifier(Verifier):
         dialect = "EDGE"
 
@@ -170,7 +200,9 @@ def __init__(self) -> None:
             self.check_edge_ops = _edge_compile_config._use_edge_ops
             self.use_dim_order = not _edge_compile_config._skip_dim_order
 
-            self.aten_op_verifier = EXIRATenDialectVerifier(exception_list)
+            self.aten_op_verifier = EXIRATenDialectVerifier(
+                exception_list=exception_list
+            )
             self.check_valid_aten_op = self.aten_op_verifier.check_valid_op
 
             if self.check_edge_ops:
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 74f98960002..ab1f3650102 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -10,7 +10,6 @@ project(executorch_jni)
 
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
-  # Can't set to 11 due to executor_runner.cpp make_unique
 endif()
 
 if(NOT ANDROID)
@@ -71,78 +70,55 @@ if(TARGET vulkan_backend)
   list(APPEND link_libraries vulkan_backend)
 endif()
 
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+  add_subdirectory(
+    ${EXECUTORCH_ROOT}/extension/llm/custom_ops
+    ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops
+  )
+  list(APPEND link_libraries custom_ops)
+  target_link_options_shared_lib(custom_ops)
+endif()
+
 add_library(executorch_jni SHARED jni/jni_layer.cpp)
-target_link_libraries(executorch_jni ${link_libraries})
-target_include_directories(
-  executorch_jni PRIVATE ${_common_include_directories}
-)
-target_compile_options(executorch_jni PUBLIC ${_common_compile_options})
 
 if(EXECUTORCH_BUILD_LLAMA_JNI)
-  set(LLAMA_RUNNER_PATH
-      ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner/libllama_runner.a
-  )
-  add_library(llama_runner STATIC IMPORTED)
-  set_property(
-    TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH}
-  )
-
+  target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp)
+  list(APPEND link_libraries llama_runner llava_runner)
+  target_compile_definitions(executorch_jni PUBLIC EXECUTORCH_BUILD_LLAMA_JNI=1)
   add_subdirectory(
     ${EXECUTORCH_ROOT}/examples/models/llava/runner
     ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llava/runner
   )
 
-  set(CUSTOM_OPS_PATH
-      ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops/libcustom_ops.a
+  add_subdirectory(
+    ${EXECUTORCH_ROOT}/examples/models/llama2/runner
+    ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner
   )
-  add_library(custom_ops STATIC IMPORTED)
-  set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH})
-  target_link_options_shared_lib(custom_ops)
+endif()
 
+if(TARGET quantized_kernels)
+  list(APPEND link_libraries quantized_kernels quantized_ops_lib)
   target_link_options_shared_lib(quantized_ops_lib)
+endif()
+
+target_include_directories(
+  executorch_jni PRIVATE ${_common_include_directories}
+)
+
+target_compile_options(executorch_jni PUBLIC ${_common_compile_options})
+
+target_link_libraries(executorch_jni ${link_libraries})
 
-  set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp)
-  add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS})
-  if(TARGET pthreadpool)
-    target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1)
-    target_include_directories(
-      executorch_llama_jni
-      PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include
-    )
-    target_include_directories(
-      executorch_llama_jni
-      PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include
-    )
-  endif()
+if(TARGET pthreadpool)
+  target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1)
   target_include_directories(
-    executorch_llama_jni PRIVATE ${_common_include_directories}
-  )
-  target_link_libraries(
-    executorch_llama_jni
-    ${link_libraries}
-    llama_runner
-    llava_runner
-    custom_ops
-    cpublas
-    eigen_blas
-    quantized_kernels
-    quantized_ops_lib
+    executorch_jni
+    PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include
   )
-  target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
-  # link re2
-  set(ABSL_ENABLE_INSTALL ON)
-  set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
-  )
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
-    ${CMAKE_CURRENT_BINARY_DIR}/re2
+  target_include_directories(
+    executorch_jni
+    PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include
   )
-  set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-  target_link_libraries(executorch_llama_jni re2::re2)
 endif()
diff --git a/extension/android/benchmark/app/build.gradle.kts b/extension/android/benchmark/app/build.gradle.kts
index b716f2e8bd0..dcf99ca9cd0 100644
--- a/extension/android/benchmark/app/build.gradle.kts
+++ b/extension/android/benchmark/app/build.gradle.kts
@@ -38,6 +38,7 @@ dependencies {
   implementation(files("libs/executorch.aar"))
   implementation("com.facebook.soloader:soloader:0.10.5")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
+  implementation("com.google.code.gson:gson:2.8.6")
   testImplementation("junit:junit:4.13.2")
   androidTestImplementation("androidx.test.ext:junit:1.2.1")
   androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1")
diff --git a/extension/android/benchmark/app/src/main/AndroidManifest.xml b/extension/android/benchmark/app/src/main/AndroidManifest.xml
index 49711b6830e..098905c052c 100644
--- a/extension/android/benchmark/app/src/main/AndroidManifest.xml
+++ b/extension/android/benchmark/app/src/main/AndroidManifest.xml
@@ -16,6 +16,14 @@
             </intent-filter>
         </activity>
 
+        <activity
+            android:name=".LlmBenchmarkActivity"
+            android:exported="true">
+            <intent-filter>
+                <action android:name="org.pytorch.minibench.BENCHMARK" />
+            </intent-filter>
+        </activity>
+
     </application>
 
 </manifest>
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
index e9599dd3518..a79f668f80b 100644
--- a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
@@ -11,8 +11,10 @@
 import android.app.Activity;
 import android.content.Intent;
 import android.os.Bundle;
+import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.Arrays;
 import org.pytorch.executorch.Module;
 
 public class BenchmarkActivity extends Activity {
@@ -20,13 +22,19 @@ public class BenchmarkActivity extends Activity {
   protected void onCreate(Bundle savedInstanceState) {
     super.onCreate(savedInstanceState);
     Intent intent = getIntent();
-    String modelPath = intent.getStringExtra("model_path");
+    File modelDir = new File(intent.getStringExtra("model_dir"));
+    File model =
+        Arrays.stream(modelDir.listFiles())
+            .filter(file -> file.getName().endsWith(".pte"))
+            .findFirst()
+            .get();
+
     int numIter = intent.getIntExtra("num_iter", 10);
 
     // TODO: Format the string with a parsable format
     StringBuilder resultText = new StringBuilder();
 
-    Module module = Module.load(modelPath);
+    Module module = Module.load(model.getPath());
     for (int i = 0; i < numIter; i++) {
       long start = System.currentTimeMillis();
       module.forward();
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
new file mode 100644
index 00000000000..496cbde53d6
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+import com.google.gson.Gson;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Arrays;
+
+public class LlmBenchmarkActivity extends Activity implements ModelRunnerCallback {
+  ModelRunner mModelRunner;
+
+  String mPrompt;
+  StatsInfo mStatsInfo;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+
+    File modelDir = new File(intent.getStringExtra("model_dir"));
+    File model =
+        Arrays.stream(modelDir.listFiles())
+            .filter(file -> file.getName().endsWith(".pte"))
+            .findFirst()
+            .get();
+    String tokenizerPath = intent.getStringExtra("tokenizer_path");
+
+    float temperature = intent.getFloatExtra("temperature", 0.8f);
+    mPrompt = intent.getStringExtra("prompt");
+    if (mPrompt == null) {
+      mPrompt = "The ultimate answer";
+    }
+
+    mStatsInfo = new StatsInfo();
+    mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
+    mStatsInfo.loadStart = System.currentTimeMillis();
+  }
+
+  @Override
+  public void onModelLoaded(int status) {
+    mStatsInfo.loadEnd = System.currentTimeMillis();
+    if (status != 0) {
+      Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
+      onGenerationStopped();
+      return;
+    }
+    mStatsInfo.generateStart = System.currentTimeMillis();
+    mModelRunner.generate(mPrompt);
+  }
+
+  @Override
+  public void onTokenGenerated(String token) {}
+
+  @Override
+  public void onStats(String stats) {
+    mStatsInfo.tokens = stats;
+  }
+
+  @Override
+  public void onGenerationStopped() {
+    mStatsInfo.generateEnd = System.currentTimeMillis();
+
+    // TODO (huydhn): Remove txt files here once the JSON format is ready
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
+      writer.write(mStatsInfo.toString());
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    // TODO (huydhn): Figure out on what the final JSON results looks like, we need something
+    // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
+      Gson gson = new Gson();
+      writer.write(gson.toJson(mStatsInfo));
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+}
+
+class StatsInfo {
+  long loadStart;
+  long loadEnd;
+  long generateStart;
+  long generateEnd;
+  String tokens;
+
+  @Override
+  public String toString() {
+    return "loadStart: "
+        + loadStart
+        + "\nloadEnd: "
+        + loadEnd
+        + "\ngenerateStart: "
+        + generateStart
+        + "\ngenerateEnd: "
+        + generateEnd
+        + "\n"
+        + tokens;
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
new file mode 100644
index 00000000000..9e9b9e003d8
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Looper;
+import android.os.Message;
+import org.pytorch.executorch.LlamaCallback;
+import org.pytorch.executorch.LlamaModule;
+
+/** A helper class to handle all model running logic within this class. */
+public class ModelRunner implements LlamaCallback {
+  LlamaModule mModule = null;
+
+  String mModelFilePath = "";
+  String mTokenizerFilePath = "";
+
+  ModelRunnerCallback mCallback = null;
+
+  HandlerThread mHandlerThread = null;
+  Handler mHandler = null;
+
+  /**
+   * ] Helper class to separate between UI logic and model runner logic. Automatically handle
+   * generate() request on worker thread.
+   *
+   * @param modelFilePath
+   * @param tokenizerFilePath
+   * @param callback
+   */
+  ModelRunner(
+      String modelFilePath,
+      String tokenizerFilePath,
+      float temperature,
+      ModelRunnerCallback callback) {
+    mModelFilePath = modelFilePath;
+    mTokenizerFilePath = tokenizerFilePath;
+    mCallback = callback;
+
+    mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f);
+    mHandlerThread = new HandlerThread("ModelRunner");
+    mHandlerThread.start();
+    mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this);
+
+    mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL);
+  }
+
+  int generate(String prompt) {
+    Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt);
+    msg.sendToTarget();
+    return 0;
+  }
+
+  void stop() {
+    mModule.stop();
+  }
+
+  @Override
+  public void onResult(String result) {
+    mCallback.onTokenGenerated(result);
+  }
+
+  @Override
+  public void onStats(float tps) {
+    mCallback.onStats("tokens/second: " + tps);
+  }
+}
+
+class ModelRunnerHandler extends Handler {
+  public static int MESSAGE_LOAD_MODEL = 1;
+  public static int MESSAGE_GENERATE = 2;
+
+  private final ModelRunner mModelRunner;
+
+  public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) {
+    super(looper);
+    mModelRunner = modelRunner;
+  }
+
+  @Override
+  public void handleMessage(android.os.Message msg) {
+    if (msg.what == MESSAGE_LOAD_MODEL) {
+      int status = mModelRunner.mModule.load();
+      mModelRunner.mCallback.onModelLoaded(status);
+    } else if (msg.what == MESSAGE_GENERATE) {
+      mModelRunner.mModule.generate((String) msg.obj, mModelRunner);
+      mModelRunner.mCallback.onGenerationStopped();
+    }
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java
new file mode 100644
index 00000000000..63701a7bbc6
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+/**
+ * A helper interface within the app for MainActivity and Benchmarking to handle callback from
+ * ModelRunner.
+ */
+public interface ModelRunnerCallback {
+
+  void onModelLoaded(int status);
+
+  void onTokenGenerated(String token);
+
+  void onStats(String token);
+
+  void onGenerationStopped();
+}
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
index 7cdf8ef7ec4..3c8f00b2bdc 100644
--- a/extension/android/jni/BUCK
+++ b/extension/android/jni/BUCK
@@ -70,21 +70,30 @@ fb_android_cxx_library(
 
 fb_android_cxx_library(
     name = "executorch_llama_jni",
-    srcs = ["jni_layer_llama.cpp"],
+    srcs = [
+        "jni_layer.cpp",
+        "jni_layer_llama.cpp",
+    ],
+    headers = ["jni_layer_constants.h"],
     allow_jni_merging = False,
     compiler_flags = [
         "-frtti",
         "-fexceptions",
+        "-DEXECUTORCH_BUILD_LLAMA_JNI",
         "-Wno-format",
     ],
-    soname = "libexecutorch_llama_jni.$(ext)",
+    soname = "libexecutorch.$(ext)",
     visibility = ["PUBLIC"],
     deps = [
         "//fbandroid/libraries/fbjni:fbjni",
         "//fbandroid/native/fb:fb",
         "//third-party/glog:glog",
+        "//xplat/executorch/backends/xnnpack:xnnpack_backend_static",
         "//xplat/executorch/examples/models/llama2/runner:runner_static",
         "//xplat/executorch/examples/models/llava/runner:runner_static",
+        "//xplat/executorch/extension/module:module_static",
+        "//xplat/executorch/extension/runner_util:inputs_static",
+        "//xplat/executorch/extension/tensor:tensor_static",
         "//xplat/executorch/extension/threadpool:cpuinfo_utils_static",
         "//xplat/executorch/extension/threadpool:threadpool_static",
     ],
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index f2cfc4a5cff..1ef81b20b08 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -386,7 +386,15 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
 };
 } // namespace executorch::extension
 
+#ifdef EXECUTORCH_BUILD_LLAMA_JNI
+extern void register_natives_for_llama();
+#else
+// No op if we don't build llama
+void register_natives_for_llama() {}
+#endif
 JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) {
-  return facebook::jni::initialize(
-      vm, [] { executorch::extension::ExecuTorchJni::registerNatives(); });
+  return facebook::jni::initialize(vm, [] {
+    executorch::extension::ExecuTorchJni::registerNatives();
+    register_natives_for_llama();
+  });
 }
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 0d43317c3ca..e6a9b5de58c 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -30,33 +30,6 @@
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
-#ifdef __ANDROID__
-#include <android/log.h>
-
-// For Android, write to logcat
-void et_pal_emit_log_message(
-    et_timestamp_t timestamp,
-    et_pal_log_level_t level,
-    const char* filename,
-    const char* function,
-    size_t line,
-    const char* message,
-    size_t length) {
-  int android_log_level = ANDROID_LOG_UNKNOWN;
-  if (level == 'D') {
-    android_log_level = ANDROID_LOG_DEBUG;
-  } else if (level == 'I') {
-    android_log_level = ANDROID_LOG_INFO;
-  } else if (level == 'E') {
-    android_log_level = ANDROID_LOG_ERROR;
-  } else if (level == 'F') {
-    android_log_level = ANDROID_LOG_FATAL;
-  }
-
-  __android_log_print(android_log_level, "LLAMA", "%s", message);
-}
-#endif
-
 using namespace torch::executor;
 
 namespace executorch_jni {
@@ -150,8 +123,8 @@ class ExecuTorchLlamaJni
       jint channels,
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
-      jboolean echo,
-      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback) {
+      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback,
+      jboolean echo) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       auto image_size = image->size();
       std::vector<Image> images;
@@ -170,7 +143,8 @@ class ExecuTorchLlamaJni
           prompt->toStdString(),
           seq_len,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); });
+          [callback](const Stats& result) { callback->onStats(result); },
+          echo);
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       runner_->generate(
           prompt->toStdString(),
@@ -248,7 +222,8 @@ class ExecuTorchLlamaJni
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
       jlong start_pos,
-      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback) {
+      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback,
+      jboolean echo) {
     if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
       return static_cast<jint>(Error::NotSupported);
     }
@@ -259,7 +234,8 @@ class ExecuTorchLlamaJni
         [callback](const std::string& result) { callback->onResult(result); },
         [callback](const ::executorch::extension::llm::Stats& stats) {
           callback->onStats(stats);
-        }));
+        },
+        echo));
   }
 
   void stop() {
@@ -285,13 +261,18 @@ class ExecuTorchLlamaJni
         makeNativeMethod("generate", ExecuTorchLlamaJni::generate),
         makeNativeMethod("stop", ExecuTorchLlamaJni::stop),
         makeNativeMethod("load", ExecuTorchLlamaJni::load),
+        makeNativeMethod(
+            "prefillImagesNative", ExecuTorchLlamaJni::prefill_images),
+        makeNativeMethod(
+            "prefillPromptNative", ExecuTorchLlamaJni::prefill_prompt),
+        makeNativeMethod(
+            "generateFromPos", ExecuTorchLlamaJni::generate_from_pos),
     });
   }
 };
 
 } // namespace executorch_jni
 
-JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) {
-  return facebook::jni::initialize(
-      vm, [] { executorch_jni::ExecuTorchLlamaJni::registerNatives(); });
+void register_natives_for_llama() {
+  executorch_jni::ExecuTorchLlamaJni::registerNatives();
 }
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
index c4de23df0ee..7c77dbae08f 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
@@ -28,7 +28,7 @@ public class LlamaModule {
     if (!NativeLoader.isInitialized()) {
       NativeLoader.init(new SystemDelegate());
     }
-    NativeLoader.loadLibrary("executorch_llama_jni");
+    NativeLoader.loadLibrary("executorch");
   }
 
   private final HybridData mHybridData;
@@ -60,7 +60,7 @@ public void resetNative() {
    * @param llamaCallback callback object to receive results.
    */
   public int generate(String prompt, LlamaCallback llamaCallback) {
-    return generate(prompt, DEFAULT_SEQ_LEN, DEFAULT_ECHO, llamaCallback);
+    return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback, DEFAULT_ECHO);
   }
 
   /**
@@ -71,18 +71,18 @@ public int generate(String prompt, LlamaCallback llamaCallback) {
    * @param llamaCallback callback object to receive results.
    */
   public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) {
-    return generate(null, 0, 0, 0, prompt, seqLen, DEFAULT_ECHO, llamaCallback);
+    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, DEFAULT_ECHO);
   }
 
   /**
    * Start generating tokens from the module.
    *
    * @param prompt Input prompt
+   * @param llamaCallback callback object to receive results
    * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
-   * @param llamaCallback callback object to receive results.
    */
-  public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) {
-    return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, echo, llamaCallback);
+  public int generate(String prompt, LlamaCallback llamaCallback, boolean echo) {
+    return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, llamaCallback, echo);
   }
 
   /**
@@ -90,11 +90,11 @@ public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) {
    *
    * @param prompt Input prompt
    * @param seqLen sequence length
+   * @param llamaCallback callback object to receive results
    * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
-   * @param llamaCallback callback object to receive results.
    */
-  public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llamaCallback) {
-    return generate(null, 0, 0, 0, prompt, seqLen, echo, llamaCallback);
+  public int generate(String prompt, int seqLen, LlamaCallback llamaCallback, boolean echo) {
+    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, echo);
   }
 
   /**
@@ -106,8 +106,8 @@ public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llama
    * @param channels Input image number of channels
    * @param prompt Input prompt
    * @param seqLen sequence length
-   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
    * @param llamaCallback callback object to receive results.
+   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
    */
   @DoNotStrip
   public native int generate(
@@ -117,8 +117,8 @@ public native int generate(
       int channels,
       String prompt,
       int seqLen,
-      boolean echo,
-      LlamaCallback llamaCallback);
+      LlamaCallback llamaCallback,
+      boolean echo);
 
   /**
    * Prefill an LLaVA Module with the given images input.
@@ -172,10 +172,11 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
    * @param seqLen The total sequence length, including the prompt tokens and new tokens.
    * @param startPos The starting position in KV cache of the input in the LLM.
    * @param llamaCallback callback object to receive results.
+   * @param echo indicate whether to echo the input prompt or not.
    * @return The error code.
    */
   public native int generateFromPos(
-      String prompt, int seqLen, long startPos, LlamaCallback callback);
+      String prompt, int seqLen, long startPos, LlamaCallback callback, boolean echo);
 
   /** Stop current generate() before it finishes. */
   @DoNotStrip
diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
index 4dcffaffbf6..1bc3188fe17 100644
--- a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
+++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
@@ -10,14 +10,14 @@
 		03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; };
 		03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; };
 		03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; };
-		03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */; };
-		03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */; };
-		03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */; };
-		03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */; };
-		03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */; };
-		03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */; };
-		03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */; };
-		03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */; };
+		03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */; };
+		03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */; };
+		03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */; };
+		03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */; };
+		03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */; };
+		03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */; };
+		03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A32C8FE44600FE4619 /* executorch.xcframework */; settings = {ATTRIBUTES = (Required, ); }; };
+		03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */; };
 		03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; };
 		03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; };
 		03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; };
@@ -45,14 +45,14 @@
 		03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = "<group>"; };
 		03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; };
-		03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = "<group>"; };
-		03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = "<group>"; };
-		03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = "<group>"; };
-		03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = "<group>"; };
-		03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = "<group>"; };
-		03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = "<group>"; };
-		03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = "<group>"; };
-		03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = "<group>"; };
+		03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = "<group>"; };
+		03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = "<group>"; };
+		03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = "<group>"; };
+		03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = "<group>"; };
+		03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = "<group>"; };
+		03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = "<group>"; };
+		03DD00A32C8FE44600FE4619 /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = "<group>"; };
+		03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = "<group>"; };
 		03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; };
 		03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; };
 		03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; };
@@ -79,14 +79,14 @@
 				03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */,
 				03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */,
 				03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */,
-				03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */,
-				03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */,
-				03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */,
-				03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */,
-				03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */,
-				03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */,
-				03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */,
-				03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */,
+				03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */,
+				03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */,
+				03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */,
+				03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */,
+				03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */,
+				03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */,
+				03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */,
+				03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -141,14 +141,14 @@
 				03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */,
 				03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */,
 				03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */,
-				03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */,
-				03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */,
-				03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */,
-				03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */,
-				03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */,
-				03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */,
-				03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */,
-				03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */,
+				03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */,
+				03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */,
+				03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */,
+				03DD00A32C8FE44600FE4619 /* executorch.xcframework */,
+				03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */,
+				03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */,
+				03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */,
+				03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */,
 			);
 			name = Frameworks;
 			sourceTree = SOURCE_ROOT;
diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm
index 5cf958765d3..dd85cb69542 100644
--- a/extension/apple/Benchmark/Tests/Tests.mm
+++ b/extension/apple/Benchmark/Tests/Tests.mm
@@ -22,82 +22,105 @@ @interface Tests : XCTestCase
 @implementation Tests
 
 + (void)initialize {
-  if (self == [Tests class]) {
-    NSString *modelsDir = [[NSBundle bundleForClass:[self class]].resourcePath
-        stringByAppendingPathComponent:@"Models"];
-    NSArray *models =
-        [NSFileManager.defaultManager contentsOfDirectoryAtPath:modelsDir
-                                                          error:nil];
-    for (NSString *model in models) {
-      NSString *modelName = model.stringByDeletingPathExtension;
-      NSString *modelPath = [modelsDir stringByAppendingPathComponent:model];
-      XCTAssertGreaterThan(modelPath.length, 0);
-
-      SEL testLoadSelector = NSSelectorFromString(
-          [NSString stringWithFormat:@"test_load_%@", modelName]);
-      IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) {
-        auto __block module = std::make_unique<Module>(modelPath.UTF8String);
-        [_self
-            measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ]
-                       options:XCTMeasureOptions.defaultOptions
-                         block:^{
-                           XCTAssertEqual(module->load_method("forward"),
-                                          Error::Ok);
-                         }];
-      });
-      class_addMethod(
-          [self class], testLoadSelector, testLoadImplementation, "v@:");
-
-      SEL testForwardSelector = NSSelectorFromString(
-          [NSString stringWithFormat:@"test_forward_%@", modelName]);
-      IMP testForwardImplementation = imp_implementationWithBlock(^(id _self) {
-        auto __block module = std::make_unique<Module>(modelPath.UTF8String);
-        XCTAssertEqual(module->load_method("forward"), Error::Ok);
-
-        const auto method_meta = module->method_meta("forward");
-        XCTAssertEqual(method_meta.error(), Error::Ok);
-
-        const auto num_inputs = method_meta->num_inputs();
-        XCTAssertGreaterThan(num_inputs, 0);
-
-        std::vector<std::vector<uint8_t>> buffers;
-        buffers.reserve(num_inputs);
-        std::vector<TensorPtr> tensors;
-        tensors.reserve(num_inputs);
-        std::vector<EValue> __block inputs;
-        inputs.reserve(num_inputs);
-
-        for (auto index = 0; index < num_inputs; ++index) {
-          auto input_tag = method_meta->input_tag(index);
-          XCTAssertEqual(input_tag.error(), Error::Ok);
-
-          switch (*input_tag) {
-          case Tag::Tensor: {
-            const auto tensor_meta = method_meta->input_tensor_meta(index);
-            XCTAssertEqual(tensor_meta.error(), Error::Ok);
-
-            const auto sizes = tensor_meta->sizes();
-            buffers.emplace_back(tensor_meta->nbytes(),
-                                 0b01010101); // Set all bytes to be non-zero.
-            tensors.emplace_back(from_blob(buffers.rbegin()->data(),
-                                           {sizes.begin(), sizes.end()},
-                                           tensor_meta->scalar_type()));
-            inputs.emplace_back(tensors.back());
-          } break;
-          default:
-            XCTFail("Unsupported tag %i at input %d", *input_tag, index);
-          }
+  if (self != [self class]) {
+    return;
+  }
+  for (NSBundle *bundle in @[
+         [NSBundle mainBundle],
+         [NSBundle bundleForClass:[self class]],
+       ]) {
+    for (NSString *directory in @[
+           @"Models",
+           @"aatp/data",
+         ]) {
+      NSString *directoryPath =
+          [bundle.resourcePath stringByAppendingPathComponent:directory];
+      NSArray *filePaths =
+          [NSFileManager.defaultManager contentsOfDirectoryAtPath:directoryPath
+                                                            error:nil];
+      for (NSString *filePath in filePaths) {
+        if (![filePath hasSuffix:@".pte"]) {
+          continue;
         }
-        [_self
-            measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ]
-                       options:XCTMeasureOptions.defaultOptions
-                         block:^{
-                           XCTAssertEqual(module->forward(inputs).error(),
-                                          Error::Ok);
-                         }];
-      });
-      class_addMethod(
-          [self class], testForwardSelector, testForwardImplementation, "v@:");
+        NSString *modelPath =
+            [directoryPath stringByAppendingPathComponent:filePath];
+        NSString *directoryName =
+            [directory stringByReplacingOccurrencesOfString:@"/"
+                                                 withString:@"_"]
+                .lowercaseString;
+        NSString *modelName =
+            modelPath.lastPathComponent.stringByDeletingPathExtension;
+
+        SEL testLoadSelector = NSSelectorFromString([NSString
+            stringWithFormat:@"test_load_%@_%@", directoryName, modelName]);
+        IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) {
+          auto __block module = std::make_unique<Module>(modelPath.UTF8String);
+          [_self measureWithMetrics:@[
+            [XCTClockMetric new],
+            [XCTMemoryMetric new],
+          ]
+                            options:XCTMeasureOptions.defaultOptions
+                              block:^{
+                                XCTAssertEqual(module->load_method("forward"),
+                                               Error::Ok);
+                              }];
+        });
+        class_addMethod(
+            [self class], testLoadSelector, testLoadImplementation, "v@:");
+
+        SEL testForwardSelector = NSSelectorFromString([NSString
+            stringWithFormat:@"test_forward_%@_%@", directoryName, modelName]);
+        IMP testForwardImplementation = imp_implementationWithBlock(^(
+            id _self) {
+          auto __block module = std::make_unique<Module>(modelPath.UTF8String);
+          XCTAssertEqual(module->load_method("forward"), Error::Ok);
+
+          const auto method_meta = module->method_meta("forward");
+          XCTAssertEqual(method_meta.error(), Error::Ok);
+
+          const auto num_inputs = method_meta->num_inputs();
+          XCTAssertGreaterThan(num_inputs, 0);
+
+          std::vector<TensorPtr> __block tensors;
+          tensors.reserve(num_inputs);
+          std::vector<EValue> __block inputs;
+          inputs.reserve(num_inputs);
+
+          for (auto index = 0; index < num_inputs; ++index) {
+            const auto input_tag = method_meta->input_tag(index);
+            XCTAssertEqual(input_tag.error(), Error::Ok);
+
+            switch (*input_tag) {
+            case Tag::Tensor: {
+              const auto tensor_meta = method_meta->input_tensor_meta(index);
+              XCTAssertEqual(tensor_meta.error(), Error::Ok);
+
+              const auto sizes = tensor_meta->sizes();
+              tensors.emplace_back(make_tensor_ptr(
+                  tensor_meta->scalar_type(),
+                  {sizes.begin(), sizes.end()},
+                  std::vector<uint8_t>(tensor_meta->nbytes(), 0b01010101)));
+              inputs.emplace_back(tensors.back());
+            } break;
+            default:
+              XCTFail("Unsupported tag %i at input %d", *input_tag, index);
+            }
+          }
+          [_self measureWithMetrics:@[
+            [XCTClockMetric new],
+            [XCTMemoryMetric new],
+          ]
+                            options:XCTMeasureOptions.defaultOptions
+                              block:^{
+                                XCTAssertEqual(module->forward(inputs).error(),
+                                               Error::Ok);
+                              }];
+        });
+        class_addMethod([self class],
+                        testForwardSelector,
+                        testForwardImplementation,
+                        "v@:");
+      }
     }
   }
 }
diff --git a/extension/kernel_util/make_boxed_from_unboxed_functor.h b/extension/kernel_util/make_boxed_from_unboxed_functor.h
index 2b21914f49b..409c981cbb1 100644
--- a/extension/kernel_util/make_boxed_from_unboxed_functor.h
+++ b/extension/kernel_util/make_boxed_from_unboxed_functor.h
@@ -173,9 +173,9 @@ static executorch::runtime::Kernel make_boxed_kernel(
 } // namespace extension
 } // namespace executorch
 
-#define EXECUTORCH_LIBRARY(ns, op_name, func)                     \
-  static auto res_##ns = ::executorch::runtime::register_kernels( \
-      ::executorch::extension::make_boxed_kernel(                 \
+#define EXECUTORCH_LIBRARY(ns, op_name, func)                    \
+  static auto res_##ns = ::executorch::runtime::register_kernel( \
+      ::executorch::extension::make_boxed_kernel(                \
           #ns "::" op_name, EXECUTORCH_FN(func)))
 
 namespace torch {
diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
index da9596def70..dce3694d517 100644
--- a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
+++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
@@ -21,10 +21,11 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using exec_aten::TensorImpl;
 using executorch::runtime::BoxedEvalueList;
+using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::getOpsFn;
-using executorch::runtime::hasOpsFn;
+using executorch::runtime::get_op_function_from_registry;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::registry_has_op_function;
 
 Tensor& my_op_out(KernelRuntimeContext& ctx, const Tensor& a, Tensor& out) {
   (void)ctx;
@@ -91,12 +92,12 @@ class MakeBoxedFromUnboxedFunctorTest : public ::testing::Test {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, Basic) {
   EXECUTORCH_LIBRARY(my_ns, "my_op.out", my_op_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::my_op.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::my_op.out"));
 }
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
   EXECUTORCH_LIBRARY(my_ns, "set_1.out", set_1_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::set_1.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::set_1.out"));
 
   // prepare out tensor
   TensorImpl::SizesType sizes[1] = {5};
@@ -106,7 +107,8 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
   auto a = Tensor(&a_impl);
 
   // get boxed callable
-  auto fn = getOpsFn("my_ns::set_1.out");
+  auto fn = get_op_function_from_registry("my_ns::set_1.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it
   KernelRuntimeContext context;
@@ -115,7 +117,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
   EValue* stack[1];
   stack[0] = &values[0];
 
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result
   EXPECT_EQ(a.const_data_ptr<int32_t>()[0], 1);
@@ -123,7 +125,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) {
   EXECUTORCH_LIBRARY(my_ns, "add_tensor.out", add_tensor_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::add_tensor.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::add_tensor.out"));
 
   // prepare ArrayRef input.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
@@ -135,13 +137,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) {
   // prepare out tensor.
   EValue out(tf.zeros({5}));
 
-  auto fn = getOpsFn("my_ns::add_tensor.out");
+  auto fn = get_op_function_from_registry("my_ns::add_tensor.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it.
   KernelRuntimeContext context;
   EValue values[2] = {boxed_array_ref, out};
   EValue* stack[2] = {&values[0], &values[1]};
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result.
   for (int i = 0; i < 5; i++) {
@@ -151,7 +154,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) {
   EXECUTORCH_LIBRARY(my_ns, "add_optional_scalar.out", add_optional_scalar_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::add_optional_scalar.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_scalar.out"));
 
   // prepare optional input.
   EValue scalar((int64_t)3);
@@ -160,13 +163,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) {
   // prepare out tensor.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
   EValue out(tf.ones({1}));
-  auto fn = getOpsFn("my_ns::add_optional_scalar.out");
+  auto fn = get_op_function_from_registry("my_ns::add_optional_scalar.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it.
   KernelRuntimeContext context;
   EValue values[3] = {scalar, scalar_none, out};
   EValue* stack[3] = {&values[0], &values[1], &values[2]};
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result.
   EXPECT_EQ(stack[2]->toTensor().const_data_ptr<int32_t>()[0], 4);
@@ -174,7 +178,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) {
 
 TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) {
   EXECUTORCH_LIBRARY(my_ns, "add_optional_tensor.out", add_optional_tensor_out);
-  EXPECT_TRUE(hasOpsFn("my_ns::add_optional_tensor.out"));
+  EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_tensor.out"));
 
   // prepare optional tensors.
   torch::executor::testing::TensorFactory<ScalarType::Int> tf;
@@ -186,13 +190,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) {
 
   // prepare out tensor.
   EValue out(tf.zeros({5}));
-  auto fn = getOpsFn("my_ns::add_optional_tensor.out");
+  auto fn = get_op_function_from_registry("my_ns::add_optional_tensor.out");
+  ASSERT_EQ(fn.error(), Error::Ok);
 
   // run it.
   KernelRuntimeContext context;
   EValue values[2] = {boxed_array_ref, out};
   EValue* stack[2] = {&values[0], &values[1]};
-  fn(context, stack);
+  (*fn)(context, stack);
 
   // check result.
   for (int i = 0; i < 5; i++) {
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index 56db1c208ea..c5ac365825b 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -158,7 +158,7 @@ static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
 template <
     typename scalar_t,
     typename std::enable_if_t<
-        ::executorch::runtime::is_reduced_floating_point<scalar_t>::value,
+        ::executorch::runtime::is_reduced_floating_point_v<scalar_t>,
         int> = 0>
 static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) {
   (void)ptr;
@@ -247,7 +247,7 @@ void cpu_flash_attention(
       "KV_split_size must be greater than q_split_size");
 
   constexpr bool is_reduced_type =
-      ::executorch::runtime::is_reduced_floating_point<scalar_t>::value;
+      ::executorch::runtime::is_reduced_floating_point_v<scalar_t>;
 
   ET_CHECK_MSG(
       !is_reduced_type, "FlashAttention does not support reduced types.");
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.clang-format b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.clang-format
deleted file mode 100644
index 4b3f13fa55e..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-...
-
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.gitignore b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.gitignore
deleted file mode 100644
index 3c1b4f2183e..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.gitignore
+++ /dev/null
@@ -1,46 +0,0 @@
-*.a
-*.so
-*.so.?*
-*.dll
-*.exe
-*.dylib
-*.cmake
-!/cmake/*.cmake
-*~
-*.pyc
-__pycache__
-
-# lcov
-*.lcov
-/lcov
-
-# cmake files.
-/Testing
-CMakeCache.txt
-CMakeFiles/
-cmake_install.cmake
-
-# makefiles.
-Makefile
-
-# in-source build.
-bin/
-lib/
-/test/*_test
-
-# exuberant ctags.
-tags
-
-# YouCompleteMe configuration.
-.ycm_extra_conf.pyc
-
-# ninja generated files.
-.ninja_deps
-.ninja_log
-build.ninja
-install_manifest.txt
-rules.ninja
-
-# out-of-source build top-level folders.
-build/
-_build/
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis-libcxx-setup.sh b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis-libcxx-setup.sh
deleted file mode 100644
index a591743c6a6..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis-libcxx-setup.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis.yml b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis.yml
deleted file mode 100644
index 36df088446c..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis.yml
+++ /dev/null
@@ -1,157 +0,0 @@
-sudo: required
-dist: trusty
-language: cpp
-
-env:
-  global:
-    - /usr/local/bin:$PATH
-
-matrix:
-  include:
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - lcov
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON
-    - compiler: gcc
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - g++-6
-      env:
-        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
-    # Clang w/ libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
-    # Clang w/ libc++, ASAN, UBSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
-        - UBSAN_OPTIONS=print_stacktrace=1
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
-
-before_script:
-  - if [ -z "$BUILD_32_BITS" ]; then
-      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
-    fi
-  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
-    fi
-  - mkdir build && cd build
-
-install:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      PATH=~/.local/bin:${PATH};
-      pip install --user --upgrade pip;
-      pip install --user cpp-coveralls;
-    fi
-
-script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ..
-  - make
-  - ctest -C ${BUILD_TYPE} --output-on-failure
-
-after_success:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
-    fi
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/AUTHORS b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/AUTHORS
deleted file mode 100644
index ae278df4046..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/AUTHORS
+++ /dev/null
@@ -1,40 +0,0 @@
-# This is the official list of benchmark authors for copyright purposes.
-# This file is distinct from the CONTRIBUTORS files.
-# See the latter for an explanation.
-#
-# Names should be added to this file as:
-#	Name or Organization <email address>
-# The email address is not required for organizations.
-#
-# Please keep the list sorted.
-
-Albert Pretorius <pretoalb@gmail.com>
-Arne Beer <arne@twobeer.de>
-Christopher Seymour <chris.j.seymour@hotmail.com>
-David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
-Dominic Hamon <dma@stripysock.com>
-Eric Fiselier <eric@efcs.ca>
-Eugene Zhuk <eugene.zhuk@gmail.com>
-Evgeny Safronov <division494@gmail.com>
-Felix Homann <linuxaudio@showlabor.de>
-Google Inc.
-International Business Machines Corporation
-Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
-Jern-Kuan Leong <jernkuan@gmail.com>
-Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
-JianXiong Zhou <zhoujianxiong2@gmail.com>
-Jussi Knuuttila <jussi.knuuttila@gmail.com>
-Kaito Udagawa <umireon@gmail.com>
-Lei Xu <eddyxu@gmail.com>
-Matt Clarkson <mattyclarkson@gmail.com>
-Maxim Vafin <maxvafin@gmail.com>
-Nick Hutchinson <nshutchinson@gmail.com>
-Oleksandr Sochka <sasha.sochka@gmail.com>
-Paul Redmond <paul.redmond@gmail.com>
-Radoslav Yovchev <radoslav.tm@gmail.com>
-Shuo Chen <chenshuo@chenshuo.com>
-Yixuan Qiu <yixuanq@gmail.com>
-Yusuke Suzuki <utatane.tea@gmail.com>
-Dirac Research 
-Zbigniew Skowron <zbychs@gmail.com>
-Dominik Czarnota <dominik.b.czarnota@gmail.com>
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CMakeLists.txt b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CMakeLists.txt
deleted file mode 100644
index f7f1566f569..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CMakeLists.txt
+++ /dev/null
@@ -1,202 +0,0 @@
-cmake_minimum_required (VERSION 2.8.12)
-
-project (benchmark)
-
-foreach(p
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
-
-option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
-option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
-option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
-option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library" OFF)
-
-# Make sure we can import out CMake functions
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-
-# Read the git tags to determine the project version
-include(GetGitVersion)
-get_git_version(GIT_VERSION)
-
-# Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message("-- Version: ${VERSION}")
-
-# The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
-
-# Import our CMake modules
-include(CheckCXXCompilerFlag)
-include(AddCXXCompilerFlag)
-include(CXXFeatureCheck)
-
-if (BENCHMARK_BUILD_32_BITS)
-  add_required_cxx_compiler_flag(-m32)
-endif()
-
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-  # Turn compiler warnings up to 11
-  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
-  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-
-  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
-    add_cxx_compiler_flag(-EHs-)
-    add_cxx_compiler_flag(-EHa-)
-  endif()
-  # Link time optimisation
-  if (BENCHMARK_ENABLE_LTO)
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
-    set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS_RELEASE} /LTCG")
-    set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /LTCG")
-    set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG")
-
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /GL")
-    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO}")
-    set(CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
-    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO}")
-    set(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
-    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO}")
-    set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
-
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /GL")
-    set(CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL "${CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL} /LTCG")
-    set(CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL "${CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL} /LTCG")
-    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
-  endif()
-else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
-  # Turn compiler warnings up to 11
-  add_cxx_compiler_flag(-Wall)
-
-  add_cxx_compiler_flag(-Wextra)
-  add_cxx_compiler_flag(-Wshadow)
-  add_cxx_compiler_flag(-Werror RELEASE)
-  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Werror MINSIZEREL)
-  add_cxx_compiler_flag(-pedantic)
-  add_cxx_compiler_flag(-pedantic-errors)
-  add_cxx_compiler_flag(-Wshorten-64-to-32)
-  add_cxx_compiler_flag(-Wfloat-equal)
-  add_cxx_compiler_flag(-fstrict-aliasing)
-  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
-    add_cxx_compiler_flag(-fno-exceptions)
-  endif()
-  if (NOT BENCHMARK_USE_LIBCXX)
-    add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
-  endif()
-  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
-      add_cxx_compiler_flag(-Wstrict-aliasing)
-    endif()
-  endif()
-  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
-  # (because of deprecated overload)
-  add_cxx_compiler_flag(-wd654)  
-  add_cxx_compiler_flag(-Wthread-safety)
-  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
-  endif()
-
-  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
-  # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
-  # since we depend on GNU/POSIX/BSD extensions.
-  if (CYGWIN)
-    add_definitions(-D_GNU_SOURCE=1)
-  endif()
-
-  # Link time optimisation
-  if (BENCHMARK_ENABLE_LTO)
-    add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
-      find_program(GCC_AR gcc-ar)
-      if (GCC_AR)
-        set(CMAKE_AR ${GCC_AR})
-      endif()
-      find_program(GCC_RANLIB gcc-ranlib)
-      if (GCC_RANLIB)
-        set(CMAKE_RANLIB ${GCC_RANLIB})
-      endif()
-    endif()
-  endif()
-
-  # Coverage build type
-  set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING
-    "Flags used by the C++ compiler during coverage builds."
-    FORCE)
-  set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    "${CMAKE_EXE_LINKER_FLAGS_DEBUG}" CACHE STRING
-    "Flags used for linking binaries during coverage builds."
-    FORCE)
-  set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
-    "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" CACHE STRING
-    "Flags used by the shared libraries linker during coverage builds."
-    FORCE)
-  mark_as_advanced(
-    CMAKE_CXX_FLAGS_COVERAGE
-    CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
-  set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
-    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage."
-    FORCE)
-  add_cxx_compiler_flag(--coverage COVERAGE)
-endif()
-
-if (BENCHMARK_USE_LIBCXX)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    add_cxx_compiler_flag(-stdlib=libc++)
-  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-    add_cxx_compiler_flag(-nostdinc++)
-    message("libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
-    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
-    # configuration checks such as 'find_package(Threads)'
-    list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
-    # -lc++ cannot be added directly to CMAKE_<TYPE>_LINKER_FLAGS because
-    # linker flags appear before all linker inputs and -lc++ must appear after.
-    list(APPEND BENCHMARK_CXX_LIBRARIES c++)
-  else()
-    message(FATAL "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
-  endif()
-endif(BENCHMARK_USE_LIBCXX)
-
-# C++ feature checks
-# Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX)
-cxx_feature_check(GNU_POSIX_REGEX)
-cxx_feature_check(POSIX_REGEX)
-if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
-  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
-endif()
-if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
-        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
-  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
-endif()
-cxx_feature_check(STEADY_CLOCK)
-# Ensure we have pthreads
-find_package(Threads REQUIRED)
-
-# Set up directories
-include_directories(${PROJECT_SOURCE_DIR}/include)
-
-# Build the targets
-add_subdirectory(src)
-
-if (BENCHMARK_ENABLE_TESTING)
-  enable_testing()
-  add_subdirectory(test)
-endif()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTING.md b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTING.md
deleted file mode 100644
index 43de4c9d470..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTING.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# How to contribute #
-
-We'd love to accept your patches and contributions to this project.  There are
-a just a few small guidelines you need to follow.
-
-
-## Contributor License Agreement ##
-
-Contributions to any Google project must be accompanied by a Contributor
-License Agreement.  This is not a copyright **assignment**, it simply gives
-Google permission to use and redistribute your contributions as part of the
-project.
-
-  * If you are an individual writing original source code and you're sure you
-    own the intellectual property, then you'll need to sign an [individual
-    CLA][].
-
-  * If you work for a company that wants to allow you to contribute your work,
-    then you'll need to sign a [corporate CLA][].
-
-You generally only need to submit a CLA once, so if you've already submitted
-one (even if it was for a different project), you probably don't need to do it
-again.
-
-[individual CLA]: https://developers.google.com/open-source/cla/individual
-[corporate CLA]: https://developers.google.com/open-source/cla/corporate
-
-Once your CLA is submitted (or if you already submitted one for
-another Google project), make a commit adding yourself to the
-[AUTHORS][] and [CONTRIBUTORS][] files. This commit can be part
-of your first [pull request][].
-
-[AUTHORS]: AUTHORS
-[CONTRIBUTORS]: CONTRIBUTORS
-
-
-## Submitting a patch ##
-
-  1. It's generally best to start by opening a new issue describing the bug or
-     feature you're intending to fix.  Even if you think it's relatively minor,
-     it's helpful to know what people are working on.  Mention in the initial
-     issue that you are planning to work on that bug or feature so that it can
-     be assigned to you.
-
-  1. Follow the normal process of [forking][] the project, and setup a new
-     branch to work in.  It's important that each group of changes be done in
-     separate branches in order to ensure that a pull request only includes the
-     commits related to that bug or feature.
-
-  1. Do your best to have [well-formed commit messages][] for each change.
-     This provides consistency throughout the project, and ensures that commit
-     messages are able to be formatted properly by various git tools.
-
-  1. Finally, push the commits to your fork and submit a [pull request][].
-
-[forking]: https://help.github.com/articles/fork-a-repo
-[well-formed commit messages]: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
-[pull request]: https://help.github.com/articles/creating-a-pull-request
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTORS b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTORS
deleted file mode 100644
index 9abb60865eb..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTORS
+++ /dev/null
@@ -1,59 +0,0 @@
-# People who have agreed to one of the CLAs and can contribute patches.
-# The AUTHORS file lists the copyright holders; this file
-# lists people.  For example, Google employees are listed here
-# but not in AUTHORS, because Google holds the copyright.
-#
-# Names should be added to this file only after verifying that
-# the individual or the individual's organization has agreed to
-# the appropriate Contributor License Agreement, found here:
-#
-# https://developers.google.com/open-source/cla/individual
-# https://developers.google.com/open-source/cla/corporate
-#
-# The agreement for individuals can be filled out on the web.
-#
-# When adding J Random Contributor's name to this file,
-# either J's name or J's organization's name should be
-# added to the AUTHORS file, depending on whether the
-# individual or corporate CLA was used.
-#
-# Names should be added to this file as:
-#     Name <email address>
-#
-# Please keep the list sorted.
-
-Albert Pretorius <pretoalb@gmail.com>
-Arne Beer <arne@twobeer.de>
-Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
-Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
-Christopher Seymour <chris.j.seymour@hotmail.com>
-David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
-Dominic Hamon <dma@stripysock.com>
-Eric Fiselier <eric@efcs.ca>
-Eugene Zhuk <eugene.zhuk@gmail.com>
-Evgeny Safronov <division494@gmail.com>
-Felix Homann <linuxaudio@showlabor.de>
-Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
-Jern-Kuan Leong <jernkuan@gmail.com>
-Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
-JianXiong Zhou <zhoujianxiong2@gmail.com>
-Jussi Knuuttila <jussi.knuuttila@gmail.com>
-Kaito Udagawa <umireon@gmail.com>
-Kai Wolf <kai.wolf@gmail.com>
-Lei Xu <eddyxu@gmail.com>
-Matt Clarkson <mattyclarkson@gmail.com>
-Maxim Vafin <maxvafin@gmail.com>
-Nick Hutchinson <nshutchinson@gmail.com>
-Oleksandr Sochka <sasha.sochka@gmail.com>
-Pascal Leroy <phl@google.com>
-Paul Redmond <paul.redmond@gmail.com>
-Pierre Phaneuf <pphaneuf@google.com>
-Radoslav Yovchev <radoslav.tm@gmail.com>
-Ray Glover <ray.glover@uk.ibm.com>
-Shuo Chen <chenshuo@chenshuo.com>
-Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
-Yixuan Qiu <yixuanq@gmail.com>
-Yusuke Suzuki <utatane.tea@gmail.com>
-Tobias Ulvgård <tobias.ulvgard@dirac.se>
-Zbigniew Skowron <zbychs@gmail.com>
-Dominik Czarnota <dominik.b.czarnota@gmail.com>
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/LICENSE b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/LICENSE
deleted file mode 100644
index d6456956733..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/README.md b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/README.md
deleted file mode 100644
index 2430d93bf9c..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/README.md
+++ /dev/null
@@ -1,726 +0,0 @@
-# benchmark
-[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
-[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
-[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
-
-A library to support the benchmarking of functions, similar to unit-tests.
-
-Discussion group: https://groups.google.com/d/forum/benchmark-discuss
-
-IRC channel: https://freenode.net #googlebenchmark
-
-[Known issues and common problems](#known-issues)
-
-[Additional Tooling Documentation](docs/tools.md)
-
-## Example usage
-### Basic usage
-Define a function that executes the code to be measured.
-
-```c++
-static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
-    std::string empty_string;
-}
-// Register the function as a benchmark
-BENCHMARK(BM_StringCreation);
-
-// Define another benchmark
-static void BM_StringCopy(benchmark::State& state) {
-  std::string x = "hello";
-  while (state.KeepRunning())
-    std::string copy(x);
-}
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
-```
-
-### Passing arguments
-Sometimes a family of benchmarks can be implemented with just one routine that
-takes an extra argument to specify which one of the family of benchmarks to
-run. For example, the following code defines a family of benchmarks for
-measuring the speed of `memcpy()` calls of different lengths:
-
-```c++
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range(0)];
-  char* dst = new char[state.range(0)];
-  memset(src, 'x', state.range(0));
-  while (state.KeepRunning())
-    memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-  delete[] src;
-  delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following invocation will pick a few appropriate arguments in
-the specified range and will generate a benchmark for each such argument.
-
-```c++
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-```
-
-By default the arguments in the range are generated in multiples of eight and
-the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
-range multiplier is changed to multiples of two.
-
-```c++
-BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
-```
-Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
-
-You might have a benchmark that depends on two or more inputs. For example, the
-following code defines a family of benchmarks for measuring the speed of set
-insertion.
-
-```c++
-static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range(0));
-    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-    ->Args({1<<10, 1})
-    ->Args({1<<10, 8})
-    ->Args({1<<10, 64})
-    ->Args({1<<10, 512})
-    ->Args({8<<10, 1})
-    ->Args({8<<10, 8})
-    ->Args({8<<10, 64})
-    ->Args({8<<10, 512});
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following macro will pick a few appropriate arguments in the
-product of the two specified ranges and will generate a benchmark for each such
-pair.
-
-```c++
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
-```
-
-For more complex patterns of inputs, passing a custom function to `Apply` allows
-programmatic specification of an arbitrary set of arguments on which to run the
-benchmark. The following example enumerates a dense range on one parameter,
-and a sparse range on the second.
-
-```c++
-static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b->Args({i, j});
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-```
-
-### Calculate asymptotic complexity (Big O)
-Asymptotic complexity might be calculated for a family of benchmarks. The
-following code will calculate the coefficient for the high-order term in the
-running time and the normalized root-mean square error of string comparison.
-
-```c++
-static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range(0), '-');
-  std::string s2(state.range(0), '-');
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(s1.compare(s2));
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
-```
-
-As shown in the following invocation, asymptotic complexity might also be
-calculated automatically.
-
-```c++
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
-```
-
-The following code will specify asymptotic complexity with a lambda function,
-that might be used to customize high-order term calculation.
-
-```c++
-BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](int n)->double{return n; });
-```
-
-### Templated benchmarks
-Templated benchmarks work the same way: This example produces and consumes
-messages of size `sizeof(v)` `range_x` times. It also outputs throughput in the
-absence of multiprogramming.
-
-```c++
-template <class Q> int BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  while (state.KeepRunning()) {
-    for (int i = state.range(0); i--; )
-      q.push(v);
-    for (int e = state.range(0); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-```
-
-Three macros are provided for adding benchmark templates.
-
-```c++
-#if __cplusplus >= 201103L // C++11 and greater.
-#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
-#else // C++ < C++11
-#define BENCHMARK_TEMPLATE(func, arg1)
-#endif
-#define BENCHMARK_TEMPLATE1(func, arg1)
-#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
-```
-
-## Passing arbitrary arguments to a benchmark
-In C++11 it is possible to define a benchmark that takes an arbitrary number
-of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
-macro creates a benchmark that invokes `func`  with the `benchmark::State` as
-the first argument followed by the specified `args...`.
-The `test_case_name` is appended to the name of the benchmark and
-should describe the values passed.
-
-```c++
-template <class ...ExtraArgs>`
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
-}
-// Registers a benchmark named "BM_takes_args/int_string_test` that passes
-// the specified values to `extra_args`.
-BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-```
-Note that elements of `...args` may refer to global variables. Users should
-avoid modifying global state inside of a benchmark.
-
-## Using RegisterBenchmark(name, fn, args...)
-
-The `RegisterBenchmark(name, func, args...)` function provides an alternative
-way to create and register benchmarks.
-`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
-pointer to a new benchmark with the specified `name` that invokes
-`func(st, args...)` where `st` is a `benchmark::State` object.
-
-Unlike the `BENCHMARK` registration macros, which can only be used at the global
-scope, the `RegisterBenchmark` can be called anywhere. This allows for
-benchmark tests to be registered programmatically.
-
-Additionally `RegisterBenchmark` allows any callable object to be registered
-as a benchmark. Including capturing lambdas and function objects. This
-allows the creation
-
-For Example:
-```c++
-auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
-
-int main(int argc, char** argv) {
-  for (auto& test_input : { /* ... */ })
-      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-}
-```
-
-### Multithreaded benchmarks
-In a multithreaded test (benchmark invoked by multiple threads simultaneously),
-it is guaranteed that none of the threads will start until all have called
-`KeepRunning`, and all will have finished before KeepRunning returns false. As
-such, any global setup or teardown can be wrapped in a check against the thread
-index:
-
-```c++
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  while (state.KeepRunning()) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(2);
-```
-
-If the benchmarked code itself uses threads and you want to compare it to
-single-threaded code, you may want to use real-time ("wallclock") measurements
-for latency comparisons:
-
-```c++
-BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
-```
-
-Without `UseRealTime`, CPU time is used by default.
-
-
-## Manual timing
-For benchmarking something for which neither CPU time nor real-time are
-correct or accurate enough, completely manual timing is supported using
-the `UseManualTime` function. 
-
-When `UseManualTime` is used, the benchmarked code must call
-`SetIterationTime` once per iteration of the `KeepRunning` loop to
-report the manually measured time.
-
-An example use case for this is benchmarking GPU execution (e.g. OpenCL
-or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
-be accurately measured using CPU time or real-time. Instead, they can be
-measured accurately using a dedicated API, and these measurement results
-can be reported back with `SetIterationTime`.
-
-```c++
-static void BM_ManualTiming(benchmark::State& state) {
-  int microseconds = state.range(0);
-  std::chrono::duration<double, std::micro> sleep_duration {
-    static_cast<double>(microseconds)
-  };
-
-  while (state.KeepRunning()) {
-    auto start = std::chrono::high_resolution_clock::now();
-    // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(sleep_duration);
-    auto end   = std::chrono::high_resolution_clock::now();
-
-    auto elapsed_seconds =
-      std::chrono::duration_cast<std::chrono::duration<double>>(
-        end - start);
-
-    state.SetIterationTime(elapsed_seconds.count());
-  }
-}
-BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
-```
-
-### Preventing optimisation
-To prevent a value or expression from being optimized away by the compiler
-the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
-functions can be used.
-
-```c++
-static void BM_test(benchmark::State& state) {
-  while (state.KeepRunning()) {
-      int x = 0;
-      for (int i=0; i < 64; ++i) {
-        benchmark::DoNotOptimize(x += i);
-      }
-  }
-}
-```
-
-`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
-memory or a register. For GNU based compilers it acts as read/write barrier
-for global memory. More specifically it forces the compiler to flush pending
-writes to memory and reload any other values as necessary.
-
-Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
-in any way. `<expr>` may even be removed entirely when the result is already
-known. For example:
-
-```c++
-  /* Example 1: `<expr>` is removed entirely. */
-  int foo(int x) { return x + 42; }
-  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
-
-  /*  Example 2: Result of '<expr>' is only reused */
-  int bar(int) __attribute__((const));
-  while (...) DoNotOptimize(bar(0)); // Optimized to:
-  // int __result__ = bar(0);
-  // while (...) DoNotOptimize(__result__);
-```
-
-The second tool for preventing optimizations is `ClobberMemory()`. In essence
-`ClobberMemory()` forces the compiler to perform all pending writes to global
-memory. Memory managed by block scope objects must be "escaped" using
-`DoNotOptimize(...)` before it can be clobbered. In the below example
-`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
-away.
-
-```c++
-static void BM_vector_push_back(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    std::vector<int> v;
-    v.reserve(1);
-    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
-    v.push_back(42);
-    benchmark::ClobberMemory(); // Force 42 to be written to memory.
-  }
-}
-```
-
-Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
-
-### Set time unit manually
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-```c++
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-```
-
-## Controlling number of iterations
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on
-the registered benchmark object.
-
-## Reporting the mean and standard devation by repeated benchmarks
-By default each benchmark is run once and that single result is reported.
-However benchmarks are often noisy and a single result may not be representative
-of the overall behavior. For this reason it's possible to repeatedly rerun the
-benchmark.
-
-The number of runs of each benchmark is specified globally by the
-`--benchmark_repetitions` flag or on a per benchmark basis by calling
-`Repetitions` on the registered benchmark object. When a benchmark is run
-more than once the mean and standard deviation of the runs will be reported.
-
-Additionally the `--benchmark_report_aggregates_only={true|false}` flag or
-`ReportAggregatesOnly(bool)` function can be used to change how repeated tests
-are reported. By default the result of each repeated run is reported. When this
-option is 'true' only the mean and standard deviation of the runs is reported.
-Calling `ReportAggregatesOnly(bool)` on a registered benchmark object overrides
-the value of the flag for that benchmark.
-
-## Fixtures
-Fixture tests are created by
-first defining a type that derives from ::benchmark::Fixture and then
-creating/registering the tests using the following macros:
-
-* `BENCHMARK_F(ClassName, Method)`
-* `BENCHMARK_DEFINE_F(ClassName, Method)`
-* `BENCHMARK_REGISTER_F(ClassName, Method)`
-
-For Example:
-
-```c++
-class MyFixture : public benchmark::Fixture {};
-
-BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   while (st.KeepRunning()) {
-     ...
-  }
-}
-
-BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   while (st.KeepRunning()) {
-     ...
-  }
-}
-/* BarTest is NOT registered */
-BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
-```
-
-
-## User-defined counters
-
-You can add your own counters with user-defined names. The example below
-will add columns "Foo", "Bar" and "Baz" in its output:
-
-```c++
-static void UserCountersExample1(benchmark::State& state) {
-  double numFoos = 0, numBars = 0, numBazs = 0;
-  while (state.KeepRunning()) {
-    // ... count Foo,Bar,Baz events
-  }
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-}
-```
-
-The `state.counters` object is a `std::map` with `std::string` keys
-and `Counter` values. The latter is a `double`-like class, via an implicit
-conversion to `double&`. Thus you can use all of the standard arithmetic
-assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
-
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
-The `Counter` constructor accepts two parameters: the value as a `double`
-and a bit flag which allows you to show counters as rates and/or as
-per-thread averages:
-
-```c++
-  // sets a simple counter
-  state.counters["Foo"] = numFoos;
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark.
-  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
-
-  // Set the counter as a thread-average quantity. It will
-  // be presented divided by the number of threads.
-  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
-
-  // There's also a combined flag:
-  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
-```
-
-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
-
-```c++
-  // With C++11, this can be done:
-  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
-  // ... instead of:
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-```
-
-### Counter reporting
-
-When using the console reporter, by default, user counters are are printed at
-the end after the table, the same way as ``bytes_processed`` and
-``items_processed``. This is best for cases in which there are few counters,
-or where there are only a couple of lines per benchmark. Here's an example of
-the default output:
-
-```
-------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations UserCounters...
-------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
-BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
-BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
-BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
-BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
-BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
-BM_Factorial                    26 ns         26 ns   26608979 40320
-BM_Factorial/real_time          26 ns         26 ns   26587936 40320
-BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
-BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
-BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
-BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
-```
-
-If this doesn't suit you, you can print each counter as a table column by
-passing the flag `--benchmark_counters_tabular=true` to the benchmark
-application. This is best for cases in which there are a lot of counters, or
-a lot of lines per individual benchmark. Note that this will trigger a
-reprinting of the table header any time the counter set changes between
-individual benchmarks. Here's an example of corresponding output when
-`--benchmark_counters_tabular=true` is passed:
-
-```
----------------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
----------------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
-BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
-BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
-BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
-BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
-BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
-BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
-BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
---------------------------------------------------------------
-Benchmark                        Time           CPU Iterations
---------------------------------------------------------------
-BM_Factorial                    26 ns         26 ns   26392245 40320
-BM_Factorial/real_time          26 ns         26 ns   26494107 40320
-BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
-BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
-BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
-BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
-BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
-BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
-BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
-BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
-BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
-```
-Note above the additional header printed when the benchmark changes from
-``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
-not have the same counter set as ``BM_UserCounter``.
-
-## Exiting Benchmarks in Error
-
-When errors caused by external influences, such as file I/O and network
-communication, occur within a benchmark the
-`State::SkipWithError(const char* msg)` function can be used to skip that run
-of benchmark and report the error. Note that only future iterations of the
-`KeepRunning()` are skipped. Users may explicitly return to exit the
-benchmark immediately.
-
-The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the `KeepRunning()` loop.
-
-For example:
-
-```c++
-static void BM_test(benchmark::State& state) {
-  auto resource = GetResource();
-  if (!resource.good()) {
-      state.SkipWithError("Resource is not good!");
-      // KeepRunning() loop will not be entered.
-  }
-  while (state.KeepRunning()) {
-      auto data = resource.read_data();
-      if (!resource.good()) {
-        state.SkipWithError("Failed to read data!");
-        break; // Needed to skip the rest of the iteration.
-     }
-     do_stuff(data);
-  }
-}
-```
-
-## Running a subset of the benchmarks
-
-The `--benchmark_filter=<regex>` option can be used to only run the benchmarks
-which match the specified `<regex>`. For example:
-
-```bash
-$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
-Run on (1 X 2300 MHz CPU )
-2016-06-25 19:34:24
-Benchmark              Time           CPU Iterations
-----------------------------------------------------
-BM_memcpy/32          11 ns         11 ns   79545455
-BM_memcpy/32k       2181 ns       2185 ns     324074
-BM_memcpy/32          12 ns         12 ns   54687500
-BM_memcpy/32k       1834 ns       1837 ns     357143
-```
-
-
-## Output Formats
-The library supports multiple output formats. Use the
-`--benchmark_format=<console|json|csv>` flag to set the format type. `console`
-is the default format.
-
-The Console format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the 
-tabular data on stdout. Example tabular output looks like:
-```
-Benchmark                               Time(ns)    CPU(ns) Iterations
-----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
-```
-
-The JSON format outputs human readable json split into two top level attributes.
-The `context` attribute contains information about the run in general, including
-information about the CPU and the date.
-The `benchmarks` attribute contains a list of ever benchmark run. Example json
-output looks like:
-```json
-{
-  "context": {
-    "date": "2015/03/17-18:40:25",
-    "num_cpus": 40,
-    "mhz_per_cpu": 2801,
-    "cpu_scaling_enabled": false,
-    "build_type": "debug"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SetInsert/1024/1",
-      "iterations": 94877,
-      "real_time": 29275,
-      "cpu_time": 29836,
-      "bytes_per_second": 134066,
-      "items_per_second": 33516
-    },
-    {
-      "name": "BM_SetInsert/1024/8",
-      "iterations": 21609,
-      "real_time": 32317,
-      "cpu_time": 32429,
-      "bytes_per_second": 986770,
-      "items_per_second": 246693
-    },
-    {
-      "name": "BM_SetInsert/1024/10",
-      "iterations": 21393,
-      "real_time": 32724,
-      "cpu_time": 33355,
-      "bytes_per_second": 1199226,
-      "items_per_second": 299807
-    }
-  ]
-}
-```
-
-The CSV format outputs comma-separated values. The `context` is output on stderr
-and the CSV itself on stdout. Example CSV output looks like:
-```
-name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
-"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
-"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
-"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
-```
-
-## Output Files
-The library supports writing the output of the benchmark to a file specified
-by `--benchmark_out=<filename>`. The format of the output can be specified
-using `--benchmark_out_format={json|console|csv}`. Specifying
-`--benchmark_out` does not suppress the console output.
-
-## Debug vs Release
-By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use:
-
-```
-cmake -DCMAKE_BUILD_TYPE=Release
-```
-
-To enable link-time optimisation, use
-
-```
-cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
-```
-
-## Linking against the library
-When using gcc, it is necessary to link against pthread to avoid runtime exceptions.
-This is due to how gcc implements std::thread.
-See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
-
-## Compiler Support
-
-Google Benchmark uses C++11 when building the library. As such we require
-a modern C++ toolchain, both compiler and standard library.
-
-The following minimum versions are strongly recommended build the library:
-
-* GCC 4.8
-* Clang 3.4
-* Visual Studio 2013
-* Intel 2015 Update 1
-
-Anything older *may* work.
-
-Note: Using the library and its headers in C++03 is supported. C++11 is only
-required to build the library.
-
-# Known Issues
-
-### Windows
-
-* Users must manually link `shlwapi.lib`. Failure to do so may result
-in unresolved symbols.
-
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/appveyor.yml b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/appveyor.yml
deleted file mode 100644
index e084f386b77..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/appveyor.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-version: '{build}'
-
-image: Visual Studio 2017
-
-configuration:
-  - Debug
-  - Release
-
-environment:
-  matrix:
-    - compiler: msvc-15-seh
-      generator: "Visual Studio 15 2017"
-
-    - compiler: msvc-15-seh
-      generator: "Visual Studio 15 2017 Win64"
-
-    - compiler: msvc-14-seh
-      generator: "Visual Studio 14 2015"
-
-    - compiler: msvc-14-seh
-      generator: "Visual Studio 14 2015 Win64"
-
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013"
-
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013 Win64"
-
-    - compiler: gcc-5.3.0-posix
-      generator: "MinGW Makefiles"
-      cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
-      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
-
-matrix:
-  fast_finish: true
-
-install:
-  # git bash conflicts with MinGW makefiles
-  - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%")
-  - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%")
-
-build_script:
-  - md _build -Force
-  - cd _build
-  - echo %configuration%
-  - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" ..
-  - cmake --build . --config %configuration%
-
-test_script:
-  - ctest -c %configuration% --timeout 300 --output-on-failure
-
-artifacts:
-  - path: '_build/CMakeFiles/*.log'
-    name: logs
-  - path: '_build/Testing/**/*.xml'
-    name: test_results
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/AddCXXCompilerFlag.cmake b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/AddCXXCompilerFlag.cmake
deleted file mode 100644
index 0b176ba27f1..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/AddCXXCompilerFlag.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-# - Adds a compiler flag if it is supported by the compiler
-#
-# This function checks that the supplied compiler flag is supported and then
-# adds it to the corresponding compiler flags
-#
-#  add_cxx_compiler_flag(<FLAG> [<VARIANT>])
-#
-# - Example
-#
-# include(AddCXXCompilerFlag)
-# add_cxx_compiler_flag(-Wall)
-# add_cxx_compiler_flag(-no-strict-aliasing RELEASE)
-# Requires CMake 2.6+
-
-if(__add_cxx_compiler_flag)
-  return()
-endif()
-set(__add_cxx_compiler_flag INCLUDED)
-
-include(CheckCXXCompilerFlag)
-
-function(mangle_compiler_flag FLAG OUTPUT)
-  string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
-  string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
-  string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
-  string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
-  set(${OUTPUT} "${SANITIZED_FLAG}" PARENT_SCOPE)
-endfunction(mangle_compiler_flag)
-
-function(add_cxx_compiler_flag FLAG)
-  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
-  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
-  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
-  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
-  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
-      string(TOUPPER "_${VARIANT}" VARIANT)
-    endif()
-    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
-  endif()
-endfunction()
-
-function(add_required_cxx_compiler_flag FLAG)
-  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
-  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
-  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
-  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
-  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
-      string(TOUPPER "_${VARIANT}" VARIANT)
-    endif()
-    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
-    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}" PARENT_SCOPE)
-  else()
-    message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler")
-  endif()
-endfunction()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/CXXFeatureCheck.cmake b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/CXXFeatureCheck.cmake
deleted file mode 100644
index 2c4460f0e30..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/CXXFeatureCheck.cmake
+++ /dev/null
@@ -1,46 +0,0 @@
-# - Compile and run code to check for C++ features
-#
-# This functions compiles a source file under the `cmake` folder
-# and adds the corresponding `HAVE_[FILENAME]` flag to the CMake
-# environment
-#
-#  cxx_feature_check(<FLAG> [<VARIANT>])
-#
-# - Example
-#
-# include(CXXFeatureCheck)
-# cxx_feature_check(STD_REGEX)
-# Requires CMake 2.8.12+
-
-if(__cxx_feature_check)
-  return()
-endif()
-set(__cxx_feature_check INCLUDED)
-
-function(cxx_feature_check FILE)
-  string(TOLOWER ${FILE} FILE)
-  string(TOUPPER ${FILE} VAR)
-  string(TOUPPER "HAVE_${VAR}" FEATURE)
-  if (DEFINED HAVE_${VAR})
-    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
-    add_definitions(-DHAVE_${VAR})
-    return()
-  endif()
-  message("-- Performing Test ${FEATURE}")
-  try_run(RUN_${FEATURE} COMPILE_${FEATURE}
-          ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-          CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-          LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
-  if(RUN_${FEATURE} EQUAL 0)
-    message("-- Performing Test ${FEATURE} -- success")
-    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
-    add_definitions(-DHAVE_${VAR})
-  else()
-    if(NOT COMPILE_${FEATURE})
-      message("-- Performing Test ${FEATURE} -- failed to compile")
-    else()
-      message("-- Performing Test ${FEATURE} -- compiled but failed to run")
-    endif()
-  endif()
-endfunction()
-
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/Config.cmake.in b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/Config.cmake.in
deleted file mode 100644
index 6e9256eea8a..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/Config.cmake.in
+++ /dev/null
@@ -1 +0,0 @@
-include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/GetGitVersion.cmake b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/GetGitVersion.cmake
deleted file mode 100644
index 8dd94800459..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/GetGitVersion.cmake
+++ /dev/null
@@ -1,51 +0,0 @@
-# - Returns a version string from Git tags
-#
-# This function inspects the annotated git tags for the project and returns a string
-# into a CMake variable
-#
-#  get_git_version(<var>)
-#
-# - Example
-#
-# include(GetGitVersion)
-# get_git_version(GIT_VERSION)
-#
-# Requires CMake 2.8.11+
-find_package(Git)
-
-if(__get_git_version)
-  return()
-endif()
-set(__get_git_version INCLUDED)
-
-function(get_git_version var)
-  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
-          RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
-          ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
-      else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
-      endif()
-
-      # Work out if the repository is dirty
-      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
-          OUTPUT_QUIET
-          ERROR_QUIET)
-      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
-          OUTPUT_VARIABLE GIT_DIFF_INDEX
-          ERROR_QUIET)
-      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-      if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
-      endif()
-  else()
-      set(GIT_VERSION "v0.0.0")
-  endif()
-
-  message("-- git Version: ${GIT_VERSION}")
-  set(${var} ${GIT_VERSION} PARENT_SCOPE)
-endfunction()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/gnu_posix_regex.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/gnu_posix_regex.cpp
deleted file mode 100644
index b5b91cdab7c..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/gnu_posix_regex.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <gnuregex.h>
-#include <string>
-int main() {
-  std::string str = "test0159";
-  regex_t re;
-  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
-  if (ec != 0) {
-    return ec;
-  }
-  return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
-}
-
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/posix_regex.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/posix_regex.cpp
deleted file mode 100644
index 466dc62560a..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/posix_regex.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <regex.h>
-#include <string>
-int main() {
-  std::string str = "test0159";
-  regex_t re;
-  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
-  if (ec != 0) {
-    return ec;
-  }
-  int ret = regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
-  regfree(&re);
-  return ret;
-}
-
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/std_regex.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/std_regex.cpp
deleted file mode 100644
index 696f2a26bce..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/std_regex.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <regex>
-#include <string>
-int main() {
-  const std::string str = "test0159";
-  std::regex re;
-  re = std::regex("^[a-z]+[0-9]+$",
-       std::regex_constants::extended | std::regex_constants::nosubs);
-  return std::regex_search(str, re) ? 0 : -1;
-}
-
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/steady_clock.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/steady_clock.cpp
deleted file mode 100644
index 66d50d17e9e..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/steady_clock.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <chrono>
-
-int main() {
-    typedef std::chrono::steady_clock Clock;
-    Clock::time_point tp = Clock::now();
-    ((void)tp);
-}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/thread_safety_attributes.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/thread_safety_attributes.cpp
deleted file mode 100644
index 46161babdb1..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/thread_safety_attributes.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-#define HAVE_THREAD_SAFETY_ATTRIBUTES
-#include "../src/mutex.h"
-
-int main() {}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/docs/tools.md b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/docs/tools.md
deleted file mode 100644
index f176f74a48f..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/docs/tools.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# Benchmark Tools
-
-## compare_bench.py
-
-The `compare_bench.py` utility which can be used to compare the result of benchmarks.
-The program is invoked like:
-
-``` bash
-$ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]...
-```
-
-Where `<old-benchmark>` and `<new-benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
-
-The sample output using the JSON test files under `Inputs/` gives:
-
-``` bash
-$ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json
-Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json
-Benchmark                   Time           CPU
-----------------------------------------------
-BM_SameTimes               +0.00         +0.00
-BM_2xFaster                -0.50         -0.50
-BM_2xSlower                +1.00         +1.00
-BM_10PercentFaster         -0.10         -0.10
-BM_10PercentSlower         +0.10         +0.10
-```
-
-When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like:
-
-```
-./compare_bench.py  test/basic_test test/basic_test  --benchmark_filter=BM_empty.*
-RUNNING: test/basic_test --benchmark_filter=BM_empty.*
-Run on (4 X 4228.32 MHz CPU s)
-2016-08-02 19:21:33
-Benchmark                              Time           CPU Iterations
---------------------------------------------------------------------
-BM_empty                               9 ns          9 ns   79545455
-BM_empty/threads:4                     4 ns          9 ns   75268816
-BM_empty_stop_start                    8 ns          8 ns   83333333
-BM_empty_stop_start/threads:4          3 ns          8 ns   83333332
-RUNNING: test/basic_test --benchmark_filter=BM_empty.*
-Run on (4 X 4228.32 MHz CPU s)
-2016-08-02 19:21:35
-Benchmark                              Time           CPU Iterations
---------------------------------------------------------------------
-BM_empty                               9 ns          9 ns   76086957
-BM_empty/threads:4                     4 ns          9 ns   76086956
-BM_empty_stop_start                    8 ns          8 ns   87500000
-BM_empty_stop_start/threads:4          3 ns          8 ns   88607596
-Comparing test/basic_test to test/basic_test
-Benchmark                              Time           CPU
----------------------------------------------------------
-BM_empty                              +0.00         +0.00
-BM_empty/threads:4                    +0.00         +0.00
-BM_empty_stop_start                   +0.00         +0.00
-BM_empty_stop_start/threads:4         +0.00         +0.00
-```
-
-Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks.
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark.h
deleted file mode 100644
index bd3b0ffb4cb..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark.h
+++ /dev/null
@@ -1,1210 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Support for registering benchmarks for functions.
-
-/* Example usage:
-// Define a function that executes the code to be measured a
-// specified number of times:
-static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
-    std::string empty_string;
-}
-
-// Register the function as a benchmark
-BENCHMARK(BM_StringCreation);
-
-// Define another benchmark
-static void BM_StringCopy(benchmark::State& state) {
-  std::string x = "hello";
-  while (state.KeepRunning())
-    std::string copy(x);
-}
-BENCHMARK(BM_StringCopy);
-
-// Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
-//       my_unittest --benchmark_filter=all
-//       my_unittest --benchmark_filter=BM_StringCreation
-//       my_unittest --benchmark_filter=String
-//       my_unittest --benchmark_filter='Copy|Creation'
-int main(int argc, char** argv) {
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  return 0;
-}
-
-// Sometimes a family of microbenchmarks can be implemented with
-// just one routine that takes an extra argument to specify which
-// one of the family of benchmarks to run.  For example, the following
-// code defines a family of microbenchmarks for measuring the speed
-// of memcpy() calls of different lengths:
-
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range(0)]; char* dst = new char[state.range(0)];
-  memset(src, 'x', state.range(0));
-  while (state.KeepRunning())
-    memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-  delete[] src; delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-
-// The preceding code is quite repetitive, and can be replaced with the
-// following short-hand.  The following invocation will pick a few
-// appropriate arguments in the specified range and will generate a
-// microbenchmark for each such argument.
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-
-// You might have a microbenchmark that depends on two inputs.  For
-// example, the following code defines a family of microbenchmarks for
-// measuring the speed of set insertion.
-static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    set<int> data = ConstructRandomSet(state.range(0));
-    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-   ->Args({1<<10, 1})
-   ->Args({1<<10, 8})
-   ->Args({1<<10, 64})
-   ->Args({1<<10, 512})
-   ->Args({8<<10, 1})
-   ->Args({8<<10, 8})
-   ->Args({8<<10, 64})
-   ->Args({8<<10, 512});
-
-// The preceding code is quite repetitive, and can be replaced with
-// the following short-hand.  The following macro will pick a few
-// appropriate arguments in the product of the two specified ranges
-// and will generate a microbenchmark for each such pair.
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
-
-// For more complex patterns of inputs, passing a custom function
-// to Apply allows programmatic specification of an
-// arbitrary set of arguments to run the microbenchmark on.
-// The following example enumerates a dense range on
-// one parameter, and a sparse range on the second.
-static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b->Args({i, j});
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-
-// Templated microbenchmarks work the same way:
-// Produce then consume 'size' messages 'iters' times
-// Measures throughput in the absence of multiprogramming.
-template <class Q> int BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  while (state.KeepRunning()) {
-    for (int i = state.range(0); i--; )
-      q.push(v);
-    for (int e = state.range(0); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-
-Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
-benchmark. This option overrides the `benchmark_min_time` flag.
-
-void BM_test(benchmark::State& state) {
- ... body ...
-}
-BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
-
-In a multithreaded test, it is guaranteed that none of the threads will start
-until all have called KeepRunning, and all will have finished before KeepRunning
-returns false. As such, any global setup or teardown you want to do can be
-wrapped in a check against the thread index:
-
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  while (state.KeepRunning()) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(4);
-
-
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-*/
-
-#ifndef BENCHMARK_BENCHMARK_H_
-#define BENCHMARK_BENCHMARK_H_
-
-
-#if __cplusplus >= 201103L
-#define BENCHMARK_HAS_CXX11
-#endif
-
-#include <stdint.h>
-
-#include <cassert>
-#include <cstddef>
-#include <iosfwd>
-#include <string>
-#include <vector>
-#include <map>
-#include <set>
-
-#if defined(BENCHMARK_HAS_CXX11)
-#include <type_traits>
-#include <initializer_list>
-#include <utility>
-#endif
-
-#if defined(_MSC_VER)
-#include <intrin.h> // for _ReadWriteBarrier
-#endif
-
-#ifndef BENCHMARK_HAS_CXX11
-#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&);                         \
-  TypeName& operator=(const TypeName&)
-#else
-#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&) = delete;                \
-  TypeName& operator=(const TypeName&) = delete
-#endif
-
-#if defined(__GNUC__)
-#define BENCHMARK_UNUSED __attribute__((unused))
-#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#elif defined(_MSC_VER) && !defined(__clang__)
-#define BENCHMARK_UNUSED
-#define BENCHMARK_ALWAYS_INLINE __forceinline
-#if _MSC_VER >= 1900
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#else
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
-#endif
-#define __func__ __FUNCTION__
-#else
-#define BENCHMARK_UNUSED
-#define BENCHMARK_ALWAYS_INLINE
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
-#endif
-
-#define BENCHMARK_INTERNAL_TOSTRING2(x) #x
-#define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
-
-#if defined(__GNUC__)
-#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
-#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
-#else
-#define BENCHMARK_BUILTIN_EXPECT(x, y) x
-#define BENCHMARK_DEPRECATED_MSG(msg)
-#define BENCHMARK_WARNING_MSG(msg) __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING(__LINE__) ") : warning note: " msg))
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#endif
-
-
-namespace benchmark {
-class BenchmarkReporter;
-
-void Initialize(int* argc, char** argv);
-
-// Report to stdout all arguments in 'argv' as unrecognized except the first.
-// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
-bool ReportUnrecognizedArguments(int argc, char** argv);
-
-// Generate a list of benchmarks matching the specified --benchmark_filter flag
-// and if --benchmark_list_tests is specified return after printing the name
-// of each matching benchmark. Otherwise run each matching benchmark and
-// report the results.
-//
-// The second and third overload use the specified 'console_reporter' and
-//  'file_reporter' respectively. 'file_reporter' will write to the file
-//  specified
-//   by '--benchmark_output'. If '--benchmark_output' is not given the
-//  'file_reporter' is ignored.
-//
-// RETURNS: The number of matching benchmarks.
-size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
-                              BenchmarkReporter* file_reporter);
-
-// If this routine is called, peak memory allocation past this point in the
-// benchmark is reported at the end of the benchmark report line. (It is
-// computed by running the benchmark once with a single iteration and a memory
-// tracer.)
-// TODO(dominic)
-// void MemoryUsage();
-
-namespace internal {
-class Benchmark;
-class BenchmarkImp;
-class BenchmarkFamilies;
-
-void UseCharPointer(char const volatile*);
-
-// Take ownership of the pointer and register the benchmark. Return the
-// registered benchmark.
-Benchmark* RegisterBenchmarkInternal(Benchmark*);
-
-// Ensure that the standard streams are properly initialized in every TU.
-int InitializeStreams();
-BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
-
-}  // namespace internal
-
-
-#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN)
-# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
-#endif
-
-// The DoNotOptimize(...) function can be used to prevent a value or
-// expression from being optimized away by the compiler. This function is
-// intended to add little to no overhead.
-// See: https://youtu.be/nXaxk27zwlk?t=2441
-#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  // Clang doesn't like the 'X' constraint on `value` and certain GCC versions
-  // don't like the 'g' constraint. Attempt to placate them both.
-#if defined(__clang__)
-  asm volatile("" : : "g"(value) : "memory");
-#else
-  asm volatile("" : : "i,r,m"(value) : "memory");
-#endif
-}
-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-  asm volatile("" : : : "memory");
-}
-#elif defined(_MSC_VER)
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
-  _ReadWriteBarrier();
-}
-
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-  _ReadWriteBarrier();
-}
-#else
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
-}
-// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
-#endif
-
-
-
-// This class is used for user-defined counters.
-class Counter {
-public:
-
-  enum Flags {
-    kDefaults   = 0,
-    // Mark the counter as a rate. It will be presented divided
-    // by the duration of the benchmark.
-    kIsRate     = 1,
-    // Mark the counter as a thread-average quantity. It will be
-    // presented divided by the number of threads.
-    kAvgThreads = 2,
-    // Mark the counter as a thread-average rate. See above.
-    kAvgThreadsRate = kIsRate|kAvgThreads
-  };
-
-  double value;
-  Flags  flags;
-
-  BENCHMARK_ALWAYS_INLINE
-  Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {}
-
-  BENCHMARK_ALWAYS_INLINE operator double const& () const { return value; }
-  BENCHMARK_ALWAYS_INLINE operator double      & ()       { return value; }
-
-};
-
-// This is the container for the user-defined counters.
-typedef std::map<std::string, Counter> UserCounters;
-
-
-// TimeUnit is passed to a benchmark in order to specify the order of magnitude
-// for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
-
-// BigO is passed to a benchmark in order to specify the asymptotic
-// computational
-// complexity for the benchmark. In case oAuto is selected, complexity will be
-// calculated automatically to the best fit.
-enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
-
-// BigOFunc is passed to a benchmark in order to specify the asymptotic
-// computational complexity for the benchmark.
-typedef double(BigOFunc)(int);
-
-namespace internal {
-class ThreadTimer;
-class ThreadManager;
-
-enum ReportMode
-#if defined(BENCHMARK_HAS_CXX11)
-  : unsigned
-#else
-#endif
-  {
-  RM_Unspecified,  // The mode has not been manually specified
-  RM_Default,      // The mode is user-specified as default.
-  RM_ReportAggregatesOnly
-};
-}  // namespace internal
-
-// State is passed to a running Benchmark and contains state for the
-// benchmark to use.
-class State {
- public:
-  // Returns true if the benchmark should continue through another iteration.
-  // NOTE: A benchmark may not return from the test until KeepRunning() has
-  // returned false.
-  bool KeepRunning() {
-    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-      StartKeepRunning();
-    }
-    bool const res = total_iterations_++ < max_iterations;
-    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-      FinishKeepRunning();
-    }
-    return res;
-  }
-
-  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
-  // Stop the benchmark timer.  If not called, the timer will be
-  // automatically stopped after KeepRunning() returns false for the first time.
-  //
-  // For threaded benchmarks the PauseTiming() function only pauses the timing
-  // for the current thread.
-  //
-  // NOTE: The "real time" measurement is per-thread. If different threads
-  // report different measurements the largest one is reported.
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void PauseTiming();
-
-  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
-  // Start the benchmark timer.  The timer is NOT running on entrance to the
-  // benchmark function. It begins running after the first call to KeepRunning()
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void ResumeTiming();
-
-  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
-  //            current thread.
-  // Skip any future iterations of the 'KeepRunning()' loop in the current
-  // thread and report an error with the specified 'msg'. After this call
-  // the user may explicitly 'return' from the benchmark.
-  //
-  // For threaded benchmarks only the current thread stops executing and future
-  // calls to `KeepRunning()` will block until all threads have completed
-  // the `KeepRunning()` loop. If multiple threads report an error only the
-  // first error message is used.
-  //
-  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
-  // the current scope immediately. If the function is called from within
-  // the 'KeepRunning()' loop the current iteration will finish. It is the users
-  // responsibility to exit the scope as needed.
-  void SkipWithError(const char* msg);
-
-  // REQUIRES: called exactly once per iteration of the KeepRunning loop.
-  // Set the manually measured time for this benchmark iteration, which
-  // is used instead of automatically measured time if UseManualTime() was
-  // specified.
-  //
-  // For threaded benchmarks the final value will be set to the largest
-  // reported values.
-  void SetIterationTime(double seconds);
-
-  // Set the number of bytes processed by the current benchmark
-  // execution.  This routine is typically called once at the end of a
-  // throughput oriented benchmark.  If this routine is called with a
-  // value > 0, the report is printed in MB/sec instead of nanoseconds
-  // per iteration.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetBytesProcessed(size_t bytes) { bytes_processed_ = bytes; }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t bytes_processed() const { return bytes_processed_; }
-
-  // If this routine is called with complexity_n > 0 and complexity report is
-  // requested for the
-  // family benchmark, then current benchmark will be part of the computation
-  // and complexity_n will
-  // represent the length of N.
-  BENCHMARK_ALWAYS_INLINE
-  void SetComplexityN(int complexity_n) { complexity_n_ = complexity_n; }
-
-  BENCHMARK_ALWAYS_INLINE
-  int complexity_length_n() { return complexity_n_; }
-
-  // If this routine is called with items > 0, then an items/s
-  // label is printed on the benchmark report line for the currently
-  // executing benchmark. It is typically called at the end of a processing
-  // benchmark where a processing items/second output is desired.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetItemsProcessed(size_t items) { items_processed_ = items; }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t items_processed() const { return items_processed_; }
-
-  // If this routine is called, the specified label is printed at the
-  // end of the benchmark report line for the currently executing
-  // benchmark.  Example:
-  //  static void BM_Compress(benchmark::State& state) {
-  //    ...
-  //    double compress = input_size / output_size;
-  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
-  //  }
-  // Produces output that looks like:
-  //  BM_Compress   50         50   14115038  compress:27.3%
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetLabel(const char* label);
-
-  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
-    this->SetLabel(str.c_str());
-  }
-
-  // Range arguments for this run. CHECKs if the argument has been set.
-  BENCHMARK_ALWAYS_INLINE
-  int range(std::size_t pos = 0) const {
-    assert(range_.size() > pos);
-    return range_[pos];
-  }
-
-  BENCHMARK_DEPRECATED_MSG("use 'range(0)' instead")
-  int range_x() const { return range(0); }
-
-  BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
-  int range_y() const { return range(1); }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const { return total_iterations_; }
-
- private:
-  bool started_;
-  bool finished_;
-  size_t total_iterations_;
-
-  std::vector<int> range_;
-
-  size_t bytes_processed_;
-  size_t items_processed_;
-
-  int complexity_n_;
-
-  bool error_occurred_;
-
- public:
-  // Container for user-defined counters.
-  UserCounters counters;
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;
-  const size_t max_iterations;
-
-  // TODO(EricWF) make me private
-  State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
-        int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
-
- private:
-  void StartKeepRunning();
-  void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
-};
-
-namespace internal {
-
-typedef void(Function)(State&);
-
-// ------------------------------------------------------
-// Benchmark registration object.  The BENCHMARK() macro expands
-// into an internal::Benchmark* object.  Various methods can
-// be called on this object to change the properties of the benchmark.
-// Each method returns "this" so that multiple method calls can
-// chained into one expression.
-class Benchmark {
- public:
-  virtual ~Benchmark();
-
-  // Note: the following methods all return "this" so that multiple
-  // method calls can be chained together in one expression.
-
-  // Run this benchmark once with "x" as the extra argument passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Arg(int x);
-
-  // Run this benchmark with the given time unit for the generated output report
-  Benchmark* Unit(TimeUnit unit);
-
-  // Run this benchmark once for a number of values picked from the
-  // range [start..limit].  (start and limit are always picked.)
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Range(int start, int limit);
-
-  // Run this benchmark once for all values in the range [start..limit] with
-  // specific step
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* DenseRange(int start, int limit, int step = 1);
-
-  // Run this benchmark once with "args" as the extra arguments passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* Args(const std::vector<int>& args);
-
-  // Equivalent to Args({x, y})
-  // NOTE: This is a legacy C++03 interface provided for compatibility only.
-  //   New code should use 'Args'.
-  Benchmark* ArgPair(int x, int y) {
-    std::vector<int> args;
-    args.push_back(x);
-    args.push_back(y);
-    return Args(args);
-  }
-
-  // Run this benchmark once for a number of values picked from the
-  // ranges [start..limit].  (starts and limits are always picked.)
-  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* Ranges(const std::vector<std::pair<int, int> >& ranges);
-
-  // Equivalent to ArgNames({name})
-  Benchmark* ArgName(const std::string& name);
-
-  // Set the argument names to display in the benchmark name. If not called,
-  // only argument values will be shown.
-  Benchmark* ArgNames(const std::vector<std::string>& names);
-
-  // Equivalent to Ranges({{lo1, hi1}, {lo2, hi2}}).
-  // NOTE: This is a legacy C++03 interface provided for compatibility only.
-  //   New code should use 'Ranges'.
-  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2) {
-    std::vector<std::pair<int, int> > ranges;
-    ranges.push_back(std::make_pair(lo1, hi1));
-    ranges.push_back(std::make_pair(lo2, hi2));
-    return Ranges(ranges);
-  }
-
-  // Pass this benchmark object to *func, which can customize
-  // the benchmark by calling various methods like Arg, Args,
-  // Threads, etc.
-  Benchmark* Apply(void (*func)(Benchmark* benchmark));
-
-  // Set the range multiplier for non-dense range. If not called, the range
-  // multiplier kRangeMultiplier will be used.
-  Benchmark* RangeMultiplier(int multiplier);
-
-  // Set the minimum amount of time to use when running this benchmark. This
-  // option overrides the `benchmark_min_time` flag.
-  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
-  Benchmark* MinTime(double t);
-
-  // Specify the amount of iterations that should be run by this benchmark.
-  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
-  //
-  // NOTE: This function should only be used when *exact* iteration control is
-  //   needed and never to control or limit how long a benchmark runs, where
-  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
-  Benchmark* Iterations(size_t n);
-
-  // Specify the amount of times to repeat this benchmark. This option overrides
-  // the `benchmark_repetitions` flag.
-  // REQUIRES: `n > 0`
-  Benchmark* Repetitions(int n);
-
-  // Specify if each repetition of the benchmark should be reported separately
-  // or if only the final statistics should be reported. If the benchmark
-  // is not repeated then the single result is always reported.
-  Benchmark* ReportAggregatesOnly(bool value = true);
-
-  // If a particular benchmark is I/O bound, runs multiple threads internally or
-  // if for some reason CPU timings are not representative, call this method. If
-  // called, the elapsed time will be used to control how many iterations are
-  // run, and in the printing of items/second or MB/seconds values.  If not
-  // called, the cpu time used by the benchmark will be used.
-  Benchmark* UseRealTime();
-
-  // If a benchmark must measure time manually (e.g. if GPU execution time is
-  // being
-  // measured), call this method. If called, each benchmark iteration should
-  // call
-  // SetIterationTime(seconds) to report the measured time, which will be used
-  // to control how many iterations are run, and in the printing of items/second
-  // or MB/second values.
-  Benchmark* UseManualTime();
-
-  // Set the asymptotic computational complexity for the benchmark. If called
-  // the asymptotic computational complexity will be shown on the output.
-  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
-
-  // Set the asymptotic computational complexity for the benchmark. If called
-  // the asymptotic computational complexity will be shown on the output.
-  Benchmark* Complexity(BigOFunc* complexity);
-
-  // Support for running multiple copies of the same benchmark concurrently
-  // in multiple threads.  This may be useful when measuring the scaling
-  // of some piece of code.
-
-  // Run one instance of this benchmark concurrently in t threads.
-  Benchmark* Threads(int t);
-
-  // Pick a set of values T from [min_threads,max_threads].
-  // min_threads and max_threads are always included in T.  Run this
-  // benchmark once for each value in T.  The benchmark run for a
-  // particular value t consists of t threads running the benchmark
-  // function concurrently.  For example, consider:
-  //    BENCHMARK(Foo)->ThreadRange(1,16);
-  // This will run the following benchmarks:
-  //    Foo in 1 thread
-  //    Foo in 2 threads
-  //    Foo in 4 threads
-  //    Foo in 8 threads
-  //    Foo in 16 threads
-  Benchmark* ThreadRange(int min_threads, int max_threads);
-
-  // For each value n in the range, run this benchmark once using n threads.
-  // min_threads and max_threads are always included in the range.
-  // stride specifies the increment. E.g. DenseThreadRange(1, 8, 3) starts
-  // a benchmark with 1, 4, 7 and 8 threads.
-  Benchmark* DenseThreadRange(int min_threads, int max_threads, int stride = 1);
-
-  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
-  Benchmark* ThreadPerCpu();
-
-  virtual void Run(State& state) = 0;
-
-  // Used inside the benchmark implementation
-  struct Instance;
-
- protected:
-  explicit Benchmark(const char* name);
-  Benchmark(Benchmark const&);
-  void SetName(const char* name);
-
-  int ArgsCnt() const;
-
-  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
-
- private:
-  friend class BenchmarkFamilies;
-
-  std::string name_;
-  ReportMode report_mode_;
-  std::vector<std::string> arg_names_;   // Args for all benchmark runs
-  std::vector<std::vector<int> > args_;  // Args for all benchmark runs
-  TimeUnit time_unit_;
-  int range_multiplier_;
-  double min_time_;
-  size_t iterations_;
-  int repetitions_;
-  bool use_real_time_;
-  bool use_manual_time_;
-  BigO complexity_;
-  BigOFunc* complexity_lambda_;
-  std::vector<int> thread_counts_;
-
-  Benchmark& operator=(Benchmark const&);
-};
-
-}  // namespace internal
-
-// Create and register a benchmark with the specified 'name' that invokes
-// the specified functor 'fn'.
-//
-// RETURNS: A pointer to the registered benchmark.
-internal::Benchmark* RegisterBenchmark(const char* name,
-                                       internal::Function* fn);
-
-#if defined(BENCHMARK_HAS_CXX11)
-template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
-#endif
-
-// Remove all registered benchmarks. All pointers to previously registered
-// benchmarks are invalidated.
-void ClearRegisteredBenchmarks();
-
-namespace internal {
-// The class used to hold all Benchmarks created from static function.
-// (ie those created using the BENCHMARK(...) macros.
-class FunctionBenchmark : public Benchmark {
- public:
-  FunctionBenchmark(const char* name, Function* func)
-      : Benchmark(name), func_(func) {}
-
-  virtual void Run(State& st);
-
- private:
-  Function* func_;
-};
-
-#ifdef BENCHMARK_HAS_CXX11
-template <class Lambda>
-class LambdaBenchmark : public Benchmark {
- public:
-  virtual void Run(State& st) { lambda_(st); }
-
- private:
-  template <class OLambda>
-  LambdaBenchmark(const char* name, OLambda&& lam)
-      : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
-
-  LambdaBenchmark(LambdaBenchmark const&) = delete;
-
- private:
-  template <class Lam>
-  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
-
-  Lambda lambda_;
-};
-#endif
-
-}  // namespace internal
-
-inline internal::Benchmark* RegisterBenchmark(const char* name,
-                                              internal::Function* fn) {
-  return internal::RegisterBenchmarkInternal(
-      ::new internal::FunctionBenchmark(name, fn));
-}
-
-#ifdef BENCHMARK_HAS_CXX11
-template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
-  using BenchType =
-      internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
-  return internal::RegisterBenchmarkInternal(
-      ::new BenchType(name, std::forward<Lambda>(fn)));
-}
-#endif
-
-#if defined(BENCHMARK_HAS_CXX11) && \
-    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
-template <class Lambda, class... Args>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
-                                       Args&&... args) {
-  return benchmark::RegisterBenchmark(
-      name, [=](benchmark::State& st) { fn(st, args...); });
-}
-#else
-#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-#endif
-
-// The base class for all fixture tests.
-class Fixture : public internal::Benchmark {
- public:
-  Fixture() : internal::Benchmark("") {}
-
-  virtual void Run(State& st) {
-    this->SetUp(st);
-    this->BenchmarkCase(st);
-    this->TearDown(st);
-  }
-
-  // These will be deprecated ...
-  virtual void SetUp(const State&) {}
-  virtual void TearDown(const State&) {}
-  // ... In favor of these.
-  virtual void SetUp(State& st) { SetUp(const_cast<const State&>(st)); }
-  virtual void TearDown(State& st) { TearDown(const_cast<const State&>(st)); }
-
- protected:
-  virtual void BenchmarkCase(State&) = 0;
-};
-
-}  // namespace benchmark
-
-// ------------------------------------------------------
-// Macro to register benchmarks
-
-// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
-// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
-// empty. If X is empty the expression becomes (+1 == +0).
-#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
-#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
-#else
-#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
-#endif
-
-// Helpers for generating unique variable names
-#define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
-#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
-#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
-
-#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
-  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
-      BENCHMARK_UNUSED
-
-#define BENCHMARK(n)                                     \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n, n)))
-
-// Old-style macros
-#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
-#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->Args({(a1), (a2)})
-#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
-#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
-#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
-  BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
-
-#if __cplusplus >= 201103L
-
-// Register a benchmark which invokes the function specified by `func`
-// with the additional arguments specified by `...`.
-//
-// For example:
-//
-// template <class ...ExtraArgs>`
-// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-//  [...]
-//}
-// /* Registers a benchmark named "BM_takes_args/int_string_test` */
-// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-#define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
-  BENCHMARK_PRIVATE_DECLARE(func) =                      \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
-              #func "/" #test_case_name,                 \
-              [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
-
-#endif  // __cplusplus >= 11
-
-// This will register a benchmark for a templatized function.  For example:
-//
-// template<int arg>
-// void BM_Foo(int iters);
-//
-// BENCHMARK_TEMPLATE(BM_Foo, 1);
-//
-// will register BM_Foo<1> as a benchmark.
-#define BENCHMARK_TEMPLATE1(n, a)                        \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
-
-#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
-  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
-      (::benchmark::internal::RegisterBenchmarkInternal(                     \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
-                                                       n<a, b>)))
-
-#if __cplusplus >= 201103L
-#define BENCHMARK_TEMPLATE(n, ...)                       \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
-              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
-#else
-#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
-#endif
-
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
-  class BaseClass##_##Method##_Benchmark : public BaseClass { \
-   public:                                                    \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
-      this->SetName(#BaseClass "/" #Method);                  \
-    }                                                         \
-                                                              \
-   protected:                                                 \
-    virtual void BenchmarkCase(::benchmark::State&);          \
-  };
-
-#define BENCHMARK_DEFINE_F(BaseClass, Method)    \
-  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-
-#define BENCHMARK_REGISTER_F(BaseClass, Method) \
-  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
-
-#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
-  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
-      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
-
-// This macro will define and register a benchmark within a fixture class.
-#define BENCHMARK_F(BaseClass, Method)           \
-  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  BENCHMARK_REGISTER_F(BaseClass, Method);       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-
-// Helper macro to create a main routine in a test that runs the benchmarks
-#define BENCHMARK_MAIN()                   \
-  int main(int argc, char** argv) {        \
-    ::benchmark::Initialize(&argc, argv);  \
-    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
-    ::benchmark::RunSpecifiedBenchmarks(); \
-  }
-
-
-// ------------------------------------------------------
-// Benchmark Reporters
-
-namespace benchmark {
-
-// Interface for custom benchmark result printers.
-// By default, benchmark reports are printed to stdout. However an application
-// can control the destination of the reports by calling
-// RunSpecifiedBenchmarks and passing it a custom reporter object.
-// The reporter object must implement the following interface.
-class BenchmarkReporter {
- public:
-  struct Context {
-    int num_cpus;
-    double mhz_per_cpu;
-    bool cpu_scaling_enabled;
-
-    // The number of chars in the longest benchmark name.
-    size_t name_field_width;
-  };
-
-  struct Run {
-    Run()
-        : error_occurred(false),
-          iterations(1),
-          time_unit(kNanosecond),
-          real_accumulated_time(0),
-          cpu_accumulated_time(0),
-          bytes_per_second(0),
-          items_per_second(0),
-          max_heapbytes_used(0),
-          complexity(oNone),
-          complexity_lambda(),
-          complexity_n(0),
-          report_big_o(false),
-          report_rms(false),
-          counters() {}
-
-    std::string benchmark_name;
-    std::string report_label;  // Empty if not set by benchmark.
-    bool error_occurred;
-    std::string error_message;
-
-    int64_t iterations;
-    TimeUnit time_unit;
-    double real_accumulated_time;
-    double cpu_accumulated_time;
-
-    // Return a value representing the real time per iteration in the unit
-    // specified by 'time_unit'.
-    // NOTE: If 'iterations' is zero the returned value represents the
-    // accumulated time.
-    double GetAdjustedRealTime() const;
-
-    // Return a value representing the cpu time per iteration in the unit
-    // specified by 'time_unit'.
-    // NOTE: If 'iterations' is zero the returned value represents the
-    // accumulated time.
-    double GetAdjustedCPUTime() const;
-
-    // Zero if not set by benchmark.
-    double bytes_per_second;
-    double items_per_second;
-
-    // This is set to 0.0 if memory tracing is not enabled.
-    double max_heapbytes_used;
-
-    // Keep track of arguments to compute asymptotic complexity
-    BigO complexity;
-    BigOFunc* complexity_lambda;
-    int complexity_n;
-
-    // Inform print function whether the current run is a complexity report
-    bool report_big_o;
-    bool report_rms;
-
-    UserCounters counters;
-  };
-
-  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
-  // and the error stream set to 'std::cerr'
-  BenchmarkReporter();
-
-  // Called once for every suite of benchmarks run.
-  // The parameter "context" contains information that the
-  // reporter may wish to use when generating its report, for example the
-  // platform under which the benchmarks are running. The benchmark run is
-  // never started if this function returns false, allowing the reporter
-  // to skip runs based on the context information.
-  virtual bool ReportContext(const Context& context) = 0;
-
-  // Called once for each group of benchmark runs, gives information about
-  // cpu-time and heap memory usage during the benchmark run. If the group
-  // of runs contained more than two entries then 'report' contains additional
-  // elements representing the mean and standard deviation of those runs.
-  // Additionally if this group of runs was the last in a family of benchmarks
-  // 'reports' contains additional entries representing the asymptotic
-  // complexity and RMS of that benchmark family.
-  virtual void ReportRuns(const std::vector<Run>& report) = 0;
-
-  // Called once and only once after ever group of benchmarks is run and
-  // reported.
-  virtual void Finalize() {}
-
-  // REQUIRES: The object referenced by 'out' is valid for the lifetime
-  // of the reporter.
-  void SetOutputStream(std::ostream* out) {
-    assert(out);
-    output_stream_ = out;
-  }
-
-  // REQUIRES: The object referenced by 'err' is valid for the lifetime
-  // of the reporter.
-  void SetErrorStream(std::ostream* err) {
-    assert(err);
-    error_stream_ = err;
-  }
-
-  std::ostream& GetOutputStream() const { return *output_stream_; }
-
-  std::ostream& GetErrorStream() const { return *error_stream_; }
-
-  virtual ~BenchmarkReporter();
-
-  // Write a human readable string to 'out' representing the specified
-  // 'context'.
-  // REQUIRES: 'out' is non-null.
-  static void PrintBasicContext(std::ostream* out, Context const& context);
-
- private:
-  std::ostream* output_stream_;
-  std::ostream* error_stream_;
-};
-
-// Simple reporter that outputs benchmark data to the console. This is the
-// default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
-public:
-  enum OutputOptions {
-    OO_None = 0,
-    OO_Color = 1,
-    OO_Tabular = 2,
-    OO_ColorTabular = OO_Color|OO_Tabular,
-    OO_Defaults = OO_ColorTabular
-  };
-  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_), name_field_width_(0),
-        prev_counters_(), printed_header_(false) {}
-
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-
- protected:
-  virtual void PrintRunData(const Run& report);
-  virtual void PrintHeader(const Run& report);
-
-  OutputOptions output_options_;
-  size_t name_field_width_;
-  UserCounters prev_counters_;
-  bool printed_header_;
-};
-
-class JSONReporter : public BenchmarkReporter {
- public:
-  JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
-
- private:
-  void PrintRunData(const Run& report);
-
-  bool first_report_;
-};
-
-class CSVReporter : public BenchmarkReporter {
- public:
-  CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-
- private:
-  void PrintRunData(const Run& report);
-
-  bool printed_header_;
-  std::set< std::string > user_counter_names_;
-};
-
-inline const char* GetTimeUnitString(TimeUnit unit) {
-  switch (unit) {
-    case kMillisecond:
-      return "ms";
-    case kMicrosecond:
-      return "us";
-    case kNanosecond:
-    default:
-      return "ns";
-  }
-}
-
-inline double GetTimeUnitMultiplier(TimeUnit unit) {
-  switch (unit) {
-    case kMillisecond:
-      return 1e3;
-    case kMicrosecond:
-      return 1e6;
-    case kNanosecond:
-    default:
-      return 1e9;
-  }
-}
-
-} // namespace benchmark
-
-#endif  // BENCHMARK_BENCHMARK_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark_api.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark_api.h
deleted file mode 100644
index a9ae67147c5..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark_api.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef BENCHMARK_BENCHMARK_API_H_
-#define BENCHMARK_BENCHMARK_API_H_
-
-#ifdef __DEPRECATED
-# ifndef BENCHMARK_WARNING_MSG
-#   warning the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead
-# else
-    BENCHMARK_WARNING_MSG("the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead")
-# endif
-#endif
-
-#include "benchmark.h"  // For forward declaration of BenchmarkReporter
-
-#endif  // BENCHMARK_BENCHMARK_API_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/reporter.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/reporter.h
deleted file mode 100644
index 5baca1a740a..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/reporter.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef BENCHMARK_REPORTER_H_
-#define BENCHMARK_REPORTER_H_
-
-#ifdef __DEPRECATED
-# ifndef BENCHMARK_WARNING_MSG
-#   warning the reporter.h header has been deprecated and will be removed, please include benchmark.h instead
-# else
-    BENCHMARK_WARNING_MSG("the reporter.h header has been deprecated and will be removed, please include benchmark.h instead")
-# endif
-#endif
-
-#include "benchmark.h"  // For forward declaration of BenchmarkReporter
-
-#endif  // BENCHMARK_REPORTER_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/mingw.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/mingw.py
deleted file mode 100644
index 706ad559db9..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/mingw.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/CMakeLists.txt b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/CMakeLists.txt
deleted file mode 100644
index 244484b8b05..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-# Allow the source files to find headers in src/
-include_directories(${PROJECT_SOURCE_DIR}/src)
-
-if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
-  list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
-  list(APPEND CMAKE_MODULE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
-endif()
-
-file(GLOB
-  SOURCE_FILES
-    *.cc
-    ${PROJECT_SOURCE_DIR}/include/benchmark/*.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-
-add_library(benchmark ${SOURCE_FILES})
-set_target_properties(benchmark PROPERTIES
-  OUTPUT_NAME "benchmark"
-  VERSION ${GENERIC_LIB_VERSION}
-  SOVERSION ${GENERIC_LIB_SOVERSION}
-)
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-
-# Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
-endif()
-
-# We need extra libraries on Windows
-if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark Shlwapi)
-endif()
-
-set(include_install_dir "include")
-set(lib_install_dir "lib/")
-set(bin_install_dir "bin/")
-set(config_install_dir "lib/cmake/${PROJECT_NAME}")
-
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
-
-set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
-set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
-set(targets_export_name "${PROJECT_NAME}Targets")
-
-set(namespace "${PROJECT_NAME}::")
-
-include(CMakePackageConfigHelpers)
-write_basic_package_version_file(
-    "${version_config}" VERSION ${GIT_VERSION} COMPATIBILITY SameMajorVersion
-)
-
-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
-
-# Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
-install(
-  TARGETS benchmark
-  EXPORT ${targets_export_name}
-  ARCHIVE DESTINATION ${lib_install_dir}
-  LIBRARY DESTINATION ${lib_install_dir}
-  RUNTIME DESTINATION ${bin_install_dir}
-  INCLUDES DESTINATION ${include_install_dir})
-
-install(
-  DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-  DESTINATION ${include_install_dir}
-  FILES_MATCHING PATTERN "*.*h")
-
-install(
-    FILES "${project_config}" "${version_config}"
-    DESTINATION "${config_install_dir}")
-
-install(
-    EXPORT "${targets_export_name}"
-    NAMESPACE "${namespace}"
-    DESTINATION "${config_install_dir}")
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/arraysize.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/arraysize.h
deleted file mode 100644
index 51a50f2dff2..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/arraysize.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef BENCHMARK_ARRAYSIZE_H_
-#define BENCHMARK_ARRAYSIZE_H_
-
-#include "internal_macros.h"
-
-namespace benchmark {
-namespace internal {
-// The arraysize(arr) macro returns the # of elements in an array arr.
-// The expression is a compile-time constant, and therefore can be
-// used in defining new arrays, for example.  If you use arraysize on
-// a pointer by mistake, you will get a compile-time error.
-//
-
-// This template function declaration is used in defining arraysize.
-// Note that the function doesn't need an implementation, as we only
-// use its type.
-template <typename T, size_t N>
-char (&ArraySizeHelper(T (&array)[N]))[N];
-
-// That gcc wants both of these prototypes seems mysterious. VC, for
-// its part, can't decide which to use (another mystery). Matching of
-// template overloads: the final frontier.
-#ifndef COMPILER_MSVC
-template <typename T, size_t N>
-char (&ArraySizeHelper(const T (&array)[N]))[N];
-#endif
-
-#define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array)))
-
-}  // end namespace internal
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_ARRAYSIZE_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark.cc
deleted file mode 100644
index 1ba0a50adf8..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark.cc
+++ /dev/null
@@ -1,715 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-#include "benchmark_api_internal.h"
-#include "internal_macros.h"
-
-#ifndef BENCHMARK_OS_WINDOWS
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <atomic>
-#include <condition_variable>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <thread>
-
-#include "check.h"
-#include "colorprint.h"
-#include "commandlineflags.h"
-#include "complexity.h"
-#include "counter.h"
-#include "log.h"
-#include "mutex.h"
-#include "re.h"
-#include "stat.h"
-#include "string_util.h"
-#include "sysinfo.h"
-#include "timers.h"
-
-DEFINE_bool(benchmark_list_tests, false,
-            "Print a list of benchmarks. This option overrides all other "
-            "options.");
-
-DEFINE_string(benchmark_filter, ".",
-              "A regular expression that specifies the set of benchmarks "
-              "to execute.  If this flag is empty, no benchmarks are run.  "
-              "If this flag is the string \"all\", all benchmarks linked "
-              "into the process are run.");
-
-DEFINE_double(benchmark_min_time, 0.5,
-              "Minimum number of seconds we should run benchmark before "
-              "results are considered significant.  For cpu-time based "
-              "tests, this is the lower bound on the total cpu time "
-              "used by all threads that make up the test.  For real-time "
-              "based tests, this is the lower bound on the elapsed time "
-              "of the benchmark execution, regardless of number of "
-              "threads.");
-
-DEFINE_int32(benchmark_repetitions, 1,
-             "The number of runs of each benchmark. If greater than 1, the "
-             "mean and standard deviation of the runs will be reported.");
-
-DEFINE_bool(benchmark_report_aggregates_only, false,
-            "Report the result of each benchmark repetitions. When 'true' is "
-            "specified only the mean, standard deviation, and other statistics "
-            "are reported for repeated benchmarks.");
-
-DEFINE_string(benchmark_format, "console",
-              "The format to use for console output. Valid values are "
-              "'console', 'json', or 'csv'.");
-
-DEFINE_string(benchmark_out_format, "json",
-              "The format to use for file output. Valid values are "
-              "'console', 'json', or 'csv'.");
-
-DEFINE_string(benchmark_out, "", "The file to write additonal output to");
-
-DEFINE_string(benchmark_color, "auto",
-              "Whether to use colors in the output.  Valid values: "
-              "'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use "
-              "colors if the output is being sent to a terminal and the TERM "
-              "environment variable is set to a terminal type that supports "
-              "colors.");
-
-DEFINE_bool(benchmark_counters_tabular, false,
-            "Whether to use tabular format when printing user counters to "
-            "the console.  Valid values: 'true'/'yes'/1, 'false'/'no'/0."
-            "Defaults to false.");
-
-DEFINE_int32(v, 0, "The level of verbose logging to output");
-
-namespace benchmark {
-namespace internal {
-
-void UseCharPointer(char const volatile*) {}
-
-}  // end namespace internal
-
-namespace {
-
-static const size_t kMaxIterations = 1000000000;
-
-}  // end namespace
-
-namespace internal {
-
-class ThreadManager {
- public:
-  ThreadManager(int num_threads)
-      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
-
-  Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
-    return benchmark_mutex_;
-  }
-
-  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
-    return start_stop_barrier_.wait();
-  }
-
-  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
-    start_stop_barrier_.removeThread();
-    if (--alive_threads_ == 0) {
-      MutexLock lock(end_cond_mutex_);
-      end_condition_.notify_all();
-    }
-  }
-
-  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
-    MutexLock lock(end_cond_mutex_);
-    end_condition_.wait(lock.native_handle(),
-                        [this]() { return alive_threads_ == 0; });
-  }
-
- public:
-  struct Result {
-    double real_time_used = 0;
-    double cpu_time_used = 0;
-    double manual_time_used = 0;
-    int64_t bytes_processed = 0;
-    int64_t items_processed = 0;
-    int complexity_n = 0;
-    std::string report_label_;
-    std::string error_message_;
-    bool has_error_ = false;
-    UserCounters counters;
-  };
-  GUARDED_BY(GetBenchmarkMutex()) Result results;
-
- private:
-  mutable Mutex benchmark_mutex_;
-  std::atomic<int> alive_threads_;
-  Barrier start_stop_barrier_;
-  Mutex end_cond_mutex_;
-  Condition end_condition_;
-};
-
-// Timer management class
-class ThreadTimer {
- public:
-  ThreadTimer() = default;
-
-  // Called by each thread
-  void StartTimer() {
-    running_ = true;
-    start_real_time_ = ChronoClockNow();
-    start_cpu_time_ = ThreadCPUUsage();
-  }
-
-  // Called by each thread
-  void StopTimer() {
-    CHECK(running_);
-    running_ = false;
-    real_time_used_ += ChronoClockNow() - start_real_time_;
-    cpu_time_used_ += ThreadCPUUsage() - start_cpu_time_;
-  }
-
-  // Called by each thread
-  void SetIterationTime(double seconds) { manual_time_used_ += seconds; }
-
-  bool running() const { return running_; }
-
-  // REQUIRES: timer is not running
-  double real_time_used() {
-    CHECK(!running_);
-    return real_time_used_;
-  }
-
-  // REQUIRES: timer is not running
-  double cpu_time_used() {
-    CHECK(!running_);
-    return cpu_time_used_;
-  }
-
-  // REQUIRES: timer is not running
-  double manual_time_used() {
-    CHECK(!running_);
-    return manual_time_used_;
-  }
-
- private:
-  bool running_ = false;        // Is the timer running
-  double start_real_time_ = 0;  // If running_
-  double start_cpu_time_ = 0;   // If running_
-
-  // Accumulated time so far (does not contain current slice if running_)
-  double real_time_used_ = 0;
-  double cpu_time_used_ = 0;
-  // Manually set iteration time. User sets this with SetIterationTime(seconds).
-  double manual_time_used_ = 0;
-};
-
-namespace {
-
-BenchmarkReporter::Run CreateRunReport(
-    const benchmark::internal::Benchmark::Instance& b,
-    const internal::ThreadManager::Result& results, size_t iters,
-    double seconds) {
-  // Create report about this benchmark run.
-  BenchmarkReporter::Run report;
-
-  report.benchmark_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
-  report.report_label = results.report_label_;
-  // Report the total iterations across all threads.
-  report.iterations = static_cast<int64_t>(iters) * b.threads;
-  report.time_unit = b.time_unit;
-
-  if (!report.error_occurred) {
-    double bytes_per_second = 0;
-    if (results.bytes_processed > 0 && seconds > 0.0) {
-      bytes_per_second = (results.bytes_processed / seconds);
-    }
-    double items_per_second = 0;
-    if (results.items_processed > 0 && seconds > 0.0) {
-      items_per_second = (results.items_processed / seconds);
-    }
-
-    if (b.use_manual_time) {
-      report.real_accumulated_time = results.manual_time_used;
-    } else {
-      report.real_accumulated_time = results.real_time_used;
-    }
-    report.cpu_accumulated_time = results.cpu_time_used;
-    report.bytes_per_second = bytes_per_second;
-    report.items_per_second = items_per_second;
-    report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.counters = results.counters;
-    internal::Finish(&report.counters, seconds, b.threads);
-  }
-  return report;
-}
-
-// Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
-void RunInThread(const benchmark::internal::Benchmark::Instance* b,
-                 size_t iters, int thread_id,
-                 internal::ThreadManager* manager) {
-  internal::ThreadTimer timer;
-  State st(iters, b->arg, thread_id, b->threads, &timer, manager);
-  b->benchmark->Run(st);
-  CHECK(st.iterations() == st.max_iterations)
-      << "Benchmark returned before State::KeepRunning() returned false!";
-  {
-    MutexLock l(manager->GetBenchmarkMutex());
-    internal::ThreadManager::Result& results = manager->results;
-    results.cpu_time_used += timer.cpu_time_used();
-    results.real_time_used += timer.real_time_used();
-    results.manual_time_used += timer.manual_time_used();
-    results.bytes_processed += st.bytes_processed();
-    results.items_processed += st.items_processed();
-    results.complexity_n += st.complexity_length_n();
-    internal::Increment(&results.counters, st.counters);
-  }
-  manager->NotifyThreadComplete();
-}
-
-std::vector<BenchmarkReporter::Run> RunBenchmark(
-    const benchmark::internal::Benchmark::Instance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  std::vector<BenchmarkReporter::Run> reports;  // return value
-
-  const bool has_explicit_iteration_count = b.iterations != 0;
-  size_t iters = has_explicit_iteration_count ? b.iterations : 1;
-  std::unique_ptr<internal::ThreadManager> manager;
-  std::vector<std::thread> pool(b.threads - 1);
-  const int repeats =
-      b.repetitions != 0 ? b.repetitions : FLAGS_benchmark_repetitions;
-  const bool report_aggregates_only =
-      repeats != 1 &&
-      (b.report_mode == internal::RM_Unspecified
-           ? FLAGS_benchmark_report_aggregates_only
-           : b.report_mode == internal::RM_ReportAggregatesOnly);
-  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-    for (;;) {
-      // Try benchmark
-      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
-
-      manager.reset(new internal::ThreadManager(b.threads));
-      for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-        pool[ti] = std::thread(&RunInThread, &b, iters,
-                               static_cast<int>(ti + 1), manager.get());
-      }
-      RunInThread(&b, iters, 0, manager.get());
-      manager->WaitForAllThreads();
-      for (std::thread& thread : pool) thread.join();
-      internal::ThreadManager::Result results;
-      {
-        MutexLock l(manager->GetBenchmarkMutex());
-        results = manager->results;
-      }
-      manager.reset();
-      // Adjust real/manual time stats since they were reported per thread.
-      results.real_time_used /= b.threads;
-      results.manual_time_used /= b.threads;
-
-      VLOG(2) << "Ran in " << results.cpu_time_used << "/"
-              << results.real_time_used << "\n";
-
-      // Base decisions off of real time if requested by this benchmark.
-      double seconds = results.cpu_time_used;
-      if (b.use_manual_time) {
-        seconds = results.manual_time_used;
-      } else if (b.use_real_time) {
-        seconds = results.real_time_used;
-      }
-
-      const double min_time =
-          !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time;
-
-      // Determine if this run should be reported; Either it has
-      // run for a sufficient amount of time or because an error was reported.
-      const bool should_report =  repetition_num > 0
-        || has_explicit_iteration_count // An exact iteration count was requested
-        || results.has_error_
-        || iters >= kMaxIterations
-        || seconds >= min_time // the elapsed time is large enough
-        // CPU time is specified but the elapsed real time greatly exceeds the
-        // minimum time. Note that user provided timers are except from this
-        // sanity check.
-        || ((results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-
-      if (should_report) {
-        BenchmarkReporter::Run report =
-            CreateRunReport(b, results, iters, seconds);
-        if (!report.error_occurred && b.complexity != oNone)
-          complexity_reports->push_back(report);
-        reports.push_back(report);
-        break;
-      }
-
-      // See how much iterations should be increased by
-      // Note: Avoid division by zero with max(seconds, 1ns).
-      double multiplier = min_time * 1.4 / std::max(seconds, 1e-9);
-      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-      // use the multiplier directly. Otherwise we use at most 10 times
-      // expansion.
-      // NOTE: When the last run was at least 10% of the min time the max
-      // expansion should be 14x.
-      bool is_significant = (seconds / min_time) > 0.1;
-      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-      if (multiplier <= 1.0) multiplier = 2.0;
-      double next_iters = std::max(multiplier * iters, iters + 1.0);
-      if (next_iters > kMaxIterations) {
-        next_iters = kMaxIterations;
-      }
-      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-      iters = static_cast<int>(next_iters + 0.5);
-    }
-  }
-  // Calculate additional statistics
-  auto stat_reports = ComputeStats(reports);
-  if ((b.complexity != oNone) && b.last_benchmark_instance) {
-    auto additional_run_stats = ComputeBigO(*complexity_reports);
-    stat_reports.insert(stat_reports.end(), additional_run_stats.begin(),
-                        additional_run_stats.end());
-    complexity_reports->clear();
-  }
-
-  if (report_aggregates_only) reports.clear();
-  reports.insert(reports.end(), stat_reports.begin(), stat_reports.end());
-  return reports;
-}
-
-}  // namespace
-}  // namespace internal
-
-State::State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
-             int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
-    : started_(false),
-      finished_(false),
-      total_iterations_(0),
-      range_(ranges),
-      bytes_processed_(0),
-      items_processed_(0),
-      complexity_n_(0),
-      error_occurred_(false),
-      counters(),
-      thread_index(thread_i),
-      threads(n_threads),
-      max_iterations(max_iters),
-      timer_(timer),
-      manager_(manager) {
-  CHECK(max_iterations != 0) << "At least one iteration must be run";
-  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
-}
-
-void State::PauseTiming() {
-  // Add in time accumulated so far
-  CHECK(started_ && !finished_ && !error_occurred_);
-  timer_->StopTimer();
-}
-
-void State::ResumeTiming() {
-  CHECK(started_ && !finished_ && !error_occurred_);
-  timer_->StartTimer();
-}
-
-void State::SkipWithError(const char* msg) {
-  CHECK(msg);
-  error_occurred_ = true;
-  {
-    MutexLock l(manager_->GetBenchmarkMutex());
-    if (manager_->results.has_error_ == false) {
-      manager_->results.error_message_ = msg;
-      manager_->results.has_error_ = true;
-    }
-  }
-  total_iterations_ = max_iterations;
-  if (timer_->running()) timer_->StopTimer();
-}
-
-void State::SetIterationTime(double seconds) {
-  timer_->SetIterationTime(seconds);
-}
-
-void State::SetLabel(const char* label) {
-  MutexLock l(manager_->GetBenchmarkMutex());
-  manager_->results.report_label_ = label;
-}
-
-void State::StartKeepRunning() {
-  CHECK(!started_ && !finished_);
-  started_ = true;
-  manager_->StartStopBarrier();
-  if (!error_occurred_) ResumeTiming();
-}
-
-void State::FinishKeepRunning() {
-  CHECK(started_ && (!finished_ || error_occurred_));
-  if (!error_occurred_) {
-    PauseTiming();
-  }
-  // Total iterations now is one greater than max iterations. Fix this.
-  total_iterations_ = max_iterations;
-  finished_ = true;
-  manager_->StartStopBarrier();
-}
-
-namespace internal {
-namespace {
-
-void RunBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
-                           BenchmarkReporter* console_reporter,
-                           BenchmarkReporter* file_reporter) {
-  // Note the file_reporter can be null.
-  CHECK(console_reporter != nullptr);
-
-  // Determine the width of the name field using a minimum width of 10.
-  bool has_repetitions = FLAGS_benchmark_repetitions > 1;
-  size_t name_field_width = 10;
-  for (const Benchmark::Instance& benchmark : benchmarks) {
-    name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.size());
-    has_repetitions |= benchmark.repetitions > 1;
-  }
-  if (has_repetitions) name_field_width += std::strlen("_stddev");
-
-  // Print header here
-  BenchmarkReporter::Context context;
-  context.num_cpus = NumCPUs();
-  context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
-
-  context.cpu_scaling_enabled = CpuScalingEnabled();
-  context.name_field_width = name_field_width;
-
-  // Keep track of runing times of all instances of current benchmark
-  std::vector<BenchmarkReporter::Run> complexity_reports;
-
-  // We flush streams after invoking reporter methods that write to them. This
-  // ensures users get timely updates even when streams are not line-buffered.
-  auto flushStreams = [](BenchmarkReporter* reporter) {
-    if (!reporter) return;
-    std::flush(reporter->GetOutputStream());
-    std::flush(reporter->GetErrorStream());
-  };
-
-  if (console_reporter->ReportContext(context) &&
-      (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(console_reporter);
-    flushStreams(file_reporter);
-    for (const auto& benchmark : benchmarks) {
-      std::vector<BenchmarkReporter::Run> reports =
-          RunBenchmark(benchmark, &complexity_reports);
-      console_reporter->ReportRuns(reports);
-      if (file_reporter) file_reporter->ReportRuns(reports);
-      flushStreams(console_reporter);
-      flushStreams(file_reporter);
-    }
-  }
-  console_reporter->Finalize();
-  if (file_reporter) file_reporter->Finalize();
-  flushStreams(console_reporter);
-  flushStreams(file_reporter);
-}
-
-std::unique_ptr<BenchmarkReporter> CreateReporter(
-    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
-  typedef std::unique_ptr<BenchmarkReporter> PtrType;
-  if (name == "console") {
-    return PtrType(new ConsoleReporter(output_opts));
-  } else if (name == "json") {
-    return PtrType(new JSONReporter);
-  } else if (name == "csv") {
-    return PtrType(new CSVReporter);
-  } else {
-    std::cerr << "Unexpected format: '" << name << "'\n";
-    std::exit(1);
-  }
-}
-
-}  // end namespace
-
-bool IsZero(double n) {
-  return std::abs(n) < std::numeric_limits<double>::epsilon();
-}
-
-ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
-  int output_opts = ConsoleReporter::OO_Defaults;
-  if ((FLAGS_benchmark_color == "auto" && IsColorTerminal()) ||
-      IsTruthyFlagValue(FLAGS_benchmark_color)) {
-    output_opts |= ConsoleReporter::OO_Color;
-  } else {
-    output_opts &= ~ConsoleReporter::OO_Color;
-  }
-  if(force_no_color) {
-    output_opts &= ~ConsoleReporter::OO_Color;
-  }
-  if(FLAGS_benchmark_counters_tabular) {
-    output_opts |= ConsoleReporter::OO_Tabular;
-  } else {
-    output_opts &= ~ConsoleReporter::OO_Tabular;
-  }
-  return static_cast< ConsoleReporter::OutputOptions >(output_opts);
-}
-
-}  // end namespace internal
-
-size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr, nullptr);
-}
-
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter) {
-  return RunSpecifiedBenchmarks(console_reporter, nullptr);
-}
-
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
-                              BenchmarkReporter* file_reporter) {
-  std::string spec = FLAGS_benchmark_filter;
-  if (spec.empty() || spec == "all")
-    spec = ".";  // Regexp that matches all benchmarks
-
-  // Setup the reporters
-  std::ofstream output_file;
-  std::unique_ptr<BenchmarkReporter> default_console_reporter;
-  std::unique_ptr<BenchmarkReporter> default_file_reporter;
-  if (!console_reporter) {
-    default_console_reporter = internal::CreateReporter(
-          FLAGS_benchmark_format, internal::GetOutputOptions());
-    console_reporter = default_console_reporter.get();
-  }
-  auto& Out = console_reporter->GetOutputStream();
-  auto& Err = console_reporter->GetErrorStream();
-
-  std::string const& fname = FLAGS_benchmark_out;
-  if (fname.empty() && file_reporter) {
-    Err << "A custom file reporter was provided but "
-           "--benchmark_out=<file> was not specified."
-        << std::endl;
-    std::exit(1);
-  }
-  if (!fname.empty()) {
-    output_file.open(fname);
-    if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << std::endl;
-      std::exit(1);
-    }
-    if (!file_reporter) {
-      default_file_reporter = internal::CreateReporter(
-          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
-      file_reporter = default_file_reporter.get();
-    }
-    file_reporter->SetOutputStream(&output_file);
-    file_reporter->SetErrorStream(&output_file);
-  }
-
-  std::vector<internal::Benchmark::Instance> benchmarks;
-  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
-
-  if (benchmarks.empty()) {
-    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
-    return 0;
-  }
-
-  if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks) Out << benchmark.name << "\n";
-  } else {
-    internal::RunBenchmarks(benchmarks, console_reporter, file_reporter);
-  }
-
-  return benchmarks.size();
-}
-
-namespace internal {
-
-void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark"
-          " [--benchmark_list_tests={true|false}]\n"
-          "          [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_report_aggregates_only={true|false}\n"
-          "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--benchmark_out=<filename>]\n"
-          "          [--benchmark_out_format=<json|console|csv>]\n"
-          "          [--benchmark_color={auto|true|false}]\n"
-          "          [--benchmark_counters_tabular={true|false}]\n"
-          "          [--v=<verbosity>]\n");
-  exit(0);
-}
-
-void ParseCommandLineFlags(int* argc, char** argv) {
-  using namespace benchmark;
-  for (int i = 1; i < *argc; ++i) {
-    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
-                      &FLAGS_benchmark_list_tests) ||
-        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
-                        &FLAGS_benchmark_min_time) ||
-        ParseInt32Flag(argv[i], "benchmark_repetitions",
-                       &FLAGS_benchmark_repetitions) ||
-        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
-                      &FLAGS_benchmark_report_aggregates_only) ||
-        ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
-        ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
-        ParseStringFlag(argv[i], "benchmark_out_format",
-                        &FLAGS_benchmark_out_format) ||
-        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
-        // "color_print" is the deprecated name for "benchmark_color".
-        // TODO: Remove this.
-        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
-        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
-                        &FLAGS_benchmark_counters_tabular) ||
-        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
-
-      --(*argc);
-      --i;
-    } else if (IsFlag(argv[i], "help")) {
-      PrintUsageAndExit();
-    }
-  }
-  for (auto const* flag :
-       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
-    if (*flag != "console" && *flag != "json" && *flag != "csv") {
-      PrintUsageAndExit();
-    }
-  if (FLAGS_benchmark_color.empty()) {
-    PrintUsageAndExit();
-  }
-}
-
-int InitializeStreams() {
-  static std::ios_base::Init init;
-  return 0;
-}
-
-}  // end namespace internal
-
-void Initialize(int* argc, char** argv) {
-  internal::ParseCommandLineFlags(argc, argv);
-  internal::LogLevel() = FLAGS_v;
-}
-
-bool ReportUnrecognizedArguments(int argc, char** argv) {
-  for (int i = 1; i < argc; ++i) {
-    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0], argv[i]);
-  }
-  return argc > 1;
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_api_internal.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_api_internal.h
deleted file mode 100644
index 36d23404717..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_api_internal.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef BENCHMARK_API_INTERNAL_H
-#define BENCHMARK_API_INTERNAL_H
-
-#include "benchmark/benchmark.h"
-
-#include <cmath>
-#include <iosfwd>
-#include <limits>
-#include <string>
-#include <vector>
-
-namespace benchmark {
-namespace internal {
-
-// Information kept per benchmark we may want to run
-struct Benchmark::Instance {
-  std::string name;
-  Benchmark* benchmark;
-  ReportMode report_mode;
-  std::vector<int> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  size_t iterations;
-  int threads;  // Number of concurrent threads to us
-};
-
-bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<Benchmark::Instance>* benchmarks,
-                            std::ostream* Err);
-
-bool IsZero(double n);
-
-ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
-
-}  // end namespace internal
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_API_INTERNAL_H
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_register.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_register.cc
deleted file mode 100644
index ed70d820dee..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_register.cc
+++ /dev/null
@@ -1,467 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-#include "benchmark_api_internal.h"
-#include "internal_macros.h"
-
-#ifndef BENCHMARK_OS_WINDOWS
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <atomic>
-#include <condition_variable>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <thread>
-
-#include "check.h"
-#include "commandlineflags.h"
-#include "complexity.h"
-#include "log.h"
-#include "mutex.h"
-#include "re.h"
-#include "stat.h"
-#include "string_util.h"
-#include "sysinfo.h"
-#include "timers.h"
-
-namespace benchmark {
-
-namespace {
-// For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
-// The size of a benchmark family determines is the number of inputs to repeat
-// the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
-}  // end namespace
-
-namespace internal {
-
-//=============================================================================//
-//                         BenchmarkFamilies
-//=============================================================================//
-
-// Class for managing registered benchmarks.  Note that each registered
-// benchmark identifies a family of related benchmarks to run.
-class BenchmarkFamilies {
- public:
-  static BenchmarkFamilies* GetInstance();
-
-  // Registers a benchmark family and returns the index assigned to it.
-  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
-
-  // Clear all registered benchmark families.
-  void ClearBenchmarks();
-
-  // Extract the list of benchmark instances that match the specified
-  // regular expression.
-  bool FindBenchmarks(const std::string& re,
-                      std::vector<Benchmark::Instance>* benchmarks,
-                      std::ostream* Err);
-
- private:
-  BenchmarkFamilies() {}
-
-  std::vector<std::unique_ptr<Benchmark>> families_;
-  Mutex mutex_;
-};
-
-BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
-  static BenchmarkFamilies instance;
-  return &instance;
-}
-
-size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
-  MutexLock l(mutex_);
-  size_t index = families_.size();
-  families_.push_back(std::move(family));
-  return index;
-}
-
-void BenchmarkFamilies::ClearBenchmarks() {
-  MutexLock l(mutex_);
-  families_.clear();
-  families_.shrink_to_fit();
-}
-
-bool BenchmarkFamilies::FindBenchmarks(
-    const std::string& spec, std::vector<Benchmark::Instance>* benchmarks,
-    std::ostream* ErrStream) {
-  CHECK(ErrStream);
-  auto& Err = *ErrStream;
-  // Make regular expression out of command-line flag
-  std::string error_msg;
-  Regex re;
-  if (!re.Init(spec, &error_msg)) {
-    Err << "Could not compile benchmark re: " << error_msg << std::endl;
-    return false;
-  }
-
-  // Special list of thread counts to use when none are specified
-  const std::vector<int> one_thread = {1};
-
-  MutexLock l(mutex_);
-  for (std::unique_ptr<Benchmark>& family : families_) {
-    // Family was deleted or benchmark doesn't match
-    if (!family) continue;
-
-    if (family->ArgsCnt() == -1) {
-      family->Args({});
-    }
-    const std::vector<int>* thread_counts =
-        (family->thread_counts_.empty()
-             ? &one_thread
-             : &static_cast<const std::vector<int>&>(family->thread_counts_));
-    const size_t family_size = family->args_.size() * thread_counts->size();
-    // The benchmark will be run at least 'family_size' different inputs.
-    // If 'family_size' is very large warn the user.
-    if (family_size > kMaxFamilySize) {
-      Err << "The number of inputs is very large. " << family->name_
-          << " will be repeated at least " << family_size << " times.\n";
-    }
-    // reserve in the special case the regex ".", since we know the final
-    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
-
-    for (auto const& args : family->args_) {
-      for (int num_threads : *thread_counts) {
-        Benchmark::Instance instance;
-        instance.name = family->name_;
-        instance.benchmark = family.get();
-        instance.report_mode = family->report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.threads = num_threads;
-
-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          instance.name += "/";
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name +=
-                  StringPrintF("%s:", family->arg_names_[arg_i].c_str());
-            }
-          }
-          
-          instance.name += StringPrintF("%d", arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name += StringPrintF("/min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0)
-          instance.name += StringPrintF("/iterations:%d", family->iterations_);
-        if (family->repetitions_ != 0)
-          instance.name += StringPrintF("/repeats:%d", family->repetitions_);
-
-        if (family->use_manual_time_) {
-          instance.name += "/manual_time";
-        } else if (family->use_real_time_) {
-          instance.name += "/real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name += StringPrintF("/threads:%d", instance.threads);
-        }
-
-        if (re.Match(instance.name)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
-          benchmarks->push_back(std::move(instance));
-        }
-      }
-    }
-  }
-  return true;
-}
-
-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-  std::unique_ptr<Benchmark> bench_ptr(bench);
-  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-  families->AddBenchmark(std::move(bench_ptr));
-  return bench;
-}
-
-// FIXME: This function is a hack so that benchmark.cc can access
-// `BenchmarkFamilies`
-bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<Benchmark::Instance>* benchmarks,
-                            std::ostream* Err) {
-  return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err);
-}
-
-//=============================================================================//
-//                               Benchmark
-//=============================================================================//
-
-Benchmark::Benchmark(const char* name)
-    : name_(name),
-      report_mode_(RM_Unspecified),
-      time_unit_(kNanosecond),
-      range_multiplier_(kRangeMultiplier),
-      min_time_(0),
-      iterations_(0),
-      repetitions_(0),
-      use_real_time_(false),
-      use_manual_time_(false),
-      complexity_(oNone),
-      complexity_lambda_(nullptr) {}
-
-Benchmark::~Benchmark() {}
-
-void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
-
-  // Add "lo"
-  dst->push_back(lo);
-
-  static const int kint32max = std::numeric_limits<int32_t>::max();
-
-  // Now space out the benchmarks in multiples of "mult"
-  for (int32_t i = 1; i < kint32max / mult; i *= mult) {
-    if (i >= hi) break;
-    if (i > lo) {
-      dst->push_back(i);
-    }
-  }
-  // Add "hi" (if different from "lo")
-  if (hi != lo) {
-    dst->push_back(hi);
-  }
-}
-
-Benchmark* Benchmark::Arg(int x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  args_.push_back({x});
-  return this;
-}
-
-Benchmark* Benchmark::Unit(TimeUnit unit) {
-  time_unit_ = unit;
-  return this;
-}
-
-Benchmark* Benchmark::Range(int start, int limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  std::vector<int> arglist;
-  AddRange(&arglist, start, limit, range_multiplier_);
-
-  for (int i : arglist) {
-    args_.push_back({i});
-  }
-  return this;
-}
-
-Benchmark* Benchmark::Ranges(const std::vector<std::pair<int, int>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
-  std::vector<std::vector<int>> arglists(ranges.size());
-  std::size_t total = 1;
-  for (std::size_t i = 0; i < ranges.size(); i++) {
-    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
-             range_multiplier_);
-    total *= arglists[i].size();
-  }
-
-  std::vector<std::size_t> ctr(arglists.size(), 0);
-
-  for (std::size_t i = 0; i < total; i++) {
-    std::vector<int> tmp;
-    tmp.reserve(arglists.size());
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      tmp.push_back(arglists[j].at(ctr[j]));
-    }
-
-    args_.push_back(std::move(tmp));
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      if (ctr[j] + 1 < arglists[j].size()) {
-        ++ctr[j];
-        break;
-      }
-      ctr[j] = 0;
-    }
-  }
-  return this;
-}
-
-Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  arg_names_ = {name};
-  return this;
-}
-
-Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
-  arg_names_ = names;
-  return this;
-}
-
-Benchmark* Benchmark::DenseRange(int start, int limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_GE(start, 0);
-  CHECK_LE(start, limit);
-  for (int arg = start; arg <= limit; arg += step) {
-    args_.push_back({arg});
-  }
-  return this;
-}
-
-Benchmark* Benchmark::Args(const std::vector<int>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
-  args_.push_back(args);
-  return this;
-}
-
-Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
-  custom_arguments(this);
-  return this;
-}
-
-Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
-  range_multiplier_ = multiplier;
-  return this;
-}
-
-
-Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
-  min_time_ = t;
-  return this;
-}
-
-
-Benchmark* Benchmark::Iterations(size_t n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
-  iterations_ = n;
-  return this;
-}
-
-Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
-  repetitions_ = n;
-  return this;
-}
-
-Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
-  report_mode_ = value ? RM_ReportAggregatesOnly : RM_Default;
-  return this;
-}
-
-Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
-      << "Cannot set UseRealTime and UseManualTime simultaneously.";
-  use_real_time_ = true;
-  return this;
-}
-
-Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
-      << "Cannot set UseRealTime and UseManualTime simultaneously.";
-  use_manual_time_ = true;
-  return this;
-}
-
-Benchmark* Benchmark::Complexity(BigO complexity) {
-  complexity_ = complexity;
-  return this;
-}
-
-Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
-  complexity_lambda_ = complexity;
-  complexity_ = oLambda;
-  return this;
-}
-
-Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
-  thread_counts_.push_back(t);
-  return this;
-}
-
-Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-
-  AddRange(&thread_counts_, min_threads, max_threads, 2);
-  return this;
-}
-
-Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
-                                       int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
-
-  for (auto i = min_threads; i < max_threads; i += stride) {
-    thread_counts_.push_back(i);
-  }
-  thread_counts_.push_back(max_threads);
-  return this;
-}
-
-Benchmark* Benchmark::ThreadPerCpu() {
-  static int num_cpus = NumCPUs();
-  thread_counts_.push_back(num_cpus);
-  return this;
-}
-
-void Benchmark::SetName(const char* name) { name_ = name; }
-
-int Benchmark::ArgsCnt() const {
-  if (args_.empty()) {
-    if (arg_names_.empty()) return -1;
-    return static_cast<int>(arg_names_.size());
-  }
-  return static_cast<int>(args_.front().size());
-}
-
-//=============================================================================//
-//                            FunctionBenchmark
-//=============================================================================//
-
-void FunctionBenchmark::Run(State& st) { func_(st); }
-
-}  // end namespace internal
-
-void ClearRegisteredBenchmarks() {
-  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/check.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/check.h
deleted file mode 100644
index 73bead2fb55..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/check.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef CHECK_H_
-#define CHECK_H_
-
-#include <cstdlib>
-#include <ostream>
-#include <cmath>
-
-#include "internal_macros.h"
-#include "log.h"
-
-namespace benchmark {
-namespace internal {
-
-typedef void(AbortHandlerT)();
-
-inline AbortHandlerT*& GetAbortHandler() {
-  static AbortHandlerT* handler = &std::abort;
-  return handler;
-}
-
-BENCHMARK_NORETURN inline void CallAbortHandler() {
-  GetAbortHandler()();
-  std::abort();  // fallback to enforce noreturn
-}
-
-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
-class CheckHandler {
- public:
-  CheckHandler(const char* check, const char* file, const char* func, int line)
-      : log_(GetErrorLogInstance()) {
-    log_ << file << ":" << line << ": " << func << ": Check `" << check
-         << "' failed. ";
-  }
-
-  LogType& GetLog() { return log_; }
-
-  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-    log_ << std::endl;
-    CallAbortHandler();
-  }
-
-  CheckHandler& operator=(const CheckHandler&) = delete;
-  CheckHandler(const CheckHandler&) = delete;
-  CheckHandler() = delete;
-
- private:
-  LogType& log_;
-};
-
-}  // end namespace internal
-}  // end namespace benchmark
-
-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
-#ifndef NDEBUG
-#define CHECK(b)                                                             \
-  (b ? ::benchmark::internal::GetNullLogInstance()                           \
-     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
-           .GetLog())
-#else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
-#endif
-
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
-
-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
-
-#endif  // CHECK_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.cc
deleted file mode 100644
index 2dec4a8b28b..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "colorprint.h"
-
-#include <cstdarg>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-#include <string>
-
-#include "check.h"
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
-#include <io.h>
-#else
-#include <unistd.h>
-#endif  // BENCHMARK_OS_WINDOWS
-
-namespace benchmark {
-namespace {
-#ifdef BENCHMARK_OS_WINDOWS
-typedef WORD PlatformColorCode;
-#else
-typedef const char* PlatformColorCode;
-#endif
-
-PlatformColorCode GetPlatformColorCode(LogColor color) {
-#ifdef BENCHMARK_OS_WINDOWS
-  switch (color) {
-    case COLOR_RED:
-      return FOREGROUND_RED;
-    case COLOR_GREEN:
-      return FOREGROUND_GREEN;
-    case COLOR_YELLOW:
-      return FOREGROUND_RED | FOREGROUND_GREEN;
-    case COLOR_BLUE:
-      return FOREGROUND_BLUE;
-    case COLOR_MAGENTA:
-      return FOREGROUND_BLUE | FOREGROUND_RED;
-    case COLOR_CYAN:
-      return FOREGROUND_BLUE | FOREGROUND_GREEN;
-    case COLOR_WHITE:  // fall through to default
-    default:
-      return 0;
-  }
-#else
-  switch (color) {
-    case COLOR_RED:
-      return "1";
-    case COLOR_GREEN:
-      return "2";
-    case COLOR_YELLOW:
-      return "3";
-    case COLOR_BLUE:
-      return "4";
-    case COLOR_MAGENTA:
-      return "5";
-    case COLOR_CYAN:
-      return "6";
-    case COLOR_WHITE:
-      return "7";
-    default:
-      return nullptr;
-  };
-#endif
-}
-
-}  // end namespace
-
-std::string FormatString(const char* msg, va_list args) {
-  // we might need a second shot at this, so pre-emptivly make a copy
-  va_list args_cp;
-  va_copy(args_cp, args);
-
-  std::size_t size = 256;
-  char local_buff[256];
-  auto ret = vsnprintf(local_buff, size, msg, args_cp);
-
-  va_end(args_cp);
-
-  // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
-
-  if (ret == 0)  // handle empty expansion
-    return {};
-  else if (static_cast<size_t>(ret) < size)
-    return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
-    return buff.get();
-  }
-}
-
-std::string FormatString(const char* msg, ...) {
-  va_list args;
-  va_start(args, msg);
-  auto tmp = FormatString(msg, args);
-  va_end(args);
-  return tmp;
-}
-
-void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  ColorPrintf(out, color, fmt, args);
-  va_end(args);
-}
-
-void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
-                 va_list args) {
-#ifdef BENCHMARK_OS_WINDOWS
-  ((void)out);  // suppress unused warning
-
-  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
-
-  // Gets the current text color.
-  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
-  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
-
-  // We need to flush the stream buffers into the console before each
-  // SetConsoleTextAttribute call lest it affect the text that is already
-  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
-
-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
-#else
-  const char* color_code = GetPlatformColorCode(color);
-  if (color_code) out << FormatString("\033[0;3%sm", color_code);
-  out << FormatString(fmt, args) << "\033[m";
-#endif
-}
-
-bool IsColorTerminal() {
-#if BENCHMARK_OS_WINDOWS
-  // On Windows the TERM variable is usually not set, but the
-  // console there does support colors.
-  return 0 != _isatty(_fileno(stdout));
-#else
-  // On non-Windows platforms, we rely on the TERM variable. This list of
-  // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
-  const char* const SUPPORTED_TERM_VALUES[] = {
-      "xterm",         "xterm-color",     "xterm-256color",
-      "screen",        "screen-256color", "tmux",
-      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
-      "linux",         "cygwin",
-  };
-
-  const char* const term = getenv("TERM");
-
-  bool term_supports_color = false;
-  for (const char* candidate : SUPPORTED_TERM_VALUES) {
-    if (term && 0 == strcmp(term, candidate)) {
-      term_supports_color = true;
-      break;
-    }
-  }
-
-  return 0 != isatty(fileno(stdout)) && term_supports_color;
-#endif  // BENCHMARK_OS_WINDOWS
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.h
deleted file mode 100644
index 9f6fab9b342..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef BENCHMARK_COLORPRINT_H_
-#define BENCHMARK_COLORPRINT_H_
-
-#include <cstdarg>
-#include <iostream>
-#include <string>
-
-namespace benchmark {
-enum LogColor {
-  COLOR_DEFAULT,
-  COLOR_RED,
-  COLOR_GREEN,
-  COLOR_YELLOW,
-  COLOR_BLUE,
-  COLOR_MAGENTA,
-  COLOR_CYAN,
-  COLOR_WHITE
-};
-
-std::string FormatString(const char* msg, va_list args);
-std::string FormatString(const char* msg, ...);
-
-void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
-                 va_list args);
-void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...);
-
-// Returns true if stdout appears to be a terminal that supports colored
-// output, false otherwise.
-bool IsColorTerminal();
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_COLORPRINT_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.cc
deleted file mode 100644
index 2fc92517a32..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "commandlineflags.h"
-
-#include <cctype>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-
-namespace benchmark {
-// Parses 'str' for a 32-bit signed integer.  If successful, writes
-// the result to *value and returns true; otherwise leaves *value
-// unchanged and returns false.
-bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
-  // Parses the environment variable as a decimal integer.
-  char* end = nullptr;
-  const long long_value = strtol(str, &end, 10);  // NOLINT
-
-  // Has strtol() consumed all characters in the string?
-  if (*end != '\0') {
-    // No - an invalid character was encountered.
-    std::cerr << src_text << " is expected to be a 32-bit integer, "
-              << "but actually has value \"" << str << "\".\n";
-    return false;
-  }
-
-  // Is the parsed value in the range of an Int32?
-  const int32_t result = static_cast<int32_t>(long_value);
-  if (long_value == std::numeric_limits<long>::max() ||
-      long_value == std::numeric_limits<long>::min() ||
-      // The parsed value overflows as a long.  (strtol() returns
-      // LONG_MAX or LONG_MIN when the input overflows.)
-      result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
-    std::cerr << src_text << " is expected to be a 32-bit integer, "
-              << "but actually has value \"" << str << "\", "
-              << "which overflows.\n";
-    return false;
-  }
-
-  *value = result;
-  return true;
-}
-
-// Parses 'str' for a double.  If successful, writes the result to *value and
-// returns true; otherwise leaves *value unchanged and returns false.
-bool ParseDouble(const std::string& src_text, const char* str, double* value) {
-  // Parses the environment variable as a decimal integer.
-  char* end = nullptr;
-  const double double_value = strtod(str, &end);  // NOLINT
-
-  // Has strtol() consumed all characters in the string?
-  if (*end != '\0') {
-    // No - an invalid character was encountered.
-    std::cerr << src_text << " is expected to be a double, "
-              << "but actually has value \"" << str << "\".\n";
-    return false;
-  }
-
-  *value = double_value;
-  return true;
-}
-
-// Returns the name of the environment variable corresponding to the
-// given flag.  For example, FlagToEnvVar("foo") will return
-// "BENCHMARK_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
-  const std::string flag_str(flag);
-
-  std::string env_var;
-  for (size_t i = 0; i != flag_str.length(); ++i)
-    env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
-
-  return "BENCHMARK_" + env_var;
-}
-
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromEnv(const char* flag, bool default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  return string_value == nullptr ? default_value
-                                 : strcmp(string_value, "0") != 0;
-}
-
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-int32_t Int32FromEnv(const char* flag, int32_t default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  if (string_value == nullptr) {
-    // The environment variable is not set.
-    return default_value;
-  }
-
-  int32_t result = default_value;
-  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
-                  &result)) {
-    std::cout << "The default value " << default_value << " is used.\n";
-    return default_value;
-  }
-
-  return result;
-}
-
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromEnv(const char* flag, const char* default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const value = getenv(env_var.c_str());
-  return value == nullptr ? default_value : value;
-}
-
-// Parses a string as a command line flag.  The string should have
-// the format "--flag=value".  When def_optional is true, the "=value"
-// part can be omitted.
-//
-// Returns the value of the flag, or nullptr if the parsing failed.
-const char* ParseFlagValue(const char* str, const char* flag,
-                           bool def_optional) {
-  // str and flag must not be nullptr.
-  if (str == nullptr || flag == nullptr) return nullptr;
-
-  // The flag must start with "--".
-  const std::string flag_str = std::string("--") + std::string(flag);
-  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
-
-  // Skips the flag name.
-  const char* flag_end = str + flag_len;
-
-  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) return flag_end;
-
-  // If def_optional is true and there are more characters after the
-  // flag name, or if def_optional is false, there must be a '=' after
-  // the flag name.
-  if (flag_end[0] != '=') return nullptr;
-
-  // Returns the string after "=".
-  return flag_end + 1;
-}
-
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Converts the string value to a bool.
-  *value = IsTruthyFlagValue(value_str);
-  return true;
-}
-
-bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseInt32(std::string("The value of flag --") + flag, value_str,
-                    value);
-}
-
-bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseDouble(std::string("The value of flag --") + flag, value_str,
-                     value);
-}
-
-bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  *value = value_str;
-  return true;
-}
-
-bool IsFlag(const char* str, const char* flag) {
-  return (ParseFlagValue(str, flag, true) != nullptr);
-}
-
-bool IsTruthyFlagValue(const std::string& value) {
-  if (value.empty()) return true;
-  char ch = value[0];
-  return isalnum(ch) &&
-         !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N');
-}
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.h
deleted file mode 100644
index 945c9a9fc4a..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
-#define BENCHMARK_COMMANDLINEFLAGS_H_
-
-#include <cstdint>
-#include <string>
-
-// Macro for referencing flags.
-#define FLAG(name) FLAGS_##name
-
-// Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_int64(name) extern int64_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
-
-// Macros for defining flags.
-#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
-#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
-#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
-#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
-#define DEFINE_string(name, default_val, doc) \
-  std::string FLAG(name) = (default_val)
-
-namespace benchmark {
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
-
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
-bool BoolFromEnv(const char* flag, bool default_val);
-int32_t Int32FromEnv(const char* flag, int32_t default_val);
-double DoubleFromEnv(const char* flag, double default_val);
-const char* StringFromEnv(const char* flag, const char* default_val);
-
-// Parses a string for a bool flag, in the form of either
-// "--flag=value" or "--flag".
-//
-// In the former case, the value is taken as true if it passes IsTruthyValue().
-//
-// In the latter case, the value is taken as true.
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value);
-
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
-
-// Parses a string for a Double flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseDoubleFlag(const char* str, const char* flag, double* value);
-
-// Parses a string for a string flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value);
-
-// Returns true if the string matches the flag.
-bool IsFlag(const char* str, const char* flag);
-
-// Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
-// some non-alphanumeric character. As a special case, also returns true if
-// value is the empty string.
-bool IsTruthyFlagValue(const std::string& value);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.cc
deleted file mode 100644
index 33975be55ec..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.cc
+++ /dev/null
@@ -1,324 +0,0 @@
-// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Source project : https://github.com/ismaelJimenez/cpp.leastsq
-// Adapted to be used with google benchmark
-
-#include "benchmark/benchmark.h"
-
-#include <algorithm>
-#include <cmath>
-#include "check.h"
-#include "complexity.h"
-#include "stat.h"
-
-namespace benchmark {
-
-// Internal function to calculate the different scalability forms
-BigOFunc* FittingCurve(BigO complexity) {
-  switch (complexity) {
-    case oN:
-      return [](int n) -> double { return n; };
-    case oNSquared:
-      return [](int n) -> double { return std::pow(n, 2); };
-    case oNCubed:
-      return [](int n) -> double { return std::pow(n, 3); };
-    case oLogN:
-      return [](int n) { return log2(n); };
-    case oNLogN:
-      return [](int n) { return n * log2(n); };
-    case o1:
-    default:
-      return [](int) { return 1.0; };
-  }
-}
-
-// Function to return an string for the calculated complexity
-std::string GetBigOString(BigO complexity) {
-  switch (complexity) {
-    case oN:
-      return "N";
-    case oNSquared:
-      return "N^2";
-    case oNCubed:
-      return "N^3";
-    case oLogN:
-      return "lgN";
-    case oNLogN:
-      return "NlgN";
-    case o1:
-      return "(1)";
-    default:
-      return "f(N)";
-  }
-}
-
-// Find the coefficient for the high-order term in the running time, by
-// minimizing the sum of squares of relative error, for the fitting curve
-// given by the lambda expresion.
-//   - n             : Vector containing the size of the benchmark tests.
-//   - time          : Vector containing the times for the benchmark tests.
-//   - fitting_curve : lambda expresion (e.g. [](int n) {return n; };).
-
-// For a deeper explanation on the algorithm logic, look the README file at
-// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit
-
-LeastSq MinimalLeastSq(const std::vector<int>& n,
-                       const std::vector<double>& time,
-                       BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
-  double sigma_gn_squared = 0.0;
-  double sigma_time = 0.0;
-  double sigma_time_gn = 0.0;
-
-  // Calculate least square fitting parameter
-  for (size_t i = 0; i < n.size(); ++i) {
-    double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
-    sigma_gn_squared += gn_i * gn_i;
-    sigma_time += time[i];
-    sigma_time_gn += time[i] * gn_i;
-  }
-
-  LeastSq result;
-  result.complexity = oLambda;
-
-  // Calculate complexity.
-  result.coef = sigma_time_gn / sigma_gn_squared;
-
-  // Calculate RMS
-  double rms = 0.0;
-  for (size_t i = 0; i < n.size(); ++i) {
-    double fit = result.coef * fitting_curve(n[i]);
-    rms += pow((time[i] - fit), 2);
-  }
-
-  // Normalized RMS by the mean of the observed values
-  double mean = sigma_time / n.size();
-  result.rms = sqrt(rms / n.size()) / mean;
-
-  return result;
-}
-
-// Find the coefficient for the high-order term in the running time, by
-// minimizing the sum of squares of relative error.
-//   - n          : Vector containing the size of the benchmark tests.
-//   - time       : Vector containing the times for the benchmark tests.
-//   - complexity : If different than oAuto, the fitting curve will stick to
-//                  this one. If it is oAuto, it will be calculated the best
-//                  fitting curve.
-LeastSq MinimalLeastSq(const std::vector<int>& n,
-                       const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
-
-  LeastSq best_fit;
-
-  if (complexity == oAuto) {
-    std::vector<BigO> fit_curves = {oLogN, oN, oNLogN, oNSquared, oNCubed};
-
-    // Take o1 as default best fitting curve
-    best_fit = MinimalLeastSq(n, time, FittingCurve(o1));
-    best_fit.complexity = o1;
-
-    // Compute all possible fitting curves and stick to the best one
-    for (const auto& fit : fit_curves) {
-      LeastSq current_fit = MinimalLeastSq(n, time, FittingCurve(fit));
-      if (current_fit.rms < best_fit.rms) {
-        best_fit = current_fit;
-        best_fit.complexity = fit;
-      }
-    }
-  } else {
-    best_fit = MinimalLeastSq(n, time, FittingCurve(complexity));
-    best_fit.complexity = complexity;
-  }
-
-  return best_fit;
-}
-
-std::vector<BenchmarkReporter::Run> ComputeStats(
-    const std::vector<BenchmarkReporter::Run>& reports) {
-  typedef BenchmarkReporter::Run Run;
-  std::vector<Run> results;
-
-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
-
-  if (reports.size() - error_count < 2) {
-    // We don't report aggregated data if there was a single run.
-    return results;
-  }
-  // Accumulators.
-  Stat1_d real_accumulated_time_stat;
-  Stat1_d cpu_accumulated_time_stat;
-  Stat1_d bytes_per_second_stat;
-  Stat1_d items_per_second_stat;
-  // All repetitions should be run with the same number of iterations so we
-  // can take this information from the first benchmark.
-  int64_t const run_iterations = reports.front().iterations;
-  // create stats for user counters
-  struct CounterStat {
-    Counter c;
-    Stat1_d s;
-  };
-  std::map< std::string, CounterStat > counter_stats;
-  for(Run const& r : reports) {
-    for(auto const& cnt : r.counters) {
-      auto it = counter_stats.find(cnt.first);
-      if(it == counter_stats.end()) {
-        counter_stats.insert({cnt.first, {cnt.second, Stat1_d{}}});
-      } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
-      }
-    }
-  }
-
-  // Populate the accumulators.
-  for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
-    CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
-    real_accumulated_time_stat +=
-        Stat1_d(run.real_accumulated_time / run.iterations);
-    cpu_accumulated_time_stat +=
-        Stat1_d(run.cpu_accumulated_time / run.iterations);
-    items_per_second_stat += Stat1_d(run.items_per_second);
-    bytes_per_second_stat += Stat1_d(run.bytes_per_second);
-    // user counters
-    for(auto const& cnt : run.counters) {
-      auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
-      it->second.s += Stat1_d(cnt.second);
-    }
-  }
-
-  // Get the data from the accumulator to BenchmarkReporter::Run's.
-  Run mean_data;
-  mean_data.benchmark_name = reports[0].benchmark_name + "_mean";
-  mean_data.iterations = run_iterations;
-  mean_data.real_accumulated_time =
-      real_accumulated_time_stat.Mean() * run_iterations;
-  mean_data.cpu_accumulated_time =
-      cpu_accumulated_time_stat.Mean() * run_iterations;
-  mean_data.bytes_per_second = bytes_per_second_stat.Mean();
-  mean_data.items_per_second = items_per_second_stat.Mean();
-  mean_data.time_unit = reports[0].time_unit;
-  // user counters
-  for(auto const& kv : counter_stats) {
-    auto c = Counter(kv.second.s.Mean(), counter_stats[kv.first].c.flags);
-    mean_data.counters[kv.first] = c;
-  }
-
-  // Only add label to mean/stddev if it is same for all runs
-  mean_data.report_label = reports[0].report_label;
-  for (std::size_t i = 1; i < reports.size(); i++) {
-    if (reports[i].report_label != reports[0].report_label) {
-      mean_data.report_label = "";
-      break;
-    }
-  }
-
-  Run stddev_data;
-  stddev_data.benchmark_name = reports[0].benchmark_name + "_stddev";
-  stddev_data.report_label = mean_data.report_label;
-  stddev_data.iterations = 0;
-  stddev_data.real_accumulated_time = real_accumulated_time_stat.StdDev();
-  stddev_data.cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
-  stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
-  stddev_data.items_per_second = items_per_second_stat.StdDev();
-  stddev_data.time_unit = reports[0].time_unit;
-  // user counters
-  for(auto const& kv : counter_stats) {
-    auto c = Counter(kv.second.s.StdDev(), counter_stats[kv.first].c.flags);
-    stddev_data.counters[kv.first] = c;
-  }
-
-  results.push_back(mean_data);
-  results.push_back(stddev_data);
-  return results;
-}
-
-std::vector<BenchmarkReporter::Run> ComputeBigO(
-    const std::vector<BenchmarkReporter::Run>& reports) {
-  typedef BenchmarkReporter::Run Run;
-  std::vector<Run> results;
-
-  if (reports.size() < 2) return results;
-
-  // Accumulators.
-  std::vector<int> n;
-  std::vector<double> real_time;
-  std::vector<double> cpu_time;
-
-  // Populate the accumulators.
-  for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
-    n.push_back(run.complexity_n);
-    real_time.push_back(run.real_accumulated_time / run.iterations);
-    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
-  }
-
-  LeastSq result_cpu;
-  LeastSq result_real;
-
-  if (reports[0].complexity == oLambda) {
-    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
-    result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
-  } else {
-    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
-    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
-  }
-  std::string benchmark_name =
-      reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/'));
-
-  // Get the data from the accumulator to BenchmarkReporter::Run's.
-  Run big_o;
-  big_o.benchmark_name = benchmark_name + "_BigO";
-  big_o.iterations = 0;
-  big_o.real_accumulated_time = result_real.coef;
-  big_o.cpu_accumulated_time = result_cpu.coef;
-  big_o.report_big_o = true;
-  big_o.complexity = result_cpu.complexity;
-
-  // All the time results are reported after being multiplied by the
-  // time unit multiplier. But since RMS is a relative quantity it
-  // should not be multiplied at all. So, here, we _divide_ it by the
-  // multiplier so that when it is multiplied later the result is the
-  // correct one.
-  double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
-
-  // Only add label to mean/stddev if it is same for all runs
-  Run rms;
-  big_o.report_label = reports[0].report_label;
-  rms.benchmark_name = benchmark_name + "_RMS";
-  rms.report_label = big_o.report_label;
-  rms.iterations = 0;
-  rms.real_accumulated_time = result_real.rms / multiplier;
-  rms.cpu_accumulated_time = result_cpu.rms / multiplier;
-  rms.report_rms = true;
-  rms.complexity = result_cpu.complexity;
-  // don't forget to keep the time unit, or we won't be able to
-  // recover the correct value.
-  rms.time_unit = reports[0].time_unit;
-
-  results.push_back(big_o);
-  results.push_back(rms);
-  return results;
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.h
deleted file mode 100644
index c0ca60e6bec..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Source project : https://github.com/ismaelJimenez/cpp.leastsq
-// Adapted to be used with google benchmark
-
-#ifndef COMPLEXITY_H_
-#define COMPLEXITY_H_
-
-#include <string>
-#include <vector>
-
-#include "benchmark/benchmark.h"
-
-namespace benchmark {
-
-// Return a vector containing the mean and standard devation information for
-// the specified list of reports. If 'reports' contains less than two
-// non-errored runs an empty vector is returned
-std::vector<BenchmarkReporter::Run> ComputeStats(
-    const std::vector<BenchmarkReporter::Run>& reports);
-
-// Return a vector containing the bigO and RMS information for the specified
-// list of reports. If 'reports.size() < 2' an empty vector is returned.
-std::vector<BenchmarkReporter::Run> ComputeBigO(
-    const std::vector<BenchmarkReporter::Run>& reports);
-
-// This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
-//                   interpolated from data.
-//   - rms         : Normalized Root Mean Squared Error.
-//   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
-//                   form has been provided to MinimalLeastSq this will return
-//                   the same value. In case BigO::oAuto has been selected, this
-//                   parameter will return the best fitting curve detected.
-
-struct LeastSq {
-  LeastSq() : coef(0.0), rms(0.0), complexity(oNone) {}
-
-  double coef;
-  double rms;
-  BigO complexity;
-};
-
-// Function to return an string for the calculated complexity
-std::string GetBigOString(BigO complexity);
-
-}  // end namespace benchmark
-#endif  // COMPLEXITY_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/console_reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/console_reporter.cc
deleted file mode 100644
index 4bb6f71271c..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/console_reporter.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-#include "counter.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdio>
-#include <iostream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "check.h"
-#include "colorprint.h"
-#include "commandlineflags.h"
-#include "internal_macros.h"
-#include "string_util.h"
-#include "timers.h"
-
-namespace benchmark {
-
-bool ConsoleReporter::ReportContext(const Context& context) {
-  name_field_width_ = context.name_field_width;
-  printed_header_ = false;
-  prev_counters_.clear();
-
-  PrintBasicContext(&GetErrorStream(), context);
-
-#ifdef BENCHMARK_OS_WINDOWS
-  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
-    GetErrorStream()
-        << "Color printing is only supported for stdout on windows."
-           " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
-  }
-#endif
-
-  return true;
-}
-
-void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %13s %10s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
-        str += FormatString(" %10s", c.first.c_str());
-      }
-    } else {
-      str += " UserCounters...";
-    }
-  }
-  str += "\n";
-  std::string line = std::string(str.length(), '-');
-  GetOutputStream() << line << "\n" << str << line << "\n";
-}
-
-void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
-  for (const auto& run : reports) {
-    // print the header:
-    // --- if none was printed yet
-    bool print_header = !printed_header_;
-    // --- or if the format is tabular and this run
-    //     has different fields from the prev header
-    print_header |= (output_options_ & OO_Tabular) &&
-                    (!internal::SameNames(run.counters, prev_counters_));
-    if (print_header) {
-      printed_header_ = true;
-      prev_counters_ = run.counters;
-      PrintHeader(run);
-    }
-    // As an alternative to printing the headers like this, we could sort
-    // the benchmarks by header and then print. But this would require
-    // waiting for the full results before printing, or printing twice.
-    PrintRunData(run);
-  }
-}
-
-static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
-                             ...) {
-  va_list args;
-  va_start(args, fmt);
-  out << FormatString(fmt, args);
-  va_end(args);
-}
-
-void ConsoleReporter::PrintRunData(const Run& result) {
-  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
-  auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
-  auto name_color =
-      (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
-  printer(Out, name_color, "%-*s ", name_field_width_,
-          result.benchmark_name.c_str());
-
-  if (result.error_occurred) {
-    printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
-    printer(Out, COLOR_DEFAULT, "\n");
-    return;
-  }
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    items =
-        StrCat(" ", HumanReadableNumber(result.items_per_second), " items/s");
-  }
-
-  const double real_time = result.GetAdjustedRealTime();
-  const double cpu_time = result.GetAdjustedCPUTime();
-
-  if (result.report_big_o) {
-    std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
-  } else if (result.report_rms) {
-    printer(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
-            cpu_time * 100);
-  } else {
-    const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
-            cpu_time, timeLabel);
-  }
-
-  if (!result.report_big_o && !result.report_rms) {
-    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
-  }
-
-  for (auto& c : result.counters) {
-    auto const& s = HumanReadableNumber(c.second.value);
-    if (output_options_ & OO_Tabular) {
-      if (c.second.flags & Counter::kIsRate) {
-        printer(Out, COLOR_DEFAULT, " %8s/s", s.c_str());
-      } else {
-        printer(Out, COLOR_DEFAULT, " %10s", s.c_str());
-      }
-    } else {
-      const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
-      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
-              unit);
-    }
-  }
-
-  if (!rate.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
-  }
-
-  if (!items.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
-  }
-
-  if (!result.report_label.empty()) {
-    printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
-  }
-
-  printer(Out, COLOR_DEFAULT, "\n");
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.cc
deleted file mode 100644
index ed1aa044ee7..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "counter.h"
-
-namespace benchmark {
-namespace internal {
-
-double Finish(Counter const& c, double cpu_time, double num_threads) {
-  double v = c.value;
-  if (c.flags & Counter::kIsRate) {
-    v /= cpu_time;
-  }
-  if (c.flags & Counter::kAvgThreads) {
-    v /= num_threads;
-  }
-  return v;
-}
-
-void Finish(UserCounters *l, double cpu_time, double num_threads) {
-  for (auto &c : *l) {
-    c.second.value = Finish(c.second, cpu_time, num_threads);
-  }
-}
-
-void Increment(UserCounters *l, UserCounters const& r) {
-  // add counters present in both or just in *l
-  for (auto &c : *l) {
-    auto it = r.find(c.first);
-    if (it != r.end()) {
-      c.second.value = c.second + it->second;
-    }
-  }
-  // add counters present in r, but not in *l
-  for (auto const &tc : r) {
-    auto it = l->find(tc.first);
-    if (it == l->end()) {
-      (*l)[tc.first] = tc.second;
-    }
-  }
-}
-
-bool SameNames(UserCounters const& l, UserCounters const& r) {
-  if (&l == &r) return true;
-  if (l.size() != r.size()) {
-    return false;
-  }
-  for (auto const& c : l) {
-    if (r.find(c.first) == r.end()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-} // end namespace internal
-} // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.h
deleted file mode 100644
index dd6865a31d7..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-
-namespace benchmark {
-
-// these counter-related functions are hidden to reduce API surface.
-namespace internal {
-void Finish(UserCounters *l, double time, double num_threads);
-void Increment(UserCounters *l, UserCounters const& r);
-bool SameNames(UserCounters const& l, UserCounters const& r);
-} // end namespace internal
-
-} //end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/csv_reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/csv_reporter.cc
deleted file mode 100644
index 35510645b08..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/csv_reporter.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <iostream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "string_util.h"
-#include "timers.h"
-#include "check.h"
-
-// File format reference: http://edoceo.com/utilitas/csv-file-format.
-
-namespace benchmark {
-
-namespace {
-std::vector<std::string> elements = {
-    "name",           "iterations",       "real_time",        "cpu_time",
-    "time_unit",      "bytes_per_second", "items_per_second", "label",
-    "error_occurred", "error_message"};
-}  // namespace
-
-bool CSVReporter::ReportContext(const Context& context) {
-  PrintBasicContext(&GetErrorStream(), context);
-  return true;
-}
-
-void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
-  std::ostream& Out = GetOutputStream();
-
-  if (!printed_header_) {
-    // save the names of all the user counters
-    for (const auto& run : reports) {
-      for (const auto& cnt : run.counters) {
-        user_counter_names_.insert(cnt.first);
-      }
-    }
-
-    // print the header
-    for (auto B = elements.begin(); B != elements.end();) {
-      Out << *B++;
-      if (B != elements.end()) Out << ",";
-    }
-    for (auto B = user_counter_names_.begin(); B != user_counter_names_.end();) {
-      Out << ",\"" << *B++ << "\"";
-    }
-    Out << "\n";
-
-    printed_header_ = true;
-  } else {
-    // check that all the current counters are saved in the name set
-    for (const auto& run : reports) {
-      for (const auto& cnt : run.counters) {
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
-              << "All counters must be present in each run. "
-              << "Counter named \"" << cnt.first
-              << "\" was not in a run after being added to the header";
-      }
-    }
-  }
-
-  // print results for each run
-  for (const auto& run : reports) {
-    PrintRunData(run);
-  }
-
-}
-
-void CSVReporter::PrintRunData(const Run & run) {
-  std::ostream& Out = GetOutputStream();
-
-  // Field with embedded double-quote characters must be doubled and the field
-  // delimited with double-quotes.
-  std::string name = run.benchmark_name;
-  ReplaceAll(&name, "\"", "\"\"");
-  Out << '"' << name << "\",";
-  if (run.error_occurred) {
-    Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    std::string msg = run.error_message;
-    ReplaceAll(&msg, "\"", "\"\"");
-    Out << '"' << msg << "\"\n";
-    return;
-  }
-
-  // Do not print iteration on bigO and RMS report
-  if (!run.report_big_o && !run.report_rms) {
-    Out << run.iterations;
-  }
-  Out << ",";
-
-  Out << run.GetAdjustedRealTime() << ",";
-  Out << run.GetAdjustedCPUTime() << ",";
-
-  // Do not print timeLabel on bigO and RMS report
-  if (run.report_big_o) {
-    Out << GetBigOString(run.complexity);
-  } else if (!run.report_rms) {
-    Out << GetTimeUnitString(run.time_unit);
-  }
-  Out << ",";
-
-  if (run.bytes_per_second > 0.0) {
-    Out << run.bytes_per_second;
-  }
-  Out << ",";
-  if (run.items_per_second > 0.0) {
-    Out << run.items_per_second;
-  }
-  Out << ",";
-  if (!run.report_label.empty()) {
-    // Field with embedded double-quote characters must be doubled and the field
-    // delimited with double-quotes.
-    std::string label = run.report_label;
-    ReplaceAll(&label, "\"", "\"\"");
-    Out << "\"" << label << "\"";
-  }
-  Out << ",,";  // for error_occurred and error_message
-
-  // Print user counters
-  for (const auto &ucn : user_counter_names_) {
-    auto it = run.counters.find(ucn);
-    if(it == run.counters.end()) {
-      Out << ",";
-    } else {
-      Out << "," << it->second;
-    }
-  }
-  Out << '\n';
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/cycleclock.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/cycleclock.h
deleted file mode 100644
index 4251fe4c32a..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/cycleclock.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// ----------------------------------------------------------------------
-// CycleClock
-//    A CycleClock tells you the current time in Cycles.  The "time"
-//    is actually time since power-on.  This is like time() but doesn't
-//    involve a system call and is much more precise.
-//
-// NOTE: Not all cpu/platform/kernel combinations guarantee that this
-// clock increments at a constant rate or is synchronized across all logical
-// cpus in a system.
-//
-// If you need the above guarantees, please consider using a different
-// API. There are efforts to provide an interface which provides a millisecond
-// granularity and implemented as a memory read. A memory read is generally
-// cheaper than the CycleClock for many architectures.
-//
-// Also, in some out of order CPU implementations, the CycleClock is not
-// serializing. So if you're trying to count at cycles granularity, your
-// data might be inaccurate due to out of order instruction execution.
-// ----------------------------------------------------------------------
-
-#ifndef BENCHMARK_CYCLECLOCK_H_
-#define BENCHMARK_CYCLECLOCK_H_
-
-#include <cstdint>
-
-#include "benchmark/benchmark.h"
-#include "internal_macros.h"
-
-#if defined(BENCHMARK_OS_MACOSX)
-#include <mach/mach_time.h>
-#endif
-// For MSVC, we want to use '_asm rdtsc' when possible (since it works
-// with even ancient MSVC compilers), and when not possible the
-// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
-// environments, <windows.h> and <intrin.h> have conflicting
-// declarations of some other intrinsics, breaking compilation.
-// Therefore, we simply declare __rdtsc ourselves. See also
-// http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
-extern "C" uint64_t __rdtsc();
-#pragma intrinsic(__rdtsc)
-#endif
-
-#ifndef BENCHMARK_OS_WINDOWS
-#include <sys/time.h>
-#include <time.h>
-#endif
-
-#ifdef BENCHMARK_OS_EMSCRIPTEN
-#include <emscripten.h>
-#endif
-
-namespace benchmark {
-// NOTE: only i386 and x86_64 have been well tested.
-// PPC, sparc, alpha, and ia64 are based on
-//    http://peter.kuscsik.com/wordpress/?p=14
-// with modifications by m3b.  See also
-//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
-namespace cycleclock {
-// This should return the number of cycles since power-on.  Thread-safe.
-inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
-#if defined(BENCHMARK_OS_MACOSX)
-  // this goes at the top because we need ALL Macs, regardless of
-  // architecture, to return the number of "mach time units" that
-  // have passed since startup.  See sysinfo.cc where
-  // InitializeSystemInfo() sets the supposed cpu clock frequency of
-  // macs to the number of mach time units per second, not actual
-  // CPU clock frequency (which can change in the face of CPU
-  // frequency scaling).  Also note that when the Mac sleeps, this
-  // counter pauses; it does not continue counting, nor does it
-  // reset to zero.
-  return mach_absolute_time();
-#elif defined(BENCHMARK_OS_EMSCRIPTEN)
-  // this goes above x86-specific code because old versions of Emscripten
-  // define __x86_64__, although they have nothing to do with it.
-  return static_cast<int64_t>(emscripten_get_now() * 1e+6);
-#elif defined(__i386__)
-  int64_t ret;
-  __asm__ volatile("rdtsc" : "=A"(ret));
-  return ret;
-#elif defined(__x86_64__) || defined(__amd64__)
-  uint64_t low, high;
-  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  return (high << 32) | low;
-#elif defined(__powerpc__) || defined(__ppc__)
-  // This returns a time-base, which is not always precisely a cycle-count.
-  int64_t tbl, tbu0, tbu1;
-  asm("mftbu %0" : "=r"(tbu0));
-  asm("mftb  %0" : "=r"(tbl));
-  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
-  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
-  return (tbu1 << 32) | tbl;
-#elif defined(__sparc__)
-  int64_t tick;
-  asm(".byte 0x83, 0x41, 0x00, 0x00");
-  asm("mov   %%g1, %0" : "=r"(tick));
-  return tick;
-#elif defined(__ia64__)
-  int64_t itc;
-  asm("mov %0 = ar.itc" : "=r"(itc));
-  return itc;
-#elif defined(COMPILER_MSVC) && defined(_M_IX86)
-  // Older MSVC compilers (like 7.x) don't seem to support the
-  // __rdtsc intrinsic properly, so I prefer to use _asm instead
-  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
-  // the code is being compiled with a non-ancient compiler.
-  _asm rdtsc
-#elif defined(COMPILER_MSVC)
-  return __rdtsc();
-#elif defined(BENCHMARK_OS_NACL)
-  // Native Client validator on x86/x86-64 allows RDTSC instructions,
-  // and this case is handled above. Native Client validator on ARM
-  // rejects MRC instructions (used in the ARM-specific sequence below),
-  // so we handle it here. Portable Native Client compiles to
-  // architecture-agnostic bytecode, which doesn't provide any
-  // cycle counter access mnemonics.
-
-  // Native Client does not provide any API to access cycle counter.
-  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
-  // because is provides nanosecond resolution (which is noticable at
-  // least for PNaCl modules running on x86 Mac & Linux).
-  // Initialize to always return 0 if clock_gettime fails.
-  struct timespec ts = { 0, 0 };
-  clock_gettime(CLOCK_MONOTONIC, &ts);
-  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#elif defined(__aarch64__)
-  // System timer of ARMv8 runs at a different frequency than the CPU's.
-  // The frequency is fixed, typically in the range 1-50MHz.  It can be
-  // read at CNTFRQ special register.  We assume the OS has set up
-  // the virtual timer properly.
-  int64_t virtual_timer_value;
-  asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
-  return virtual_timer_value;
-#elif defined(__ARM_ARCH)
-  // V6 is the earliest arch that has a standard cyclecount
-  // Native Client validator doesn't allow MRC instructions.
-#if (__ARM_ARCH >= 6)
-  uint32_t pmccntr;
-  uint32_t pmuseren;
-  uint32_t pmcntenset;
-  // Read the user mode perf monitor counter access permissions.
-  asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
-  if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
-    asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
-    if (pmcntenset & 0x80000000ul) {  // Is it counting?
-      asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
-      // The counter is set up to count every 64th cycle
-      return static_cast<int64_t>(pmccntr) * 64;  // Should optimize to << 6
-    }
-  }
-#endif
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
-  // mips apparently only allows rdtsc for superusers, so we fall
-  // back to gettimeofday.  It's possible clock_gettime would be better.
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#else
-// The soft failover to a generic implementation is automatic only for ARM.
-// For other platforms the developer is expected to make an attempt to create
-// a fast implementation and use generic version if nothing better is available.
-#error You need to define CycleTimer for your OS and CPU
-#endif
-}
-}  // end namespace cycleclock
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_CYCLECLOCK_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/internal_macros.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/internal_macros.h
deleted file mode 100644
index 942887457f1..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/internal_macros.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef BENCHMARK_INTERNAL_MACROS_H_
-#define BENCHMARK_INTERNAL_MACROS_H_
-
-#include "benchmark/benchmark.h"
-
-#ifndef __has_feature
-#define __has_feature(x) 0
-#endif
-
-#if defined(__clang__)
-#define COMPILER_CLANG
-#elif defined(_MSC_VER)
-#define COMPILER_MSVC
-#elif defined(__GNUC__)
-#define COMPILER_GCC
-#endif
-
-#if __has_feature(cxx_attributes)
-#define BENCHMARK_NORETURN [[noreturn]]
-#elif defined(__GNUC__)
-#define BENCHMARK_NORETURN __attribute__((noreturn))
-#elif defined(COMPILER_MSVC)
-#define BENCHMARK_NORETURN __declspec(noreturn)
-#else
-#define BENCHMARK_NORETURN
-#endif
-
-#if defined(__CYGWIN__)
-#define BENCHMARK_OS_CYGWIN 1
-#elif defined(_WIN32)
-#define BENCHMARK_OS_WINDOWS 1
-#elif defined(__APPLE__)
-#include "TargetConditionals.h"
-  #if defined(TARGET_OS_MAC)
-    #define BENCHMARK_OS_MACOSX 1
-    #if defined(TARGET_OS_IPHONE)
-      #define BENCHMARK_OS_IOS 1
-    #endif
-  #endif
-#elif defined(__FreeBSD__)
-#define BENCHMARK_OS_FREEBSD 1
-#elif defined(__linux__)
-#define BENCHMARK_OS_LINUX 1
-#elif defined(__native_client__)
-#define BENCHMARK_OS_NACL 1
-#elif defined(EMSCRIPTEN)
-#define BENCHMARK_OS_EMSCRIPTEN 1
-#elif defined(__rtems__)
-#define BENCHMARK_OS_RTEMS 1
-#endif
-
-#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
-     && !defined(__EXCEPTIONS)
-#define BENCHMARK_HAS_NO_EXCEPTIONS
-#endif
-
-#endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/json_reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/json_reporter.cc
deleted file mode 100644
index edf6ecc836c..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/json_reporter.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <iostream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "string_util.h"
-#include "timers.h"
-
-namespace benchmark {
-
-namespace {
-
-std::string FormatKV(std::string const& key, std::string const& value) {
-  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value.c_str());
-}
-
-std::string FormatKV(std::string const& key, const char* value) {
-  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value);
-}
-
-std::string FormatKV(std::string const& key, bool value) {
-  return StringPrintF("\"%s\": %s", key.c_str(), value ? "true" : "false");
-}
-
-std::string FormatKV(std::string const& key, int64_t value) {
-  std::stringstream ss;
-  ss << '"' << key << "\": " << value;
-  return ss.str();
-}
-
-std::string FormatKV(std::string const& key, double value) {
-  return StringPrintF("\"%s\": %.2f", key.c_str(), value);
-}
-
-int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
-
-}  // end namespace
-
-bool JSONReporter::ReportContext(const Context& context) {
-  std::ostream& out = GetOutputStream();
-
-  out << "{\n";
-  std::string inner_indent(2, ' ');
-
-  // Open context block and print context information.
-  out << inner_indent << "\"context\": {\n";
-  std::string indent(4, ' ');
-
-  std::string walltime_value = LocalDateTimeString();
-  out << indent << FormatKV("date", walltime_value) << ",\n";
-
-  out << indent << FormatKV("num_cpus", static_cast<int64_t>(context.num_cpus))
-      << ",\n";
-  out << indent << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu))
-      << ",\n";
-  out << indent << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled)
-      << ",\n";
-
-#if defined(NDEBUG)
-  const char build_type[] = "release";
-#else
-  const char build_type[] = "debug";
-#endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
-  // Close context block and open the list of benchmarks.
-  out << inner_indent << "},\n";
-  out << inner_indent << "\"benchmarks\": [\n";
-  return true;
-}
-
-void JSONReporter::ReportRuns(std::vector<Run> const& reports) {
-  if (reports.empty()) {
-    return;
-  }
-  std::string indent(4, ' ');
-  std::ostream& out = GetOutputStream();
-  if (!first_report_) {
-    out << ",\n";
-  }
-  first_report_ = false;
-
-  for (auto it = reports.begin(); it != reports.end(); ++it) {
-    out << indent << "{\n";
-    PrintRunData(*it);
-    out << indent << '}';
-    auto it_cp = it;
-    if (++it_cp != reports.end()) {
-      out << ",\n";
-    }
-  }
-}
-
-void JSONReporter::Finalize() {
-  // Close the list of benchmarks and the top level object.
-  GetOutputStream() << "\n  ]\n}\n";
-}
-
-void JSONReporter::PrintRunData(Run const& run) {
-  std::string indent(6, ' ');
-  std::ostream& out = GetOutputStream();
-  out << indent << FormatKV("name", run.benchmark_name) << ",\n";
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
-  }
-  if (!run.report_big_o && !run.report_rms) {
-    out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent
-        << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime()))
-        << ",\n";
-    out << indent
-        << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime()));
-    out << ",\n"
-        << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
-  } else if (run.report_big_o) {
-    out << indent
-        << FormatKV("cpu_coefficient", RoundDouble(run.GetAdjustedCPUTime()))
-        << ",\n";
-    out << indent
-        << FormatKV("real_coefficient", RoundDouble(run.GetAdjustedRealTime()))
-        << ",\n";
-    out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n";
-    out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
-  } else if (run.report_rms) {
-    out << indent
-        << FormatKV("rms", run.GetAdjustedCPUTime());
-  }
-  if (run.bytes_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second));
-  }
-  if (run.items_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("items_per_second", RoundDouble(run.items_per_second));
-  }
-  for(auto &c : run.counters) {
-    out << ",\n"
-        << indent
-        << FormatKV(c.first, RoundDouble(c.second));
-  }
-  if (!run.report_label.empty()) {
-    out << ",\n" << indent << FormatKV("label", run.report_label);
-  }
-  out << '\n';
-}
-
-} // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/log.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/log.h
deleted file mode 100644
index d06e1031db1..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/log.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef BENCHMARK_LOG_H_
-#define BENCHMARK_LOG_H_
-
-#include <iostream>
-#include <ostream>
-
-#include "benchmark/benchmark.h"
-
-namespace benchmark {
-namespace internal {
-
-typedef std::basic_ostream<char>&(EndLType)(std::basic_ostream<char>&);
-
-class LogType {
-  friend LogType& GetNullLogInstance();
-  friend LogType& GetErrorLogInstance();
-
-  // FIXME: Add locking to output.
-  template <class Tp>
-  friend LogType& operator<<(LogType&, Tp const&);
-  friend LogType& operator<<(LogType&, EndLType*);
-
- private:
-  LogType(std::ostream* out) : out_(out) {}
-  std::ostream* out_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
-};
-
-template <class Tp>
-LogType& operator<<(LogType& log, Tp const& value) {
-  if (log.out_) {
-    *log.out_ << value;
-  }
-  return log;
-}
-
-inline LogType& operator<<(LogType& log, EndLType* m) {
-  if (log.out_) {
-    *log.out_ << m;
-  }
-  return log;
-}
-
-inline int& LogLevel() {
-  static int log_level = 0;
-  return log_level;
-}
-
-inline LogType& GetNullLogInstance() {
-  static LogType log(nullptr);
-  return log;
-}
-
-inline LogType& GetErrorLogInstance() {
-  static LogType log(&std::clog);
-  return log;
-}
-
-inline LogType& GetLogInstanceForLevel(int level) {
-  if (level <= LogLevel()) {
-    return GetErrorLogInstance();
-  }
-  return GetNullLogInstance();
-}
-
-}  // end namespace internal
-}  // end namespace benchmark
-
-#define VLOG(x)                                                               \
-  (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
-                                                                         " ")
-
-#endif
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/mutex.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/mutex.h
deleted file mode 100644
index 5f461d05a0c..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/mutex.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef BENCHMARK_MUTEX_H_
-#define BENCHMARK_MUTEX_H_
-
-#include <condition_variable>
-#include <mutex>
-
-#include "check.h"
-
-// Enable thread safety attributes only with clang.
-// The attributes can be safely erased when compiling with other compilers.
-#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
-#else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
-#endif
-
-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
-
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
-
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
-
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
-
-#define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
-
-#define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
-
-#define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
-
-#define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
-
-#define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
-
-#define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
-
-#define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
-
-#define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
-
-#define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
-
-#define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
-
-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
-
-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
-
-#define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
-
-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
-
-#define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
-
-namespace benchmark {
-
-typedef std::condition_variable Condition;
-
-// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
-// we can annotate them with thread safety attributes and use the
-// -Wthread-safety warning with clang. The standard library types cannot be
-// used directly because they do not provided the required annotations.
-class CAPABILITY("mutex") Mutex {
- public:
-  Mutex() {}
-
-  void lock() ACQUIRE() { mut_.lock(); }
-  void unlock() RELEASE() { mut_.unlock(); }
-  std::mutex& native_handle() { return mut_; }
-
- private:
-  std::mutex mut_;
-};
-
-class SCOPED_CAPABILITY MutexLock {
-  typedef std::unique_lock<std::mutex> MutexLockImp;
-
- public:
-  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle()) {}
-  ~MutexLock() RELEASE() {}
-  MutexLockImp& native_handle() { return ml_; }
-
- private:
-  MutexLockImp ml_;
-};
-
-class Barrier {
- public:
-  Barrier(int num_threads) : running_threads_(num_threads) {}
-
-  // Called by each thread
-  bool wait() EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = createBarrier(ml);
-    }
-    if (last_thread) phase_condition_.notify_all();
-    return last_thread;
-  }
-
-  void removeThread() EXCLUDES(lock_) {
-    MutexLock ml(lock_);
-    --running_threads_;
-    if (entered_ != 0) phase_condition_.notify_all();
-  }
-
- private:
-  Mutex lock_;
-  Condition phase_condition_;
-  int running_threads_;
-
-  // State for barrier management
-  int phase_number_ = 0;
-  int entered_ = 0;  // Number of threads that have entered this barrier
-
-  // Enter the barrier and wait until all other threads have also
-  // entered the barrier.  Returns iff this is the last thread to
-  // enter the barrier.
-  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
-    entered_++;
-    if (entered_ < running_threads_) {
-      // Wait for all threads to enter
-      int phase_number_cp = phase_number_;
-      auto cb = [this, phase_number_cp]() {
-        return this->phase_number_ > phase_number_cp ||
-               entered_ == running_threads_;  // A thread has aborted in error
-      };
-      phase_condition_.wait(ml.native_handle(), cb);
-      if (phase_number_ > phase_number_cp) return false;
-      // else (running_threads_ == entered_) and we are the last thread.
-    }
-    // Last thread has reached the barrier
-    phase_number_++;
-    entered_ = 0;
-    return true;
-  }
-};
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_MUTEX_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/re.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/re.h
deleted file mode 100644
index 01e9736505e..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/re.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BENCHMARK_RE_H_
-#define BENCHMARK_RE_H_
-
-#include "internal_macros.h"
-
-// Prefer C regex libraries when compiling w/o exceptions so that we can
-// correctly report errors.
-#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && defined(HAVE_STD_REGEX) && \
-    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
-#undef HAVE_STD_REGEX
-#endif
-
-#if defined(HAVE_STD_REGEX)
-#include <regex>
-#elif defined(HAVE_GNU_POSIX_REGEX)
-#include <gnuregex.h>
-#elif defined(HAVE_POSIX_REGEX)
-#include <regex.h>
-#else
-#error No regular expression backend was found!
-#endif
-#include <string>
-
-#include "check.h"
-
-namespace benchmark {
-
-// A wrapper around the POSIX regular expression API that provides automatic
-// cleanup
-class Regex {
- public:
-  Regex() : init_(false) {}
-
-  ~Regex();
-
-  // Compile a regular expression matcher from spec.  Returns true on success.
-  //
-  // On failure (and if error is not nullptr), error is populated with a human
-  // readable error message if an error occurs.
-  bool Init(const std::string& spec, std::string* error);
-
-  // Returns whether str matches the compiled regular expression.
-  bool Match(const std::string& str);
-
- private:
-  bool init_;
-// Underlying regular expression object
-#if defined(HAVE_STD_REGEX)
-  std::regex re_;
-#elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX)
-  regex_t re_;
-#else
-#error No regular expression backend implementation available
-#endif
-};
-
-#if defined(HAVE_STD_REGEX)
-
-inline bool Regex::Init(const std::string& spec, std::string* error) {
-#ifdef BENCHMARK_HAS_NO_EXCEPTIONS
-  ((void)error); // suppress unused warning
-#else
-  try {
-#endif
-    re_ = std::regex(spec, std::regex_constants::extended);
-    init_ = true;
-#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  } catch (const std::regex_error& e) {
-    if (error) {
-      *error = e.what();
-    }
-  }
-#endif
-  return init_;
-}
-
-inline Regex::~Regex() {}
-
-inline bool Regex::Match(const std::string& str) {
-  if (!init_) {
-    return false;
-  }
-  return std::regex_search(str, re_);
-}
-
-#else
-inline bool Regex::Init(const std::string& spec, std::string* error) {
-  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
-  if (ec != 0) {
-    if (error) {
-      size_t needed = regerror(ec, &re_, nullptr, 0);
-      char* errbuf = new char[needed];
-      regerror(ec, &re_, errbuf, needed);
-
-      // regerror returns the number of bytes necessary to null terminate
-      // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
-      error->assign(errbuf, needed - 1);
-
-      delete[] errbuf;
-    }
-
-    return false;
-  }
-
-  init_ = true;
-  return true;
-}
-
-inline Regex::~Regex() {
-  if (init_) {
-    regfree(&re_);
-  }
-}
-
-inline bool Regex::Match(const std::string& str) {
-  if (!init_) {
-    return false;
-  }
-  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
-}
-#endif
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_RE_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/reporter.cc
deleted file mode 100644
index aacd453179d..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/reporter.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-#include "timers.h"
-
-#include <cstdlib>
-
-#include <iostream>
-#include <tuple>
-#include <vector>
-
-#include "check.h"
-#include "stat.h"
-
-namespace benchmark {
-
-BenchmarkReporter::BenchmarkReporter()
-    : output_stream_(&std::cout), error_stream_(&std::cerr) {}
-
-BenchmarkReporter::~BenchmarkReporter() {}
-
-void BenchmarkReporter::PrintBasicContext(std::ostream *out,
-                                          Context const &context) {
-  CHECK(out) << "cannot be null";
-  auto &Out = *out;
-
-  Out << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
-      << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
-
-  Out << LocalDateTimeString() << "\n";
-
-  if (context.cpu_scaling_enabled) {
-    Out << "***WARNING*** CPU scaling is enabled, the benchmark "
-           "real time measurements may be noisy and will incur extra "
-           "overhead.\n";
-  }
-
-#ifndef NDEBUG
-  Out << "***WARNING*** Library was built as DEBUG. Timings may be "
-         "affected.\n";
-#endif
-}
-
-double BenchmarkReporter::Run::GetAdjustedRealTime() const {
-  double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
-  return new_time;
-}
-
-double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
-  double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
-  return new_time;
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.cc
deleted file mode 100644
index 54aa04a4224..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sleep.h"
-
-#include <cerrno>
-#include <cstdlib>
-#include <ctime>
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
-#endif
-
-namespace benchmark {
-#ifdef BENCHMARK_OS_WINDOWS
-// Window's Sleep takes milliseconds argument.
-void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
-void SleepForSeconds(double seconds) {
-  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
-}
-#else   // BENCHMARK_OS_WINDOWS
-void SleepForMicroseconds(int microseconds) {
-  struct timespec sleep_time;
-  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-    ;  // Ignore signals and wait for the full interval to elapse.
-}
-
-void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
-}
-
-void SleepForSeconds(double seconds) {
-  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
-}
-#endif  // BENCHMARK_OS_WINDOWS
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.h
deleted file mode 100644
index f98551afe28..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef BENCHMARK_SLEEP_H_
-#define BENCHMARK_SLEEP_H_
-
-namespace benchmark {
-const int kNumMillisPerSecond = 1000;
-const int kNumMicrosPerMilli = 1000;
-const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
-const int kNumNanosPerMicro = 1000;
-const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
-
-void SleepForMilliseconds(int milliseconds);
-void SleepForSeconds(double seconds);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SLEEP_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/stat.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/stat.h
deleted file mode 100644
index d356875b632..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/stat.h
+++ /dev/null
@@ -1,310 +0,0 @@
-#ifndef BENCHMARK_STAT_H_
-#define BENCHMARK_STAT_H_
-
-#include <cmath>
-#include <limits>
-#include <ostream>
-#include <type_traits>
-
-namespace benchmark {
-
-template <typename VType, typename NumType>
-class Stat1;
-
-template <typename VType, typename NumType>
-class Stat1MinMax;
-
-typedef Stat1<float, int64_t> Stat1_f;
-typedef Stat1<double, int64_t> Stat1_d;
-typedef Stat1MinMax<float, int64_t> Stat1MinMax_f;
-typedef Stat1MinMax<double, int64_t> Stat1MinMax_d;
-
-template <typename VType>
-class Vector2;
-template <typename VType>
-class Vector3;
-template <typename VType>
-class Vector4;
-
-template <typename VType, typename NumType>
-class Stat1 {
- public:
-  typedef Stat1<VType, NumType> Self;
-
-  Stat1() { Clear(); }
-  // Create a sample of value dat and weight 1
-  explicit Stat1(const VType &dat) {
-    sum_ = dat;
-    sum_squares_ = Sqr(dat);
-    numsamples_ = 1;
-  }
-  // Create statistics for all the samples between begin (included)
-  // and end(excluded)
-  explicit Stat1(const VType *begin, const VType *end) {
-    Clear();
-    for (const VType *item = begin; item < end; ++item) {
-      (*this) += Stat1(*item);
-    }
-  }
-  // Create a sample of value dat and weight w
-  Stat1(const VType &dat, const NumType &w) {
-    sum_ = w * dat;
-    sum_squares_ = w * Sqr(dat);
-    numsamples_ = w;
-  }
-  // Copy operator
-  Stat1(const Self &stat) {
-    sum_ = stat.sum_;
-    sum_squares_ = stat.sum_squares_;
-    numsamples_ = stat.numsamples_;
-  }
-
-  void Clear() {
-    numsamples_ = NumType();
-    sum_squares_ = sum_ = VType();
-  }
-
-  Self &operator=(const Self &stat) {
-    sum_ = stat.sum_;
-    sum_squares_ = stat.sum_squares_;
-    numsamples_ = stat.numsamples_;
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self &operator+=(const Self &stat) {
-    sum_ += stat.sum_;
-    sum_squares_ += stat.sum_squares_;
-    numsamples_ += stat.numsamples_;
-    return (*this);
-  }
-  // The operation opposite to +=
-  Self &operator-=(const Self &stat) {
-    sum_ -= stat.sum_;
-    sum_squares_ -= stat.sum_squares_;
-    numsamples_ -= stat.numsamples_;
-    return (*this);
-  }
-  // Multiply the weight of the set of samples by a factor k
-  Self &operator*=(const VType &k) {
-    sum_ *= k;
-    sum_squares_ *= k;
-    numsamples_ *= k;
-    return (*this);
-  }
-
-  // Merge statistics from two sample sets.
-  Self operator+(const Self &stat) const { return Self(*this) += stat; }
-
-  // The operation opposite to +
-  Self operator-(const Self &stat) const { return Self(*this) -= stat; }
-
-  // Multiply the weight of the set of samples by a factor k
-  Self operator*(const VType &k) const { return Self(*this) *= k; }
-
-  // Return the total weight of this sample set
-  NumType numSamples() const { return numsamples_; }
-
-  // Return the sum of this sample set
-  VType Sum() const { return sum_; }
-
-  // Return the mean of this sample set
-  VType Mean() const {
-    if (numsamples_ == 0) return VType();
-    return sum_ * (1.0 / numsamples_);
-  }
-
-  // Return the mean of this sample set and compute the standard deviation at
-  // the same time.
-  VType Mean(VType *stddev) const {
-    if (numsamples_ == 0) return VType();
-    VType mean = sum_ * (1.0 / numsamples_);
-    if (stddev) {
-      // Sample standard deviation is undefined for n = 1
-      if (numsamples_ == 1) {
-        *stddev = VType();
-      } else {
-        VType avg_squares = sum_squares_ * (1.0 / numsamples_);
-        *stddev = Sqrt(numsamples_ / (numsamples_ - 1.0) * (avg_squares - Sqr(mean)));
-      }
-    }
-    return mean;
-  }
-
-  // Return the standard deviation of the sample set
-  VType StdDev() const {
-    VType stddev = VType();
-    Mean(&stddev);
-    return stddev;
-  }
-
- private:
-  static_assert(std::is_integral<NumType>::value &&
-                    !std::is_same<NumType, bool>::value,
-                "NumType must be an integral type that is not bool.");
-  // Let i be the index of the samples provided (using +=)
-  // and weight[i],value[i] be the data of sample #i
-  // then the variables have the following meaning:
-  NumType numsamples_;  // sum of weight[i];
-  VType sum_;           // sum of weight[i]*value[i];
-  VType sum_squares_;   // sum of weight[i]*value[i]^2;
-
-  // Template function used to square a number.
-  // For a vector we square all components
-  template <typename SType>
-  static inline SType Sqr(const SType &dat) {
-    return dat * dat;
-  }
-
-  template <typename SType>
-  static inline Vector2<SType> Sqr(const Vector2<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  template <typename SType>
-  static inline Vector3<SType> Sqr(const Vector3<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  template <typename SType>
-  static inline Vector4<SType> Sqr(const Vector4<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  // Template function used to take the square root of a number.
-  // For a vector we square all components
-  template <typename SType>
-  static inline SType Sqrt(const SType &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    if (dat < 0) return 0;
-    return sqrt(dat);
-  }
-
-  template <typename SType>
-  static inline Vector2<SType> Sqrt(const Vector2<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector2<SType>()).Sqrt();
-  }
-
-  template <typename SType>
-  static inline Vector3<SType> Sqrt(const Vector3<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector3<SType>()).Sqrt();
-  }
-
-  template <typename SType>
-  static inline Vector4<SType> Sqrt(const Vector4<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector4<SType>()).Sqrt();
-  }
-};
-
-// Useful printing function
-template <typename VType, typename NumType>
-std::ostream &operator<<(std::ostream &out, const Stat1<VType, NumType> &s) {
-  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
-      << " nsamples = " << s.NumSamples() << "}";
-  return out;
-}
-
-// Stat1MinMax: same as Stat1, but it also
-// keeps the Min and Max values; the "-"
-// operator is disabled because it cannot be implemented
-// efficiently
-template <typename VType, typename NumType>
-class Stat1MinMax : public Stat1<VType, NumType> {
- public:
-  typedef Stat1MinMax<VType, NumType> Self;
-
-  Stat1MinMax() { Clear(); }
-  // Create a sample of value dat and weight 1
-  explicit Stat1MinMax(const VType &dat) : Stat1<VType, NumType>(dat) {
-    max_ = dat;
-    min_ = dat;
-  }
-  // Create statistics for all the samples between begin (included)
-  // and end(excluded)
-  explicit Stat1MinMax(const VType *begin, const VType *end) {
-    Clear();
-    for (const VType *item = begin; item < end; ++item) {
-      (*this) += Stat1MinMax(*item);
-    }
-  }
-  // Create a sample of value dat and weight w
-  Stat1MinMax(const VType &dat, const NumType &w)
-      : Stat1<VType, NumType>(dat, w) {
-    max_ = dat;
-    min_ = dat;
-  }
-  // Copy operator
-  Stat1MinMax(const Self &stat) : Stat1<VType, NumType>(stat) {
-    max_ = stat.max_;
-    min_ = stat.min_;
-  }
-
-  void Clear() {
-    Stat1<VType, NumType>::Clear();
-    if (std::numeric_limits<VType>::has_infinity) {
-      min_ = std::numeric_limits<VType>::infinity();
-      max_ = -std::numeric_limits<VType>::infinity();
-    } else {
-      min_ = std::numeric_limits<VType>::max();
-      max_ = std::numeric_limits<VType>::min();
-    }
-  }
-
-  Self &operator=(const Self &stat) {
-    this->Stat1<VType, NumType>::operator=(stat);
-    max_ = stat.max_;
-    min_ = stat.min_;
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self &operator+=(const Self &stat) {
-    this->Stat1<VType, NumType>::operator+=(stat);
-    if (stat.max_ > max_) max_ = stat.max_;
-    if (stat.min_ < min_) min_ = stat.min_;
-    return (*this);
-  }
-  // Multiply the weight of the set of samples by a factor k
-  Self &operator*=(const VType &stat) {
-    this->Stat1<VType, NumType>::operator*=(stat);
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self operator+(const Self &stat) const { return Self(*this) += stat; }
-  // Multiply the weight of the set of samples by a factor k
-  Self operator*(const VType &k) const { return Self(*this) *= k; }
-
-  // Return the maximal value in this sample set
-  VType Max() const { return max_; }
-  // Return the minimal value in this sample set
-  VType Min() const { return min_; }
-
- private:
-  // The - operation makes no sense with Min/Max
-  // unless we keep the full list of values (but we don't)
-  // make it private, and let it undefined so nobody can call it
-  Self &operator-=(const Self &stat);  // senseless. let it undefined.
-
-  // The operation opposite to -
-  Self operator-(const Self &stat) const;  // senseless. let it undefined.
-
-  // Let i be the index of the samples provided (using +=)
-  // and weight[i],value[i] be the data of sample #i
-  // then the variables have the following meaning:
-  VType max_;  // max of value[i]
-  VType min_;  // min of value[i]
-};
-
-// Useful printing function
-template <typename VType, typename NumType>
-std::ostream &operator<<(std::ostream &out,
-                         const Stat1MinMax<VType, NumType> &s) {
-  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
-      << " nsamples = " << s.NumSamples() << " min = " << s.Min()
-      << " max = " << s.Max() << "}";
-  return out;
-}
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_STAT_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.cc
deleted file mode 100644
index cd4e7cfde57..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-#include "string_util.h"
-
-#include <array>
-#include <cmath>
-#include <cstdarg>
-#include <cstdio>
-#include <memory>
-#include <sstream>
-
-#include "arraysize.h"
-
-namespace benchmark {
-namespace {
-
-// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
-const char kBigSIUnits[] = "kMGTPEZY";
-// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
-const char kBigIECUnits[] = "KMGTPEZY";
-// milli, micro, nano, pico, femto, atto, zepto, yocto.
-const char kSmallSIUnits[] = "munpfazy";
-
-// We require that all three arrays have the same size.
-static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
-              "SI and IEC unit arrays must be the same size");
-static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
-              "Small SI and Big SI unit arrays must be the same size");
-
-static const int64_t kUnitsSize = arraysize(kBigSIUnits);
-
-}  // end anonymous namespace
-
-void ToExponentAndMantissa(double val, double thresh, int precision,
-                           double one_k, std::string* mantissa,
-                           int64_t* exponent) {
-  std::stringstream mantissa_stream;
-
-  if (val < 0) {
-    mantissa_stream << "-";
-    val = -val;
-  }
-
-  // Adjust threshold so that it never excludes things which can't be rendered
-  // in 'precision' digits.
-  const double adjusted_threshold =
-      std::max(thresh, 1.0 / std::pow(10.0, precision));
-  const double big_threshold = adjusted_threshold * one_k;
-  const double small_threshold = adjusted_threshold;
-  // Values in ]simple_threshold,small_threshold[ will be printed as-is
-  const double simple_threshold = 0.01;
-
-  if (val > big_threshold) {
-    // Positive powers
-    double scaled = val;
-    for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) {
-      scaled /= one_k;
-      if (scaled <= big_threshold) {
-        mantissa_stream << scaled;
-        *exponent = i + 1;
-        *mantissa = mantissa_stream.str();
-        return;
-      }
-    }
-    mantissa_stream << val;
-    *exponent = 0;
-  } else if (val < small_threshold) {
-    // Negative powers
-    if (val < simple_threshold) {
-      double scaled = val;
-      for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
-        scaled *= one_k;
-        if (scaled >= small_threshold) {
-          mantissa_stream << scaled;
-          *exponent = -static_cast<int64_t>(i + 1);
-          *mantissa = mantissa_stream.str();
-          return;
-        }
-      }
-    }
-    mantissa_stream << val;
-    *exponent = 0;
-  } else {
-    mantissa_stream << val;
-    *exponent = 0;
-  }
-  *mantissa = mantissa_stream.str();
-}
-
-std::string ExponentToPrefix(int64_t exponent, bool iec) {
-  if (exponent == 0) return "";
-
-  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
-  if (index >= kUnitsSize) return "";
-
-  const char* array =
-      (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
-  if (iec)
-    return array[index] + std::string("i");
-  else
-    return std::string(1, array[index]);
-}
-
-std::string ToBinaryStringFullySpecified(double value, double threshold,
-                                         int precision) {
-  std::string mantissa;
-  int64_t exponent;
-  ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
-                        &exponent);
-  return mantissa + ExponentToPrefix(exponent, false);
-}
-
-void AppendHumanReadable(int n, std::string* str) {
-  std::stringstream ss;
-  // Round down to the nearest SI prefix.
-  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
-  *str += ss.str();
-}
-
-std::string HumanReadableNumber(double n) {
-  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
-  // this softens edge effects.
-  // 1 means that we should show one decimal place of precision.
-  return ToBinaryStringFullySpecified(n, 1.1, 1);
-}
-
-std::string StringPrintFImp(const char* msg, va_list args) {
-  // we might need a second shot at this, so pre-emptivly make a copy
-  va_list args_cp;
-  va_copy(args_cp, args);
-
-  // TODO(ericwf): use std::array for first attempt to avoid one memory
-  // allocation guess what the size might be
-  std::array<char, 256> local_buff;
-  std::size_t size = local_buff.size();
-  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
-  // in the android-ndk
-  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
-
-  va_end(args_cp);
-
-  // handle empty expansion
-  if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < size)
-    return std::string(local_buff.data());
-
-  // we did not provide a long enough buffer on our first attempt.
-  // add 1 to size to account for null-byte in size cast to prevent overflow
-  size = static_cast<std::size_t>(ret) + 1;
-  auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
-  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
-  // in the android-ndk
-  ret = vsnprintf(buff_ptr.get(), size, msg, args);
-  return std::string(buff_ptr.get());
-}
-
-std::string StringPrintF(const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  std::string tmp = StringPrintFImp(format, args);
-  va_end(args);
-  return tmp;
-}
-
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to) {
-  std::size_t start = 0;
-  while ((start = str->find(from, start)) != std::string::npos) {
-    str->replace(start, from.length(), to);
-    start += to.length();
-  }
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.h
deleted file mode 100644
index 0b190b91a16..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef BENCHMARK_STRING_UTIL_H_
-#define BENCHMARK_STRING_UTIL_H_
-
-#include <sstream>
-#include <string>
-#include <utility>
-#include "internal_macros.h"
-
-namespace benchmark {
-
-void AppendHumanReadable(int n, std::string* str);
-
-std::string HumanReadableNumber(double n);
-
-std::string StringPrintF(const char* format, ...);
-
-inline std::ostream& StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT {
-  return out;
-}
-
-template <class First, class... Rest>
-inline std::ostream& StringCatImp(std::ostream& out, First&& f,
-                                  Rest&&... rest) {
-  out << std::forward<First>(f);
-  return StringCatImp(out, std::forward<Rest>(rest)...);
-}
-
-template <class... Args>
-inline std::string StrCat(Args&&... args) {
-  std::ostringstream ss;
-  StringCatImp(ss, std::forward<Args>(args)...);
-  return ss.str();
-}
-
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to);
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_STRING_UTIL_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.cc
deleted file mode 100644
index 7feb79e65f2..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.cc
+++ /dev/null
@@ -1,355 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sysinfo.h"
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <Shlwapi.h>
-#include <VersionHelpers.h>
-#include <Windows.h>
-#else
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
-#include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
-#include <sys/sysctl.h>
-#endif
-#endif
-
-#include <cerrno>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <mutex>
-
-#include "arraysize.h"
-#include "check.h"
-#include "cycleclock.h"
-#include "internal_macros.h"
-#include "log.h"
-#include "sleep.h"
-#include "string_util.h"
-
-namespace benchmark {
-namespace {
-std::once_flag cpuinfo_init;
-double cpuinfo_cycles_per_second = 1.0;
-int cpuinfo_num_cpus = 1;  // Conservative guess
-
-#if !defined BENCHMARK_OS_MACOSX
-const int64_t estimate_time_ms = 1000;
-
-// Helper function estimates cycles/sec by observing cycles elapsed during
-// sleep(). Using small sleep time decreases accuracy significantly.
-int64_t EstimateCyclesPerSecond() {
-  const int64_t start_ticks = cycleclock::Now();
-  SleepForMilliseconds(estimate_time_ms);
-  return cycleclock::Now() - start_ticks;
-}
-#endif
-
-#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-// Helper function for reading an int from a file. Returns true if successful
-// and the memory location pointed to by value is set to the value read.
-bool ReadIntFromFile(const char* file, long* value) {
-  bool ret = false;
-  int fd = open(file, O_RDONLY);
-  if (fd != -1) {
-    char line[1024];
-    char* err;
-    memset(line, '\0', sizeof(line));
-    ssize_t read_err = read(fd, line, sizeof(line) - 1);
-    ((void)read_err); // prevent unused warning
-    CHECK(read_err >= 0);
-    const long temp_value = strtol(line, &err, 10);
-    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
-      *value = temp_value;
-      ret = true;
-    }
-    close(fd);
-  }
-  return ret;
-}
-#endif
-
-#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-static std::string convertToLowerCase(std::string s) {
-  for (auto& ch : s)
-    ch = std::tolower(ch);
-  return s;
-}
-static bool startsWithKey(std::string Value, std::string Key,
-                          bool IgnoreCase = true) {
-  if (IgnoreCase) {
-    Key = convertToLowerCase(std::move(Key));
-    Value = convertToLowerCase(std::move(Value));
-  }
-  return Value.compare(0, Key.size(), Key) == 0;
-}
-#endif
-
-void InitializeSystemInfo() {
-#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-  char line[1024];
-  char* err;
-  long freq;
-
-  bool saw_mhz = false;
-
-  // If the kernel is exporting the tsc frequency use that. There are issues
-  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
-  // exporintg an invalid p-state (on x86) or p-states may be used to put the
-  // processor in a new mode (turbo mode). Essentially, those frequencies
-  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
-  // well.
-  if (!saw_mhz &&
-      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
-    // The value is in kHz (as the file name suggests).  For example, on a
-    // 2GHz warpstation, the file contains the value "2000000".
-    cpuinfo_cycles_per_second = freq * 1000.0;
-    saw_mhz = true;
-  }
-
-  // If CPU scaling is in effect, we want to use the *maximum* frequency,
-  // not whatever CPU speed some random processor happens to be using now.
-  if (!saw_mhz &&
-      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
-                      &freq)) {
-    // The value is in kHz.  For example, on a 2GHz warpstation, the file
-    // contains the value "2000000".
-    cpuinfo_cycles_per_second = freq * 1000.0;
-    saw_mhz = true;
-  }
-
-  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
-  const char* pname = "/proc/cpuinfo";
-  int fd = open(pname, O_RDONLY);
-  if (fd == -1) {
-    perror(pname);
-    if (!saw_mhz) {
-      cpuinfo_cycles_per_second =
-          static_cast<double>(EstimateCyclesPerSecond());
-    }
-    return;
-  }
-
-  double bogo_clock = 1.0;
-  bool saw_bogo = false;
-  long max_cpu_id = 0;
-  int num_cpus = 0;
-  line[0] = line[1] = '\0';
-  size_t chars_read = 0;
-  do {  // we'll exit when the last read didn't read anything
-    // Move the next line to the beginning of the buffer
-    const size_t oldlinelen = strlen(line);
-    if (sizeof(line) == oldlinelen + 1)  // oldlinelen took up entire line
-      line[0] = '\0';
-    else  // still other lines left to save
-      memmove(line, line + oldlinelen + 1, sizeof(line) - (oldlinelen + 1));
-    // Terminate the new line, reading more if we can't find the newline
-    char* newline = strchr(line, '\n');
-    if (newline == nullptr) {
-      const size_t linelen = strlen(line);
-      const size_t bytes_to_read = sizeof(line) - 1 - linelen;
-      CHECK(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
-      chars_read = read(fd, line + linelen, bytes_to_read);
-      line[linelen + chars_read] = '\0';
-      newline = strchr(line, '\n');
-    }
-    if (newline != nullptr) *newline = '\0';
-
-    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
-    // accept postive values. Some environments (virtual machines) report zero,
-    // which would cause infinite looping in WallTime_Init.
-    if (!saw_mhz && startsWithKey(line, "cpu MHz")) {
-      const char* freqstr = strchr(line, ':');
-      if (freqstr) {
-        cpuinfo_cycles_per_second = strtod(freqstr + 1, &err) * 1000000.0;
-        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
-          saw_mhz = true;
-      }
-    } else if (startsWithKey(line, "bogomips")) {
-      const char* freqstr = strchr(line, ':');
-      if (freqstr) {
-        bogo_clock = strtod(freqstr + 1, &err) * 1000000.0;
-        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
-          saw_bogo = true;
-      }
-    } else if (startsWithKey(line, "processor", /*IgnoreCase*/false)) {
-      // The above comparison is case-sensitive because ARM kernels often
-      // include a "Processor" line that tells you about the CPU, distinct
-      // from the usual "processor" lines that give you CPU ids. No current
-      // Linux architecture is using "Processor" for CPU ids.
-      num_cpus++;  // count up every time we see an "processor :" entry
-      const char* id_str = strchr(line, ':');
-      if (id_str) {
-        const long cpu_id = strtol(id_str + 1, &err, 10);
-        if (id_str[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id)
-          max_cpu_id = cpu_id;
-      }
-    }
-  } while (chars_read > 0);
-  close(fd);
-
-  if (!saw_mhz) {
-    if (saw_bogo) {
-      // If we didn't find anything better, we'll use bogomips, but
-      // we're not happy about it.
-      cpuinfo_cycles_per_second = bogo_clock;
-    } else {
-      // If we don't even have bogomips, we'll use the slow estimation.
-      cpuinfo_cycles_per_second =
-          static_cast<double>(EstimateCyclesPerSecond());
-    }
-  }
-  if (num_cpus == 0) {
-    fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n");
-  } else {
-    if ((max_cpu_id + 1) != num_cpus) {
-      fprintf(stderr,
-              "CPU ID assignments in /proc/cpuinfo seem messed up."
-              " This is usually caused by a bad BIOS.\n");
-    }
-    cpuinfo_num_cpus = num_cpus;
-  }
-
-#elif defined BENCHMARK_OS_FREEBSD
-// For this sysctl to work, the machine must be configured without
-// SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
-// and later.  Before that, it's a 32-bit quantity (and gives the
-// wrong answer on machines faster than 2^32 Hz).  See
-//  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
-// But also compare FreeBSD 7.0:
-//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
-//  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
-// To FreeBSD 6.3 (it's the same in 6-STABLE):
-//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
-//  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
-#if __FreeBSD__ >= 7
-  uint64_t hz = 0;
-#else
-  unsigned int hz = 0;
-#endif
-  size_t sz = sizeof(hz);
-  const char* sysctl_path = "machdep.tsc_freq";
-  if (sysctlbyname(sysctl_path, &hz, &sz, nullptr, 0) != 0) {
-    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
-            sysctl_path, strerror(errno));
-    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-  } else {
-    cpuinfo_cycles_per_second = hz;
-  }
-// TODO: also figure out cpuinfo_num_cpus
-
-#elif defined BENCHMARK_OS_WINDOWS
-  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
-  // then make a crude estimate.
-  DWORD data, data_size = sizeof(data);
-  if (IsWindowsXPOrGreater() &&
-      SUCCEEDED(
-          SHGetValueA(HKEY_LOCAL_MACHINE,
-                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
-                      "~MHz", nullptr, &data, &data_size)))
-    cpuinfo_cycles_per_second =
-        static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
-  else
-    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-
-  SYSTEM_INFO sysinfo;
-  // Use memset as opposed to = {} to avoid GCC missing initializer false
-  // positives.
-  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
-  GetSystemInfo(&sysinfo);
-  cpuinfo_num_cpus = sysinfo.dwNumberOfProcessors;  // number of logical
-                                                    // processors in the current
-                                                    // group
-
-#elif defined BENCHMARK_OS_MACOSX
-  int32_t num_cpus = 0;
-  size_t size = sizeof(num_cpus);
-  if (::sysctlbyname("hw.ncpu", &num_cpus, &size, nullptr, 0) == 0 &&
-      (size == sizeof(num_cpus))) {
-    cpuinfo_num_cpus = num_cpus;
-  } else {
-    fprintf(stderr, "%s\n", strerror(errno));
-    std::exit(EXIT_FAILURE);
-  }
-  int64_t cpu_freq = 0;
-  size = sizeof(cpu_freq);
-  if (::sysctlbyname("hw.cpufrequency", &cpu_freq, &size, nullptr, 0) == 0 &&
-      (size == sizeof(cpu_freq))) {
-    cpuinfo_cycles_per_second = cpu_freq;
-  } else {
-    #if defined BENCHMARK_OS_IOS
-    fprintf(stderr, "CPU frequency cannot be detected. \n");
-    cpuinfo_cycles_per_second = 0;
-    #else
-    fprintf(stderr, "%s\n", strerror(errno));
-    std::exit(EXIT_FAILURE);
-    #endif
-  }
-#else
-  // Generic cycles per second counter
-  cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-#endif
-}
-
-}  // end namespace
-
-double CyclesPerSecond(void) {
-  std::call_once(cpuinfo_init, InitializeSystemInfo);
-  return cpuinfo_cycles_per_second;
-}
-
-int NumCPUs(void) {
-  std::call_once(cpuinfo_init, InitializeSystemInfo);
-  return cpuinfo_num_cpus;
-}
-
-// The ""'s catch people who don't pass in a literal for "str"
-#define strliterallen(str) (sizeof("" str "") - 1)
-
-// Must use a string literal for prefix.
-#define memprefix(str, len, prefix)                       \
-  ((((len) >= strliterallen(prefix)) &&                   \
-    std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
-       ? str + strliterallen(prefix)                      \
-       : nullptr)
-
-bool CpuScalingEnabled() {
-#ifndef BENCHMARK_OS_WINDOWS
-  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
-  // local file system. If reading the exported files fails, then we may not be
-  // running on Linux, so we silently ignore all the read errors.
-  for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
-    std::string governor_file =
-        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    FILE* file = fopen(governor_file.c_str(), "r");
-    if (!file) break;
-    char buff[16];
-    size_t bytes_read = fread(buff, 1, sizeof(buff), file);
-    fclose(file);
-    if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
-  }
-#endif
-  return false;
-}
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.h
deleted file mode 100644
index c5d9916d2dd..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef BENCHMARK_SYSINFO_H_
-#define BENCHMARK_SYSINFO_H_
-
-namespace benchmark {
-int NumCPUs();
-double CyclesPerSecond();
-bool CpuScalingEnabled();
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SYSINFO_H_
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.cc
deleted file mode 100644
index 817272d00bc..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "timers.h"
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <Shlwapi.h>
-#include <VersionHelpers.h>
-#include <Windows.h>
-#else
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
-#include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
-#include <sys/sysctl.h>
-#endif
-#if defined(BENCHMARK_OS_MACOSX)
-#include <mach/mach_init.h>
-#include <mach/mach_port.h>
-#include <mach/thread_act.h>
-#endif
-#endif
-
-#ifdef BENCHMARK_OS_EMSCRIPTEN
-#include <emscripten.h>
-#endif
-
-#include <cerrno>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <iostream>
-#include <limits>
-#include <mutex>
-
-#include "check.h"
-#include "log.h"
-#include "sleep.h"
-#include "string_util.h"
-
-namespace benchmark {
-
-// Suppress unused warnings on helper functions.
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wunused-function"
-#endif
-
-namespace {
-#if defined(BENCHMARK_OS_WINDOWS)
-double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
-  ULARGE_INTEGER kernel;
-  ULARGE_INTEGER user;
-  kernel.HighPart = kernel_time.dwHighDateTime;
-  kernel.LowPart = kernel_time.dwLowDateTime;
-  user.HighPart = user_time.dwHighDateTime;
-  user.LowPart = user_time.dwLowDateTime;
-  return (static_cast<double>(kernel.QuadPart) +
-          static_cast<double>(user.QuadPart)) *
-         1e-7;
-}
-#else
-double MakeTime(struct rusage const& ru) {
-  return (static_cast<double>(ru.ru_utime.tv_sec) +
-          static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-          static_cast<double>(ru.ru_stime.tv_sec) +
-          static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-}
-#endif
-#if defined(BENCHMARK_OS_MACOSX)
-double MakeTime(thread_basic_info_data_t const& info) {
-  return (static_cast<double>(info.user_time.seconds) +
-          static_cast<double>(info.user_time.microseconds) * 1e-6 +
-          static_cast<double>(info.system_time.seconds) +
-          static_cast<double>(info.system_time.microseconds) * 1e-6);
-}
-#endif
-#if defined(CLOCK_PROCESS_CPUTIME_ID) || defined(CLOCK_THREAD_CPUTIME_ID)
-double MakeTime(struct timespec const& ts) {
-  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
-}
-#endif
-
-BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) {
-  std::cerr << "ERROR: " << msg << std::endl;
-  std::exit(EXIT_FAILURE);
-}
-
-}  // end namespace
-
-double ProcessCPUUsage() {
-#if defined(BENCHMARK_OS_WINDOWS)
-  HANDLE proc = GetCurrentProcess();
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  if (GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time,
-                      &user_time))
-    return MakeTime(kernel_time, user_time);
-  DiagnoseAndExit("GetProccessTimes() failed");
-#elif defined(BENCHMARK_OS_EMSCRIPTEN)
-  // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
-  // Use Emscripten-specific API. Reported CPU time would be exactly the
-  // same as total time, but this is ok because there aren't long-latency
-  // syncronous system calls in Emscripten.
-  return emscripten_get_now() * 1e-3;
-#elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
-  struct timespec spec;
-  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
-    return MakeTime(spec);
-  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
-#else
-  struct rusage ru;
-  if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru);
-  DiagnoseAndExit("getrusage(RUSAGE_SELF, ...) failed");
-#endif
-}
-
-double ThreadCPUUsage() {
-#if defined(BENCHMARK_OS_WINDOWS)
-  HANDLE this_thread = GetCurrentThread();
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
-                 &user_time);
-  return MakeTime(kernel_time, user_time);
-#elif defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
-  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
-  thread_basic_info_data_t info;
-  mach_port_t thread = pthread_mach_thread_np(pthread_self());
-  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
-      KERN_SUCCESS) {
-    return MakeTime(info);
-  }
-  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
-#elif defined(BENCHMARK_OS_EMSCRIPTEN)
-  // Emscripten doesn't support traditional threads
-  return ProcessCPUUsage();
-#elif defined(BENCHMARK_OS_RTEMS)
-  // RTEMS doesn't support CLOCK_THREAD_CPUTIME_ID. See
-  // https://github.com/RTEMS/rtems/blob/master/cpukit/posix/src/clockgettime.c
-  return ProcessCPUUsage();
-#elif defined(CLOCK_THREAD_CPUTIME_ID)
-  struct timespec ts;
-  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
-  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
-#else
-#error Per-thread timing is not available on your system.
-#endif
-}
-
-namespace {
-
-std::string DateTimeString(bool local) {
-  typedef std::chrono::system_clock Clock;
-  std::time_t now = Clock::to_time_t(Clock::now());
-  const std::size_t kStorageSize = 128;
-  char storage[kStorageSize];
-  std::size_t written;
-
-  if (local) {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written =
-        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  } else {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  }
-  CHECK(written < kStorageSize);
-  ((void)written);  // prevent unused variable in optimized mode.
-  return std::string(storage);
-}
-
-}  // end namespace
-
-std::string LocalDateTimeString() { return DateTimeString(true); }
-
-}  // end namespace benchmark
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.h
deleted file mode 100644
index 65606ccd93d..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef BENCHMARK_TIMERS_H
-#define BENCHMARK_TIMERS_H
-
-#include <chrono>
-#include <string>
-
-namespace benchmark {
-
-// Return the CPU usage of the current process
-double ProcessCPUUsage();
-
-// Return the CPU usage of the children of the current process
-double ChildrenCPUUsage();
-
-// Return the CPU usage of the current thread
-double ThreadCPUUsage();
-
-#if defined(HAVE_STEADY_CLOCK)
-template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
-struct ChooseSteadyClock {
-  typedef std::chrono::high_resolution_clock type;
-};
-
-template <>
-struct ChooseSteadyClock<false> {
-  typedef std::chrono::steady_clock type;
-};
-#endif
-
-struct ChooseClockType {
-#if defined(HAVE_STEADY_CLOCK)
-  typedef ChooseSteadyClock<>::type type;
-#else
-  typedef std::chrono::high_resolution_clock type;
-#endif
-};
-
-inline double ChronoClockNow() {
-  typedef ChooseClockType::type ClockType;
-  using FpSeconds = std::chrono::duration<double, std::chrono::seconds::period>;
-  return FpSeconds(ClockType::now().time_since_epoch()).count();
-}
-
-std::string LocalDateTimeString();
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_TIMERS_H
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/CMakeLists.txt b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/CMakeLists.txt
deleted file mode 100644
index b55612b4655..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/CMakeLists.txt
+++ /dev/null
@@ -1,170 +0,0 @@
-# Enable the tests
-
-find_package(Threads REQUIRED)
-include(CheckCXXCompilerFlag)
-
-# NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
-# strip -DNDEBUG from the default CMake flags in DEBUG mode.
-string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
-if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
-  add_definitions( -UNDEBUG )
-  add_definitions(-DTEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS)
-  # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
-  foreach (flags_var_to_scrub
-      CMAKE_CXX_FLAGS_RELEASE
-      CMAKE_CXX_FLAGS_RELWITHDEBINFO
-      CMAKE_CXX_FLAGS_MINSIZEREL
-      CMAKE_C_FLAGS_RELEASE
-      CMAKE_C_FLAGS_RELWITHDEBINFO
-      CMAKE_C_FLAGS_MINSIZEREL)
-    string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
-      "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
-  endforeach()
-endif()
-
-# NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise
-# they will break the configuration check.
-if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
-  list(APPEND CMAKE_EXE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
-endif()
-
-add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
-
-macro(compile_benchmark_test name)
-  add_executable(${name} "${name}.cc")
-  target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
-endmacro(compile_benchmark_test)
-
-
-macro(compile_output_test name)
-  add_executable(${name} "${name}.cc" output_test.h)
-  target_link_libraries(${name} output_test_helper benchmark
-          ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-endmacro(compile_output_test)
-
-
-# Demonstration executable
-compile_benchmark_test(benchmark_test)
-add_test(benchmark benchmark_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(filter_test)
-macro(add_filter_test name filter expect)
-  add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
-  add_test(${name}_list_only filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
-endmacro(add_filter_test)
-
-add_filter_test(filter_simple "Foo" 3)
-add_filter_test(filter_suffix "BM_.*" 4)
-add_filter_test(filter_regex_all ".*" 5)
-add_filter_test(filter_regex_blank "" 5)
-add_filter_test(filter_regex_none "monkey" 0)
-add_filter_test(filter_regex_wildcard ".*Foo.*" 3)
-add_filter_test(filter_regex_begin "^BM_.*" 4)
-add_filter_test(filter_regex_begin2 "^N" 1)
-add_filter_test(filter_regex_end ".*Ba$" 1)
-
-compile_benchmark_test(options_test)
-add_test(options_benchmarks options_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(basic_test)
-add_test(basic_benchmark basic_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(diagnostics_test)
-add_test(diagnostics_test diagnostics_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(skip_with_error_test)
-add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(donotoptimize_test)
-# Some of the issues with DoNotOptimize only occur when optimization is enabled
-check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
-if (BENCHMARK_HAS_O3_FLAG)
-  set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
-endif()
-add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(fixture_test)
-add_test(fixture_test fixture_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(register_benchmark_test)
-add_test(register_benchmark_test register_benchmark_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(map_test)
-add_test(map_test map_test --benchmark_min_time=0.01)
-
-compile_benchmark_test(multiple_ranges_test)
-add_test(multiple_ranges_test multiple_ranges_test --benchmark_min_time=0.01)
-
-compile_output_test(reporter_output_test)
-add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
-
-compile_output_test(user_counters_test)
-add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
-
-compile_output_test(user_counters_tabular_test)
-add_test(user_counters_tabular_test user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
-
-check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
-if (BENCHMARK_HAS_CXX03_FLAG)
-  set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
-  string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-  string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-
-  compile_benchmark_test(cxx03_test)
-  set_target_properties(cxx03_test
-      PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}")
-  add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
-endif()
-
-# Attempt to work around flaky test failures when running on Appveyor servers.
-if (DEFINED ENV{APPVEYOR})
-  set(COMPLEXITY_MIN_TIME "0.5")
-else()
-  set(COMPLEXITY_MIN_TIME "0.01")
-endif()
-compile_output_test(complexity_test)
-add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
-
-# Add the coverage command(s)
-if(CMAKE_BUILD_TYPE)
-  string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
-endif()
-if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
-  find_program(GCOV gcov)
-  find_program(LCOV lcov)
-  find_program(GENHTML genhtml)
-  find_program(CTEST ctest)
-  if (GCOV AND LCOV AND GENHTML AND CTEST AND HAVE_CXX_FLAG_COVERAGE)
-    add_custom_command(
-      OUTPUT ${CMAKE_BINARY_DIR}/lcov/index.html
-      COMMAND ${LCOV} -q -z -d .
-      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o before.lcov -i
-      COMMAND ${CTEST} --force-new-ctest-process
-      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o after.lcov
-      COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov
-      COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov
-      COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark
-      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test complexity_test
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-      COMMENT "Running LCOV"
-    )
-    add_custom_target(coverage
-      DEPENDS ${CMAKE_BINARY_DIR}/lcov/index.html
-      COMMENT "LCOV report at lcov/index.html"
-    )
-    message(STATUS "Coverage command added")
-  else()
-    if (HAVE_CXX_FLAG_COVERAGE)
-      set(CXX_FLAG_COVERAGE_MESSAGE supported)
-    else()
-      set(CXX_FLAG_COVERAGE_MESSAGE unavailable)
-    endif()
-    message(WARNING
-      "Coverage not available:\n"
-      "  gcov: ${GCOV}\n"
-      "  lcov: ${LCOV}\n"
-      "  genhtml: ${GENHTML}\n"
-      "  ctest: ${CTEST}\n"
-      "  --coverage flag: ${CXX_FLAG_COVERAGE_MESSAGE}")
-  endif()
-endif()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/basic_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/basic_test.cc
deleted file mode 100644
index bc1f96d9315..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/basic_test.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-
-#include "benchmark/benchmark.h"
-
-#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
-
-void BM_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(state.iterations());
-  }
-}
-BENCHMARK(BM_empty);
-BENCHMARK(BM_empty)->ThreadPerCpu();
-
-void BM_spin_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    for (int x = 0; x < state.range(0); ++x) {
-      benchmark::DoNotOptimize(x);
-    }
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_empty);
-BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
-
-void BM_spin_pause_before(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-  while (state.KeepRunning()) {
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_before);
-BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
-
-void BM_spin_pause_during(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-    state.ResumeTiming();
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_during);
-BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
-
-void BM_pause_during(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    state.ResumeTiming();
-  }
-}
-BENCHMARK(BM_pause_during);
-BENCHMARK(BM_pause_during)->ThreadPerCpu();
-BENCHMARK(BM_pause_during)->UseRealTime();
-BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
-
-void BM_spin_pause_after(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_after);
-BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
-
-void BM_spin_pause_before_and_after(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-  while (state.KeepRunning()) {
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
-BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
-
-void BM_empty_stop_start(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_empty_stop_start);
-BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
-
-BENCHMARK_MAIN()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/benchmark_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/benchmark_test.cc
deleted file mode 100644
index 7a16466e208..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/benchmark_test.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-#include "benchmark/benchmark.h"
-
-#include <assert.h>
-#include <math.h>
-#include <stdint.h>
-
-#include <chrono>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <list>
-#include <map>
-#include <mutex>
-#include <set>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <utility>
-#include <vector>
-
-#if defined(__GNUC__)
-#define BENCHMARK_NOINLINE __attribute__((noinline))
-#else
-#define BENCHMARK_NOINLINE
-#endif
-
-namespace {
-
-int BENCHMARK_NOINLINE Factorial(uint32_t n) {
-  return (n == 1) ? 1 : n * Factorial(n - 1);
-}
-
-double CalculatePi(int depth) {
-  double pi = 0.0;
-  for (int i = 0; i < depth; ++i) {
-    double numerator = static_cast<double>(((i % 2) * 2) - 1);
-    double denominator = static_cast<double>((2 * i) - 1);
-    pi += numerator / denominator;
-  }
-  return (pi - 1.0) * 4;
-}
-
-std::set<int> ConstructRandomSet(int size) {
-  std::set<int> s;
-  for (int i = 0; i < size; ++i) s.insert(i);
-  return s;
-}
-
-std::mutex test_vector_mu;
-std::vector<int>* test_vector = nullptr;
-
-}  // end namespace
-
-static void BM_Factorial(benchmark::State& state) {
-  int fac_42 = 0;
-  while (state.KeepRunning()) fac_42 = Factorial(8);
-  // Prevent compiler optimizations
-  std::stringstream ss;
-  ss << fac_42;
-  state.SetLabel(ss.str());
-}
-BENCHMARK(BM_Factorial);
-BENCHMARK(BM_Factorial)->UseRealTime();
-
-static void BM_CalculatePiRange(benchmark::State& state) {
-  double pi = 0.0;
-  while (state.KeepRunning()) pi = CalculatePi(state.range(0));
-  std::stringstream ss;
-  ss << pi;
-  state.SetLabel(ss.str());
-}
-BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
-
-static void BM_CalculatePi(benchmark::State& state) {
-  static const int depth = 1024;
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(CalculatePi(depth));
-  }
-}
-BENCHMARK(BM_CalculatePi)->Threads(8);
-BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
-BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
-
-static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range(0));
-    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j) data.insert(rand());
-  }
-  state.SetItemsProcessed(state.iterations() * state.range(1));
-  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
-}
-BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {1, 10}});
-
-template <typename Container,
-          typename ValueType = typename Container::value_type>
-static void BM_Sequential(benchmark::State& state) {
-  ValueType v = 42;
-  while (state.KeepRunning()) {
-    Container c;
-    for (int i = state.range(0); --i;) c.push_back(v);
-  }
-  const size_t items_processed = state.iterations() * state.range(0);
-  state.SetItemsProcessed(items_processed);
-  state.SetBytesProcessed(items_processed * sizeof(v));
-}
-BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
-    ->Range(1 << 0, 1 << 10);
-BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
-// Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
-#if __cplusplus >= 201103L
-BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
-#endif
-
-static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range(0), '-');
-  std::string s2(state.range(0), '-');
-  while (state.KeepRunning()) benchmark::DoNotOptimize(s1.compare(s2));
-}
-BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
-
-static void BM_SetupTeardown(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // No need to lock test_vector_mu here as this is running single-threaded.
-    test_vector = new std::vector<int>();
-  }
-  int i = 0;
-  while (state.KeepRunning()) {
-    std::lock_guard<std::mutex> l(test_vector_mu);
-    if (i % 2 == 0)
-      test_vector->push_back(i);
-    else
-      test_vector->pop_back();
-    ++i;
-  }
-  if (state.thread_index == 0) {
-    delete test_vector;
-  }
-}
-BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
-
-static void BM_LongTest(benchmark::State& state) {
-  double tracker = 0.0;
-  while (state.KeepRunning()) {
-    for (int i = 0; i < state.range(0); ++i)
-      benchmark::DoNotOptimize(tracker += i);
-  }
-}
-BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
-
-static void BM_ParallelMemset(benchmark::State& state) {
-  int size = state.range(0) / static_cast<int>(sizeof(int));
-  int thread_size = size / state.threads;
-  int from = thread_size * state.thread_index;
-  int to = from + thread_size;
-
-  if (state.thread_index == 0) {
-    test_vector = new std::vector<int>(size);
-  }
-
-  while (state.KeepRunning()) {
-    for (int i = from; i < to; i++) {
-      // No need to lock test_vector_mu as ranges
-      // do not overlap between threads.
-      benchmark::DoNotOptimize(test_vector->at(i) = 1);
-    }
-  }
-
-  if (state.thread_index == 0) {
-    delete test_vector;
-  }
-}
-BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
-
-static void BM_ManualTiming(benchmark::State& state) {
-  size_t slept_for = 0;
-  int microseconds = state.range(0);
-  std::chrono::duration<double, std::micro> sleep_duration{
-      static_cast<double>(microseconds)};
-
-  while (state.KeepRunning()) {
-    auto start = std::chrono::high_resolution_clock::now();
-    // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(
-        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
-    auto end = std::chrono::high_resolution_clock::now();
-
-    auto elapsed =
-        std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
-
-    state.SetIterationTime(elapsed.count());
-    slept_for += microseconds;
-  }
-  state.SetItemsProcessed(slept_for);
-}
-BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
-BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();
-
-#if __cplusplus >= 201103L
-
-template <class... Args>
-void BM_with_args(benchmark::State& state, Args&&...) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44);
-BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"),
-                  std::pair<int, double>(42, 3.8));
-
-void BM_non_template_args(benchmark::State& state, int, double) {
-  while(state.KeepRunning()) {}
-}
-BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
-
-#endif  // __cplusplus >= 201103L
-
-static void BM_DenseThreadRanges(benchmark::State& st) {
-  switch (st.range(0)) {
-    case 1:
-      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
-      break;
-    case 2:
-      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
-      break;
-    case 3:
-      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
-             st.threads == 14);
-      break;
-    default:
-      assert(false && "Invalid test case number");
-  }
-  while (st.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3);
-BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
-BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
-
-BENCHMARK_MAIN()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/complexity_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/complexity_test.cc
deleted file mode 100644
index 62d1154df0e..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/complexity_test.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-#undef NDEBUG
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <vector>
-#include "benchmark/benchmark.h"
-#include "output_test.h"
-
-namespace {
-
-#define ADD_COMPLEXITY_CASES(...) \
-  int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
-
-int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
-                      std::string big_o) {
-  SetSubstitutions({{"%bigo_name", big_o_test_name},
-                    {"%rms_name", rms_test_name},
-                    {"%bigo_str", "[ ]* %float " + big_o},
-                    {"%bigo", big_o},
-                    {"%rms", "[ ]*[0-9]+ %"}});
-  AddCases(
-      TC_ConsoleOut,
-      {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
-       {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
-       {"^%rms_name %rms %rms[ ]*$", MR_Next}});
-  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
-                        {"\"cpu_coefficient\": [0-9]+,$", MR_Next},
-                        {"\"real_coefficient\": [0-9]{1,5},$", MR_Next},
-                        {"\"big_o\": \"%bigo\",$", MR_Next},
-                        {"\"time_unit\": \"ns\"$", MR_Next},
-                        {"}", MR_Next},
-                        {"\"name\": \"%rms_name\",$"},
-                        {"\"rms\": %float$", MR_Next},
-                        {"}", MR_Next}});
-  AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
-                       {"^\"%bigo_name\"", MR_Not},
-                       {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
-  return 0;
-}
-
-}  // end namespace
-
-// ========================================================================= //
-// --------------------------- Testing BigO O(1) --------------------------- //
-// ========================================================================= //
-
-void BM_Complexity_O1(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    for (int i = 0; i < 1024; ++i) {
-      benchmark::DoNotOptimize(&i);
-    }
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int) {
-  return 1.0;
-});
-
-const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
-const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
-const char *enum_big_o_1 = "\\([0-9]+\\)";
-// FIXME: Tolerate both '(1)' and 'lgN' as output when the complexity is auto
-// deduced.
-// See https://github.com/google/benchmark/issues/272
-const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
-const char *lambda_big_o_1 = "f\\(N\\)";
-
-// Add enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, enum_big_o_1);
-
-// Add auto enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, auto_big_o_1);
-
-// Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
-
-// ========================================================================= //
-// --------------------------- Testing BigO O(N) --------------------------- //
-// ========================================================================= //
-
-std::vector<int> ConstructRandomVector(int size) {
-  std::vector<int> v;
-  v.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    v.push_back(std::rand() % size);
-  }
-  return v;
-}
-
-void BM_Complexity_O_N(benchmark::State& state) {
-  auto v = ConstructRandomVector(state.range(0));
-  const int item_not_in_vector =
-      state.range(0) * 2;  // Test worst case scenario (item not in vector)
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_Complexity_O_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity(benchmark::oN);
-BENCHMARK(BM_Complexity_O_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int n) -> double { return n; });
-BENCHMARK(BM_Complexity_O_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity();
-
-const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
-const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
-const char *enum_auto_big_o_n = "N";
-const char *lambda_big_o_n = "f\\(N\\)";
-
-// Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
-
-// Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
-
-// ========================================================================= //
-// ------------------------- Testing BigO O(N*lgN) ------------------------- //
-// ========================================================================= //
-
-static void BM_Complexity_O_N_log_N(benchmark::State& state) {
-  auto v = ConstructRandomVector(state.range(0));
-  while (state.KeepRunning()) {
-    std::sort(v.begin(), v.end());
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_Complexity_O_N_log_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity(benchmark::oNLogN);
-BENCHMARK(BM_Complexity_O_N_log_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int n) { return n * log2(n); });
-BENCHMARK(BM_Complexity_O_N_log_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity();
-
-const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
-const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
-const char *enum_auto_big_o_n_lg_n = "NlgN";
-const char *lambda_big_o_n_lg_n = "f\\(N\\)";
-
-// Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     enum_auto_big_o_n_lg_n);
-
-// Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     lambda_big_o_n_lg_n);
-
-// ========================================================================= //
-// --------------------------- TEST CASES END ------------------------------ //
-// ========================================================================= //
-
-int main(int argc, char *argv[]) { RunOutputTests(argc, argv); }
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/cxx03_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/cxx03_test.cc
deleted file mode 100644
index a79d964e17b..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/cxx03_test.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-#undef NDEBUG
-#include <cassert>
-#include <cstddef>
-
-#include "benchmark/benchmark.h"
-
-#if __cplusplus >= 201103L
-#error C++11 or greater detected. Should be C++03.
-#endif
-
-void BM_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    volatile std::size_t x = state.iterations();
-    ((void)x);
-  }
-}
-BENCHMARK(BM_empty);
-
-// The new C++11 interface for args/ranges requires initializer list support.
-// Therefore we provide the old interface to support C++03.
-void BM_old_arg_range_interface(benchmark::State& state) {
-  assert((state.range(0) == 1 && state.range(1) == 2) ||
-         (state.range(0) == 5 && state.range(1) == 6));
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_old_arg_range_interface)->ArgPair(1, 2)->RangePair(5, 5, 6, 6);
-
-template <class T, class U>
-void BM_template2(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE2(BM_template2, int, long);
-
-template <class T>
-void BM_template1(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE(BM_template1, long);
-BENCHMARK_TEMPLATE1(BM_template1, int);
-
-void BM_counters(benchmark::State& state) {
-    BM_empty(state);
-    state.counters["Foo"] = 2;
-}
-BENCHMARK(BM_counters);
-
-BENCHMARK_MAIN()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/diagnostics_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/diagnostics_test.cc
deleted file mode 100644
index 7aac8069e59..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/diagnostics_test.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Testing:
-//   State::PauseTiming()
-//   State::ResumeTiming()
-// Test that CHECK's within these function diagnose when they are called
-// outside of the KeepRunning() loop.
-//
-// NOTE: Users should NOT include or use src/check.h. This is only done in
-// order to test library internals.
-
-#include <cstdlib>
-#include <stdexcept>
-
-#include "../src/check.h"
-#include "benchmark/benchmark.h"
-
-#if defined(__GNUC__) && !defined(__EXCEPTIONS)
-#define TEST_HAS_NO_EXCEPTIONS
-#endif
-
-void TestHandler() {
-#ifndef TEST_HAS_NO_EXCEPTIONS
-  throw std::logic_error("");
-#else
-  std::abort();
-#endif
-}
-
-void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
-  try {
-    state.PauseTiming();
-    std::abort();
-  } catch (std::logic_error const&) {
-  }
-  try {
-    state.ResumeTiming();
-    std::abort();
-  } catch (std::logic_error const&) {
-  }
-#else
-  (void)state;  // avoid unused warning
-#endif
-}
-
-void BM_diagnostic_test(benchmark::State& state) {
-  static bool called_once = false;
-
-  if (called_once == false) try_invalid_pause_resume(state);
-
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(state.iterations());
-  }
-
-  if (called_once == false) try_invalid_pause_resume(state);
-
-  called_once = true;
-}
-BENCHMARK(BM_diagnostic_test);
-
-int main(int argc, char* argv[]) {
-  benchmark::internal::GetAbortHandler() = &TestHandler;
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/donotoptimize_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/donotoptimize_test.cc
deleted file mode 100644
index a705654a269..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/donotoptimize_test.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "benchmark/benchmark.h"
-
-#include <cstdint>
-
-namespace {
-#if defined(__GNUC__)
-std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
-#endif
-std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
-}
-
-// Using DoNotOptimize on types like BitRef seem to cause a lot of problems
-// with the inline assembly on both GCC and Clang.
-struct BitRef {
-  int index;
-  unsigned char &byte;
-
-public:
-  static BitRef Make() {
-    static unsigned char arr[2] = {};
-    BitRef b(1, arr[0]);
-    return b;
-  }
-private:
-  BitRef(int i, unsigned char& b) : index(i), byte(b) {}
-};
-
-int main(int, char*[]) {
-  // this test verifies compilation of DoNotOptimize() for some types
-
-  char buffer8[8];
-  benchmark::DoNotOptimize(buffer8);
-
-  char buffer20[20];
-  benchmark::DoNotOptimize(buffer20);
-
-  char buffer1024[1024];
-  benchmark::DoNotOptimize(buffer1024);
-  benchmark::DoNotOptimize(&buffer1024[0]);
-
-  int x = 123;
-  benchmark::DoNotOptimize(x);
-  benchmark::DoNotOptimize(&x);
-  benchmark::DoNotOptimize(x += 42);
-
-  benchmark::DoNotOptimize(double_up(x));
-
-  // These tests are to e
-  benchmark::DoNotOptimize(BitRef::Make());
-  BitRef lval = BitRef::Make();
-  benchmark::DoNotOptimize(lval);
-}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/filter_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/filter_test.cc
deleted file mode 100644
index 3a205295f09..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/filter_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-#include "benchmark/benchmark.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <string>
-
-namespace {
-
-class TestReporter : public benchmark::ConsoleReporter {
- public:
-  virtual bool ReportContext(const Context& context) {
-    return ConsoleReporter::ReportContext(context);
-  };
-
-  virtual void ReportRuns(const std::vector<Run>& report) {
-    ++count_;
-    ConsoleReporter::ReportRuns(report);
-  };
-
-  TestReporter() : count_(0) {}
-
-  virtual ~TestReporter() {}
-
-  size_t GetCount() const { return count_; }
-
- private:
-  mutable size_t count_;
-};
-
-}  // end namespace
-
-static void NoPrefix(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(NoPrefix);
-
-static void BM_Foo(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_Foo);
-
-static void BM_Bar(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_Bar);
-
-static void BM_FooBar(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_FooBar);
-
-static void BM_FooBa(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_FooBa);
-
-int main(int argc, char **argv) {
-  bool list_only = false;
-  for (int i = 0; i < argc; ++i)
-    list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
-                 std::string::npos;
-
-  benchmark::Initialize(&argc, argv);
-
-  TestReporter test_reporter;
-  const size_t returned_count =
-      benchmark::RunSpecifiedBenchmarks(&test_reporter);
-
-  if (argc == 2) {
-    // Make sure we ran all of the tests
-    std::stringstream ss(argv[1]);
-    size_t expected_return;
-    ss >> expected_return;
-
-    if (returned_count != expected_return) {
-      std::cerr << "ERROR: Expected " << expected_return
-                << " tests to match the filter but returned_count = "
-                << returned_count << std::endl;
-      return -1;
-    }
-
-    const size_t expected_reports = list_only ? 0 : expected_return;
-    const size_t reports_count = test_reporter.GetCount();
-    if (reports_count != expected_reports) {
-      std::cerr << "ERROR: Expected " << expected_reports
-                << " tests to be run but reported_count = " << reports_count
-                << std::endl;
-      return -1;
-    }
-  }
-
-  return 0;
-}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/fixture_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/fixture_test.cc
deleted file mode 100644
index bbc2f957902..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/fixture_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-
-#include "benchmark/benchmark.h"
-
-#include <cassert>
-#include <memory>
-
-class MyFixture : public ::benchmark::Fixture {
- public:
-  void SetUp(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
-      assert(data.get() == nullptr);
-      data.reset(new int(42));
-    }
-  }
-
-  void TearDown(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
-      assert(data.get() != nullptr);
-      data.reset();
-    }
-  }
-
-  ~MyFixture() { assert(data == nullptr); }
-
-  std::unique_ptr<int> data;
-};
-
-BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
-  assert(data.get() != nullptr);
-  assert(*data == 42);
-  while (st.KeepRunning()) {
-  }
-}
-
-BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
-  if (st.thread_index == 0) {
-    assert(data.get() != nullptr);
-    assert(*data == 42);
-  }
-  while (st.KeepRunning()) {
-    assert(data.get() != nullptr);
-    assert(*data == 42);
-  }
-  st.SetItemsProcessed(st.range(0));
-}
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
-
-BENCHMARK_MAIN()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/map_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/map_test.cc
deleted file mode 100644
index 83457c9981c..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/map_test.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-#include "benchmark/benchmark.h"
-
-#include <cstdlib>
-#include <map>
-
-namespace {
-
-std::map<int, int> ConstructRandomMap(int size) {
-  std::map<int, int> m;
-  for (int i = 0; i < size; ++i) {
-    m.insert(std::make_pair(rand() % size, rand() % size));
-  }
-  return m;
-}
-
-}  // namespace
-
-// Basic version.
-static void BM_MapLookup(benchmark::State& state) {
-  const int size = state.range(0);
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    std::map<int, int> m = ConstructRandomMap(size);
-    state.ResumeTiming();
-    for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(rand() % size));
-    }
-  }
-  state.SetItemsProcessed(state.iterations() * size);
-}
-BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
-
-// Using fixtures.
-class MapFixture : public ::benchmark::Fixture {
- public:
-  void SetUp(const ::benchmark::State& st) {
-    m = ConstructRandomMap(st.range(0));
-  }
-
-  void TearDown(const ::benchmark::State&) { m.clear(); }
-
-  std::map<int, int> m;
-};
-
-BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
-  const int size = state.range(0);
-  while (state.KeepRunning()) {
-    for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(rand() % size));
-    }
-  }
-  state.SetItemsProcessed(state.iterations() * size);
-}
-BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1 << 3, 1 << 12);
-
-BENCHMARK_MAIN()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/multiple_ranges_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/multiple_ranges_test.cc
deleted file mode 100644
index 8e67b3b2a99..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/multiple_ranges_test.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "benchmark/benchmark.h"
-
-#include <cassert>
-#include <set>
-
-class MultipleRangesFixture : public ::benchmark::Fixture {
- public:
-  MultipleRangesFixture()
-      : expectedValues({{1, 3, 5},
-                        {1, 3, 8},
-                        {1, 3, 15},
-                        {2, 3, 5},
-                        {2, 3, 8},
-                        {2, 3, 15},
-                        {1, 4, 5},
-                        {1, 4, 8},
-                        {1, 4, 15},
-                        {2, 4, 5},
-                        {2, 4, 8},
-                        {2, 4, 15},
-                        {1, 7, 5},
-                        {1, 7, 8},
-                        {1, 7, 15},
-                        {2, 7, 5},
-                        {2, 7, 8},
-                        {2, 7, 15},
-                        {7, 6, 3}}) {}
-
-  void SetUp(const ::benchmark::State& state) {
-    std::vector<int> ranges = {state.range(0), state.range(1), state.range(2)};
-
-    assert(expectedValues.find(ranges) != expectedValues.end());
-
-    actualValues.insert(ranges);
-  }
-
-  virtual ~MultipleRangesFixture() {
-    assert(actualValues.size() == expectedValues.size());
-  }
-
-  std::set<std::vector<int>> expectedValues;
-  std::set<std::vector<int>> actualValues;
-};
-
-BENCHMARK_DEFINE_F(MultipleRangesFixture, Empty)(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    int product = state.range(0) * state.range(1) * state.range(2);
-    for (int x = 0; x < product; x++) {
-      benchmark::DoNotOptimize(x);
-    }
-  }
-}
-
-BENCHMARK_REGISTER_F(MultipleRangesFixture, Empty)
-    ->RangeMultiplier(2)
-    ->Ranges({{1, 2}, {3, 7}, {5, 15}})
-    ->Args({7, 6, 3});
-
-void BM_CheckDefaultArgument(benchmark::State& state) {
-  // Test that the 'range()' without an argument is the same as 'range(0)'.
-  assert(state.range() == state.range(0));
-  assert(state.range() != state.range(1));
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_CheckDefaultArgument)->Ranges({{1, 5}, {6, 10}});
-
-static void BM_MultipleRanges(benchmark::State& st) {
-  while (st.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_MultipleRanges)->Ranges({{5, 5}, {6, 6}});
-
-BENCHMARK_MAIN()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/options_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/options_test.cc
deleted file mode 100644
index 8eac068b977..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/options_test.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "benchmark/benchmark.h"
-#include <chrono>
-#include <thread>
-
-#if defined(NDEBUG)
-#undef NDEBUG
-#endif
-#include <cassert>
-
-void BM_basic(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-
-void BM_basic_slow(benchmark::State& state) {
-  std::chrono::milliseconds sleep_duration(state.range(0));
-  while (state.KeepRunning()) {
-    std::this_thread::sleep_for(
-        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
-  }
-}
-
-BENCHMARK(BM_basic);
-BENCHMARK(BM_basic)->Arg(42);
-BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
-BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_basic)->Range(1, 8);
-BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
-BENCHMARK(BM_basic)->DenseRange(10, 15);
-BENCHMARK(BM_basic)->Args({42, 42});
-BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}});
-BENCHMARK(BM_basic)->MinTime(0.7);
-BENCHMARK(BM_basic)->UseRealTime();
-BENCHMARK(BM_basic)->ThreadRange(2, 4);
-BENCHMARK(BM_basic)->ThreadPerCpu();
-BENCHMARK(BM_basic)->Repetitions(3);
-
-void CustomArgs(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i < 10; ++i) {
-    b->Arg(i);
-  }
-}
-
-BENCHMARK(BM_basic)->Apply(CustomArgs);
-
-void BM_explicit_iteration_count(benchmark::State& st) {
-  // Test that benchmarks specified with an explicit iteration count are
-  // only run once.
-  static bool invoked_before = false;
-  assert(!invoked_before);
-  invoked_before = true;
-
-  // Test that the requested iteration count is respected.
-  assert(st.max_iterations == 42);
-  size_t actual_iterations = 0;
-  while (st.KeepRunning())
-    ++actual_iterations;
-  assert(st.iterations() == st.max_iterations);
-  assert(st.iterations() == 42);
-
-}
-BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
-
-BENCHMARK_MAIN()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test.h
deleted file mode 100644
index 897a13866ba..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifndef TEST_OUTPUT_TEST_H
-#define TEST_OUTPUT_TEST_H
-
-#undef NDEBUG
-#include <initializer_list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <functional>
-#include <sstream>
-
-#include "../src/re.h"
-#include "benchmark/benchmark.h"
-
-#define CONCAT2(x, y) x##y
-#define CONCAT(x, y) CONCAT2(x, y)
-
-#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = ::AddCases(__VA_ARGS__)
-
-#define SET_SUBSTITUTIONS(...) \
-  int CONCAT(dummy, __LINE__) = ::SetSubstitutions(__VA_ARGS__)
-
-enum MatchRules {
-  MR_Default,  // Skip non-matching lines until a match is found.
-  MR_Next,     // Match must occur on the next line.
-  MR_Not  // No line between the current position and the next match matches
-          // the regex
-};
-
-struct TestCase {
-  TestCase(std::string re, int rule = MR_Default);
-
-  std::string regex_str;
-  int match_rule;
-  std::string substituted_regex;
-  std::shared_ptr<benchmark::Regex> regex;
-};
-
-enum TestCaseID {
-  TC_ConsoleOut,
-  TC_ConsoleErr,
-  TC_JSONOut,
-  TC_JSONErr,
-  TC_CSVOut,
-  TC_CSVErr,
-
-  TC_NumID  // PRIVATE
-};
-
-// Add a list of test cases to be run against the output specified by
-// 'ID'
-int AddCases(TestCaseID ID, std::initializer_list<TestCase> il);
-
-// Add or set a list of substitutions to be performed on constructed regex's
-// See 'output_test_helper.cc' for a list of default substitutions.
-int SetSubstitutions(
-    std::initializer_list<std::pair<std::string, std::string>> il);
-
-// Run all output tests.
-void RunOutputTests(int argc, char* argv[]);
-
-// ========================================================================= //
-// ------------------------- Results checking ------------------------------ //
-// ========================================================================= //
-
-// Call this macro to register a benchmark for checking its results. This
-// should be all that's needed. It subscribes a function to check the (CSV)
-// results of a benchmark. This is done only after verifying that the output
-// strings are really as expected.
-// bm_name_pattern: a name or a regex pattern which will be matched against
-//                  all the benchmark names. Matching benchmarks
-//                  will be the subject of a call to checker_function
-// checker_function: should be of type ResultsCheckFn (see below)
-#define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
-    size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
-
-struct Results;
-typedef std::function< void(Results const&) > ResultsCheckFn;
-
-size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
-
-// Class holding the results of a benchmark.
-// It is passed in calls to checker functions.
-struct Results {
-
-  // the benchmark name
-  std::string name;
-  // the benchmark fields
-  std::map< std::string, std::string > values;
-
-  Results(const std::string& n) : name(n) {}
-
-  int NumThreads() const;
-
-  typedef enum { kCpuTime, kRealTime } BenchmarkTime;
-
-  // get cpu_time or real_time in seconds
-  double GetTime(BenchmarkTime which) const;
-
-  // get the real_time duration of the benchmark in seconds.
-  // it is better to use fuzzy float checks for this, as the float
-  // ASCII formatting is lossy.
-  double DurationRealTime() const {
-    return GetAs< double >("iterations") * GetTime(kRealTime);
-  }
-  // get the cpu_time duration of the benchmark in seconds
-  double DurationCPUTime() const {
-    return GetAs< double >("iterations") * GetTime(kCpuTime);
-  }
-
-  // get the string for a result by name, or nullptr if the name
-  // is not found
-  const std::string* Get(const char* entry_name) const {
-    auto it = values.find(entry_name);
-    if(it == values.end()) return nullptr;
-    return &it->second;
-  }
-
-  // get a result by name, parsed as a specific type.
-  // NOTE: for counters, use GetCounterAs instead.
-  template <class T>
-  T GetAs(const char* entry_name) const;
-
-  // counters are written as doubles, so they have to be read first
-  // as a double, and only then converted to the asked type.
-  template <class T>
-  T GetCounterAs(const char* entry_name) const {
-    double dval = GetAs< double >(entry_name);
-    T tval = static_cast< T >(dval);
-    return tval;
-  }
-};
-
-template <class T>
-T Results::GetAs(const char* entry_name) const {
-  auto *sv = Get(entry_name);
-  CHECK(sv != nullptr && !sv->empty());
-  std::stringstream ss;
-  ss << *sv;
-  T out;
-  ss >> out;
-  CHECK(!ss.fail());
-  return out;
-}
-
-//----------------------------------
-// Macros to help in result checking. Do not use them with arguments causing
-// side-effects.
-
-#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
-    CONCAT(CHECK_, relationship)                                        \
-    (entry.getfn< var_type >(var_name), (value)) << "\n"                \
-    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
-    << __FILE__ << ":" << __LINE__ << ": "                              \
-    << "expected (" << #var_type << ")" << (var_name)                   \
-    << "=" << (entry).getfn< var_type >(var_name)                       \
-    << " to be " #relationship " to " << (value) << "\n"
-
-// check with tolerance. eps_factor is the tolerance window, which is
-// interpreted relative to value (eg, 0.1 means 10% of value).
-#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
-    CONCAT(CHECK_FLOAT_, relationship)                                  \
-    (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
-    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
-    << __FILE__ << ":" << __LINE__ << ": "                              \
-    << "expected (" << #var_type << ")" << (var_name)                   \
-    << "=" << (entry).getfn< var_type >(var_name)                       \
-    << " to be " #relationship " to " << (value) << "\n"                \
-    << __FILE__ << ":" << __LINE__ << ": "                              \
-    << "with tolerance of " << (eps_factor) * (value)                   \
-    << " (" << (eps_factor)*100. << "%), "                              \
-    << "but delta was " << ((entry).getfn< var_type >(var_name) - (value)) \
-    << " (" << (((entry).getfn< var_type >(var_name) - (value))         \
-               /                                                        \
-               ((value) > 1.e-5 || value < -1.e-5 ? value : 1.e-5)*100.) \
-    << "%)"
-
-#define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
-
-#define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
-
-#define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
-
-#define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
-
-// ========================================================================= //
-// --------------------------- Misc Utilities ------------------------------ //
-// ========================================================================= //
-
-namespace {
-
-const char* const dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
-
-}  //  end namespace
-
-#endif  // TEST_OUTPUT_TEST_H
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test_helper.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test_helper.cc
deleted file mode 100644
index 24746f6d27f..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test_helper.cc
+++ /dev/null
@@ -1,423 +0,0 @@
-#include <iostream>
-#include <map>
-#include <memory>
-#include <sstream>
-#include <cstring>
-
-#include "../src/check.h"  // NOTE: check.h is for internal use only!
-#include "../src/re.h"     // NOTE: re.h is for internal use only
-#include "output_test.h"
-#include "../src/benchmark_api_internal.h"
-
-// ========================================================================= //
-// ------------------------------ Internals -------------------------------- //
-// ========================================================================= //
-namespace internal {
-namespace {
-
-using TestCaseList = std::vector<TestCase>;
-
-// Use a vector because the order elements are added matters during iteration.
-// std::map/unordered_map don't guarantee that.
-// For example:
-//  SetSubstitutions({{"%HelloWorld", "Hello"}, {"%Hello", "Hi"}});
-//     Substitute("%HelloWorld") // Always expands to Hello.
-using SubMap = std::vector<std::pair<std::string, std::string>>;
-
-TestCaseList& GetTestCaseList(TestCaseID ID) {
-  // Uses function-local statics to ensure initialization occurs
-  // before first use.
-  static TestCaseList lists[TC_NumID];
-  return lists[ID];
-}
-
-SubMap& GetSubstitutions() {
-  // Don't use 'dec_re' from header because it may not yet be initialized.
-  static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
-  static SubMap map = {
-      {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
-      // human-readable float
-      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
-      {"%int", "[ ]*[0-9]+"},
-      {" %s ", "[ ]+"},
-      {"%time", "[ ]*[0-9]{1,5} ns"},
-      {"%console_report", "[ ]*[0-9]{1,5} ns [ ]*[0-9]{1,5} ns [ ]*[0-9]+"},
-      {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
-      {"%csv_header",
-       "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
-       "items_per_second,label,error_occurred,error_message"},
-      {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
-      {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
-      {"%csv_bytes_report",
-       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
-      {"%csv_items_report",
-       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,," + safe_dec_re + ",,,"},
-      {"%csv_bytes_items_report",
-       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re +
-       "," + safe_dec_re + ",,,"},
-      {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
-      {"%csv_label_report_end", ",,"}};
-  return map;
-}
-
-std::string PerformSubstitutions(std::string source) {
-  SubMap const& subs = GetSubstitutions();
-  using SizeT = std::string::size_type;
-  for (auto const& KV : subs) {
-    SizeT pos;
-    SizeT next_start = 0;
-    while ((pos = source.find(KV.first, next_start)) != std::string::npos) {
-      next_start = pos + KV.second.size();
-      source.replace(pos, KV.first.size(), KV.second);
-    }
-  }
-  return source;
-}
-
-void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
-               TestCaseList const& not_checks) {
-  std::string first_line;
-  bool on_first = true;
-  std::string line;
-  while (remaining_output.eof() == false) {
-    CHECK(remaining_output.good());
-    std::getline(remaining_output, line);
-    if (on_first) {
-      first_line = line;
-      on_first = false;
-    }
-    for (const auto& NC : not_checks) {
-      CHECK(!NC.regex->Match(line))
-          << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
-          << NC.regex_str << "\""
-          << "\n    actual regex string \"" << TC.substituted_regex << "\""
-          << "\n    started matching near: " << first_line;
-    }
-    if (TC.regex->Match(line)) return;
-    CHECK(TC.match_rule != MR_Next)
-        << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
-        << "\""
-        << "\n    actual regex string \"" << TC.substituted_regex << "\""
-        << "\n    started matching near: " << first_line;
-  }
-  CHECK(remaining_output.eof() == false)
-      << "End of output reached before match for regex \"" << TC.regex_str
-      << "\" was found"
-      << "\n    actual regex string \"" << TC.substituted_regex << "\""
-      << "\n    started matching near: " << first_line;
-}
-
-void CheckCases(TestCaseList const& checks, std::stringstream& output) {
-  std::vector<TestCase> not_checks;
-  for (size_t i = 0; i < checks.size(); ++i) {
-    const auto& TC = checks[i];
-    if (TC.match_rule == MR_Not) {
-      not_checks.push_back(TC);
-      continue;
-    }
-    CheckCase(output, TC, not_checks);
-    not_checks.clear();
-  }
-}
-
-class TestReporter : public benchmark::BenchmarkReporter {
- public:
-  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps) {}
-
-  virtual bool ReportContext(const Context& context) {
-    bool last_ret = false;
-    bool first = true;
-    for (auto rep : reporters_) {
-      bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
-          << "Reports return different values for ReportContext";
-      first = false;
-      last_ret = new_ret;
-    }
-    (void)first;
-    return last_ret;
-  }
-
-  void ReportRuns(const std::vector<Run>& report) {
-    for (auto rep : reporters_) rep->ReportRuns(report);
-  }
-  void Finalize() {
-    for (auto rep : reporters_) rep->Finalize();
-  }
-
- private:
-  std::vector<benchmark::BenchmarkReporter *> reporters_;
-};
-}
-
-}  // end namespace internal
-
-// ========================================================================= //
-// -------------------------- Results checking ----------------------------- //
-// ========================================================================= //
-
-namespace internal {
-
-// Utility class to manage subscribers for checking benchmark results.
-// It works by parsing the CSV output to read the results.
-class ResultsChecker {
- public:
-
-  struct PatternAndFn : public TestCase { // reusing TestCase for its regexes
-    PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
-    : TestCase(rx), fn(fn_) {}
-    ResultsCheckFn fn;
-  };
-
-  std::vector< PatternAndFn > check_patterns;
-  std::vector< Results > results;
-  std::vector< std::string > field_names;
-
-  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
-
-  void CheckResults(std::stringstream& output);
-
- private:
-
-  void SetHeader_(const std::string& csv_header);
-  void SetValues_(const std::string& entry_csv_line);
-
-  std::vector< std::string > SplitCsv_(const std::string& line);
-
-};
-
-// store the static ResultsChecker in a function to prevent initialization
-// order problems
-ResultsChecker& GetResultsChecker() {
-  static ResultsChecker rc;
-  return rc;
-}
-
-// add a results checker for a benchmark
-void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
-  check_patterns.emplace_back(entry_pattern, fn);
-}
-
-// check the results of all subscribed benchmarks
-void ResultsChecker::CheckResults(std::stringstream& output) {
-  // first reset the stream to the start
-  {
-    auto start = std::ios::streampos(0);
-    // clear before calling tellg()
-    output.clear();
-    // seek to zero only when needed
-    if(output.tellg() > start) output.seekg(start);
-    // and just in case
-    output.clear();
-  }
-  // now go over every line and publish it to the ResultsChecker
-  std::string line;
-  bool on_first = true;
-  while (output.eof() == false) {
-    CHECK(output.good());
-    std::getline(output, line);
-    if (on_first) {
-      SetHeader_(line); // this is important
-      on_first = false;
-      continue;
-    }
-    SetValues_(line);
-  }
-  // finally we can call the subscribed check functions
-  for(const auto& p : check_patterns) {
-    VLOG(2) << "--------------------------------\n";
-    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
-    for(const auto& r : results) {
-      if(!p.regex->Match(r.name)) {
-        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
-        continue;
-      } else {
-        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
-      }
-      VLOG(1) << "Checking results of " << r.name << ": ... \n";
-      p.fn(r);
-      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
-    }
-  }
-}
-
-// prepare for the names in this header
-void ResultsChecker::SetHeader_(const std::string& csv_header) {
-  field_names = SplitCsv_(csv_header);
-}
-
-// set the values for a benchmark
-void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
-  if(entry_csv_line.empty()) return; // some lines are empty
-  CHECK(!field_names.empty());
-  auto vals = SplitCsv_(entry_csv_line);
-  CHECK_EQ(vals.size(), field_names.size());
-  results.emplace_back(vals[0]); // vals[0] is the benchmark name
-  auto &entry = results.back();
-  for (size_t i = 1, e = vals.size(); i < e; ++i) {
-    entry.values[field_names[i]] = vals[i];
-  }
-}
-
-// a quick'n'dirty csv splitter (eliminating quotes)
-std::vector< std::string > ResultsChecker::SplitCsv_(const std::string& line) {
-  std::vector< std::string > out;
-  if(line.empty()) return out;
-  if(!field_names.empty()) out.reserve(field_names.size());
-  size_t prev = 0, pos = line.find_first_of(','), curr = pos;
-  while(pos != line.npos) {
-    CHECK(curr > 0);
-    if(line[prev] == '"') ++prev;
-    if(line[curr-1] == '"') --curr;
-    out.push_back(line.substr(prev, curr-prev));
-    prev = pos + 1;
-    pos = line.find_first_of(',', pos + 1);
-    curr = pos;
-  }
-  curr = line.size();
-  if(line[prev] == '"') ++prev;
-  if(line[curr-1] == '"') --curr;
-  out.push_back(line.substr(prev, curr-prev));
-  return out;
-}
-
-}  // end namespace internal
-
-size_t AddChecker(const char* bm_name, ResultsCheckFn fn)
-{
-  auto &rc = internal::GetResultsChecker();
-  rc.Add(bm_name, fn);
-  return rc.results.size();
-}
-
-int Results::NumThreads() const {
-  auto pos = name.find("/threads:");
-  if(pos == name.npos) return 1;
-  auto end = name.find('/', pos + 9);
-  std::stringstream ss;
-  ss << name.substr(pos + 9, end);
-  int num = 1;
-  ss >> num;
-  CHECK(!ss.fail());
-  return num;
-}
-
-double Results::GetTime(BenchmarkTime which) const {
-  CHECK(which == kCpuTime || which == kRealTime);
-  const char *which_str = which == kCpuTime ? "cpu_time" : "real_time";
-  double val = GetAs< double >(which_str);
-  auto unit = Get("time_unit");
-  CHECK(unit);
-  if(*unit == "ns") {
-    return val * 1.e-9;
-  } else if(*unit == "us") {
-    return val * 1.e-6;
-  } else if(*unit == "ms") {
-    return val * 1.e-3;
-  } else if(*unit == "s") {
-    return val;
-  } else {
-    CHECK(1 == 0) << "unknown time unit: " << *unit;
-    return 0;
-  }
-}
-
-// ========================================================================= //
-// -------------------------- Public API Definitions------------------------ //
-// ========================================================================= //
-
-TestCase::TestCase(std::string re, int rule)
-    : regex_str(std::move(re)),
-      match_rule(rule),
-      substituted_regex(internal::PerformSubstitutions(regex_str)),
-      regex(std::make_shared<benchmark::Regex>()) {
-  std::string err_str;
-  regex->Init(substituted_regex,& err_str);
-  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
-                         << "\""
-                         << "\n    originally \"" << regex_str << "\""
-                         << "\n    got error: " << err_str;
-}
-
-int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
-  auto& L = internal::GetTestCaseList(ID);
-  L.insert(L.end(), il);
-  return 0;
-}
-
-int SetSubstitutions(
-    std::initializer_list<std::pair<std::string, std::string>> il) {
-  auto& subs = internal::GetSubstitutions();
-  for (auto KV : il) {
-    bool exists = false;
-    KV.second = internal::PerformSubstitutions(KV.second);
-    for (auto& EKV : subs) {
-      if (EKV.first == KV.first) {
-        EKV.second = std::move(KV.second);
-        exists = true;
-        break;
-      }
-    }
-    if (!exists) subs.push_back(std::move(KV));
-  }
-  return 0;
-}
-
-void RunOutputTests(int argc, char* argv[]) {
-  using internal::GetTestCaseList;
-  benchmark::Initialize(&argc, argv);
-  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/true);
-  benchmark::ConsoleReporter CR(options);
-  benchmark::JSONReporter JR;
-  benchmark::CSVReporter CSVR;
-  struct ReporterTest {
-    const char* name;
-    std::vector<TestCase>& output_cases;
-    std::vector<TestCase>& error_cases;
-    benchmark::BenchmarkReporter& reporter;
-    std::stringstream out_stream;
-    std::stringstream err_stream;
-
-    ReporterTest(const char* n, std::vector<TestCase>& out_tc,
-                 std::vector<TestCase>& err_tc,
-                 benchmark::BenchmarkReporter& br)
-        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
-      reporter.SetOutputStream(&out_stream);
-      reporter.SetErrorStream(&err_stream);
-    }
-  } TestCases[] = {
-      {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut),
-       GetTestCaseList(TC_ConsoleErr), CR},
-      {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr),
-       JR},
-      {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr),
-       CSVR},
-  };
-
-  // Create the test reporter and run the benchmarks.
-  std::cout << "Running benchmarks...\n";
-  internal::TestReporter test_rep({&CR, &JR, &CSVR});
-  benchmark::RunSpecifiedBenchmarks(&test_rep);
-
-  for (auto& rep_test : TestCases) {
-    std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
-    std::string banner(msg.size() - 1, '-');
-    std::cout << banner << msg << banner << "\n";
-
-    std::cerr << rep_test.err_stream.str();
-    std::cout << rep_test.out_stream.str();
-
-    internal::CheckCases(rep_test.error_cases, rep_test.err_stream);
-    internal::CheckCases(rep_test.output_cases, rep_test.out_stream);
-
-    std::cout << "\n";
-  }
-
-  // now that we know the output is as expected, we can dispatch
-  // the checks to subscribees.
-  auto &csv = TestCases[2];
-  // would use == but gcc spits a warning
-  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
-  internal::GetResultsChecker().CheckResults(csv.out_stream);
-}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/register_benchmark_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/register_benchmark_test.cc
deleted file mode 100644
index 2769b7a6b63..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/register_benchmark_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-
-#undef NDEBUG
-#include <cassert>
-#include <vector>
-
-#include "../src/check.h"  // NOTE: check.h is for internal use only!
-#include "benchmark/benchmark.h"
-
-namespace {
-
-class TestReporter : public benchmark::ConsoleReporter {
- public:
-  virtual void ReportRuns(const std::vector<Run>& report) {
-    all_runs_.insert(all_runs_.end(), begin(report), end(report));
-    ConsoleReporter::ReportRuns(report);
-  }
-
-  std::vector<Run> all_runs_;
-};
-
-struct TestCase {
-  std::string name;
-  const char* label;
-  // Note: not explicit as we rely on it being converted through ADD_CASES.
-  TestCase(const char* xname) : TestCase(xname, nullptr) {}
-  TestCase(const char* xname, const char* xlabel)
-      : name(xname), label(xlabel) {}
-
-  typedef benchmark::BenchmarkReporter::Run Run;
-
-  void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got "
-                                      << run.benchmark_name;
-    if (label) {
-      CHECK(run.report_label == label) << "expected " << label << " got "
-                                       << run.report_label;
-    } else {
-      CHECK(run.report_label == "");
-    }
-  }
-};
-
-std::vector<TestCase> ExpectedResults;
-
-int AddCases(std::initializer_list<TestCase> const& v) {
-  for (auto N : v) {
-    ExpectedResults.push_back(N);
-  }
-  return 0;
-}
-
-#define CONCAT(x, y) CONCAT2(x, y)
-#define CONCAT2(x, y) x##y
-#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases({__VA_ARGS__})
-
-}  // end namespace
-
-typedef benchmark::internal::Benchmark* ReturnVal;
-
-//----------------------------------------------------------------------------//
-// Test RegisterBenchmark with no additional arguments
-//----------------------------------------------------------------------------//
-void BM_function(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_function);
-ReturnVal dummy = benchmark::RegisterBenchmark(
-    "BM_function_manual_registration", BM_function);
-ADD_CASES({"BM_function"}, {"BM_function_manual_registration"});
-
-//----------------------------------------------------------------------------//
-// Test RegisterBenchmark with additional arguments
-// Note: GCC <= 4.8 do not support this form of RegisterBenchmark because they
-//       reject the variadic pack expansion of lambda captures.
-//----------------------------------------------------------------------------//
-#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-
-void BM_extra_args(benchmark::State& st, const char* label) {
-  while (st.KeepRunning()) {
-  }
-  st.SetLabel(label);
-}
-int RegisterFromFunction() {
-  std::pair<const char*, const char*> cases[] = {
-      {"test1", "One"}, {"test2", "Two"}, {"test3", "Three"}};
-  for (auto const& c : cases)
-    benchmark::RegisterBenchmark(c.first, &BM_extra_args, c.second);
-  return 0;
-}
-int dummy2 = RegisterFromFunction();
-ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"});
-
-#endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-
-//----------------------------------------------------------------------------//
-// Test RegisterBenchmark with different callable types
-//----------------------------------------------------------------------------//
-
-struct CustomFixture {
-  void operator()(benchmark::State& st) {
-    while (st.KeepRunning()) {
-    }
-  }
-};
-
-void TestRegistrationAtRuntime() {
-#ifdef BENCHMARK_HAS_CXX11
-  {
-    CustomFixture fx;
-    benchmark::RegisterBenchmark("custom_fixture", fx);
-    AddCases({"custom_fixture"});
-  }
-#endif
-#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-  {
-    const char* x = "42";
-    auto capturing_lam = [=](benchmark::State& st) {
-      while (st.KeepRunning()) {
-      }
-      st.SetLabel(x);
-    };
-    benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam);
-    AddCases({{"lambda_benchmark", x}});
-  }
-#endif
-}
-
-// Test that all benchmarks, registered at either during static init or runtime,
-// are run and the results are passed to the reported.
-void RunTestOne() {
-  TestRegistrationAtRuntime();
-
-  TestReporter test_reporter;
-  benchmark::RunSpecifiedBenchmarks(&test_reporter);
-
-  typedef benchmark::BenchmarkReporter::Run Run;
-  auto EB = ExpectedResults.begin();
-
-  for (Run const& run : test_reporter.all_runs_) {
-    assert(EB != ExpectedResults.end());
-    EB->CheckRun(run);
-    ++EB;
-  }
-  assert(EB == ExpectedResults.end());
-}
-
-// Test that ClearRegisteredBenchmarks() clears all previously registered
-// benchmarks.
-// Also test that new benchmarks can be registered and ran afterwards.
-void RunTestTwo() {
-  assert(ExpectedResults.size() != 0 &&
-         "must have at least one registered benchmark");
-  ExpectedResults.clear();
-  benchmark::ClearRegisteredBenchmarks();
-
-  TestReporter test_reporter;
-  size_t num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
-  assert(num_ran == 0);
-  assert(test_reporter.all_runs_.begin() == test_reporter.all_runs_.end());
-
-  TestRegistrationAtRuntime();
-  num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
-  assert(num_ran == ExpectedResults.size());
-
-  typedef benchmark::BenchmarkReporter::Run Run;
-  auto EB = ExpectedResults.begin();
-
-  for (Run const& run : test_reporter.all_runs_) {
-    assert(EB != ExpectedResults.end());
-    EB->CheckRun(run);
-    ++EB;
-  }
-  assert(EB == ExpectedResults.end());
-}
-
-int main(int argc, char* argv[]) {
-  benchmark::Initialize(&argc, argv);
-
-  RunTestOne();
-  RunTestTwo();
-}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/reporter_output_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/reporter_output_test.cc
deleted file mode 100644
index 4a481433485..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/reporter_output_test.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-
-#undef NDEBUG
-#include <utility>
-
-#include "benchmark/benchmark.h"
-#include "output_test.h"
-
-// ========================================================================= //
-// ---------------------- Testing Prologue Output -------------------------- //
-// ========================================================================= //
-
-ADD_CASES(TC_ConsoleOut,
-          {{"^[-]+$", MR_Next},
-           {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
-           {"^[-]+$", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"%csv_header"}});
-
-// ========================================================================= //
-// ------------------------ Testing Basic Output --------------------------- //
-// ========================================================================= //
-
-void BM_basic(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_basic);
-
-ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\"$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
-
-// ========================================================================= //
-// ------------------------ Testing Bytes per Second Output ---------------- //
-// ========================================================================= //
-
-void BM_bytes_per_second(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  state.SetBytesProcessed(1);
-}
-BENCHMARK(BM_bytes_per_second);
-
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_bytes_per_second %console_report +%floatB/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bytes_per_second\": %int$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
-
-// ========================================================================= //
-// ------------------------ Testing Items per Second Output ---------------- //
-// ========================================================================= //
-
-void BM_items_per_second(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  state.SetItemsProcessed(1);
-}
-BENCHMARK(BM_items_per_second);
-
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_items_per_second %console_report +%float items/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"items_per_second\": %int$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_items_per_second\",%csv_items_report$"}});
-
-// ========================================================================= //
-// ------------------------ Testing Label Output --------------------------- //
-// ========================================================================= //
-
-void BM_label(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  state.SetLabel("some label");
-}
-BENCHMARK(BM_label);
-
-ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"label\": \"some label\"$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_label\",%csv_label_report_begin\"some "
-                       "label\"%csv_label_report_end$"}});
-
-// ========================================================================= //
-// ------------------------ Testing Error Output --------------------------- //
-// ========================================================================= //
-
-void BM_error(benchmark::State& state) {
-  state.SkipWithError("message");
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_error);
-ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
-                       {"\"error_occurred\": true,$", MR_Next},
-                       {"\"error_message\": \"message\",$", MR_Next}});
-
-ADD_CASES(TC_CSVOut, {{"^\"BM_error\",,,,,,,,true,\"message\"$"}});
-
-// ========================================================================= //
-// ------------------------ Testing No Arg Name Output -----------------------
-// //
-// ========================================================================= //
-
-void BM_no_arg_name(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_no_arg_name)->Arg(3);
-ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
-
-// ========================================================================= //
-// ------------------------ Testing Arg Name Output ----------------------- //
-// ========================================================================= //
-
-void BM_arg_name(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
-ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
-
-// ========================================================================= //
-// ------------------------ Testing Arg Names Output ----------------------- //
-// ========================================================================= //
-
-void BM_arg_names(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
-
-// ========================================================================= //
-// ----------------------- Testing Complexity Output ----------------------- //
-// ========================================================================= //
-
-void BM_Complexity_O1(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
-SET_SUBSTITUTIONS({{"%bigOStr", "[ ]* %float \\([0-9]+\\)"},
-                   {"%RMS", "[ ]*[0-9]+ %"}});
-ADD_CASES(TC_ConsoleOut, {{"^BM_Complexity_O1_BigO %bigOStr %bigOStr[ ]*$"},
-                          {"^BM_Complexity_O1_RMS %RMS %RMS[ ]*$"}});
-
-// ========================================================================= //
-// ----------------------- Testing Aggregate Output ------------------------ //
-// ========================================================================= //
-
-// Test that non-aggregate data is printed by default
-void BM_Repeat(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_Repeat)->Repetitions(3);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3_mean %console_report$"},
-                          {"^BM_Repeat/repeats:3_stddev %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
-                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
-                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
-                      {"^\"BM_Repeat/repeats:3_mean\",%csv_report$"},
-                      {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
-
-// Test that a non-repeated test still prints non-aggregate results even when
-// only-aggregate reports have been requested
-void BM_RepeatOnce(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
-ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
-
-// Test that non-aggregate data is not reported
-void BM_SummaryRepeat(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
-ADD_CASES(TC_ConsoleOut,
-          {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-           {"^BM_SummaryRepeat/repeats:3_mean %console_report$"},
-           {"^BM_SummaryRepeat/repeats:3_stddev %console_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"}});
-ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-                      {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
-                      {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
-
-void BM_RepeatTimeUnit(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_RepeatTimeUnit)
-    ->Repetitions(3)
-    ->ReportAggregatesOnly()
-    ->Unit(benchmark::kMicrosecond);
-ADD_CASES(TC_ConsoleOut,
-          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-           {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_report$"},
-           {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
-                       {"\"time_unit\": \"us\",?$"},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
-                       {"\"time_unit\": \"us\",?$"}});
-ADD_CASES(TC_CSVOut,
-          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-           {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
-           {"^\"BM_RepeatTimeUnit/repeats:3_stddev\",%csv_us_report$"}});
-
-// ========================================================================= //
-// --------------------------- TEST CASES END ------------------------------ //
-// ========================================================================= //
-
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/skip_with_error_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/skip_with_error_test.cc
deleted file mode 100644
index b74d33c5899..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/skip_with_error_test.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-
-#undef NDEBUG
-#include <cassert>
-#include <vector>
-
-#include "../src/check.h"  // NOTE: check.h is for internal use only!
-#include "benchmark/benchmark.h"
-
-namespace {
-
-class TestReporter : public benchmark::ConsoleReporter {
- public:
-  virtual bool ReportContext(const Context& context) {
-    return ConsoleReporter::ReportContext(context);
-  };
-
-  virtual void ReportRuns(const std::vector<Run>& report) {
-    all_runs_.insert(all_runs_.end(), begin(report), end(report));
-    ConsoleReporter::ReportRuns(report);
-  }
-
-  TestReporter() {}
-  virtual ~TestReporter() {}
-
-  mutable std::vector<Run> all_runs_;
-};
-
-struct TestCase {
-  std::string name;
-  bool error_occurred;
-  std::string error_message;
-
-  typedef benchmark::BenchmarkReporter::Run Run;
-
-  void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got "
-                                      << run.benchmark_name;
-    CHECK(error_occurred == run.error_occurred);
-    CHECK(error_message == run.error_message);
-    if (error_occurred) {
-      // CHECK(run.iterations == 0);
-    } else {
-      CHECK(run.iterations != 0);
-    }
-  }
-};
-
-std::vector<TestCase> ExpectedResults;
-
-int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
-  for (auto TC : v) {
-    TC.name = base_name + TC.name;
-    ExpectedResults.push_back(std::move(TC));
-  }
-  return 0;
-}
-
-#define CONCAT(x, y) CONCAT2(x, y)
-#define CONCAT2(x, y) x##y
-#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
-
-}  // end namespace
-
-void BM_error_before_running(benchmark::State& state) {
-  state.SkipWithError("error message");
-  while (state.KeepRunning()) {
-    assert(false);
-  }
-}
-BENCHMARK(BM_error_before_running);
-ADD_CASES("BM_error_before_running", {{"", true, "error message"}});
-
-void BM_error_during_running(benchmark::State& state) {
-  int first_iter = true;
-  while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
-      assert(first_iter);
-      first_iter = false;
-      state.SkipWithError("error message");
-    } else {
-      state.PauseTiming();
-      state.ResumeTiming();
-    }
-  }
-}
-BENCHMARK(BM_error_during_running)->Arg(1)->Arg(2)->ThreadRange(1, 8);
-ADD_CASES("BM_error_during_running", {{"/1/threads:1", true, "error message"},
-                                      {"/1/threads:2", true, "error message"},
-                                      {"/1/threads:4", true, "error message"},
-                                      {"/1/threads:8", true, "error message"},
-                                      {"/2/threads:1", false, ""},
-                                      {"/2/threads:2", false, ""},
-                                      {"/2/threads:4", false, ""},
-                                      {"/2/threads:8", false, ""}});
-
-void BM_error_after_running(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(state.iterations());
-  }
-  if (state.thread_index <= (state.threads / 2))
-    state.SkipWithError("error message");
-}
-BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
-ADD_CASES("BM_error_after_running", {{"/threads:1", true, "error message"},
-                                     {"/threads:2", true, "error message"},
-                                     {"/threads:4", true, "error message"},
-                                     {"/threads:8", true, "error message"}});
-
-void BM_error_while_paused(benchmark::State& state) {
-  bool first_iter = true;
-  while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
-      assert(first_iter);
-      first_iter = false;
-      state.PauseTiming();
-      state.SkipWithError("error message");
-    } else {
-      state.PauseTiming();
-      state.ResumeTiming();
-    }
-  }
-}
-BENCHMARK(BM_error_while_paused)->Arg(1)->Arg(2)->ThreadRange(1, 8);
-ADD_CASES("BM_error_while_paused", {{"/1/threads:1", true, "error message"},
-                                    {"/1/threads:2", true, "error message"},
-                                    {"/1/threads:4", true, "error message"},
-                                    {"/1/threads:8", true, "error message"},
-                                    {"/2/threads:1", false, ""},
-                                    {"/2/threads:2", false, ""},
-                                    {"/2/threads:4", false, ""},
-                                    {"/2/threads:8", false, ""}});
-
-int main(int argc, char* argv[]) {
-  benchmark::Initialize(&argc, argv);
-
-  TestReporter test_reporter;
-  benchmark::RunSpecifiedBenchmarks(&test_reporter);
-
-  typedef benchmark::BenchmarkReporter::Run Run;
-  auto EB = ExpectedResults.begin();
-
-  for (Run const& run : test_reporter.all_runs_) {
-    assert(EB != ExpectedResults.end());
-    EB->CheckRun(run);
-    ++EB;
-  }
-  assert(EB == ExpectedResults.end());
-
-  return 0;
-}
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_tabular_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_tabular_test.cc
deleted file mode 100644
index 5fc5b4d9b88..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_tabular_test.cc
+++ /dev/null
@@ -1,250 +0,0 @@
-
-#undef NDEBUG
-
-#include "benchmark/benchmark.h"
-#include "output_test.h"
-
-// @todo: <jpmag> this checks the full output at once; the rule for
-// CounterSet1 was failing because it was not matching "^[-]+$".
-// @todo: <jpmag> check that the counters are vertically aligned.
-ADD_CASES(TC_ConsoleOut, {
-// keeping these lines long improves readability, so:
-// clang-format off
-    {"^[-]+$", MR_Next},
-    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Baz %s Foo$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^Benchmark %s Time %s CPU %s Iterations %s Bat %s Baz %s Foo$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
-// clang-format on
-});
-ADD_CASES(TC_CSVOut, {{"%csv_header,"
-                       "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
-
-// ========================================================================= //
-// ------------------------- Tabular Counters Output ----------------------- //
-// ========================================================================= //
-
-void BM_Counters_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-    {"Foo",  { 1, bm::Counter::kAvgThreads}},
-    {"Bar",  { 2, bm::Counter::kAvgThreads}},
-    {"Baz",  { 4, bm::Counter::kAvgThreads}},
-    {"Bat",  { 8, bm::Counter::kAvgThreads}},
-    {"Frob", {16, bm::Counter::kAvgThreads}},
-    {"Lob",  {32, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
-                       "%float,%float,%float,%float,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckTabular(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 1);
-  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 2);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 4);
-  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 8);
-  CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
-  CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
-
-// ========================================================================= //
-// -------------------- Tabular+Rate Counters Output ----------------------- //
-// ========================================================================= //
-
-void BM_CounterRates_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-    {"Foo",  { 1, bm::Counter::kAvgThreadsRate}},
-    {"Bar",  { 2, bm::Counter::kAvgThreadsRate}},
-    {"Baz",  { 4, bm::Counter::kAvgThreadsRate}},
-    {"Bat",  { 8, bm::Counter::kAvgThreadsRate}},
-    {"Frob", {16, bm::Counter::kAvgThreadsRate}},
-    {"Lob",  {32, bm::Counter::kAvgThreadsRate}},
-  });
-}
-BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
-                       "%float,%float,%float,%float,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckTabularRate(Results const& e) {
-  double t = e.DurationCPUTime();
-  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32./t, 0.001);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
-                        &CheckTabularRate);
-
-// ========================================================================= //
-// ------------------------- Tabular Counters Output ----------------------- //
-// ========================================================================= //
-
-// set only some of the counters
-void BM_CounterSet0_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-    {"Foo", {10, bm::Counter::kAvgThreads}},
-    {"Bar", {20, bm::Counter::kAvgThreads}},
-    {"Baz", {40, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
-                       "%float,,%float,%float,,"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckSet0(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
-  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 20);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterSet0_Tabular", &CheckSet0);
-
-// again.
-void BM_CounterSet1_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-    {"Foo", {15, bm::Counter::kAvgThreads}},
-    {"Bar", {25, bm::Counter::kAvgThreads}},
-    {"Baz", {45, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
-                       "%float,,%float,%float,,"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckSet1(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 15);
-  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 25);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 45);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterSet1_Tabular/threads:%int", &CheckSet1);
-
-// ========================================================================= //
-// ------------------------- Tabular Counters Output ----------------------- //
-// ========================================================================= //
-
-// set only some of the counters, different set now.
-void BM_CounterSet2_Tabular(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-    {"Foo", {10, bm::Counter::kAvgThreads}},
-    {"Bat", {30, bm::Counter::kAvgThreads}},
-    {"Baz", {40, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
-                       ",%float,%float,%float,,"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckSet2(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
-  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 30);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2);
-
-// ========================================================================= //
-// --------------------------- TEST CASES END ------------------------------ //
-// ========================================================================= //
-
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_test.cc
deleted file mode 100644
index 66df48b31f8..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-
-#undef NDEBUG
-
-#include "benchmark/benchmark.h"
-#include "output_test.h"
-
-// ========================================================================= //
-// ---------------------- Testing Prologue Output -------------------------- //
-// ========================================================================= //
-
-ADD_CASES(TC_ConsoleOut,
-          {{"^[-]+$", MR_Next},
-           {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
-           {"^[-]+$", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
-
-// ========================================================================= //
-// ------------------------- Simple Counters Output ------------------------ //
-// ========================================================================= //
-
-void BM_Counters_Simple(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  state.counters["foo"] = 1;
-  state.counters["bar"] = 2 * (double)state.iterations();
-}
-BENCHMARK(BM_Counters_Simple);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckSimple(Results const& e) {
-  double its = e.GetAs< double >("iterations");
-  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
-  // check that the value of bar is within 0.1% of the expected value
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2.*its, 0.001);
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
-
-// ========================================================================= //
-// --------------------- Counters+Items+Bytes/s Output --------------------- //
-// ========================================================================= //
-
-namespace { int num_calls1 = 0; }
-void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  state.counters["foo"] = 1;
-  state.counters["bar"] = ++num_calls1;
-  state.SetBytesProcessed(364);
-  state.SetItemsProcessed(150);
-}
-BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
-            "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bytes_per_second\": %int,$", MR_Next},
-                       {"\"items_per_second\": %int,$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
-                       "%csv_bytes_items_report,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckBytesAndItemsPSec(Results const& e) {
-  double t = e.DurationCPUTime(); // this (and not real time) is the time used
-  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
-  CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1);
-  // check that the values are within 0.1% of the expected values
-  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364./t, 0.001);
-  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150./t, 0.001);
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
-                        &CheckBytesAndItemsPSec);
-
-// ========================================================================= //
-// ------------------------- Rate Counters Output -------------------------- //
-// ========================================================================= //
-
-void BM_Counters_Rate(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
-  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
-}
-BENCHMARK(BM_Counters_Rate);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckRate(Results const& e) {
-  double t = e.DurationCPUTime(); // this (and not real time) is the time used
-  // check that the values are within 0.1% of the expected values
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./t, 0.001);
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
-
-// ========================================================================= //
-// ------------------------- Thread Counters Output ------------------------ //
-// ========================================================================= //
-
-void BM_Counters_Threads(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  state.counters["foo"] = 1;
-  state.counters["bar"] = 2;
-}
-BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckThreads(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "foo", EQ, e.NumThreads());
-  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads());
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads);
-
-// ========================================================================= //
-// ---------------------- ThreadAvg Counters Output ------------------------ //
-// ========================================================================= //
-
-void BM_Counters_AvgThreads(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads};
-  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
-}
-BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckAvgThreads(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
-  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2);
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
-                        &CheckAvgThreads);
-
-// ========================================================================= //
-// ---------------------- ThreadAvg Counters Output ------------------------ //
-// ========================================================================= //
-
-void BM_Counters_AvgThreadsRate(benchmark::State& state) {
-  while (state.KeepRunning()) {
-  }
-  namespace bm = benchmark;
-  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
-  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
-}
-BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %int,$", MR_Next},
-                       {"\"cpu_time\": %int,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/threads:%int\",%csv_report,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckAvgThreadsRate(Results const& e) {
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./e.DurationCPUTime(), 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./e.DurationCPUTime(), 0.001);
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
-                        &CheckAvgThreadsRate);
-
-// ========================================================================= //
-// --------------------------- TEST CASES END ------------------------------ //
-// ========================================================================= //
-
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/compare_bench.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/compare_bench.py
deleted file mode 100755
index d54baaa0e8f..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/compare_bench.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python
-"""
-compare_bench.py - Compare two benchmarks or their results and report the
-                   difference.
-"""
-import argparse
-from argparse import ArgumentParser
-import sys
-import gbench
-from gbench import util, report
-from gbench.util import *
-
-def check_inputs(in1, in2, flags):
-    """
-    Perform checking on the user provided inputs and diagnose any abnormalities
-    """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag('--benchmark_out=', flags)
-    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
-        print(("WARNING: '--benchmark_out=%s' will be passed to both "
-              "benchmarks causing it to be overwritten") % output_file)
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print("WARNING: passing --benchmark flags has no effect since both "
-              "inputs are JSON")
-    if output_type is not None and output_type != 'json':
-        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare_bench.py`"
-              " is not supported.") % output_type)
-        sys.exit(1)
-
-
-def main():
-    parser = ArgumentParser(
-        description='compare the results of two benchmarks')
-    parser.add_argument(
-        'test1', metavar='test1', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    parser.add_argument(
-        'test2', metavar='test2', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    # FIXME this is a dummy argument which will never actually match
-    # any --benchmark flags but it helps generate a better usage message
-    parser.add_argument(
-        'benchmark_options', metavar='benchmark_option', nargs='*',
-        help='Arguments to pass when running benchmark executables'
-    )
-    args, unknown_args = parser.parse_known_args()
-    # Parse the command line flags
-    test1 = args.test1[0]
-    test2 = args.test2[0]
-    if args.benchmark_options:
-        print("Unrecognized positional argument arguments: '%s'"
-              % args.benchmark_options)
-        exit(1)
-    benchmark_options = unknown_args
-    check_inputs(test1, test2, benchmark_options)
-    # Run the benchmarks and report the results
-    json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
-    json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options)
-    output_lines = gbench.report.generate_difference_report(json1, json2)
-    print('Comparing %s to %s' % (test1, test2))
-    for ln in output_lines:
-        print(ln)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run1.json b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run1.json
deleted file mode 100644
index 37faed46d13..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run1.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-  "context": {
-    "date": "2016-08-02 17:44:46",
-    "num_cpus": 4,
-    "mhz_per_cpu": 4228,
-    "cpu_scaling_enabled": false,
-    "library_build_type": "release"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SameTimes",
-      "iterations": 1000,
-      "real_time": 10,
-      "cpu_time": 10,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_2xFaster",
-      "iterations": 1000,
-      "real_time": 50,
-      "cpu_time": 50,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_2xSlower",
-      "iterations": 1000,
-      "real_time": 50,
-      "cpu_time": 50,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_10PercentFaster",
-      "iterations": 1000,
-      "real_time": 100,
-      "cpu_time": 100,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_10PercentSlower",
-      "iterations": 1000,
-      "real_time": 100,
-      "cpu_time": 100,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_100xSlower",
-      "iterations": 1000,
-      "real_time": 100,
-      "cpu_time": 100,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_100xFaster",
-      "iterations": 1000,
-      "real_time": 10000,
-      "cpu_time": 10000,
-      "time_unit": "ns"
-    }
-  ]
-}
\ No newline at end of file
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run2.json b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run2.json
deleted file mode 100644
index aed5151d392..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run2.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-  "context": {
-    "date": "2016-08-02 17:44:46",
-    "num_cpus": 4,
-    "mhz_per_cpu": 4228,
-    "cpu_scaling_enabled": false,
-    "library_build_type": "release"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SameTimes",
-      "iterations": 1000,
-      "real_time": 10,
-      "cpu_time": 10,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_2xFaster",
-      "iterations": 1000,
-      "real_time": 25,
-      "cpu_time": 25,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_2xSlower",
-      "iterations": 20833333,
-      "real_time": 100,
-      "cpu_time": 100,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_10PercentFaster",
-      "iterations": 1000,
-      "real_time": 90,
-      "cpu_time": 90,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_10PercentSlower",
-      "iterations": 1000,
-      "real_time": 110,
-      "cpu_time": 110,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_100xSlower",
-      "iterations": 1000,
-      "real_time": 10000,
-      "cpu_time": 10000,
-      "time_unit": "ns"
-    },
-    {
-      "name": "BM_100xFaster",
-      "iterations": 1000,
-      "real_time": 100,
-      "cpu_time": 100,
-      "time_unit": "ns"
-    }
-  ]
-}
\ No newline at end of file
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/__init__.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/__init__.py
deleted file mode 100644
index fce1a1acfbb..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""Google Benchmark tooling"""
-
-__author__ = 'Eric Fiselier'
-__email__ = 'eric@efcs.ca'
-__versioninfo__ = (0, 5, 0)
-__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
-
-__all__ = []
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/report.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/report.py
deleted file mode 100644
index 015d33d9e49..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/report.py
+++ /dev/null
@@ -1,146 +0,0 @@
-"""report.py - Utilities for reporting statistics about benchmark results
-"""
-import os
-
-class BenchmarkColor(object):
-    def __init__(self, name, code):
-        self.name = name
-        self.code = code
-
-    def __repr__(self):
-        return '%s%r' % (self.__class__.__name__,
-                         (self.name, self.code))
-
-    def __format__(self, format):
-        return self.code
-
-# Benchmark Colors Enumeration
-BC_NONE = BenchmarkColor('NONE', '')
-BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
-BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
-BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
-BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
-BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
-BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
-BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
-BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
-BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
-BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
-
-def color_format(use_color, fmt_str, *args, **kwargs):
-    """
-    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
-    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
-    is False then all color codes in 'args' and 'kwargs' are replaced with
-    the empty string.
-    """
-    assert use_color is True or use_color is False
-    if not use_color:
-        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
-                for arg in args]
-        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
-                  for key, arg in kwargs.items()}
-    return fmt_str.format(*args, **kwargs)
-
-
-def find_longest_name(benchmark_list):
-    """
-    Return the length of the longest benchmark name in a given list of
-    benchmark JSON objects
-    """
-    longest_name = 1
-    for bc in benchmark_list:
-        if len(bc['name']) > longest_name:
-            longest_name = len(bc['name'])
-    return longest_name
-
-
-def calculate_change(old_val, new_val):
-    """
-    Return a float representing the decimal change between old_val and new_val.
-    """
-    if old_val == 0 and new_val == 0:
-        return 0.0
-    if old_val == 0:
-        return float(new_val - old_val) / (float(old_val + new_val) / 2)
-    return float(new_val - old_val) / abs(old_val)
-
-
-def generate_difference_report(json1, json2, use_color=True):
-    """
-    Calculate and report the difference between each test of two benchmarks
-    runs specified as 'json1' and 'json2'.
-    """
-    first_col_width = find_longest_name(json1['benchmarks']) + 5
-    def find_test(name):
-        for b in json2['benchmarks']:
-            if b['name'] == name:
-                return b
-        return None
-    first_line = "{:<{}s}     Time           CPU           Old           New".format(
-        'Benchmark', first_col_width)
-    output_strs = [first_line, '-' * len(first_line)]
-
-    gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)
-    for bn in gen:
-        other_bench = find_test(bn['name'])
-        if not other_bench:
-            continue
-
-        def get_color(res):
-            if res > 0.05:
-                return BC_FAIL
-            elif res > -0.07:
-                return BC_WHITE
-            else:
-                return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}{}{:+9.2f}{endc}{}{:+14.2f}{endc}{:14d}{:14d}"
-        tres = calculate_change(bn['real_time'], other_bench['real_time'])
-        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
-        output_strs += [color_format(use_color, fmt_str,
-            BC_HEADER, bn['name'], first_col_width,
-            get_color(tres), tres, get_color(cpures), cpures,
-            bn['cpu_time'], other_bench['cpu_time'],
-            endc=BC_ENDC)]
-    return output_strs
-
-###############################################################################
-# Unit tests
-
-import unittest
-
-class TestReportDifference(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
-        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_basic(self):
-        expect_lines = [
-            ['BM_SameTimes', '+0.00', '+0.00', '10', '10'],
-            ['BM_2xFaster', '-0.50', '-0.50', '50', '25'],
-            ['BM_2xSlower', '+1.00', '+1.00', '50', '100'],
-            ['BM_10PercentFaster', '-0.10', '-0.10', '100', '90'],
-            ['BM_10PercentSlower', '+0.10', '+0.10', '100', '110'],
-            ['BM_100xSlower', '+99.00', '+99.00', '100', '10000'],
-            ['BM_100xFaster', '-0.99', '-0.99', '10000', '100'],
-        ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
-        output_lines = output_lines_with_header[2:]
-        print("\n".join(output_lines_with_header))
-        self.assertEqual(len(output_lines), len(expect_lines))
-        for i in xrange(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(len(parts), 5)
-            self.assertEqual(parts, expect_lines[i])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/util.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/util.py
deleted file mode 100644
index 07c23772754..00000000000
--- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/util.py
+++ /dev/null
@@ -1,159 +0,0 @@
-"""util.py - General utilities for running, loading, and processing benchmarks
-"""
-import json
-import os
-import tempfile
-import subprocess
-import sys
-
-# Input file type enumeration
-IT_Invalid    = 0
-IT_JSON       = 1
-IT_Executable = 2
-
-_num_magic_bytes = 2 if sys.platform.startswith('win') else 4
-def is_executable_file(filename):
-    """
-    Return 'True' if 'filename' names a valid file which is likely
-    an executable. A file is considered an executable if it starts with the
-    magic bytes for a EXE, Mach O, or ELF file.
-    """
-    if not os.path.isfile(filename):
-        return False
-    with open(filename, mode='rb') as f:
-        magic_bytes = f.read(_num_magic_bytes)
-    if sys.platform == 'darwin':
-        return magic_bytes in [
-            b'\xfe\xed\xfa\xce',  # MH_MAGIC
-            b'\xce\xfa\xed\xfe',  # MH_CIGAM
-            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
-            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
-            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
-            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
-        ]
-    elif sys.platform.startswith('win'):
-        return magic_bytes == b'MZ'
-    else:
-        return magic_bytes == b'\x7FELF'
-
-
-def is_json_file(filename):
-    """
-    Returns 'True' if 'filename' names a valid JSON output file.
-    'False' otherwise.
-    """
-    try:
-        with open(filename, 'r') as f:
-            json.load(f)
-        return True
-    except:
-        pass
-    return False
-
-
-def classify_input_file(filename):
-    """
-    Return a tuple (type, msg) where 'type' specifies the classified type
-    of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
-    string represeting the error.
-    """
-    ftype = IT_Invalid
-    err_msg = None
-    if not os.path.exists(filename):
-        err_msg = "'%s' does not exist" % filename
-    elif not os.path.isfile(filename):
-        err_msg = "'%s' does not name a file" % filename
-    elif is_executable_file(filename):
-        ftype = IT_Executable
-    elif is_json_file(filename):
-        ftype = IT_JSON
-    else:
-        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
-    return ftype, err_msg
-
-
-def check_input_file(filename):
-    """
-    Classify the file named by 'filename' and return the classification.
-    If the file is classified as 'IT_Invalid' print an error message and exit
-    the program.
-    """
-    ftype, msg = classify_input_file(filename)
-    if ftype == IT_Invalid:
-        print("Invalid input file: %s" % msg)
-        sys.exit(1)
-    return ftype
-
-def find_benchmark_flag(prefix, benchmark_flags):
-    """
-    Search the specified list of flags for a flag matching `<prefix><arg>` and
-    if it is found return the arg it specifies. If specified more than once the
-    last value is returned. If the flag is not found None is returned.
-    """
-    assert prefix.startswith('--') and prefix.endswith('=')
-    result = None
-    for f in benchmark_flags:
-        if f.startswith(prefix):
-            result = f[len(prefix):]
-    return result
-
-def remove_benchmark_flags(prefix, benchmark_flags):
-    """
-    Return a new list containing the specified benchmark_flags except those
-    with the specified prefix.
-    """
-    assert prefix.startswith('--') and prefix.endswith('=')
-    return [f for f in benchmark_flags if not f.startswith(prefix)]
-
-def load_benchmark_results(fname):
-    """
-    Read benchmark output from a file and return the JSON object.
-    REQUIRES: 'fname' names a file containing JSON benchmark output.
-    """
-    with open(fname, 'r') as f:
-        return json.load(f)
-
-
-def run_benchmark(exe_name, benchmark_flags):
-    """
-    Run a benchmark specified by 'exe_name' with the specified
-    'benchmark_flags'. The benchmark is run directly as a subprocess to preserve
-    real time console output.
-    RETURNS: A JSON object representing the benchmark output
-    """
-    output_name = find_benchmark_flag('--benchmark_out=',
-                                      benchmark_flags)
-    is_temp_output = False
-    if output_name is None:
-        is_temp_output = True
-        thandle, output_name = tempfile.mkstemp()
-        os.close(thandle)
-        benchmark_flags = list(benchmark_flags) + \
-                          ['--benchmark_out=%s' % output_name]
-
-    cmd = [exe_name] + benchmark_flags
-    print("RUNNING: %s" % ' '.join(cmd))
-    exitCode = subprocess.call(cmd)
-    if exitCode != 0:
-        print('TEST FAILED...')
-        sys.exit(exitCode)
-    json_res = load_benchmark_results(output_name)
-    if is_temp_output:
-        os.unlink(output_name)
-    return json_res
-
-
-def run_or_load_benchmark(filename, benchmark_flags):
-    """
-    Get the results for a specified benchmark. If 'filename' specifies
-    an executable benchmark then the results are generated by running the
-    benchmark. Otherwise 'filename' must name a valid JSON output file,
-    which is loaded and the result returned.
-    """
-    ftype = check_input_file(filename)
-    if ftype == IT_JSON:
-        return load_benchmark_results(filename)
-    elif ftype == IT_Executable:
-        return run_benchmark(filename, benchmark_flags)
-    else:
-        assert False # This branch is unreachable
\ No newline at end of file
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
index 1084dcc3dee..3f00fe5cda2 100644
--- a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
@@ -1,3 +1,11 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 // (c) Meta Platforms, Inc. and affiliates.
 #pragma once
 
@@ -25,9 +33,7 @@ T fast_sqrt_of_power_of_2(int log2_n) {
 }
 
 template <typename T>
-void normalize_after_fht(
-    T* out,
-    int log2_vec_size) {
+void normalize_after_fht(T* out, int log2_vec_size) {
   const T inv_sqrt = T(1) / fast_sqrt_of_power_of_2<T>(log2_vec_size);
   const int vec_size = 1 << log2_vec_size;
   for (int ii = 0; ii < vec_size; ++ii) {
@@ -35,7 +41,6 @@ void normalize_after_fht(
   }
 }
 
-
 // Normalization step: divide by sqrt(1 << log2_vec_size). Similar
 // to fast_sqrt above, if N is even, then the maximum-precision way
 // to do this is right-shift by log2_vec_size / 2. If N is odd, we
@@ -46,7 +51,11 @@ void normalize_after_fht(
 // function to tend to increase the magnitude of the elements of
 // vec, which would resulting in clipping and therefore accuracy
 // loss, especially compounded over 30+ transformer layers.
-void quantized_normalize_after_fht(const int32_t* tmp, int16_t* out, int log2_vec_size, int vec_size) {
+void quantized_normalize_after_fht(
+    const int32_t* tmp,
+    int16_t* out,
+    int log2_vec_size,
+    int vec_size) {
   const int log2_sqrt_vec_size = log2_vec_size / 2;
   constexpr int32_t qmin = -(1 << 15) + 1;
   constexpr int32_t qmax = -qmin;
@@ -55,8 +64,9 @@ void quantized_normalize_after_fht(const int32_t* tmp, int16_t* out, int log2_ve
     static const int32_t inv_sqrt_2_numerator = 408;
     static const int32_t inv_sqrt_2_denominator = 577;
     for (int ii = 0; ii < vec_size; ++ii) {
-      const auto val_over_sqrt_vec_size = (tmp[ii] * inv_sqrt_2_numerator / inv_sqrt_2_denominator)
-        >> log2_sqrt_vec_size;
+      const auto val_over_sqrt_vec_size =
+          (tmp[ii] * inv_sqrt_2_numerator / inv_sqrt_2_denominator) >>
+          log2_sqrt_vec_size;
       out[ii] = std::clamp(val_over_sqrt_vec_size, qmin, qmax);
     }
   } else {
@@ -90,9 +100,7 @@ void fast_hadamard_transform_unnormalized_simple_impl(
 }
 
 template <typename T>
-void fast_hadamard_transform_simple_impl(
-    T* vec,
-    int log2_vec_size) {
+void fast_hadamard_transform_simple_impl(T* vec, int log2_vec_size) {
   fast_hadamard_transform_unnormalized_simple_impl(vec, log2_vec_size);
   normalize_after_fht(vec, log2_vec_size);
 }
@@ -104,7 +112,7 @@ void fast_hadamard_transform_simple_impl(
 // of vec, which must be of length (1 << log2_vec_size).
 template <typename T>
 void fast_hadamard_transform(T* vec, int log2_vec_size) {
-    internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+  internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
 }
 
 // Compute a quantized fast Walsh-Hadamard transform of vec, which
@@ -116,8 +124,11 @@ void fast_hadamard_transform(T* vec, int log2_vec_size) {
 // following trivial identities:
 //
 // scale * a + scale * b = scale * (a + b)  (addition doesn't need the scale)
-// alpha * (scale * a) = scale * (alpha * a) (multiplication doesn't need the scale)
-void fast_hadamard_transform_symmetric_quantized_s16(int16_t* vec, int log2_vec_size) {
+// alpha * (scale * a) = scale * (alpha * a) (multiplication doesn't need the
+// scale)
+void fast_hadamard_transform_symmetric_quantized_s16(
+    int16_t* vec,
+    int log2_vec_size) {
   if (log2_vec_size == 0) {
     return;
   }
@@ -136,9 +147,11 @@ void fast_hadamard_transform_symmetric_quantized_s16(int16_t* vec, int log2_vec_
   // implementation.
   // NOTE: if we need this to be fast on CPU, we can use FFHT to
   // generate fht_uint32 similar to fht_float.
-  internal::fast_hadamard_transform_unnormalized_simple_impl(tmp.get(), log2_vec_size);
+  internal::fast_hadamard_transform_unnormalized_simple_impl(
+      tmp.get(), log2_vec_size);
 
-  internal::quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size);
+  internal::quantized_normalize_after_fht(
+      tmp.get(), vec, log2_vec_size, vec_size);
 }
 
 // Like fast_hadamard_transform, but vec must be of length 28 * (1 <<
@@ -161,7 +174,9 @@ void fast_hadamard_transform_28N(T* vec, int log2_vec_size) {
 // We don't need the quantization scale; see the function-level
 // comment on fast_hadamard_transform_symmetric_quantized_s16 for
 // details.
-void fast_hadamard_transform_symmetric_quantized_s16_28N(int16_t* vec, int log2_vec_size) {
+void fast_hadamard_transform_symmetric_quantized_s16_28N(
+    int16_t* vec,
+    int log2_vec_size) {
   if (log2_vec_size == 0) {
     return;
   }
@@ -171,14 +186,16 @@ void fast_hadamard_transform_symmetric_quantized_s16_28N(int16_t* vec, int log2_
   std::copy(vec, vec + vec_size * 28, tmp.get());
 
   for (int ii = 0; ii < 28; ++ii) {
-    internal::fast_hadamard_transform_unnormalized_simple_impl(&tmp[ii * vec_size], log2_vec_size);
+    internal::fast_hadamard_transform_unnormalized_simple_impl(
+        &tmp[ii * vec_size], log2_vec_size);
   }
 
   for (int ii = 0; ii < vec_size; ++ii) {
     hadamard_mult_28_strided(&tmp[ii], vec_size);
   }
 
-  internal::quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size * 28);
+  internal::quantized_normalize_after_fht(
+      tmp.get(), vec, log2_vec_size, vec_size * 28);
 }
 
 } // namespace executorch
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
index edc62b9667a..ca5a8d61e73 100644
--- a/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
@@ -1,5 +1,4 @@
-
-// This file is auto-generated. See "special_hadamard_code_gen.py"
+// @generated by special_hadamard_code_gen.py strided_cpu
 
 
 #pragma once
diff --git a/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
index 1dc57166c6d..a8b9feb0785 100644
--- a/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
+++ b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
@@ -32,8 +32,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import math
-import re
 from pathlib import Path
 
 import numpy as np
@@ -176,12 +174,12 @@
 had_strings = [had_12, had_20_will, had_28_will, had_40_tpal]
 
 header = """
-// This file is auto-generated. See "special_hadamard_code_gen.py"\n
 
 #pragma once
 
 """
 
+
 TEMPLATE = """
 __device__ __forceinline__ void hadamard_mult_thread_{N}(float x[{N}]) {{
     float out[{N}];
@@ -220,8 +218,13 @@
 
 def string_to_array(string):
     # Convert strings of + and - to bool arrays
-    string = string.strip().replace('+', '1').replace('-', '-1').split()
-    return np.stack([np.fromstring(" ".join(string[i]), dtype=np.int32, sep=' ') for i in range(len(string))])
+    string = string.strip().replace("+", "1").replace("-", "-1").split()
+    return np.stack(
+        [
+            np.fromstring(" ".join(string[i]), dtype=np.int32, sep=" ")
+            for i in range(len(string))
+        ]
+    )
 
 
 def strided_load_code_gen(N):
@@ -233,28 +236,44 @@ def array_code_gen(arr, template):
     assert arr.shape[0] == arr.shape[1]
     out = []
     for i in range(N):
-        out.append(f"out[{i}] = " + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)]) + ";")
-    return template.format(N=str(N), code='\n    '.join(out), strided_load_code = strided_load_code_gen(N))
-
-
-def main(template = TEMPLATE):
-    output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h"
-    output_dir.write_text(header + ''.join(array_code_gen(string_to_array(s), template) for s in had_strings))
+        out.append(
+            f"out[{i}] = "
+            + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)])
+            + ";"
+        )
+    return template.format(
+        N=str(N), code="\n    ".join(out), strided_load_code=strided_load_code_gen(N)
+    )
 
 
 OPTION_TO_TEMPLATE = {
-    'cuda': TEMPLATE,
-    'cpu': CPU_TEMPLATE,
-    'strided_cpu': STRIDED_CPU_TEMPLATE,
+    "cuda": TEMPLATE,
+    "cpu": CPU_TEMPLATE,
+    "strided_cpu": STRIDED_CPU_TEMPLATE,
 }
 
 
-if __name__ == '__main__':
+def main(option="cuda"):
+    try:
+        template = OPTION_TO_TEMPLATE[option]
+    except KeyError:
+        raise Exception(
+            f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}"
+        )
+    output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h"
+    generated_line = f"// @{'generated'} by special_hadamard_code_gen.py {option}\n"
+
+    output_dir.write_text(
+        generated_line
+        + header
+        + "".join(array_code_gen(string_to_array(s), template) for s in had_strings)
+    )
+
+
+if __name__ == "__main__":
     import sys
-    template = TEMPLATE
+
+    option = "cuda"
     if len(sys.argv) > 1:
         option = sys.argv[1]
-        if option not in OPTION_TO_TEMPLATE:
-            raise Exception(f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}")
-        template = OPTION_TO_TEMPLATE[option]
-    main(template)
+    main(option)
diff --git a/extension/llm/custom_ops/spinquant/targets.bzl b/extension/llm/custom_ops/spinquant/targets.bzl
index 8cf7827f9e2..42fa472548b 100644
--- a/extension/llm/custom_ops/spinquant/targets.bzl
+++ b/extension/llm/custom_ops/spinquant/targets.bzl
@@ -8,8 +8,9 @@ def define_common_targets():
     """
     runtime.cxx_library(
         name = "fast_hadamard_transform",
-        headers = [
+        exported_headers = [
             "fast_hadamard_transform.h",
             "fast_hadamard_transform_special.h",
         ],
+        visibility = ["@EXECUTORCH_CLIENTS"],
     )
diff --git a/extension/llm/custom_ops/spinquant/FFHT/LICENSE.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/LICENSE.md
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md
diff --git a/extension/llm/custom_ops/spinquant/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/Makefile
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
diff --git a/extension/llm/custom_ops/spinquant/FFHT/README.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/README.md
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
diff --git a/extension/llm/custom_ops/spinquant/FFHT/example.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/example.py
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/example.py
diff --git a/extension/llm/custom_ops/spinquant/FFHT/fast_copy.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/fast_copy.c
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c
diff --git a/extension/llm/custom_ops/spinquant/FFHT/fast_copy.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/fast_copy.h
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h
diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/fht.c
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c
diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/fht.h
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h
diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht_avx.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/fht_avx.c
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c
diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht_impl.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/fht_impl.h
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h
diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht_sse.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/fht_sse.c
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c
diff --git a/extension/llm/custom_ops/spinquant/FFHT/gen.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/gen.py
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py
diff --git a/extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_avx.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_avx.txt
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt
diff --git a/extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_sse.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_sse.txt
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt
diff --git a/extension/llm/custom_ops/spinquant/FFHT/measurements/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile
similarity index 61%
rename from extension/llm/custom_ops/spinquant/FFHT/measurements/Makefile
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile
index c0b6daff716..807d5fe626b 100644
--- a/extension/llm/custom_ops/spinquant/FFHT/measurements/Makefile
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile
@@ -1,5 +1,5 @@
 CXX=g++
-CXX_FLAGS=-O3 -Wall -march=native -std=c++11 -I../external/benchmark/include -L../external/benchmark/src -lbenchmark -lpthread
+CXX_FLAGS=-O3 -Wall -march=native -std=c++11 `pkg-config benchmark --cflags --libs` -lpthread
 
 .PHONY: run_float run_double clean
 
diff --git a/extension/llm/custom_ops/spinquant/FFHT/measurements/run_double.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/measurements/run_double.cpp
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp
diff --git a/extension/llm/custom_ops/spinquant/FFHT/measurements/run_float.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/measurements/run_float.cpp
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp
diff --git a/extension/llm/custom_ops/spinquant/FFHT/test_double.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/test_double.c
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c
diff --git a/extension/llm/custom_ops/spinquant/FFHT/test_float.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c
similarity index 100%
rename from extension/llm/custom_ops/spinquant/FFHT/test_float.c
rename to extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index bc64ae869fc..4237ae7b3a7 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -69,6 +69,7 @@ def __init__(
         example_inputs,
         args: Optional[Any] = None,
         enable_dynamic_shape: bool = False,
+        generate_full_logits: bool = False,
         calibration_tasks: Optional[List[str]] = None,
         calibration_limit: Optional[int] = None,
         calibration_seq_length: Optional[int] = None,
@@ -86,6 +87,7 @@ def __init__(
         self.dtype = dtype
         self.example_inputs = example_inputs
         self.use_kv_cache = use_kv_cache
+        self.generate_full_logits = generate_full_logits
         self.enable_dynamic_shape = enable_dynamic_shape
         self.verbose = verbose
         self.metadata = metadata
@@ -229,7 +231,12 @@ def calibrate_template(
                     )
                     pos += 1
                     if pos >= len(token_list):
-                        token_list.append(torch.argmax(logits[:], dim=-1).item())
+                        if self.generate_full_logits:
+                            token_list.append(
+                                torch.argmax(logits[:, -1], dim=-1).item()
+                            )
+                        else:
+                            token_list.append(torch.argmax(logits[:], dim=-1).item())
 
         calibrate_template(
             module=prepared_module,
@@ -243,6 +250,7 @@ def calibrate_template(
             tokenizer=tokenizer,
             max_seq_length=calibration_seq_length,
             use_kv_cache=self.use_kv_cache,
+            generate_full_logits=self.generate_full_logits,
             enable_dynamic_shape=self.enable_dynamic_shape,
         )
         eval_results = evaluate_model(
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index e75d5bef3fb..eca78bc9346 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -56,11 +56,11 @@ def get_mps_partitioner(use_kv_cache: bool = False):
 
 
 def get_coreml_partitioner(
-    use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None
+    enable_state: bool = False,
+    embedding_quantize: Optional[str] = None,
+    pt2e_quantize: Optional[str] = None,
+    coreml_quantize: Optional[str] = None,
 ):
-    assert (
-        use_kv_cache is True
-    ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
     try:
         import coremltools as ct
         from executorch.backends.apple.coreml.compiler import (  # pyre-ignore
@@ -75,22 +75,34 @@ def get_coreml_partitioner(
         )
 
     minimum_deployment_target = ct.target.iOS15
-    # In Core ML, quantization in introduced in iOS 16
-    if pt2e_quantize is not None:
+    # In Core ML, stateful execution is introduced in iOS 18
+    if enable_state:
+        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
+    # In Core ML, quantization is introduced in iOS 16
+    if embedding_quantize is not None or pt2e_quantize is not None:
         minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS16)
     # In Core ML, 8-bit activation quantization is introduced in iOS 17
-    if pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"):
+    if (
+        embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 8
+    ) or pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"):
         minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
     # In Core ML, 4-bit weight compression is introduced in iOS 18
-    if pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
+    if (
+        (embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4)
+        or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
+        or coreml_quantize == "b4w"
+    ):
         minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
-    # In Core ML, stateful execution is introduced in iOS 18
-    # TODO (https://github.com/pytorch/executorch/issues/4209)
-    # For now, since mutable buffer is kept in executorch runtime,
-    # state is out of place and can be handled by older iOS.
-    # Once mutable buffer can be handed over to delegate, i.e. state becomes in-place, we will have
-    # if use_kv_cache:
-    #     minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
+
+    op_linear_quantizer_config = None
+    if coreml_quantize == "b4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_block",
+            "block_size": 32,
+            "weight_threshold": 512,
+        }
 
     compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
         minimum_deployment_target=minimum_deployment_target,
@@ -98,9 +110,11 @@ def get_coreml_partitioner(
         # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
         compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
         model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
+        op_linear_quantizer_config=op_linear_quantizer_config,
     )
     return CoreMLPartitioner(  # pyre-fixme[16]
         compile_specs=compile_specs,
+        take_over_mutable_buffer=enable_state,
     )
 
 
@@ -108,6 +122,7 @@ def get_qnn_partitioner(
     use_kv_cache: bool = False,
     pt2e_quantize: Optional[str] = None,
     num_sharding: int = 0,
+    soc_model: str = "SM8650",  # default to SM8650
 ):
     assert (
         use_kv_cache is True
@@ -130,17 +145,17 @@ def get_qnn_partitioner(
         )
     except ImportError:
         raise ImportError(
-            "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html"
+            "Please install the Qualcomm backend following https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html"
         )
 
     use_fp16 = True
-    skip_node_op_set = {"llama.fallback.default"}
+    skip_node_op_set = {"llama.fallback.default", "aten.embedding.default"}
     if pt2e_quantize is not None:
         use_fp16 = False
 
     return QnnPartitioner(  # pyre-fixme[16]
         generate_qnn_executorch_compiler_spec(  # pyre-fixme[16]
-            soc_model=QcomChipset.SM8650,  # default to SM8650  # pyre-fixme[16]
+            soc_model=getattr(QcomChipset, soc_model),  # pyre-fixme[16]
             # pyre-fixme[16]
             backend_options=generate_htp_compiler_spec(
                 use_fp16=use_fp16,
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 7fc53358c50..45d9932724e 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -180,8 +180,9 @@ def get_qnn_quantizer(
         # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w
         # TODO: enable it after the issue is fixed
         logging.warning(
-            "Disable per channel quantization for linear due to the error with QNN HTP 16a16w."
+            "Disable per channel quantization for linear and conv due to the error with QNN HTP 16a16w."
         )
+        qnn_quantizer.set_per_channel_conv_quant(enable=False)
         qnn_quantizer.set_per_channel_linear_quant(enable=False)
         qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
         qnn_quantizer.set_bit16_op_quant_config(
@@ -208,6 +209,12 @@ def get_qnn_quantizer(
         quantization_mode is None
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
+    qnn_quantizer.add_discard_ops(
+        [
+            torch.ops.aten.embedding.default,
+        ]
+    )
+
     return qnn_quantizer, quant_dtype
 
 
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index 70ecafee810..6798f648a0c 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -59,7 +59,8 @@ class MultimodalRunner {
       const std::string& prompt,
       int32_t seq_len = 1024,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {}) = 0;
+      std::function<void(const Stats&)> stats_callback = {},
+      bool echo = true) = 0;
 
   /**
    * Prefill an LLaVA Module with the given images input.
@@ -95,6 +96,7 @@ class MultimodalRunner {
    * @param start_pos The starting position in KV cache of the input in the LLM.
    * @param token_callback What to do after a token is generated.
    * @param stats_callback What to do with Stats.
+   * @param echo Whether to echo the input prompt or not.
    * @return The error code.
    */
   virtual runtime::Error generate_from_pos(
@@ -103,7 +105,8 @@ class MultimodalRunner {
       int64_t start_pos = 0,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {}) = 0;
+          stats_callback = {},
+      bool echo = true) = 0;
 
   inline void stop() {
     text_token_generator_->stop();
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 01887e75600..1726750ece5 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -70,11 +70,8 @@ class TextTokenGenerator {
     }
 
     // initialize tensor wrappers
-    auto tokens_managed = from_blob(
-        token_data.data(),
-        token_shape,
-        exec_aten::ScalarType::Long,
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+    auto tokens_managed =
+        from_blob(token_data.data(), token_shape, exec_aten::ScalarType::Long);
 
     auto start_pos_managed = from_blob(&pos, {1}, exec_aten::ScalarType::Long);
 
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 75cead25a72..7db4784dc93 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -15,9 +15,8 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 
-using namespace ::testing;
-
-namespace torch::executor {
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
 
 class ModuleTest : public ::testing::Test {
  protected:
@@ -102,13 +101,13 @@ TEST_F(ModuleTest, TestMethodMeta) {
 
   const auto input_meta = meta->input_tensor_meta(0);
   EXPECT_TRUE(input_meta.ok());
-  EXPECT_EQ(input_meta->scalar_type(), ScalarType::Float);
+  EXPECT_EQ(input_meta->scalar_type(), exec_aten::ScalarType::Float);
   EXPECT_EQ(input_meta->sizes().size(), 1);
   EXPECT_EQ(input_meta->sizes()[0], 1);
 
   const auto output_meta = meta->output_tensor_meta(0);
   EXPECT_TRUE(output_meta.ok());
-  EXPECT_EQ(output_meta->scalar_type(), ScalarType::Float);
+  EXPECT_EQ(output_meta->scalar_type(), exec_aten::ScalarType::Float);
   EXPECT_EQ(output_meta->sizes().size(), 1);
   EXPECT_EQ(output_meta->sizes()[0], 1);
 }
@@ -125,11 +124,11 @@ TEST_F(ModuleTest, TestExecute) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result =
-      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  const auto result = module.execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   EXPECT_TRUE(result.ok());
@@ -149,11 +148,11 @@ TEST_F(ModuleTest, TestExecutePreload) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result =
-      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  const auto result = module.execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
@@ -169,11 +168,11 @@ TEST_F(ModuleTest, TestExecutePreload_method) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result =
-      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  const auto result = module.execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
@@ -192,11 +191,11 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result =
-      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  const auto result = module.execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
@@ -225,10 +224,11 @@ TEST_F(ModuleTest, TestGet) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result = module.get("forward", {Tensor(&tensor), Tensor(&tensor)});
+  const auto result = module.get(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
 
   EXPECT_TRUE(result.ok());
   const auto data = result->toTensor().const_data_ptr<float>();
@@ -240,10 +240,11 @@ TEST_F(ModuleTest, TestForward) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result = module->forward({Tensor(&tensor), Tensor(&tensor)});
+  const auto result =
+      module->forward({exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
@@ -251,9 +252,10 @@ TEST_F(ModuleTest, TestForward) {
   EXPECT_NEAR(data[0], 2, 1e-5);
 
   std::array<float, 2> input2{2, 3};
-  TensorImpl tensor2(
-      ScalarType::Float, sizes.size(), sizes.data(), input2.data());
-  const auto result2 = module->forward({Tensor(&tensor2), Tensor(&tensor2)});
+  exec_aten::TensorImpl tensor2(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input2.data());
+  const auto result2 = module->forward(
+      {exec_aten::Tensor(&tensor2), exec_aten::Tensor(&tensor2)});
   EXPECT_TRUE(result2.ok());
 
   const auto data2 = result->at(0).toTensor().const_data_ptr<float>();
@@ -298,10 +300,9 @@ TEST_F(ModuleTest, TestProgramSharingBetweenModules) {
 }
 
 TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) {
-  auto loader = util::FileDataLoader::from(model_path_.c_str());
+  auto loader = FileDataLoader::from(model_path_.c_str());
   EXPECT_TRUE(loader.ok());
-  auto data_loader =
-      std::make_unique<util::FileDataLoader>(std::move(loader.get()));
+  auto data_loader = std::make_unique<FileDataLoader>(std::move(loader.get()));
 
   auto module1 = std::make_unique<Module>(std::move(data_loader));
 
@@ -311,24 +312,24 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  auto result1 =
-      module1->execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  auto result1 = module1->execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result1.ok());
 
   auto module2 = std::make_unique<Module>(module1->program());
 
-  auto result2 =
-      module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  auto result2 = module2->execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result2.ok());
 
   module1 = std::make_unique<Module>("/path/to/nonexistent/file.pte");
   EXPECT_FALSE(module1->is_loaded());
 
-  auto result3 =
-      module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  auto result3 = module2->execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result3.ok());
 }
 
@@ -336,10 +337,10 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
   std::shared_ptr<Program> shared_program;
 
   {
-    auto loader = util::FileDataLoader::from(model_path_.c_str());
+    auto loader = FileDataLoader::from(model_path_.c_str());
     EXPECT_TRUE(loader.ok());
     auto data_loader =
-        std::make_unique<util::FileDataLoader>(std::move(loader.get()));
+        std::make_unique<FileDataLoader>(std::move(loader.get()));
     auto* data_loader_ptr = data_loader.get();
 
     Module module(std::move(data_loader));
@@ -362,10 +363,11 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
 
   std::array<float, 1> input{1};
   std::array<int32_t, 1> sizes{1};
-  TensorImpl tensor(
-      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  exec_aten::TensorImpl tensor(
+      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  auto result = module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  auto result = module.execute(
+      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   auto data = result->at(0).toTensor().const_data_ptr<float>();
@@ -391,10 +393,14 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
                    const std::array<float, 1>& input) {
     Module module(program);
     std::array<int32_t, 1> sizes{1};
-    TensorImpl tensor(
-        ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data());
-
-    const auto result = module.forward({Tensor(&tensor), Tensor(&tensor)});
+    exec_aten::TensorImpl tensor(
+        exec_aten::ScalarType::Float,
+        sizes.size(),
+        sizes.data(),
+        (void*)input.data());
+
+    const auto result = module.forward(
+        {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
     EXPECT_TRUE(result.ok());
 
     const auto data = result->at(0).toTensor().const_data_ptr<float>();
@@ -413,5 +419,3 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
   t4.join();
   t5.join();
 }
-
-} // namespace torch::executor
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index c605c48c582..57bc44d1394 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -71,6 +71,7 @@ void et_pal_emit_log_message(
 }
 
 namespace py = pybind11;
+using executorch::bundled_program::verify_method_outputs;
 using ::executorch::extension::BufferDataLoader;
 using ::executorch::extension::MallocMemoryAllocator;
 using ::executorch::extension::MmapDataLoader;
@@ -79,7 +80,7 @@ using ::executorch::runtime::DataLoader;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerDebugLogLevel;
-using ::executorch::runtime::get_kernels;
+using ::executorch::runtime::get_registered_kernels;
 using ::executorch::runtime::HierarchicalAllocator;
 using ::executorch::runtime::Kernel;
 using ::executorch::runtime::MemoryAllocator;
@@ -92,8 +93,6 @@ using ::executorch::runtime::Span;
 using ::executorch::runtime::Tag;
 using torch::executor::etdump_result;
 using torch::executor::ETDumpGen;
-using torch::executor::bundled_program::LoadBundledInput;
-using torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput;
 
 #ifndef USE_ATEN_LIB
 using ::executorch::extension::alias_attensor_to_etensor;
@@ -655,11 +654,11 @@ struct PyModule final {
       const std::string method_name,
       size_t testset_idx) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    Error status = LoadBundledInput(
+    Error status = executorch::bundled_program::load_bundled_input(
         module_->get_method(method_name), bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
-        "LoadBundledInput failed with status %" PRIu32,
+        "load_bundled_input failed with status 0x%" PRIx32,
         static_cast<uint32_t>(status));
   }
 
@@ -671,13 +670,14 @@ struct PyModule final {
       double atol = 1e-8) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
     auto& method = module_->get_method(method_name);
-    Error status = LoadBundledInput(method, bundled_program_ptr, testset_idx);
+    Error status = executorch::bundled_program::load_bundled_input(
+        method, bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
-        "LoadBundledInput failed with status %" PRIu32,
+        "load_bundled_input failed with status 0x%" PRIx32,
         static_cast<uint32_t>(status));
     py::list outputs = plan_execute(method_name);
-    status = VerifyResultWithBundledExpectedOutput(
+    status = executorch::bundled_program::verify_method_outputs(
         method, bundled_program_ptr, testset_idx, rtol, atol);
     THROW_IF_ERROR(
         status,
@@ -774,7 +774,7 @@ void create_profile_block(const std::string& name) {
 }
 
 py::list get_operator_names() {
-  ArrayRef<Kernel> kernels = get_kernels();
+  Span<const Kernel> kernels = get_registered_kernels();
   py::list res;
   for (const Kernel& k : kernels) {
     if (k.name_ != nullptr) {
diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl
index 4998b5cf15b..8493d093fa1 100644
--- a/extension/tensor/targets.bzl
+++ b/extension/tensor/targets.bzl
@@ -15,6 +15,7 @@ def define_common_targets():
             srcs = [
                 "tensor_impl_ptr.cpp",
                 "tensor_ptr.cpp",
+                "tensor_ptr_maker.cpp",
             ],
             exported_headers = [
                 "tensor.h",
diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h
index 3ccede79b1d..5f34f929b96 100644
--- a/extension/tensor/tensor_impl_ptr.h
+++ b/extension/tensor/tensor_impl_ptr.h
@@ -66,7 +66,7 @@ TensorImplPtr make_tensor_impl_ptr(
     std::vector<exec_aten::DimOrderType> dim_order = {},
     std::vector<exec_aten::StridesType> strides = {},
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC,
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
     std::function<void(void*)> deleter = nullptr);
 
 /**
@@ -93,10 +93,10 @@ TensorImplPtr make_tensor_impl_ptr(
     std::vector<exec_aten::DimOrderType> dim_order = {},
     std::vector<exec_aten::StridesType> strides = {},
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC) {
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
   constexpr exec_aten::ScalarType scalar_type =
       runtime::CppTypeToScalarType<T>::value;
-  auto raw_data_ptr = data.data();
+  const auto raw_data_ptr = data.data();
   auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
   return make_tensor_impl_ptr(
       scalar_type,
@@ -108,6 +108,40 @@ TensorImplPtr make_tensor_impl_ptr(
       [data_ptr = std::move(data_ptr)](void*) {});
 }
 
+/**
+ * Creates a TensorImplPtr that manages a newly created TensorImpl with the
+ * specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as a vector. The scalar type is automatically deduced from the
+ * vector's data type. The deleter ensures that the data vector is properly
+ * managed and its lifetime is tied to the TensorImpl.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param data A vector containing the tensor's data.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorImplPtr that manages the newly created TensorImpl.
+ */
+template <typename T = float>
+TensorImplPtr make_tensor_impl_ptr(
+    std::vector<T> data,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  constexpr exec_aten::ScalarType scalar_type =
+      runtime::CppTypeToScalarType<T>::value;
+  std::vector<exec_aten::SizesType> sizes{exec_aten::SizesType(data.size())};
+  const auto raw_data_ptr = data.data();
+  auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
+  return make_tensor_impl_ptr(
+      scalar_type,
+      std::move(sizes),
+      raw_data_ptr,
+      {0},
+      {1},
+      dynamism,
+      [data_ptr = std::move(data_ptr)](void*) {});
+}
+
 /**
  * Creates a TensorImplPtr that manages a newly created TensorImpl with the
  * specified properties.
@@ -131,7 +165,7 @@ TensorImplPtr make_tensor_impl_ptr(
     std::vector<exec_aten::DimOrderType> dim_order = {},
     std::vector<exec_aten::StridesType> strides = {},
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC);
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
 
 } // namespace extension
 } // namespace executorch
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 18568876607..f477199a3e1 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -125,7 +125,7 @@ inline TensorPtr make_tensor_ptr(
     std::vector<exec_aten::DimOrderType> dim_order = {},
     std::vector<exec_aten::StridesType> strides = {},
     const exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC,
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
     std::function<void(void*)> deleter = nullptr) {
   return make_tensor_ptr(make_tensor_impl_ptr(
       type,
@@ -142,8 +142,7 @@ inline TensorPtr make_tensor_ptr(
  *
  * This template overload is specialized for cases where the tensor data is
  * provided as a vector. The scalar type is automatically deduced from the
- * vector's data type. The deleter ensures that the data vector is properly
- * managed and its lifetime is tied to the TensorImpl.
+ * vector's data type.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the vector.
  * @param sizes A vector specifying the size of each dimension.
@@ -160,7 +159,7 @@ TensorPtr make_tensor_ptr(
     std::vector<exec_aten::DimOrderType> dim_order = {},
     std::vector<exec_aten::StridesType> strides = {},
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC) {
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
   return make_tensor_ptr(make_tensor_impl_ptr(
       std::move(sizes),
       std::move(data),
@@ -169,6 +168,47 @@ TensorPtr make_tensor_ptr(
       dynamism));
 }
 
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as a vector. The scalar type is automatically deduced from the
+ * vector's data type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the vector.
+ * @param data A vector containing the tensor's data.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <typename T = float>
+TensorPtr make_tensor_ptr(
+    std::vector<T> data,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(make_tensor_impl_ptr(std::move(data), dynamism));
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload allows creating a Tensor from an initializer list
+ * of data. The scalar type is automatically deduced from the type of the
+ * initializer list's elements.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param data An initializer list containing the tensor's data.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <typename T = float>
+TensorPtr make_tensor_ptr(
+    std::initializer_list<T> data,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return make_tensor_ptr(std::vector<T>(data), dynamism);
+}
+
 /**
  * Creates a TensorPtr that manages a Tensor with the specified properties.
  *
@@ -191,7 +231,7 @@ inline TensorPtr make_tensor_ptr(
     std::vector<exec_aten::DimOrderType> dim_order = {},
     std::vector<exec_aten::StridesType> strides = {},
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC) {
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
   return make_tensor_ptr(make_tensor_impl_ptr(
       scalar_type,
       std::move(sizes),
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
new file mode 100644
index 00000000000..1a09fea4cac
--- /dev/null
+++ b/extension/tensor/tensor_ptr_maker.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+
+#include <random>
+
+namespace executorch {
+namespace extension {
+namespace {
+
+template <
+    typename INT_T,
+    typename std::enable_if<
+        std::is_integral<INT_T>::value && !std::is_same<INT_T, bool>::value,
+        bool>::type = true>
+bool extract_scalar(exec_aten::Scalar scalar, INT_T* out_val) {
+  if (!scalar.isIntegral(/*includeBool=*/false)) {
+    return false;
+  }
+  int64_t val = scalar.to<int64_t>();
+  if (val < std::numeric_limits<INT_T>::lowest() ||
+      val > std::numeric_limits<INT_T>::max()) {
+    return false;
+  }
+  *out_val = static_cast<INT_T>(val);
+  return true;
+}
+
+template <
+    typename FLOAT_T,
+    typename std::enable_if<std::is_floating_point<FLOAT_T>::value, bool>::
+        type = true>
+bool extract_scalar(exec_aten::Scalar scalar, FLOAT_T* out_val) {
+  double val;
+  if (scalar.isFloatingPoint()) {
+    val = scalar.to<double>();
+    if (std::isfinite(val) &&
+        (val < std::numeric_limits<FLOAT_T>::lowest() ||
+         val > std::numeric_limits<FLOAT_T>::max())) {
+      return false;
+    }
+  } else if (scalar.isIntegral(/*includeBool=*/false)) {
+    val = static_cast<double>(scalar.to<int64_t>());
+  } else {
+    return false;
+  }
+  *out_val = static_cast<FLOAT_T>(val);
+  return true;
+}
+
+template <
+    typename BOOL_T,
+    typename std::enable_if<std::is_same<BOOL_T, bool>::value, bool>::type =
+        true>
+bool extract_scalar(exec_aten::Scalar scalar, BOOL_T* out_val) {
+  if (scalar.isIntegral(false)) {
+    *out_val = static_cast<bool>(scalar.to<int64_t>());
+    return true;
+  }
+  if (scalar.isBoolean()) {
+    *out_val = scalar.to<bool>();
+    return true;
+  }
+  return false;
+}
+
+#define ET_EXTRACT_SCALAR(scalar, out_val) \
+  ET_CHECK_MSG(                            \
+      extract_scalar(scalar, &out_val),    \
+      #scalar " could not be extracted: wrong type or out of range");
+
+template <typename Distribution>
+TensorPtr random_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism,
+    Distribution&& distribution) {
+  auto tensor =
+      empty_strided(std::move(sizes), std::move(strides), type, dynamism);
+  std::default_random_engine gen{std::random_device{}()};
+
+  ET_SWITCH_REALB_TYPES(type, nullptr, "random_strided", CTYPE, [&] {
+    std::generate_n(tensor->mutable_data_ptr<CTYPE>(), tensor->numel(), [&]() {
+      return static_cast<CTYPE>(distribution(gen));
+    });
+  });
+  return tensor;
+}
+
+} // namespace
+
+TensorPtr empty_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  std::vector<uint8_t> data(
+      exec_aten::compute_numel(sizes.data(), sizes.size()) *
+      exec_aten::elementSize(type));
+  return make_tensor_ptr(
+      type,
+      std::move(sizes),
+      std::move(data),
+      {},
+      std::move(strides),
+      dynamism);
+}
+
+TensorPtr full_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  auto tensor =
+      empty_strided(std::move(sizes), std::move(strides), type, dynamism);
+  ET_SWITCH_REALB_TYPES(type, nullptr, "full_strided", CTYPE, [&] {
+    CTYPE value;
+    ET_EXTRACT_SCALAR(fill_value, value);
+    std::fill(
+        tensor->mutable_data_ptr<CTYPE>(),
+        tensor->mutable_data_ptr<CTYPE>() + tensor->numel(),
+        value);
+  });
+  return tensor;
+}
+
+TensorPtr rand_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  return random_strided(
+      std::move(sizes),
+      std::move(strides),
+      type,
+      dynamism,
+      std::uniform_real_distribution<float>(0.0f, 1.0f));
+}
+
+TensorPtr randn_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  return random_strided(
+      std::move(sizes),
+      std::move(strides),
+      type,
+      dynamism,
+      std::normal_distribution<float>(0.0f, 1.0f));
+}
+
+TensorPtr randint_strided(
+    int64_t low,
+    int64_t high,
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  return random_strided(
+      std::move(sizes),
+      std::move(strides),
+      type,
+      dynamism,
+      std::uniform_int_distribution<int64_t>(low, high - 1));
+}
+
+} // namespace extension
+} // namespace executorch
diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h
index a08f04c2101..4e65480b7fd 100644
--- a/extension/tensor/tensor_ptr_maker.h
+++ b/extension/tensor/tensor_ptr_maker.h
@@ -15,7 +15,7 @@ namespace extension {
 
 /**
  * A helper class for creating TensorPtr instances from raw data and tensor
- * properties. Note the the TensorPtr created by this class will not own the
+ * properties. Note that the TensorPtr created by this class will not own the
  * data, so it must outlive the TensorPtr.
  *
  * TensorPtrMaker provides a fluent interface for specifying various properties
@@ -31,6 +31,7 @@ class TensorPtrMaker final {
   // But it is movable.
   TensorPtrMaker(TensorPtrMaker&&) = default;
   TensorPtrMaker& operator=(TensorPtrMaker&&) = default;
+
   /**
    * Sets the scalar type of the tensor elements.
    *
@@ -138,7 +139,7 @@ class TensorPtrMaker final {
   void* data_ = nullptr;
   exec_aten::ScalarType type_ = exec_aten::ScalarType::Float;
   exec_aten::TensorShapeDynamism dynamism_ =
-      exec_aten::TensorShapeDynamism::STATIC;
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND;
 };
 
 /**
@@ -182,7 +183,7 @@ inline TensorPtr from_blob(
     std::vector<exec_aten::SizesType> sizes,
     exec_aten::ScalarType type = exec_aten::ScalarType::Float,
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC) {
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
   return for_blob(data, std::move(sizes), type)
       .dynamism(dynamism)
       .make_tensor_ptr();
@@ -210,7 +211,7 @@ inline TensorPtr from_blob(
     std::vector<exec_aten::StridesType> strides,
     exec_aten::ScalarType type = exec_aten::ScalarType::Float,
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC) {
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
   return for_blob(data, std::move(sizes), type)
       .strides(std::move(strides))
       .dynamism(dynamism)
@@ -239,7 +240,7 @@ inline TensorPtr from_blob(
     exec_aten::ScalarType type,
     std::function<void(void*)>&& deleter,
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC) {
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
   return for_blob(data, std::move(sizes), type)
       .deleter(std::move(deleter))
       .dynamism(dynamism)
@@ -270,7 +271,7 @@ inline TensorPtr from_blob(
     exec_aten::ScalarType type,
     std::function<void(void*)>&& deleter,
     exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::STATIC) {
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
   return for_blob(data, std::move(sizes), type)
       .strides(std::move(strides))
       .deleter(std::move(deleter))
@@ -278,5 +279,408 @@ inline TensorPtr from_blob(
       .make_tensor_ptr();
 }
 
+/**
+ * Creates a TensorPtr with the specified sizes, strides, and properties.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values. The tensor is created with the
+ * specified strides.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr empty_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates an empty TensorPtr with the same size and properties as the given
+ * tensor.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr empty_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return empty_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates an empty TensorPtr with the specified sizes and properties.
+ *
+ * This function allocates memory for the tensor elements but does not
+ * initialize them with any specific values.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr empty(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return empty_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with the specified value.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param fill_value The value to fill the tensor with.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr full_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with the specified value, with the same size and
+ * properties as another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param fill_value The value to fill the tensor with.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr full_like(
+    const TensorPtr& other,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return full_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      fill_value,
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with the specified value.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param fill_value The value to fill the tensor with.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr full(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::Scalar fill_value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_strided(std::move(sizes), {}, fill_value, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr that holds a scalar value.
+ *
+ * @param value The scalar value to create the tensor with.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created scalar Tensor.
+ */
+inline TensorPtr scalar_tensor(
+    exec_aten::Scalar value,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full({}, value, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with ones, with the same size and properties as
+ * another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the `other` tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr ones_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_like(other, 1, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with ones.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr ones(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full(std::move(sizes), 1, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with zeros, with the same size and properties as
+ * another tensor.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the `other` tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr zeros_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full_like(other, 0, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with zeros.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr zeros(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return full(std::move(sizes), 0, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ **/
+TensorPtr rand_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr rand_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return rand_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values between 0 and 1.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr rand(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return rand_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values from a normal distribution.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr randn_strided(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random values from a normal distribution.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randn_like(
+    const TensorPtr& other,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return randn_strided(
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random values from a normal distribution.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randn(
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return randn_strided(std::move(sizes), {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random integer values in the given range.
+ *
+ * @param low The lower bound (inclusive) of the random values.
+ * @param high The upper bound (exclusive) of the random values.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param strides A vector specifying the stride for each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr randint_strided(
+    int64_t low,
+    int64_t high,
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Int,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr filled with random integer values in the given range.
+ *
+ * @param other A reference to another tensor, whose size and properties will be
+ * used.
+ * @param low The lower bound (inclusive) of the random values.
+ * @param high The upper bound (exclusive) of the random values.
+ * @param type The scalar type of the tensor elements. If not specified, the
+ * scalar type of the other tensor is used.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randint_like(
+    const TensorPtr& other,
+    int64_t low,
+    int64_t high,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Undefined,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  if (type == exec_aten::ScalarType::Undefined) {
+    type = other->scalar_type();
+  }
+  return randint_strided(
+      low,
+      high,
+      {other->sizes().begin(), other->sizes().end()},
+      {other->strides().begin(), other->strides().end()},
+      type,
+      dynamism);
+}
+
+/**
+ * Creates a TensorPtr filled with random integer values in the given range.
+ *
+ * @param low The lower bound (inclusive) of the random values.
+ * @param high The upper bound (exclusive) of the random values.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr randint(
+    int64_t low,
+    int64_t high,
+    std::vector<exec_aten::SizesType> sizes,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Int,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return randint_strided(low, high, std::move(sizes), {}, type, dynamism);
+}
+
 } // namespace extension
 } // namespace executorch
diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp
index 45d79f240af..f7fd062c462 100644
--- a/extension/tensor/test/tensor_impl_ptr_test.cpp
+++ b/extension/tensor/test/tensor_impl_ptr_test.cpp
@@ -23,6 +23,29 @@ class TensorImplPtrTest : public ::testing::Test {
   }
 };
 
+TEST_F(TensorImplPtrTest, ScalarTensorCreation) {
+  float scalar_data = 3.14f;
+  auto tensor_impl =
+      make_tensor_impl_ptr(exec_aten::ScalarType::Float, {}, &scalar_data);
+
+  EXPECT_EQ(tensor_impl->numel(), 1);
+  EXPECT_EQ(tensor_impl->dim(), 0);
+  EXPECT_EQ(tensor_impl->sizes().size(), 0);
+  EXPECT_EQ(tensor_impl->strides().size(), 0);
+  EXPECT_EQ((float*)tensor_impl->data(), &scalar_data);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f);
+}
+
+TEST_F(TensorImplPtrTest, ScalarTensorOwningData) {
+  auto tensor_impl = make_tensor_impl_ptr({}, {3.14f});
+
+  EXPECT_EQ(tensor_impl->numel(), 1);
+  EXPECT_EQ(tensor_impl->dim(), 0);
+  EXPECT_EQ(tensor_impl->sizes().size(), 0);
+  EXPECT_EQ(tensor_impl->strides().size(), 0);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f);
+}
+
 TEST_F(TensorImplPtrTest, TensorImplCreation) {
   float data[20] = {2};
   auto tensor_impl = make_tensor_impl_ptr(
@@ -34,8 +57,8 @@ TEST_F(TensorImplPtrTest, TensorImplCreation) {
   EXPECT_EQ(tensor_impl->strides()[0], 5);
   EXPECT_EQ(tensor_impl->strides()[1], 1);
   EXPECT_EQ(tensor_impl->data(), data);
-  EXPECT_EQ(tensor_impl->mutable_data(), data);
-  EXPECT_EQ(((float*)tensor_impl->mutable_data())[0], 2);
+  EXPECT_EQ(tensor_impl->data(), data);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 2);
 }
 
 TEST_F(TensorImplPtrTest, TensorImplSharedOwnership) {
@@ -145,7 +168,7 @@ TEST_F(TensorImplPtrTest, TensorImplDataDeleterReleasesCapturedSharedPtr) {
       data_ptr.get(),
       {},
       {},
-      exec_aten::TensorShapeDynamism::STATIC,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
 
   EXPECT_EQ(data_ptr.use_count(), 2);
@@ -172,7 +195,7 @@ TEST_F(TensorImplPtrTest, TensorImplOwningData) {
 }
 
 TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) {
-  auto tensor_impl = make_tensor_impl_ptr({0, 5}, {});
+  auto tensor_impl = make_tensor_impl_ptr({0, 5}, std::vector<float>());
 
   EXPECT_EQ(tensor_impl->dim(), 2);
   EXPECT_EQ(tensor_impl->size(0), 0);
@@ -182,6 +205,74 @@ TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) {
   EXPECT_EQ(tensor_impl->data(), nullptr);
 }
 
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyDoubleType) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0);
+  EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt32Type) {
+  std::vector<int32_t> data = {10, 20, 30, 40};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((int32_t*)tensor_impl->data())[0], 10);
+  EXPECT_EQ(((int32_t*)tensor_impl->data())[3], 40);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt64Type) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((int64_t*)tensor_impl->data())[0], 100);
+  EXPECT_EQ(((int64_t*)tensor_impl->data())[3], 400);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplDataOnlyUint8Type) {
+  std::vector<uint8_t> data = {10, 20, 30, 40};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 1);
+  EXPECT_EQ(tensor_impl->size(0), 4);
+  EXPECT_EQ(tensor_impl->strides()[0], 1);
+  EXPECT_EQ(((uint8_t*)tensor_impl->data())[0], 10);
+  EXPECT_EQ(((uint8_t*)tensor_impl->data())[3], 40);
+}
+
+TEST_F(TensorImplPtrTest, TensorImplAmbiguityWithMixedVectors) {
+  std::vector<exec_aten::SizesType> sizes = {2, 2};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto tensor_impl = make_tensor_impl_ptr(std::move(sizes), std::move(data));
+
+  EXPECT_EQ(tensor_impl->dim(), 2);
+  EXPECT_EQ(tensor_impl->size(0), 2);
+  EXPECT_EQ(tensor_impl->size(1), 2);
+  EXPECT_EQ(tensor_impl->strides()[0], 2);
+  EXPECT_EQ(tensor_impl->strides()[1], 1);
+  EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f);
+  EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f);
+
+  auto tensor_impl2 = make_tensor_impl_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
+
+  EXPECT_EQ(tensor_impl2->dim(), 2);
+  EXPECT_EQ(tensor_impl2->size(0), 2);
+  EXPECT_EQ(tensor_impl2->size(1), 2);
+  EXPECT_EQ(tensor_impl2->strides()[0], 2);
+  EXPECT_EQ(tensor_impl2->strides()[1], 1);
+  EXPECT_EQ(((float*)tensor_impl2->data())[0], 1.0f);
+  EXPECT_EQ(((float*)tensor_impl2->data())[3], 4.0f);
+}
+
 TEST_F(TensorImplPtrTest, SharedDataManagement) {
   auto data = std::make_shared<std::vector<float>>(100, 1.0f);
   auto tensor_impl1 = make_tensor_impl_ptr(
@@ -212,7 +303,7 @@ TEST_F(TensorImplPtrTest, CustomDeleterWithSharedData) {
         data->data(),
         {},
         {},
-        exec_aten::TensorShapeDynamism::STATIC,
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
         [data, &deleter_called](void*) mutable {
           deleter_called = true;
           data.reset();
diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp
index d1b4179a260..41f3fa21439 100644
--- a/extension/tensor/test/tensor_ptr_maker_test.cpp
+++ b/extension/tensor/test/tensor_ptr_maker_test.cpp
@@ -178,3 +178,262 @@ TEST_F(TensorPtrMakerTest, TensorDeleterReleasesCapturedSharedPtr) {
   EXPECT_TRUE(deleter_called);
   EXPECT_EQ(data_ptr.use_count(), 1);
 }
+
+TEST_F(TensorPtrMakerTest, CreateEmpty) {
+  auto tensor = empty({4, 5});
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto tensor2 = empty({4, 5}, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+
+  auto tensor3 = empty({4, 5}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+
+  auto tensor4 = empty({4, 5}, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrMakerTest, CreateFull) {
+  auto tensor = full({4, 5}, 7);
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 7);
+
+  auto tensor2 = full({4, 5}, 3, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 3);
+
+  auto tensor3 = full({4, 5}, 9, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 9);
+
+  auto tensor4 = full({4, 5}, 11, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 11);
+}
+
+TEST_F(TensorPtrMakerTest, CreateScalar) {
+  auto tensor = scalar_tensor(3.14f);
+
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
+
+  auto tensor2 = scalar_tensor(5, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor2->dim(), 0);
+  EXPECT_EQ(tensor2->numel(), 1);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 5);
+
+  auto tensor3 = scalar_tensor(7.0, exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor3->dim(), 0);
+  EXPECT_EQ(tensor3->numel(), 1);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor3->const_data_ptr<double>()[0], 7.0);
+}
+
+TEST_F(TensorPtrMakerTest, CreateOnes) {
+  auto tensor = ones({4, 5});
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1);
+
+  auto tensor2 = ones({4, 5}, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 1);
+
+  auto tensor3 = ones({4, 5}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 1);
+
+  auto tensor4 = ones({4, 5}, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 1);
+}
+
+TEST_F(TensorPtrMakerTest, CreateZeros) {
+  auto tensor = zeros({4, 5});
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 0);
+
+  auto tensor2 = zeros({4, 5}, exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 4);
+  EXPECT_EQ(tensor2->size(1), 5);
+  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 0);
+
+  auto tensor3 = zeros({4, 5}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->dim(), 2);
+  EXPECT_EQ(tensor3->size(0), 4);
+  EXPECT_EQ(tensor3->size(1), 5);
+  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 0);
+
+  auto tensor4 = zeros({4, 5}, exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->dim(), 2);
+  EXPECT_EQ(tensor4->size(0), 4);
+  EXPECT_EQ(tensor4->size(1), 5);
+  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 0);
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandTensor) {
+  auto tensor = rand({4, 5});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<float>()[i];
+    EXPECT_GE(val, 0.0f);
+    EXPECT_LT(val, 1.0f);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandTensorWithIntType) {
+  auto tensor = rand({4, 5}, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int32_t>()[i];
+    EXPECT_EQ(val, 0);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandTensorWithDoubleType) {
+  auto tensor = rand({4, 5}, exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<double>()[i];
+    EXPECT_GE(val, 0.0);
+    EXPECT_LT(val, 1.0);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandnTensor) {
+  auto tensor = randn({4, 5});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto sum = 0.0f;
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    sum += tensor->const_data_ptr<float>()[i];
+  }
+  const auto average = sum / tensor->numel();
+  EXPECT_NEAR(average, 0.0f, 0.5f);
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) {
+  auto tensor = randn({4, 5}, exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+
+  auto sum = 0.0;
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    sum += tensor->const_data_ptr<double>()[i];
+  }
+  const auto average = sum / tensor->numel();
+  EXPECT_NEAR(average, 0.0, 0.5);
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithIntType) {
+  auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int32_t>()[i];
+    EXPECT_GE(val, 10);
+    EXPECT_LT(val, 20);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithLongType) {
+  auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Long);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int64_t>()[i];
+    EXPECT_GE(val, 10);
+    EXPECT_LT(val, 20);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandnTensorWithIntType) {
+  auto tensor = rand({4, 5}, exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int32_t>()[i];
+    EXPECT_EQ(val, 0);
+  }
+}
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 1542824fb73..653e2ef98d7 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -22,6 +22,28 @@ class TensorPtrTest : public ::testing::Test {
   }
 };
 
+TEST_F(TensorPtrTest, ScalarTensorCreation) {
+  float scalar_data = 3.14f;
+  auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {}, &scalar_data);
+
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->sizes().size(), 0);
+  EXPECT_EQ(tensor->strides().size(), 0);
+  EXPECT_EQ(tensor->const_data_ptr<float>(), &scalar_data);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
+}
+
+TEST_F(TensorPtrTest, ScalarTensorOwningData) {
+  auto tensor = make_tensor_ptr({}, {3.14f});
+
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->sizes().size(), 0);
+  EXPECT_EQ(tensor->strides().size(), 0);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
+}
+
 TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) {
   float data[20] = {2};
   auto tensor = make_tensor_ptr(
@@ -98,7 +120,7 @@ TEST_F(TensorPtrTest, TensorWithCustomDataDeleter) {
       data,
       {},
       {},
-      exec_aten::TensorShapeDynamism::STATIC,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [&deleter_called](void* ptr) {
         deleter_called = true;
         delete[] static_cast<float*>(ptr);
@@ -118,7 +140,7 @@ TEST_F(TensorPtrTest, TensorManagesMovedVector) {
       data_ptr,
       {},
       {},
-      exec_aten::TensorShapeDynamism::STATIC,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [moved_data = std::move(data), &deleter_called](void*) mutable {
         deleter_called = true;
       });
@@ -140,7 +162,7 @@ TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) {
       data_ptr.get(),
       {},
       {},
-      exec_aten::TensorShapeDynamism::STATIC,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
 
   EXPECT_EQ(data_ptr.use_count(), 2);
@@ -167,7 +189,7 @@ TEST_F(TensorPtrTest, TensorOwningData) {
 }
 
 TEST_F(TensorPtrTest, TensorOwningEmptyData) {
-  auto tensor = make_tensor_ptr({0, 5}, {});
+  auto tensor = make_tensor_ptr({0, 5}, std::vector<float>());
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 0);
@@ -175,6 +197,90 @@ TEST_F(TensorPtrTest, TensorOwningEmptyData) {
   EXPECT_EQ(tensor->strides()[0], 5);
   EXPECT_EQ(tensor->strides()[1], 1);
   EXPECT_EQ(tensor->data_ptr<float>(), nullptr);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnly) {
+  auto tensor = make_tensor_ptr({1.0f, 2.0f, 3.0f, 4.0f});
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1.0);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<double>()[0], 1.0);
+  EXPECT_EQ(tensor->const_data_ptr<double>()[3], 4.0);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) {
+  std::vector<int32_t> data = {10, 20, 30, 40};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<int32_t>()[0], 10);
+  EXPECT_EQ(tensor->const_data_ptr<int32_t>()[3], 40);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<int64_t>()[0], 100);
+  EXPECT_EQ(tensor->const_data_ptr<int64_t>()[3], 400);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) {
+  std::vector<uint8_t> data = {10, 20, 30, 40};
+  auto tensor = make_tensor_ptr(std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->const_data_ptr<uint8_t>()[0], 10);
+  EXPECT_EQ(tensor->const_data_ptr<uint8_t>()[3], 40);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Byte);
+}
+
+TEST_F(TensorPtrTest, TensorImplAmbiguityWithMixedVectors) {
+  std::vector<exec_aten::SizesType> sizes = {2, 2};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto tensor = make_tensor_ptr(std::move(sizes), std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 2);
+  EXPECT_EQ(tensor->strides()[0], 2);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0f);
+
+  auto tensor2 = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
+
+  EXPECT_EQ(tensor2->dim(), 2);
+  EXPECT_EQ(tensor2->size(0), 2);
+  EXPECT_EQ(tensor2->size(1), 2);
+  EXPECT_EQ(tensor2->strides()[0], 2);
+  EXPECT_EQ(tensor2->strides()[1], 1);
+  EXPECT_EQ(tensor2->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_EQ(tensor2->const_data_ptr<float>()[3], 4.0f);
 }
 
 TEST_F(TensorPtrTest, TensorSharingImplModifiesSharedDataVector) {
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index 1350fc090b0..e63863fc048 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -215,6 +215,8 @@
 
 - op: linalg_vector_norm.out
 
+- op: linear.out
+
 - op: log.out
 
 - op: log10.out
diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp
index 35b208d30fc..99003f8f0ea 100644
--- a/kernels/optimized/blas/CPUBlas.cpp
+++ b/kernels/optimized/blas/CPUBlas.cpp
@@ -173,5 +173,28 @@ void gemm(
 }
 // clang-format on
 
+// clang-format off
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const BFloat16 alpha,
+    const BFloat16 *a, int64_t lda,
+    const BFloat16 *b, int64_t ldb,
+    const BFloat16 beta,
+    BFloat16 *c, int64_t ldc) {
+  normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+
+  using acc_type = utils::compute_dtype<BFloat16>;
+  gemm_impl(
+      transa, transb,
+      m, n, k,
+      static_cast<const acc_type>(alpha),
+      a, lda,
+      b, ldb,
+      static_cast<const acc_type>(beta),
+      c, ldc);
+}
+// clang-format on
+
 } // namespace cpublas
 } // namespace executorch
diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h
index dd4a24cbce0..71e50601238 100644
--- a/kernels/optimized/blas/CPUBlas.h
+++ b/kernels/optimized/blas/CPUBlas.h
@@ -17,6 +17,7 @@
 namespace executorch {
 namespace cpublas {
 
+using BFloat16 = torch::executor::BFloat16;
 using Half = torch::executor::Half;
 
 enum class TransposeType {
@@ -104,6 +105,15 @@ void gemm(
     const Half *b, int64_t ldb,
     const Half beta,
     Half *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const BFloat16 alpha,
+    const BFloat16 *a, int64_t lda,
+    const BFloat16 *b, int64_t ldb,
+    const BFloat16 beta,
+    BFloat16 *c, int64_t ldc);
 // clang-format on
 
 // clang-format off
diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp
new file mode 100644
index 00000000000..56634d326f2
--- /dev/null
+++ b/kernels/optimized/cpu/op_linear.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <array>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& opt_linear_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& mat2,
+    const optional<Tensor>& bias,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      !bias.has_value(),
+      InvalidArgument,
+      out,
+      "bias not supported yet in linear");
+  ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out);
+
+  size_t output_ndim = 0;
+  std::array<exec_aten::SizesType, kTensorDimensionLimit> output_sizes;
+  get_linear_out_target_size(in, mat2, output_sizes.data(), &output_ndim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // gemm on some platforms doesn't tolerate empty input.
+  if (out.numel() == 0) {
+    return out;
+  }
+
+  int flattened_input_dim = 1;
+  for (int ii = 0; ii < in.dim() - 1; ++ii) {
+    flattened_input_dim *= in.sizes()[ii];
+  }
+  ET_SWITCH_REAL_TYPES_AND2(
+      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
+        size_t n = flattened_input_dim;
+        size_t k = in.sizes()[in.dim() - 1];
+        size_t m = mat2.size(0);
+
+        executorch::cpublas::gemm(
+            executorch::cpublas::TransposeType::Transpose,
+            executorch::cpublas::TransposeType::NoTranspose,
+            m,
+            n,
+            k,
+            static_cast<CTYPE>(1),
+            mat2.const_data_ptr<CTYPE>(),
+            k,
+            in.const_data_ptr<CTYPE>(),
+            k,
+            static_cast<CTYPE>(0),
+            out.mutable_data_ptr<CTYPE>(),
+            m);
+      });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/op_mm.cpp b/kernels/optimized/cpu/op_mm.cpp
new file mode 100644
index 00000000000..9131356aeb6
--- /dev/null
+++ b/kernels/optimized/cpu/op_mm.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <array>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& opt_mm_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& mat2,
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, check_mm_args(in, mat2, out), InvalidArgument, out);
+
+  size_t output_ndim = 0;
+  std::array<exec_aten::SizesType, kTensorDimensionLimit> output_sizes;
+  get_mm_out_target_size(in, mat2, output_sizes.data(), &output_ndim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (out.numel() == 0) {
+    return out;
+  }
+  ET_SWITCH_REAL_TYPES_AND2(
+      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
+        size_t n = in.size(0);
+        size_t k = in.size(1);
+        size_t m = mat2.size(1);
+
+        // gemm expects column-major inputs and produces column-major
+        // output. So, we take advantage of the identity (A @ B).t()
+        // = B.t() @ A.t() here; row-major B is B.t() from gemm's
+        // column-major perspective, etc.
+        executorch::cpublas::gemm(
+            executorch::cpublas::TransposeType::NoTranspose,
+            executorch::cpublas::TransposeType::NoTranspose,
+            m,
+            n,
+            k,
+            static_cast<CTYPE>(1),
+            mat2.const_data_ptr<CTYPE>(),
+            m,
+            in.const_data_ptr<CTYPE>(),
+            k,
+            static_cast<CTYPE>(0),
+            out.mutable_data_ptr<CTYPE>(),
+            m);
+      });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index e7bb2d36bf4..488d2af7fa1 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -40,6 +40,13 @@ _OPTIMIZED_ATEN_OPS = (
             "//executorch/kernels/portable/cpu:scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_linear",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
     op_target(
         name = "op_log_softmax",
         deps = select({
@@ -52,6 +59,13 @@ _OPTIMIZED_ATEN_OPS = (
             ],
         }),
     ),
+    op_target(
+        name = "op_mm",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
     op_target(
         name = "op_mul",
         deps = [
diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml
index f79d652b91d..797744f3bd4 100644
--- a/kernels/optimized/optimized-oss.yaml
+++ b/kernels/optimized/optimized-oss.yaml
@@ -45,6 +45,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_le_tensor_out
 
+- op: linear.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_linear_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
index 0d445deb3e8..2421673f8a7 100644
--- a/kernels/optimized/optimized.yaml
+++ b/kernels/optimized/optimized.yaml
@@ -52,6 +52,16 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_le_tensor_out
 
+- op: linear.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_linear_out
+
+- op: mm.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_mm_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
diff --git a/kernels/optimized/test/libblas_test.cpp b/kernels/optimized/test/libblas_test.cpp
index 8f30a357e1a..24aeaba776a 100644
--- a/kernels/optimized/test/libblas_test.cpp
+++ b/kernels/optimized/test/libblas_test.cpp
@@ -9,6 +9,7 @@
 #include <gtest/gtest.h>
 
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 #include <vector>
 
@@ -17,7 +18,8 @@
   _<float, N>();                           \
   _<int64_t, N>();                         \
   _<uint8_t, N>();                         \
-  _<int32_t, N>();
+  _<int32_t, N>();                         \
+  _<exec_aten::BFloat16, N>();
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 8fc4f9d4593..34e7e085687 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -123,7 +123,11 @@ Tensor& mul_scalar_out(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
diff --git a/kernels/portable/cpu/op_reflection_pad1d.cpp b/kernels/portable/cpu/op_reflection_pad1d.cpp
index 66a2333619f..53fbbc9c56a 100644
--- a/kernels/portable/cpu/op_reflection_pad1d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad1d.cpp
@@ -28,6 +28,11 @@ Tensor& reflection_pad1d_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_padding_out_target_size(1, in, padding, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_reflection_pad2d.cpp b/kernels/portable/cpu/op_reflection_pad2d.cpp
index a16d92ff1ce..8de0baba43b 100644
--- a/kernels/portable/cpu/op_reflection_pad2d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad2d.cpp
@@ -28,6 +28,11 @@ Tensor& reflection_pad2d_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_padding_out_target_size(2, in, padding, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_reflection_pad3d.cpp b/kernels/portable/cpu/op_reflection_pad3d.cpp
index 9629b9e4c4e..4ba78733046 100644
--- a/kernels/portable/cpu/op_reflection_pad3d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad3d.cpp
@@ -28,6 +28,11 @@ Tensor& reflection_pad3d_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_padding_out_target_size(3, in, padding, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_relu.cpp b/kernels/portable/cpu/op_relu.cpp
index b9136cb3392..e59aec3ae64 100644
--- a/kernels/portable/cpu/op_relu.cpp
+++ b/kernels/portable/cpu/op_relu.cpp
@@ -35,6 +35,9 @@ Tensor& relu_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
   ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "relu.out", CTYPE, [&]() {
     apply_unary_map_fn(
         [](const CTYPE val_in) {
diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp
index 7c858c1c08a..3a641829773 100644
--- a/kernels/portable/cpu/op_remainder.cpp
+++ b/kernels/portable/cpu/op_remainder.cpp
@@ -80,6 +80,9 @@ Tensor& remainder_Tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
@@ -124,6 +127,9 @@ Tensor& remainder_Scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
diff --git a/kernels/portable/cpu/op_repeat.cpp b/kernels/portable/cpu/op_repeat.cpp
index 644ebc98420..3b5596b2163 100644
--- a/kernels/portable/cpu/op_repeat.cpp
+++ b/kernels/portable/cpu/op_repeat.cpp
@@ -62,6 +62,11 @@ Tensor& repeat_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_roll.cpp b/kernels/portable/cpu/op_roll.cpp
index 4eff081eec4..09c7667c812 100644
--- a/kernels/portable/cpu/op_roll.cpp
+++ b/kernels/portable/cpu/op_roll.cpp
@@ -60,6 +60,9 @@ Tensor& roll_out(
   ET_KERNEL_CHECK(
       ctx, check_roll_args(in, shifts, dims, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   if (in.numel() == 0) {
     return out;
   }
diff --git a/kernels/portable/cpu/op_round.cpp b/kernels/portable/cpu/op_round.cpp
index 0b28ba41887..33af6508be2 100644
--- a/kernels/portable/cpu/op_round.cpp
+++ b/kernels/portable/cpu/op_round.cpp
@@ -45,6 +45,9 @@ Tensor& round_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   auto in_scalar_type = in.scalar_type();
 
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "round.out", CTYPE, [&] {
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
index 6a5ef598ef4..442221d6693 100644
--- a/kernels/portable/cpu/op_rsub.cpp
+++ b/kernels/portable/cpu/op_rsub.cpp
@@ -31,6 +31,9 @@ Tensor& rsub_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
index e10d87f9193..b4cf0d84f04 100644
--- a/kernels/portable/cpu/op_scatter_add.cpp
+++ b/kernels/portable/cpu/op_scatter_add.cpp
@@ -65,6 +65,15 @@ Tensor& scatter_add_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      context,
+      tensors_have_same_dim_order(self, src, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      context, tensor_is_default_dim_order(index), InvalidArgument, out);
+
   if (dim < 0) {
     dim += nonzero_dim(self);
   }
diff --git a/kernels/portable/cpu/op_select_scatter.cpp b/kernels/portable/cpu/op_select_scatter.cpp
index 71e7d9dfefd..db3ef8b1d29 100644
--- a/kernels/portable/cpu/op_select_scatter.cpp
+++ b/kernels/portable/cpu/op_select_scatter.cpp
@@ -33,6 +33,9 @@ Tensor& select_scatter_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, src, out), InvalidArgument, out);
+
   // Account for negative indices
   if (dim < 0) {
     dim += in.dim();
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index b696c29518b..919d42a721a 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -24,6 +24,9 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_sign.cpp b/kernels/portable/cpu/op_sign.cpp
index 6dc6f3d015e..1c18788404d 100644
--- a/kernels/portable/cpu/op_sign.cpp
+++ b/kernels/portable/cpu/op_sign.cpp
@@ -30,6 +30,9 @@ Tensor& sign_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
diff --git a/kernels/portable/cpu/op_slice_copy.cpp b/kernels/portable/cpu/op_slice_copy.cpp
index 41a76567906..2b5c48737d6 100644
--- a/kernels/portable/cpu/op_slice_copy.cpp
+++ b/kernels/portable/cpu/op_slice_copy.cpp
@@ -33,6 +33,9 @@ Tensor& slice_copy_Tensor_out(
     dim += in.dim();
   }
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // If user do not set value to end_val, set end to in.size(dim) (largest
   // value available)
   int64_t end = end_val.has_value() ? end_val.value() : in.size(dim);
diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp
index 47374716b4e..97f75553c1d 100644
--- a/kernels/portable/cpu/op_slice_scatter.cpp
+++ b/kernels/portable/cpu/op_slice_scatter.cpp
@@ -40,6 +40,9 @@ Tensor& slice_scatter_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(input, out), InvalidArgument, out);
+
   if (input.numel() == 0) {
     return out;
   }
diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp
index 9f1565ff161..544887bed62 100644
--- a/kernels/portable/cpu/op_softmax.cpp
+++ b/kernels/portable/cpu/op_softmax.cpp
@@ -36,6 +36,9 @@ Tensor& softmax_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
diff --git a/kernels/portable/cpu/op_split_copy.cpp b/kernels/portable/cpu/op_split_copy.cpp
index a604e76b51c..1829b356ff2 100644
--- a/kernels/portable/cpu/op_split_copy.cpp
+++ b/kernels/portable/cpu/op_split_copy.cpp
@@ -46,6 +46,11 @@ void split_copy_Tensor_out(
       check_split_copy_args(input, split_size, dim, out),
       InvalidArgument, );
 
+  for (size_t i = 0; i < out.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, );
+  }
+
   const size_t leading_dims = getLeadingDims(input, dim);
   const size_t trailing_dims = getTrailingDims(input, dim);
   const size_t step = input.size(dim) * trailing_dims;
diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
index 7d1b485e7a4..623394e8013 100644
--- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp
+++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
@@ -38,6 +38,11 @@ void split_with_sizes_copy_out(
       check_split_with_sizes_copy_args(in, split_sizes, dim, out),
       InvalidArgument, );
 
+  for (size_t i = 0; i < out.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx, tensors_have_same_dim_order(in, out[i]), InvalidArgument, );
+  }
+
   // If out is empty, then nothing needs to be done after checking the args.
   // Valid args implies that in.size(dim) == 0 and split_sizes is also empty.
   if (out.size() == 0) {
diff --git a/kernels/portable/cpu/op_squeeze_copy.cpp b/kernels/portable/cpu/op_squeeze_copy.cpp
index 5be91ff827d..11489e31729 100644
--- a/kernels/portable/cpu/op_squeeze_copy.cpp
+++ b/kernels/portable/cpu/op_squeeze_copy.cpp
@@ -29,6 +29,11 @@ Tensor& squeeze_copy_dim_out(
   ET_KERNEL_CHECK(
       ctx, check_squeeze_copy_dim_args(in, dim, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   if (dim < 0) {
     dim += nonzero_dim(in);
   }
@@ -62,6 +67,11 @@ Tensor& squeeze_copy_dims_out(
   ET_KERNEL_CHECK(
       ctx, check_squeeze_copy_dims_args(in, dims, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_squeeze_copy_dims_out_target_size(
diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp
index f241120ae2f..6859f2a8746 100644
--- a/kernels/portable/cpu/op_stack.cpp
+++ b/kernels/portable/cpu/op_stack.cpp
@@ -31,6 +31,16 @@ Tensor& stack_out(
   ET_KERNEL_CHECK(
       ctx, check_stack_args(tensors, dim, out), InvalidArgument, out);
 
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(tensors[i], out),
+        InvalidArgument,
+        out);
+  }
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index 04254653a43..b97b7b490f3 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -78,6 +78,9 @@ Tensor& sub_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
@@ -131,6 +134,9 @@ Tensor& sub_scalar_out(
 
   ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType alpha_type = utils::get_scalar_dtype(alpha);
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
index dfa897206a9..c9a4260344e 100644
--- a/kernels/portable/cpu/op_sum.cpp
+++ b/kernels/portable/cpu/op_sum.cpp
@@ -38,6 +38,11 @@ Tensor& sum_dim_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(
       Bool, in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] {
         ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_t_copy.cpp b/kernels/portable/cpu/op_t_copy.cpp
index c6a2ad5fdb5..46807a42f22 100644
--- a/kernels/portable/cpu/op_t_copy.cpp
+++ b/kernels/portable/cpu/op_t_copy.cpp
@@ -47,6 +47,11 @@ Tensor& t_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
     return out;
   }
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_transpose_out_target_size(in, 1, 0, expected_out_size, &expected_out_dim);
diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp
index c0c04e65e93..46bd0bf987e 100644
--- a/kernels/portable/cpu/op_to_copy.cpp
+++ b/kernels/portable/cpu/op_to_copy.cpp
@@ -46,6 +46,11 @@ Tensor& to_copy_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   ET_SWITCH_REALHBBF16_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] {
     ET_SWITCH_REALHBBF16_TYPES(
         out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] {
diff --git a/kernels/portable/cpu/op_transpose_copy.cpp b/kernels/portable/cpu/op_transpose_copy.cpp
index 79c04646a73..d2456b8592e 100644
--- a/kernels/portable/cpu/op_transpose_copy.cpp
+++ b/kernels/portable/cpu/op_transpose_copy.cpp
@@ -57,6 +57,9 @@ Tensor& transpose_copy_int_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_ALL_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
     transpose_tensors<CTYPE>(in, dim0, dim1, out);
   });
diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp
index cdf87bea4ba..46a91e8c627 100644
--- a/kernels/portable/cpu/op_tril.cpp
+++ b/kernels/portable/cpu/op_tril.cpp
@@ -145,6 +145,11 @@ Tensor& tril_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   if (self.numel() == 0) {
     return out;
   }
diff --git a/kernels/portable/cpu/op_unbind_copy.cpp b/kernels/portable/cpu/op_unbind_copy.cpp
index da5a73d624c..cea4ccce345 100644
--- a/kernels/portable/cpu/op_unbind_copy.cpp
+++ b/kernels/portable/cpu/op_unbind_copy.cpp
@@ -36,6 +36,13 @@ void unbind_copy_int_out(
   ET_KERNEL_CHECK(
       ctx, check_unbind_copy_args(input, dim, out), InvalidArgument, );
 
+  for (int i = 0; i < out.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, );
+  }
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(input), InvalidArgument, );
+
   if (input.numel() == 0) {
     return;
   }
diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp
index f6d25a04983..1c0a5c79990 100644
--- a/kernels/portable/cpu/op_unsqueeze_copy.cpp
+++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp
@@ -38,6 +38,11 @@ Tensor& unsqueeze_copy_out(
   ET_KERNEL_CHECK(ctx, self.dim() + 1 == out.dim(), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, dim <= self.dim(), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   for (size_t i = 0; i < out.dim(); ++i) {
     if (i < dim) {
       expected_output_size[i] = self.size(i);
diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp
index 52019e381c0..fa49269196e 100644
--- a/kernels/portable/cpu/op_var.cpp
+++ b/kernels/portable/cpu/op_var.cpp
@@ -74,6 +74,11 @@ Tensor& var_out(
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx,
       resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok,
diff --git a/kernels/portable/cpu/op_view_copy.cpp b/kernels/portable/cpu/op_view_copy.cpp
index f7174caac1e..ba72396b44f 100644
--- a/kernels/portable/cpu/op_view_copy.cpp
+++ b/kernels/portable/cpu/op_view_copy.cpp
@@ -44,6 +44,11 @@ Tensor& view_copy_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx, check_view_copy_args(self, size_int64_t, out), InvalidArgument, out);
 
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index 6ff4cb85fb3..90f6e3df92b 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -35,6 +35,9 @@ Tensor& where_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out);
+
   constexpr auto name = "where.self_out";
 
   ET_CHECK_MSG(
diff --git a/kernels/portable/cpu/util/matmul_ops_util.cpp b/kernels/portable/cpu/util/matmul_ops_util.cpp
index d7e49d64958..3d4f2e5e9ba 100644
--- a/kernels/portable/cpu/util/matmul_ops_util.cpp
+++ b/kernels/portable/cpu/util/matmul_ops_util.cpp
@@ -71,6 +71,19 @@ bool check_mm_args(const Tensor& in, const Tensor& mat2, Tensor& out) {
   return true;
 }
 
+bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() == out.dim());
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() >= 2);
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(mat2, 2));
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, mat2, out));
+
+  ET_LOG_AND_RETURN_IF_FALSE(
+      tensors_have_same_size_at_dims(in, in.dim() - 1, mat2, 1));
+
+  return true;
+}
+
 void get_mm_out_target_size(
     const Tensor& mat1,
     const Tensor& mat2,
@@ -81,5 +94,17 @@ void get_mm_out_target_size(
   out_sizes[1] = mat2.size(1);
 }
 
+void get_linear_out_target_size(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = mat1.dim();
+  for (int ii = 0; ii < mat1.dim() - 1; ++ii) {
+    out_sizes[ii] = mat1.sizes()[ii];
+  }
+  out_sizes[mat1.dim() - 1] = mat2.size(0);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/matmul_ops_util.h b/kernels/portable/cpu/util/matmul_ops_util.h
index 91e27ff2cc9..d2991868e95 100644
--- a/kernels/portable/cpu/util/matmul_ops_util.h
+++ b/kernels/portable/cpu/util/matmul_ops_util.h
@@ -37,5 +37,13 @@ void get_mm_out_target_size(
     Tensor::SizesType* out_sizes,
     size_t* out_ndim);
 
+bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out);
+
+void get_linear_out_target_size(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim);
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/select_copy_util.cpp b/kernels/portable/cpu/util/select_copy_util.cpp
index cf56b3e4ca2..2564317b043 100644
--- a/kernels/portable/cpu/util/select_copy_util.cpp
+++ b/kernels/portable/cpu/util/select_copy_util.cpp
@@ -38,6 +38,10 @@ Error select_copy_util(
     return Error::InvalidArgument;
   }
 
+  if (!tensors_have_same_dim_order(in, out)) {
+    return Error::InvalidArgument;
+  }
+
   // If the input is a empty tensor, no other operation could be done. We just
   // return the output.
   if (in.numel() == 0) {
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
new file mode 100644
index 00000000000..96875cc6f77
--- /dev/null
+++ b/kernels/test/op_linear_test.cpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+#include <limits>
+
+using namespace ::testing;
+using exec_aten::ArrayRef;
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpLinearOutTest : public OperatorTest {
+ protected:
+  Tensor& op_linear_out(const Tensor& self, const Tensor& mat2, Tensor& out) {
+    return torch::executor::aten::linear_outf(context_, self, mat2, {}, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+      if (DTYPE == ScalarType::Half) {
+        GTEST_SKIP()
+            << "skip Half because torch::executor::aten::mm_out does not support Half";
+        return;
+      }
+    }
+
+    // matmul gives 4 * 2 * 3 = 24
+    Tensor x = tf.full({3, 4}, 2);
+    Tensor y = tf.full({5, 4}, 3);
+
+    // Output shape should be (3, 5)
+    Tensor out = tf.zeros({3, 5});
+
+    op_linear_out(x, y, out);
+
+    Tensor expected = tf.full({3, 5}, 24);
+
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpLinearOutTest, OutputDim) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // 3 tensors with compatible dimensions: (3, 5), (3, 4) and (4, 5).
+  Tensor x = tf.ones({3, 4});
+  Tensor y = tf.ones({5, 4});
+  Tensor out = tf.zeros({3, 5});
+
+  Tensor ret = op_linear_out(x, y, out);
+
+  // Should always return the provided out Tensor.
+  EXPECT_TENSOR_EQ(ret, out);
+
+  // Expected tensor, filled with 4.
+  Tensor expected = tf.full({3, 5}, 4);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+/// A generic smoke test that works for any dtype that supports ones() and
+/// zeros().
+TEST_F(OpLinearOutTest, AllDtypesSupported) {
+#define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+  // TODO: Also add tests for half, complex, quantized, and other types. Easiest
+  // way to do that would be to make TensorFactory support zeros() and ones()
+  // for those types.
+}
+
+TEST_F(OpLinearOutTest, EmptyInputWithEmptyOutTensorPasses) {
+  TensorFactory<ScalarType::Float> tf;
+
+  // Empty input matrices
+  Tensor x = tf.make({0, 3}, {});
+  Tensor y = tf.make({0, 3}, {});
+
+  // Output matrix is also empty
+  Tensor out = tf.make({0, 0}, {});
+
+  Tensor expected = tf.make({0, 0}, {});
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected);
+}
+
+TEST_F(OpLinearOutTest, InfinityTensorPasses) {
+  TensorFactory<ScalarType::Float> tff;
+
+  Tensor x = tff.full({3, 4}, std::numeric_limits<float>::infinity());
+  Tensor y = tff.full({5, 4}, 3);
+
+  // Output shape should be (3, 5)
+  Tensor out = tff.zeros({3, 5});
+
+  Tensor expected = tff.full({3, 5}, std::numeric_limits<float>::infinity());
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected);
+}
+
+TEST_F(OpLinearOutTest, MismatchedDimensionsDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor x = tf.full({2, 2}, 3);
+
+  Tensor wrong_y = tf.full({1, 3}, 1);
+  Tensor right_y = tf.full({2, 2}, 1);
+
+  // Make an empty out tensor and demonstrate that it's empty.
+  Tensor out = tf.full({2, 2}, 0);
+
+  Tensor expected = tf.full({2, 2}, 6);
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, out));
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, right_y, out), expected);
+}
+
+TEST_F(OpLinearOutTest, MismatchedDimensionSizeDies) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle mismatched dimension size";
+  }
+  TensorFactory<ScalarType::Int> tf;
+  Tensor x = tf.full({2, 2}, 3);
+
+  // wrong_y has incompatible dim
+  Tensor wrong_y = tf.full({2, 2, 2}, 1);
+  Tensor right_y = tf.full({2, 2}, 1);
+
+  // wrong_out has incompatible dim
+  Tensor right_out = tf.ones({2, 2});
+  Tensor wrong_out = tf.ones({2, 2, 3});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, right_y, wrong_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, right_out));
+}
+
+TEST_F(OpLinearOutTest, WrongOutShapeDies) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle wrong out shape";
+  }
+  TensorFactory<ScalarType::Int> tf;
+  Tensor x = tf.ones({10, 3});
+
+  Tensor y = tf.ones({4, 3});
+
+  // wrong_out has incompatible shape
+  Tensor right_out = tf.ones({10, 4});
+  Tensor wrong_out = tf.ones({7, 5});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, y, wrong_out));
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, right_out), tf.full({10, 4}, 3));
+}
+
+TEST_F(OpLinearOutTest, DynamicShapeUpperBoundSameAsExpected) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {3, 2},
+      {0.17412060499191284,
+       0.34793388843536377,
+       0.8187907934188843,
+       0.9979893565177917,
+       0.7049332857131958,
+       0.4255824089050293});
+  Tensor y = tf.make(
+      {4, 2},
+      {0.8071839213371277,
+       0.31638312339782715,
+       0.13667285442352295,
+       0.3691965937614441,
+       0.9002121090888977,
+       0.09420186281204224,
+       0.9070476293563843,
+       0.9310881495475769});
+  Tensor expected_result = tf.make(
+      {3, 4},
+      {0.2506277561187744,
+       0.15225356817245483,
+       0.18952149152755737,
+       0.48189279437065125,
+       0.976661741733551,
+       0.480360746383667,
+       0.8310978412628174,
+       1.6718982458114624,
+       0.703657865524292,
+       0.2534688115119934,
+       0.6746801733970642,
+       1.0356627702713013});
+
+  Tensor out =
+      tf.zeros({3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  Tensor ret = op_linear_out(x, y, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpLinearOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {3, 2},
+      {0.17412060499191284,
+       0.34793388843536377,
+       0.8187907934188843,
+       0.9979893565177917,
+       0.7049332857131958,
+       0.4255824089050293});
+  Tensor y = tf.make(
+      {4, 2},
+      {0.8071839213371277,
+       0.31638312339782715,
+       0.13667285442352295,
+       0.3691965937614441,
+       0.9002121090888977,
+       0.09420186281204224,
+       0.9070476293563843,
+       0.9310881495475769});
+  Tensor expected_result = tf.make(
+      {3, 4},
+      {0.2506277561187744,
+       0.15225356817245483,
+       0.18952149152755737,
+       0.48189279437065125,
+       0.976661741733551,
+       0.480360746383667,
+       0.8310978412628174,
+       1.6718982458114624,
+       0.703657865524292,
+       0.2534688115119934,
+       0.6746801733970642,
+       1.0356627702713013});
+
+  Tensor out =
+      tf.zeros({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  Tensor ret = op_linear_out(x, y, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpLinearOutTest, DynamicShapeUnbound) {
+  GTEST_SKIP() << "Dynamic shape not supported";
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {3, 2},
+      {0.17412060499191284,
+       0.34793388843536377,
+       0.8187907934188843,
+       0.9979893565177917,
+       0.7049332857131958,
+       0.4255824089050293});
+  Tensor y = tf.make(
+      {4, 2},
+      {0.8071839213371277,
+       0.31638312339782715,
+       0.13667285442352295,
+       0.3691965937614441,
+       0.9002121090888977,
+       0.09420186281204224,
+       0.9070476293563843,
+       0.9310881495475769});
+  Tensor expected_result = tf.make(
+      {3, 4},
+      {0.2506277561187744,
+       0.15225356817245483,
+       0.18952149152755737,
+       0.48189279437065125,
+       0.976661741733551,
+       0.480360746383667,
+       0.8310978412628174,
+       1.6718982458114624,
+       0.703657865524292,
+       0.2534688115119934,
+       0.6746801733970642,
+       1.0356627702713013});
+
+  Tensor out =
+      tf.zeros({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  Tensor ret = op_linear_out(x, y, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+// TODO: support and test bias
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index 84a7e8dedc4..f8205ea601e 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -586,3 +586,29 @@ TEST_F(OpMulScalarOutTest, OptimizedSanityCheck) {
   // Check that it matches the expected output.
   EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4}));
 }
+
+TEST_F(OpMulScalarOutTest, HalfSanityCheck) {
+  TensorFactory<ScalarType::Half> tf;
+
+  const std::vector<int32_t> sizes = {2, 2};
+
+  Tensor out = tf.zeros(sizes);
+
+  op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out);
+
+  // Check that it matches the expected output.
+  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4}));
+}
+
+TEST_F(OpMulScalarOutTest, BFloat16SanityCheck) {
+  TensorFactory<ScalarType::BFloat16> tf;
+
+  const std::vector<int32_t> sizes = {2, 2};
+
+  Tensor out = tf.zeros(sizes);
+
+  op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out);
+
+  // Check that it matches the expected output.
+  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4}));
+}
diff --git a/kernels/test/op_slice_scatter_test.cpp b/kernels/test/op_slice_scatter_test.cpp
index 1d5c8a43b10..1d5e972ef2e 100644
--- a/kernels/test/op_slice_scatter_test.cpp
+++ b/kernels/test/op_slice_scatter_test.cpp
@@ -863,3 +863,24 @@ TEST_F(OpSliceScatterTensorOutTest, DynamicShapeTest) {
   EXPECT_TENSOR_EQ(ret_default_end, out);
   EXPECT_TENSOR_EQ(ret_default_end, expected);
 }
+
+TEST_F(OpSliceScatterTensorOutTest, LargeEndValue) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor input = tf.zeros({1, 1, 2, 5, 3, 3});
+  Tensor src = tf.ones({1, 1, 2, 5, 3, 3});
+
+  Tensor out = tf.zeros({1, 1, 2, 5, 3, 3});
+  Tensor expected = tf.ones({1, 1, 2, 5, 3, 3});
+
+  Tensor ret = op_slice_scatter_out(
+      input,
+      src,
+      /*dim=*/1,
+      /*start=*/0,
+      /*end=*/9223372036854775807,
+      /*step=*/1,
+      out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_EQ(ret, expected);
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 7ae17c5237a..f8ea484435a 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -226,6 +226,7 @@ def define_common_targets():
     _common_op_test("op_le_test", ["aten", "portable", "optimized"])
     _common_op_test("op_leaky_relu_test", ["aten", "portable"])
     _common_op_test("op_lift_fresh_copy_test", ["aten", "portable"])
+    _common_op_test("op_linear_test", ["aten", "optimized"])
     _common_op_test("op_log_softmax_test", ["aten", "portable", "optimized"])
     _common_op_test("op_log_test", ["aten", "portable"])
     _common_op_test("op_log10_test", ["aten", "portable"])
@@ -244,7 +245,7 @@ def define_common_targets():
     _common_op_test("op_mean_test", ["aten", "portable"])
     _common_op_test("op_min_test", ["aten", "portable"])
     _common_op_test("op_minimum_test", ["aten", "portable"])
-    _common_op_test("op_mm_test", ["aten", "portable"])
+    _common_op_test("op_mm_test", ["aten", "portable", "optimized"])
     _common_op_test("op_mul_test", ["aten", "portable", "optimized"])
     _common_op_test("op_narrow_copy_test", ["aten", "portable"])
     _common_op_test("op_native_batch_norm_test", ["aten", "portable"])
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index 4d8712c1590..7c576f889fb 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -73,6 +73,10 @@ struct is_reduced_floating_point
           bool,
           std::is_same<T, torch::executor::Half>::value ||
               std::is_same<T, torch::executor::BFloat16>::value> {};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v =
+    is_reduced_floating_point<T>::value;
 #endif
 
 /// Maps ScalarTypes to C++ types.
diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h
index 5aded68270b..8987d82804b 100644
--- a/runtime/core/portable_type/half.h
+++ b/runtime/core/portable_type/half.h
@@ -62,7 +62,7 @@ struct alignas(2) Half {
 namespace internal {
 
 inline float fp32_from_bits(uint32_t w) {
-  static_assert(sizeof(float) == sizeof(uint32_t), "");
+  static_assert(sizeof(float) == sizeof(uint32_t));
   union {
     uint32_t as_bits;
     float as_value;
@@ -71,7 +71,7 @@ inline float fp32_from_bits(uint32_t w) {
 }
 
 inline uint32_t fp32_to_bits(float f) {
-  static_assert(sizeof(float) == sizeof(uint32_t), "");
+  static_assert(sizeof(float) == sizeof(uint32_t));
   union {
     float as_value;
     uint32_t as_bits;
diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h
index 4036539ccc5..47a9f335eb5 100644
--- a/runtime/core/portable_type/string_view.h
+++ b/runtime/core/portable_type/string_view.h
@@ -79,13 +79,10 @@ class basic_string_view final {
   }
 
   constexpr const_reference at(size_type pos) const {
-    return (pos >= size_)
-        ? (ET_ASSERT_MESSAGE_EMIT(
-               " (%s): "
-               "string_view::operator[] or string_view::at() out of range",
-               pos >= size_),
-           torch::executor::runtime_abort())
-        : at_(pos);
+    ET_CHECK_MSG(
+        pos >= size_,
+        "string_view::operator[] or string_view::at() out of range");
+    return at_(pos);
   }
 
   constexpr const_reference front() const {
@@ -140,13 +137,9 @@ class basic_string_view final {
 
   constexpr basic_string_view substr(size_type pos = 0, size_type count = npos)
       const {
-    return (pos > size_)
-        ? (ET_ASSERT_MESSAGE_EMIT(
-               " (%s): "
-               "basic_string_view::substr parameter out of bounds.",
-               pos > size_),
-           torch::executor::runtime_abort())
-        : substr_(pos, count);
+    ET_CHECK_MSG(
+        pos > size_, "basic_string_view::substr parameter out of bounds.");
+    return substr_(pos, count);
   }
 
   constexpr int compare(basic_string_view rhs) const noexcept {
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index d39ba875531..a6ed7e354a9 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -17,6 +17,7 @@
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/executor/memory_manager.h>
+#include <executorch/runtime/executor/platform_memory_allocator.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/executor/tensor_parser.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
@@ -29,6 +30,8 @@
 namespace executorch {
 namespace runtime {
 
+using internal::PlatformMemoryAllocator;
+
 /**
  * Runtime state for a backend delegate.
  */
@@ -527,19 +530,20 @@ Error Method::resolve_operator(
           i,
           static_cast<uint32_t>(err));
       meta[count].dim_order_ =
-          ArrayRef<exec_aten::DimOrderType>(dim_order_ptr, size);
+          Span<exec_aten::DimOrderType>(dim_order_ptr, size);
       count++;
     }
   }
-  // search kernel
-  if (hasOpsFn(operator_name, ArrayRef<TensorMeta>(meta, count))) {
-    kernels[kernel_index] =
-        getOpsFn(operator_name, ArrayRef<TensorMeta>(meta, count));
-    return Error::Ok;
-  } else {
+
+  // Find a kernel with the matching name and tensor meta.
+  Result<OpFunction> op_function =
+      get_op_function_from_registry(operator_name, {meta, count});
+  if (!op_function.ok()) {
     ET_LOG(Error, "Missing operator: [%d] %s", op_index, operator_name);
-    return Error::OperatorMissing;
+    return op_function.error();
   }
+  kernels[kernel_index] = op_function.get();
+  return Error::Ok;
 }
 
 Result<Method> Method::load(
@@ -547,7 +551,16 @@ Result<Method> Method::load(
     const Program* program,
     MemoryManager* memory_manager,
     EventTracer* event_tracer) {
-  Method method(program, memory_manager, event_tracer);
+  MemoryAllocator* temp_allocator = memory_manager->temp_allocator();
+  if (temp_allocator == nullptr) {
+    PlatformMemoryAllocator* platform_allocator =
+        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
+            memory_manager->method_allocator(), PlatformMemoryAllocator);
+    new (platform_allocator) PlatformMemoryAllocator();
+    temp_allocator = platform_allocator;
+  }
+  Method method(program, memory_manager, event_tracer, temp_allocator);
+
   Error err = method.init(s_plan);
   if (err != Error::Ok) {
     return err;
@@ -1038,16 +1051,14 @@ Error Method::execute_instruction() {
   auto instruction = instructions->Get(step_state_.instr_idx);
   size_t next_instr_idx = step_state_.instr_idx + 1;
   Error err = Error::Ok;
+
   switch (instruction->instr_args_type()) {
     case executorch_flatbuffer::InstructionArguments::KernelCall: {
       EXECUTORCH_SCOPE_PROF("OPERATOR_CALL");
       internal::EventTracerProfileScope event_tracer_scope =
           internal::EventTracerProfileScope(event_tracer_, "OPERATOR_CALL");
       // TODO(T147221312): Also expose tensor resizer via the context.
-      // The temp_allocator passed can be null, but calling allocate_temp will
-      // fail
-      KernelRuntimeContext context(
-          event_tracer_, memory_manager_->temp_allocator());
+      KernelRuntimeContext context(event_tracer_, temp_allocator_);
       auto args = chain.argument_lists_[step_state_.instr_idx];
       chain.kernels_[step_state_.instr_idx](context, args.data());
       // We reset the temp_allocator after the switch statement
@@ -1095,7 +1106,7 @@ Error Method::execute_instruction() {
           step_state_.instr_idx);
       BackendExecutionContext backend_execution_context(
           /*event_tracer*/ event_tracer_,
-          /*temp_allocator*/ memory_manager_->temp_allocator());
+          /*temp_allocator*/ temp_allocator_);
       err = delegates_[delegate_idx].Execute(
           backend_execution_context,
           chain.argument_lists_[step_state_.instr_idx].data());
@@ -1167,8 +1178,8 @@ Error Method::execute_instruction() {
       err = Error::InvalidProgram;
   }
   // Reset the temp allocator for every instruction.
-  if (memory_manager_->temp_allocator() != nullptr) {
-    memory_manager_->temp_allocator()->reset();
+  if (temp_allocator_ != nullptr) {
+    temp_allocator_->reset();
   }
   if (err == Error::Ok) {
     step_state_.instr_idx = next_instr_idx;
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
index 7d96096accf..0a35d6b9282 100644
--- a/runtime/executor/method.h
+++ b/runtime/executor/method.h
@@ -53,6 +53,7 @@ class Method final {
       : step_state_(rhs.step_state_),
         program_(rhs.program_),
         memory_manager_(rhs.memory_manager_),
+        temp_allocator_(rhs.temp_allocator_),
         serialization_plan_(rhs.serialization_plan_),
         event_tracer_(rhs.event_tracer_),
         n_value_(rhs.n_value_),
@@ -273,10 +274,12 @@ class Method final {
   Method(
       const Program* program,
       MemoryManager* memory_manager,
-      EventTracer* event_tracer)
+      EventTracer* event_tracer,
+      MemoryAllocator* temp_allocator)
       : step_state_(),
         program_(program),
         memory_manager_(memory_manager),
+        temp_allocator_(temp_allocator),
         serialization_plan_(nullptr),
         event_tracer_(event_tracer),
         n_value_(0),
@@ -319,6 +322,7 @@ class Method final {
   StepState step_state_;
   const Program* program_;
   MemoryManager* memory_manager_;
+  MemoryAllocator* temp_allocator_;
   executorch_flatbuffer::ExecutionPlan* serialization_plan_;
   EventTracer* event_tracer_;
 
diff --git a/runtime/executor/platform_memory_allocator.h b/runtime/executor/platform_memory_allocator.h
new file mode 100644
index 00000000000..09195a460ac
--- /dev/null
+++ b/runtime/executor/platform_memory_allocator.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <cinttypes>
+#include <cstdint>
+
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/platform.h>
+
+namespace executorch {
+namespace runtime {
+namespace internal {
+
+/**
+ * PlatformMemoryAllocator is a memory allocator that uses a linked list to
+ * manage allocated nodes. It overrides the allocate method of MemoryAllocator
+ * using the PAL fallback allocator method `et_pal_allocate`.
+ */
+class PlatformMemoryAllocator final : public MemoryAllocator {
+ private:
+  // We allocate a little more than requested and use that memory as a node in
+  // a linked list, pushing the allocated buffers onto a list that's iterated
+  // and freed when the KernelRuntimeContext is destroyed.
+  struct AllocationNode {
+    void* data;
+    AllocationNode* next;
+  };
+
+  AllocationNode* head_ = nullptr;
+
+ public:
+  PlatformMemoryAllocator() : MemoryAllocator(0, nullptr) {}
+
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
+    if (!isPowerOf2(alignment)) {
+      ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
+      return nullptr;
+    }
+
+    // Allocate enough memory for the node, the data and the alignment bump.
+    size_t alloc_size = sizeof(AllocationNode) + size + alignment;
+    void* node_memory = et_pal_allocate(alloc_size);
+
+    // If allocation failed, log message and return nullptr.
+    if (node_memory == nullptr) {
+      ET_LOG(Error, "Failed to allocate %zu bytes", alloc_size);
+      return nullptr;
+    }
+
+    // Compute data pointer.
+    uint8_t* data_ptr =
+        reinterpret_cast<uint8_t*>(node_memory) + sizeof(AllocationNode);
+
+    // Align the data pointer.
+    void* aligned_data_ptr = alignPointer(data_ptr, alignment);
+
+    // Assert that the alignment didn't overflow the allocated memory.
+    ET_DCHECK_MSG(
+        reinterpret_cast<uintptr_t>(aligned_data_ptr) + size <=
+            reinterpret_cast<uintptr_t>(node_memory) + alloc_size,
+        "aligned_data_ptr %p + size %zu > node_memory %p + alloc_size %zu",
+        aligned_data_ptr,
+        size,
+        node_memory,
+        alloc_size);
+
+    // Construct the node.
+    AllocationNode* new_node = reinterpret_cast<AllocationNode*>(node_memory);
+    new_node->data = aligned_data_ptr;
+    new_node->next = head_;
+    head_ = new_node;
+
+    // Return the aligned data pointer.
+    return head_->data;
+  }
+
+  void reset() override {
+    AllocationNode* current = head_;
+    while (current != nullptr) {
+      AllocationNode* next = current->next;
+      et_pal_free(current);
+      current = next;
+    }
+    head_ = nullptr;
+  }
+
+  ~PlatformMemoryAllocator() override {
+    reset();
+  }
+
+ private:
+  // Disable copy and move.
+  PlatformMemoryAllocator(const PlatformMemoryAllocator&) = delete;
+  PlatformMemoryAllocator& operator=(const PlatformMemoryAllocator&) = delete;
+  PlatformMemoryAllocator(PlatformMemoryAllocator&&) noexcept = delete;
+  PlatformMemoryAllocator& operator=(PlatformMemoryAllocator&&) noexcept =
+      delete;
+};
+
+} // namespace internal
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/executor/program.h b/runtime/executor/program.h
index a599cc958e0..f7469eb2192 100644
--- a/runtime/executor/program.h
+++ b/runtime/executor/program.h
@@ -123,7 +123,8 @@ class Program final {
    *
    * @param[in] method_name The name of the method to load.
    * @param[in] memory_manager The allocators to use during initialization and
-   *     execution of the loaded method.
+   *     execution of the loaded method. If `memory_manager.temp_allocator()` is
+   *     null, the runtime will allocate temp memory using `et_pal_allocate()`.
    * @param[in] event_tracer The event tracer to use for this method run.
    *
    * @returns The loaded method on success, or an error on failure.
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index 46f997a80ad..cc91255d7b5 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -65,6 +65,9 @@ def define_common_targets():
                 "tensor_parser_exec_aten.cpp",
                 "tensor_parser{}.cpp".format(aten_suffix if aten_mode else "_portable"),
             ],
+            headers = [
+                "platform_memory_allocator.h",
+            ],
             exported_headers = [
                 "method.h",
                 "method_meta.h",
diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp
index da0d53374f1..15b3982297c 100644
--- a/runtime/executor/test/executor_test.cpp
+++ b/runtime/executor/test/executor_test.cpp
@@ -24,11 +24,13 @@ using exec_aten::SizesType;
 using exec_aten::Tensor;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::getOpsFn;
-using executorch::runtime::hasOpsFn;
+using executorch::runtime::get_op_function_from_registry;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelRuntimeContext;
-using executorch::runtime::register_kernels;
+using executorch::runtime::OpFunction;
+using executorch::runtime::register_kernel;
+using executorch::runtime::registry_has_op_function;
+using executorch::runtime::Result;
 using executorch::runtime::testing::TensorFactory;
 
 namespace pytree = ::executorch::extension::pytree;
@@ -87,9 +89,9 @@ TEST_F(ExecutorTest, TensorHalf) {
 
 TEST_F(ExecutorTest, RegistryLookupAndCall) {
   const char* op_name = "aten::add.out";
-  ASSERT_TRUE(hasOpsFn(op_name));
-  auto func = getOpsFn(op_name);
-  ASSERT_TRUE(func);
+  Result<OpFunction> func = get_op_function_from_registry(op_name);
+  ASSERT_EQ(func.error(), Error::Ok);
+  ASSERT_NE(*func, nullptr);
 
   TensorFactory<ScalarType::Int> tf;
   constexpr size_t num_evalues = 4;
@@ -108,7 +110,7 @@ TEST_F(ExecutorTest, RegistryLookupAndCall) {
   kernel_args[4] = &evalues[3];
 
   KernelRuntimeContext context{};
-  func(context, kernel_args);
+  (*func)(context, kernel_args);
   auto c_ptr = evalues[3].toTensor().const_data_ptr<int32_t>();
   ASSERT_EQ(c_ptr[3], 12);
 }
@@ -166,15 +168,15 @@ TEST_F(ExecutorTest, EValueToScalar) {
 void test_op(KernelRuntimeContext& /*unused*/, EValue** /*unused*/) {}
 
 TEST_F(ExecutorTest, OpRegistration) {
-  auto s1 = register_kernels({Kernel("test", test_op)});
-  auto s2 = register_kernels({Kernel("test_2", test_op)});
+  auto s1 = register_kernel(Kernel("test", test_op));
+  auto s2 = register_kernel(Kernel("test_2", test_op));
   ASSERT_EQ(Error::Ok, s1);
   ASSERT_EQ(Error::Ok, s2);
   ET_EXPECT_DEATH(
-      []() { (void)register_kernels({Kernel("test", test_op)}); }(), "");
+      []() { (void)register_kernel(Kernel("test", test_op)); }(), "");
 
-  ASSERT_TRUE(hasOpsFn("test"));
-  ASSERT_TRUE(hasOpsFn("test_2"));
+  ASSERT_TRUE(registry_has_op_function("test"));
+  ASSERT_TRUE(registry_has_op_function("test_2"));
 }
 
 TEST_F(ExecutorTest, OpRegistrationWithContext) {
@@ -184,25 +186,27 @@ TEST_F(ExecutorTest, OpRegistrationWithContext) {
         (void)context;
         *(values[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({op});
+  auto s1 = register_kernel(op);
   ASSERT_EQ(Error::Ok, s1);
-  ASSERT_TRUE(hasOpsFn("test_op_with_context"));
 
-  auto func = getOpsFn("test_op_with_context");
+  Result<OpFunction> func =
+      get_op_function_from_registry("test_op_with_context");
+  ASSERT_EQ(func.error(), Error::Ok);
+
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
 }
 
 TEST_F(ExecutorTest, AddMulAlreadyRegistered) {
-  ASSERT_TRUE(hasOpsFn("aten::add.out"));
-  ASSERT_TRUE(hasOpsFn("aten::mul.out"));
+  ASSERT_TRUE(registry_has_op_function("aten::add.out"));
+  ASSERT_TRUE(registry_has_op_function("aten::mul.out"));
 }
 
 TEST(PyTreeEValue, List) {
diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp
index 3e7da810933..4f1ac0240b9 100644
--- a/runtime/executor/test/kernel_integration_test.cpp
+++ b/runtime/executor/test/kernel_integration_test.cpp
@@ -34,6 +34,7 @@ using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelKey;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Method;
 using executorch::runtime::Program;
 using executorch::runtime::Result;
@@ -59,10 +60,26 @@ struct KernelControl {
   // returning.
   Error fail_value = Error::Ok;
 
+  // If true, the kernel should allocate temporary memory.
+  bool allocate_temp_memory = false;
+
+  // If true, the kernel should simulate allocating temporary memory.
+  bool simulate_temp_memory_allocation = false;
+
+  // The size of the temporary memory to allocate.
+  int temp_memory_size = 0;
+
+  // The total size of all allocations.
+  int total_allocated_size = 0;
+
   void reset() {
     call_count = 0;
     call_context_fail = false;
     fail_value = Error::Ok;
+    allocate_temp_memory = false;
+    simulate_temp_memory_allocation = false;
+    temp_memory_size = 0;
+    total_allocated_size = 0;
   }
 
   /**
@@ -94,7 +111,7 @@ struct KernelControl {
         executorch::runtime::KernelKey("v1/6;0,1|6;0,1|6;0,1|6;0,1");
     Kernel kernel = executorch::runtime::Kernel(
         "aten::add.out", key, KernelControl::kernel_hook);
-    Error err = executorch::runtime::register_kernels({kernel});
+    Error err = executorch::runtime::register_kernel(kernel);
     EXPECT_EQ(err, Error::Ok);
 
     registered_ = true;
@@ -117,6 +134,33 @@ struct KernelControl {
     if (control->call_context_fail) {
       context.fail(control->fail_value);
     }
+
+    // Allocate temporary memory.
+    if (control->allocate_temp_memory) {
+      Result<void*> temp_mem_res =
+          context.allocate_temp(control->temp_memory_size);
+      if (temp_mem_res.ok()) {
+        control->total_allocated_size += control->temp_memory_size;
+        // We actually use the memory, to test default memory allocation was
+        // successful.
+        uint8_t* array = (uint8_t*)(temp_mem_res.get());
+        for (int i = 0; i < control->temp_memory_size; i++) {
+          array[i] = i % 256;
+        }
+      }
+    }
+
+    // Simulate allocating temporary memory. We use this, for testing that when
+    // a temp allocator is provided, the kernel will use it, instead of
+    // allocating memory with the default platform memory allocator.
+    // The provided TempMemoryAllocator class in this file, simulates allocating
+    // memory instead of actually allocating anything.
+    if (control->simulate_temp_memory_allocation) {
+      Result<void*> temp_mem_res =
+          context.allocate_temp(control->temp_memory_size);
+      control->total_allocated_size += control->temp_memory_size;
+      EXPECT_EQ(temp_mem_res.error(), Error::Ok);
+    }
   }
 
   static bool registered_;
@@ -126,6 +170,44 @@ struct KernelControl {
 bool KernelControl::registered_ = false;
 KernelControl KernelControl::singleton_;
 
+/**
+ * MemoryAllocator that keeps track of the number/sizes of its allocations,
+ * to test the case where the user provides a temp allocator.
+ */
+class TempMemoryAllocator final : public MemoryAllocator {
+ public:
+  TempMemoryAllocator() : MemoryAllocator(0, nullptr) {}
+
+  // The number of times allocate() has been called.
+  int number_of_allocations = 0;
+
+  // The number of times reset() has been called.
+  int number_of_resets = 0;
+
+  // The amount of memory currently allocated (should go to 0 when reset is
+  // called).
+  int currently_allocated_size = 0;
+
+  // The total size of all allocations.
+  int total_allocated_size = 0;
+
+  void* allocate(size_t size, ET_UNUSED size_t alignment = kDefaultAlignment)
+      override {
+    number_of_allocations += 1;
+    currently_allocated_size += size;
+    total_allocated_size += size;
+    // This is a simulation, we don't actually allocate memory. But we need to
+    // return a non-null pointer, so we return a bad, non-zero address that will
+    // crash if anyone tries to dereference it.
+    return (void*)1;
+  }
+
+  void reset() override {
+    number_of_resets += 1;
+    currently_allocated_size = 0;
+  }
+};
+
 class KernelIntegrationTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -152,7 +234,9 @@ class KernelIntegrationTest : public ::testing::Test {
 
     // Load the forward method.
     mmm_ = std::make_unique<ManagedMemoryManager>(
-        kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+        kDefaultNonConstMemBytes,
+        kDefaultRuntimeMemBytes,
+        temp_allocator_.get());
     Result<Method> method = program_->load_method("forward", &mmm_->get());
     ASSERT_EQ(method.error(), Error::Ok);
     method_ = std::make_unique<Method>(std::move(method.get()));
@@ -185,6 +269,19 @@ class KernelIntegrationTest : public ::testing::Test {
 
   // The KernelControl associated with method_.
   KernelControl* control_;
+
+  // The temp memory allocator provided by the user. By default, none is
+  // provided.
+  std::unique_ptr<TempMemoryAllocator> temp_allocator_ = nullptr;
+};
+
+class KernelTempMemoryAllocatorIntegrationTest : public KernelIntegrationTest {
+ protected:
+  void SetUp() override {
+    // Create a temp allocator for the test before calling the parent SetUp.
+    temp_allocator_ = std::make_unique<TempMemoryAllocator>();
+    KernelIntegrationTest::SetUp();
+  }
 };
 
 TEST_F(KernelIntegrationTest, KernelHookIsCalled) {
@@ -222,3 +319,63 @@ TEST_F(KernelIntegrationTest, FailurePropagates) {
   EXPECT_EQ(err, Error::Ok);
   EXPECT_EQ(control_->call_count, 3);
 }
+
+TEST_F(KernelIntegrationTest, DefaultPlatformMemoryAllocator) {
+  // Tell the kernel to allocate memory. Since no temp allocator is provided,
+  // this will allocate memory using the default platform memory allocator.
+  control_->allocate_temp_memory = true;
+
+  control_->temp_memory_size = 4;
+  // This is not a simulation. This actually allocates memory, using the
+  // default platform memory allocator.
+  Error err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 1);
+  EXPECT_EQ(control_->total_allocated_size, 4);
+
+  control_->temp_memory_size = 8;
+  // This is not a simulation. This actually allocates memory, using the
+  // default platform memory allocator.
+  err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 2);
+  EXPECT_EQ(control_->total_allocated_size, 12);
+}
+
+TEST_F(KernelTempMemoryAllocatorIntegrationTest, UsingTempMemoryAllocator) {
+  // In this test we provide a temp allocator to the method, and tell the kernel
+  // to allocate memory using it. We want to make sure that the kernel uses the
+  // temp allocator, and that the temp allocator is reset after the execution.
+  // Since we are testing that the kernel uses the temp allocator, and not the
+  // temp allocator itself, we don't need to test the actual allocation of
+  // memory. Therefore, we set simulate_temp_memory_allocation to true, so that
+  // the kernel will not actually allocate memory, but will instead simulate
+  // allocating memory.
+  // The provided TempMemoryAllocator, simulates allocating memory by increasing
+  // total_allocated_size and currently_allocated_size by the requested size.
+  // We simulate resetting the allocator by setting currently_allocated_size
+  // back to 0.
+  control_->simulate_temp_memory_allocation = true;
+
+  control_->temp_memory_size = 4;
+  Error err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 1);
+  EXPECT_EQ(control_->total_allocated_size, 4);
+  EXPECT_EQ(temp_allocator_->number_of_allocations, 1);
+  EXPECT_EQ(temp_allocator_->total_allocated_size, 4);
+  // The temp allocator should have been reset after the execution.
+  EXPECT_EQ(temp_allocator_->number_of_resets, 1);
+  EXPECT_EQ(temp_allocator_->currently_allocated_size, 0);
+
+  control_->temp_memory_size = 8;
+  err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 2);
+  EXPECT_EQ(control_->total_allocated_size, 12);
+  EXPECT_EQ(temp_allocator_->number_of_allocations, 2);
+  EXPECT_EQ(temp_allocator_->total_allocated_size, 12);
+  // The temp allocator should have been reset after the execution.
+  EXPECT_EQ(temp_allocator_->number_of_resets, 2);
+  EXPECT_EQ(temp_allocator_->currently_allocated_size, 0);
+}
diff --git a/runtime/executor/test/kernel_resolution_test.cpp b/runtime/executor/test/kernel_resolution_test.cpp
index 7ce16a8e9f3..aae0ff9b7ea 100644
--- a/runtime/executor/test/kernel_resolution_test.cpp
+++ b/runtime/executor/test/kernel_resolution_test.cpp
@@ -34,7 +34,7 @@ using executorch::runtime::KernelKey;
 using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::Method;
 using executorch::runtime::Program;
-using executorch::runtime::register_kernels;
+using executorch::runtime::register_kernel;
 using executorch::runtime::Result;
 using executorch::runtime::TensorMeta;
 using executorch::runtime::testing::ManagedMemoryManager;
@@ -77,7 +77,7 @@ TEST_F(KernelResolutionTest, InitExecutionPlanSuccess) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernel(kernel_1);
   EXPECT_EQ(s1, executorch::runtime::Error::Ok);
 
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
@@ -109,7 +109,7 @@ TEST_F(KernelResolutionTest, ResolveKernelKeySuccess) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernel(kernel_1);
   EXPECT_EQ(s1, executorch::runtime::Error::Ok);
 
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
diff --git a/runtime/executor/test/managed_memory_manager.h b/runtime/executor/test/managed_memory_manager.h
index 667aa35ca24..a01091527b0 100644
--- a/runtime/executor/test/managed_memory_manager.h
+++ b/runtime/executor/test/managed_memory_manager.h
@@ -27,7 +27,8 @@ class ManagedMemoryManager {
  public:
   ManagedMemoryManager(
       size_t planned_memory_bytes,
-      size_t method_allocator_bytes)
+      size_t method_allocator_bytes,
+      MemoryAllocator* temp_allocator = nullptr)
       : planned_memory_buffer_(new uint8_t[planned_memory_bytes]),
         planned_memory_span_(
             planned_memory_buffer_.get(),
@@ -35,7 +36,7 @@ class ManagedMemoryManager {
         planned_memory_({&planned_memory_span_, 1}),
         method_allocator_pool_(new uint8_t[method_allocator_bytes]),
         method_allocator_(method_allocator_bytes, method_allocator_pool_.get()),
-        memory_manager_(&method_allocator_, &planned_memory_) {}
+        memory_manager_(&method_allocator_, &planned_memory_, temp_allocator) {}
 
   MemoryManager& get() {
     return memory_manager_;
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index a8fd50d7b91..78aa0a51732 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -8,53 +8,63 @@
 
 #include <executorch/runtime/kernel/operator_registry.h>
 
-#include <executorch/runtime/platform/runtime.h>
-#include <executorch/runtime/platform/system.h>
 #include <cinttypes>
 
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/runtime/platform/system.h>
 
 namespace executorch {
 namespace runtime {
 
-OperatorRegistry& getOperatorRegistry();
-OperatorRegistry& getOperatorRegistry() {
-  static OperatorRegistry operator_registry;
-  return operator_registry;
-}
-
-Error register_kernels(const ArrayRef<Kernel>& kernels) {
-  Error success = getOperatorRegistry().register_kernels(kernels);
-  if (success == Error::InvalidArgument || success == Error::Internal) {
-    ET_CHECK_MSG(
-        false,
-        "Kernel registration failed with error %" PRIu32
-        ", see error log for details.",
-        static_cast<uint32_t>(success));
-  }
-  return success;
-}
-
-Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
-  // Operator registration happens in static initialization time when PAL init
-  // may or may not happen already. Here we are assuming et_pal_init() doesn't
-  // have any side effect even if falled multiple times.
+namespace {
+
+// Maximum number of operators and their associated kernels that can be
+// registered.
+#ifdef MAX_KERNEL_NUM
+constexpr uint32_t kMaxRegisteredKernels = MAX_KERNEL_NUM;
+#else
+constexpr uint32_t kMaxOperators = 250;
+constexpr uint32_t kMaxKernelsPerOp = 8;
+constexpr uint32_t kMaxRegisteredKernels = kMaxOperators * kMaxKernelsPerOp;
+#endif
+
+// Data that backs the kernel table. Since Kernel has a custom default
+// constructor (implicitly, because it contains KernelKey, which has a custom
+// ctor), some toolchains don't like having a global array of them: it would
+// require constructing them at init time. Since we don't care about the values
+// until we add each entry to the table, allocate static zeroed memory instead
+// and point the table at it.
+// @lint-ignore CLANGTIDY facebook-hte-CArray
+alignas(sizeof(Kernel)) uint8_t
+    registered_kernels_data[kMaxRegisteredKernels * sizeof(Kernel)];
+
+/// Global table of registered kernels.
+Kernel* registered_kernels = reinterpret_cast<Kernel*>(registered_kernels_data);
+
+/// The number of kernels registered in the table.
+size_t num_registered_kernels = 0;
+
+// Registers the kernels, but may return an error.
+Error register_kernels_internal(const Span<const Kernel> kernels) {
+  // Operator registration happens in static initialization time before or after
+  // PAL init, so call it here. It is safe to call multiple times.
   ::et_pal_init();
 
-  if (kernels.size() + this->num_kernels_ > kMaxNumOfKernels) {
+  if (kernels.size() + num_registered_kernels > kMaxRegisteredKernels) {
     ET_LOG(
         Error,
-        "The total number of kernels to be registered is larger than the limit %" PRIu32
-        ". %" PRIu32
-        " kernels are already registered and we're trying to register another %" PRIu32
-        " kernels.",
-        kMaxNumOfKernels,
-        (uint32_t)this->num_kernels_,
+        "The total number of kernels to be registered is larger than the limit "
+        "%" PRIu32 ". %" PRIu32
+        " kernels are already registered and we're trying to register another "
+        "%" PRIu32 " kernels.",
+        kMaxRegisteredKernels,
+        (uint32_t)num_registered_kernels,
         (uint32_t)kernels.size());
     ET_LOG(Error, "======== Kernels already in the registry: ========");
-    for (size_t i = 0; i < this->num_kernels_; i++) {
-      ET_LOG(Error, "%s", this->kernels_[i].name_);
-      ET_LOG_KERNEL_KEY(this->kernels_[i].kernel_key_);
+    for (size_t i = 0; i < num_registered_kernels; i++) {
+      ET_LOG(Error, "%s", registered_kernels[i].name_);
+      ET_LOG_KERNEL_KEY(registered_kernels[i].kernel_key_);
     }
     ET_LOG(Error, "======== Kernels being registered: ========");
     for (size_t i = 0; i < kernels.size(); i++) {
@@ -67,9 +77,9 @@ Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
   const char* lib_name = et_pal_get_shared_library_name(kernels.data());
 
   for (const auto& kernel : kernels) {
-    // linear search. This is fine if the number of kernels are small.
-    for (int32_t i = 0; i < this->num_kernels_; i++) {
-      Kernel k = this->kernels_[i];
+    // Linear search. This is fine if the number of kernels is small.
+    for (int32_t i = 0; i < num_registered_kernels; i++) {
+      Kernel k = registered_kernels[i];
       if (strcmp(kernel.name_, k.name_) == 0 &&
           kernel.kernel_key_ == k.kernel_key_) {
         ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name);
@@ -77,7 +87,7 @@ Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
         return Error::InvalidArgument;
       }
     }
-    this->kernels_[this->num_kernels_++] = kernel;
+    registered_kernels[num_registered_kernels++] = kernel;
   }
   ET_LOG(
       Debug,
@@ -87,11 +97,23 @@ Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
   return Error::Ok;
 }
 
-bool hasOpsFn(const char* name, ArrayRef<TensorMeta> kernel_key) {
-  return getOperatorRegistry().hasOpsFn(name, kernel_key);
+} // namespace
+
+// Registers the kernels, but panics if an error occurs. Always returns Ok.
+Error register_kernels(const Span<const Kernel> kernels) {
+  Error success = register_kernels_internal(kernels);
+  if (success == Error::InvalidArgument || success == Error::Internal) {
+    ET_CHECK_MSG(
+        false,
+        "Kernel registration failed with error %" PRIu32
+        ", see error log for details.",
+        static_cast<uint32_t>(success));
+  }
+  return success;
 }
 
-static int copy_char_as_number_to_buf(char num, char* buf) {
+namespace {
+int copy_char_as_number_to_buf(char num, char* buf) {
   if ((char)num < 10) {
     *buf = '0' + (char)num;
     buf += 1;
@@ -104,10 +126,10 @@ static int copy_char_as_number_to_buf(char num, char* buf) {
     return 2;
   }
 }
+} // namespace
 
-void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf);
-
-void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf) {
+namespace internal {
+void make_kernel_key_string(Span<const TensorMeta> key, char* buf) {
   if (key.empty()) {
     // If no tensor is present in an op, kernel key does not apply
     return;
@@ -130,61 +152,43 @@ void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf) {
     buf += 1;
   }
 }
+} // namespace internal
 
-bool OperatorRegistry::hasOpsFn(
+bool registry_has_op_function(
     const char* name,
-    ArrayRef<TensorMeta> meta_list) {
-  char buf[KernelKey::MAX_SIZE] = {0};
-  make_kernel_key_string(meta_list, buf);
-  KernelKey kernel_key = KernelKey(buf);
-
-  for (size_t idx = 0; idx < this->num_kernels_; idx++) {
-    if (strcmp(this->kernels_[idx].name_, name) == 0) {
-      if (this->kernels_[idx].kernel_key_.is_fallback() ||
-          this->kernels_[idx].kernel_key_ == kernel_key) {
-        return true;
-      }
-    }
-  }
-
-  return false;
+    Span<const TensorMeta> meta_list) {
+  return get_op_function_from_registry(name, meta_list).ok();
 }
 
-const OpFunction& getOpsFn(const char* name, ArrayRef<TensorMeta> kernel_key) {
-  return getOperatorRegistry().getOpsFn(name, kernel_key);
-}
-
-const OpFunction& OperatorRegistry::getOpsFn(
+Result<OpFunction> get_op_function_from_registry(
     const char* name,
-    ArrayRef<TensorMeta> meta_list) {
+    Span<const TensorMeta> meta_list) {
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
   char buf[KernelKey::MAX_SIZE] = {0};
-  make_kernel_key_string(meta_list, buf);
+  internal::make_kernel_key_string(meta_list, buf);
   KernelKey kernel_key = KernelKey(buf);
 
   int32_t fallback_idx = -1;
-  for (size_t idx = 0; idx < this->num_kernels_; idx++) {
-    if (strcmp(this->kernels_[idx].name_, name) == 0) {
-      if (this->kernels_[idx].kernel_key_ == kernel_key) {
-        return this->kernels_[idx].op_;
+  for (size_t idx = 0; idx < num_registered_kernels; idx++) {
+    if (strcmp(registered_kernels[idx].name_, name) == 0) {
+      if (registered_kernels[idx].kernel_key_ == kernel_key) {
+        return registered_kernels[idx].op_;
       }
-      if (this->kernels_[idx].kernel_key_.is_fallback()) {
+      if (registered_kernels[idx].kernel_key_.is_fallback()) {
         fallback_idx = idx;
       }
     }
   }
   if (fallback_idx != -1) {
-    return this->kernels_[fallback_idx].op_;
+    return registered_kernels[fallback_idx].op_;
   }
-  ET_CHECK_MSG(false, "kernel '%s' not found.", name);
+  ET_LOG(Error, "kernel '%s' not found.", name);
   ET_LOG_TENSOR_META(meta_list);
+  return Error::OperatorMissing;
 }
 
-ArrayRef<Kernel> get_kernels() {
-  return getOperatorRegistry().get_kernels();
-}
-
-ArrayRef<Kernel> OperatorRegistry::get_kernels() {
-  return ArrayRef<Kernel>(this->kernels_, this->num_kernels_);
+Span<const Kernel> get_registered_kernels() {
+  return {registered_kernels, num_registered_kernels};
 }
 
 } // namespace runtime
diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h
index f1be83306f8..4b71f436d41 100644
--- a/runtime/kernel/operator_registry.h
+++ b/runtime/kernel/operator_registry.h
@@ -14,8 +14,11 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <executorch/runtime/platform/platform.h>
+
 // Debug switch for operator registry
 #if defined(ET_OP_REGISTRY_DEBUG)
 #include <ostream>
@@ -48,12 +51,10 @@ using OpFunction = void (*)(KernelRuntimeContext&, EValue**);
  */
 struct TensorMeta {
   exec_aten::ScalarType dtype_;
-  ArrayRef<exec_aten::DimOrderType> dim_order_;
+  Span<exec_aten::DimOrderType> dim_order_;
 
   TensorMeta() = default;
-  TensorMeta(
-      exec_aten::ScalarType dtype,
-      ArrayRef<exec_aten::DimOrderType> order)
+  TensorMeta(exec_aten::ScalarType dtype, Span<exec_aten::DimOrderType> order)
       : dtype_(dtype), dim_order_(order) {}
 
   bool operator==(const TensorMeta& other) const {
@@ -190,73 +191,49 @@ struct Kernel {
   Kernel() {}
 };
 
-// Maximum number of operators and their associated kernels that can be
-// registered.
-constexpr uint32_t kOperatorTableMaxSize = 250;
-constexpr uint32_t kMaxNumOfKernelPerOp = 8;
-#ifdef MAX_KERNEL_NUM
-constexpr uint32_t kMaxNumOfKernels = MAX_KERNEL_NUM;
-#else
-constexpr uint32_t kMaxNumOfKernels =
-    kOperatorTableMaxSize * kMaxNumOfKernelPerOp;
-#endif
+namespace internal {
+void make_kernel_key_string(Span<const TensorMeta> key, char* buf);
+} // namespace internal
+
 /**
- * See OperatorRegistry::hasOpsFn()
+ * Checks whether an operator exists with a given name and TensorMeta list. When
+ * TensorMeta is empty, it means this op does not have specialized kernels, so
+ * it checks whether it has any fallback kernels.
  */
-bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list = {});
+bool registry_has_op_function(
+    const char* name,
+    Span<const TensorMeta> meta_list = {});
 
 /**
- * See OperatorRegistry::getOpsFn()
+ * Returns the operator with a given name and TensorMeta list, if present.
  */
-const OpFunction& getOpsFn(
+::executorch::runtime::Result<OpFunction> get_op_function_from_registry(
     const char* name,
-    ArrayRef<TensorMeta> meta_list = {});
+    Span<const TensorMeta> meta_list = {});
 
 /**
- * See OperatorRegistry::get_kernels()
+ * Returns all registered kernels.
  */
-ArrayRef<Kernel> get_kernels();
+Span<const Kernel> get_registered_kernels();
 
 /**
- * See OperatorRegistry::register_kernels(). Notice that the returned Error
- * object should be handled internally and the reason for keep returning is to
- * satisfy the requirement to run this in static initialization time.
+ * Registers the provided kernels.
+ *
+ * @param[in] kernels Kernel objects to register.
+ * @retval Error::Ok always. Panics on error. This function needs to return a
+ *     non-void type to run at static initialization time.
  */
-ET_NODISCARD Error register_kernels(const ArrayRef<Kernel>&);
-
-struct OperatorRegistry {
- public:
-  OperatorRegistry() : num_kernels_(0) {}
-
-  /**
-   * Registers the Kernels object (i.e. string name and function reference
-   * pair). The kernels will be merged into Operators based on the op name.
-   *
-   * @param[in] kernels Kernel object
-   * @retval Error code representing whether registration was successful.
-   */
-  ET_NODISCARD Error register_kernels(const ArrayRef<Kernel>&);
-
-  /**
-   * Checks whether an operator with a given name and TensorMeta list.
-   * When TensorMeta is empty, it means this op does not have specialized
-   * kernels, so it checks whether it has any fallback kernels.
-   */
-  bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list);
+ET_NODISCARD Error register_kernels(const Span<const Kernel>);
 
-  /**
-   * Get the operator with a given name and TensorMeta list
-   */
-  const OpFunction& getOpsFn(const char* name, ArrayRef<TensorMeta> meta_list);
-
-  /**
-   * Return all registered operators.
-   */
-  ArrayRef<Kernel> get_kernels();
-
- private:
-  Kernel kernels_[kMaxNumOfKernels];
-  uint32_t num_kernels_;
+/**
+ * Registers a single kernel.
+ *
+ * @param[in] kernel Kernel object to register.
+ * @retval Error::Ok always. Panics on error. This function needs to return a
+ *     non-void type to run at static initialization time.
+ */
+ET_NODISCARD inline Error register_kernel(const Kernel& kernel) {
+  return register_kernels({&kernel, 1});
 };
 
 } // namespace runtime
@@ -266,16 +243,32 @@ namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::get_kernels;
-using ::executorch::runtime::getOpsFn;
-using ::executorch::runtime::hasOpsFn;
 using ::executorch::runtime::Kernel;
 using ::executorch::runtime::KernelKey;
 using ::executorch::runtime::KernelRuntimeContext;
-using ::executorch::runtime::OperatorRegistry;
 using ::executorch::runtime::OpFunction;
-using ::executorch::runtime::register_kernels;
 using ::executorch::runtime::TensorMeta;
 using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+
+inline ::executorch::runtime::Error register_kernels(ArrayRef<Kernel> kernels) {
+  return ::executorch::runtime::register_kernels(
+      {kernels.data(), kernels.size()});
+}
+inline OpFunction getOpsFn(
+    const char* name,
+    ArrayRef<TensorMeta> meta_list = {}) {
+  auto result = ::executorch::runtime::get_op_function_from_registry(
+      name, {meta_list.data(), meta_list.size()});
+  ET_CHECK(result.ok()); // get_op_function_from_registry() logs details.
+  return *result;
+}
+inline bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list = {}) {
+  return ::executorch::runtime::registry_has_op_function(
+      name, {meta_list.data(), meta_list.size()});
+}
+inline ArrayRef<Kernel> get_kernels() {
+  Span<const Kernel> kernels = ::executorch::runtime::get_registered_kernels();
+  return ArrayRef<Kernel>(kernels.data(), kernels.size());
+}
 } // namespace executor
 } // namespace torch
diff --git a/runtime/kernel/test/kernel_double_registration_test.cpp b/runtime/kernel/test/kernel_double_registration_test.cpp
index bef3b46f46b..1739dffd31b 100644
--- a/runtime/kernel/test/kernel_double_registration_test.cpp
+++ b/runtime/kernel/test/kernel_double_registration_test.cpp
@@ -20,6 +20,7 @@ using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::register_kernels;
 
 class KernelDoubleRegistrationTest : public ::testing::Test {
  public:
@@ -33,10 +34,9 @@ TEST_F(KernelDoubleRegistrationTest, Basic) {
       "aten::add.out",
       "v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3",
       [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
   Error err = Error::InvalidArgument;
 
   ET_EXPECT_DEATH(
-      { auto res = register_kernels(kernels_array); },
+      { (void)register_kernels({kernels}); },
       std::to_string(static_cast<uint32_t>(err)));
 }
diff --git a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp
index 16520358c75..6f6fe4b9e1b 100644
--- a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp
+++ b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp
@@ -19,9 +19,10 @@ using namespace ::testing;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::hasOpsFn;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::register_kernels;
+using executorch::runtime::registry_has_op_function;
 
 class OperatorRegistryMaxKernelNumTest : public ::testing::Test {
  public:
@@ -33,11 +34,10 @@ class OperatorRegistryMaxKernelNumTest : public ::testing::Test {
 // Register one kernel when max_kernel_num=1; success
 TEST_F(OperatorRegistryMaxKernelNumTest, RegisterOneOp) {
   Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
-  auto s1 = register_kernels(kernels_array);
+  auto s1 = register_kernels({kernels});
   EXPECT_EQ(s1, Error::Ok);
-  EXPECT_FALSE(hasOpsFn("fpp"));
-  EXPECT_TRUE(hasOpsFn("foo"));
+  EXPECT_FALSE(registry_has_op_function("fpp"));
+  EXPECT_TRUE(registry_has_op_function("foo"));
 }
 
 // Register two kernels when max_kernel_num=1; fail
@@ -45,8 +45,7 @@ TEST_F(OperatorRegistryMaxKernelNumTest, RegisterTwoOpsFail) {
   Kernel kernels[] = {
       Kernel("foo1", [](KernelRuntimeContext&, EValue**) {}),
       Kernel("foo2", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
   ET_EXPECT_DEATH(
-      { (void)register_kernels(kernels_array); },
+      { (void)register_kernels({kernels}); },
       "The total number of kernels to be registered is larger than the limit 1");
 }
diff --git a/runtime/kernel/test/operator_registry_test.cpp b/runtime/kernel/test/operator_registry_test.cpp
index 60cd5723cd0..57439a2bd0f 100644
--- a/runtime/kernel/test/operator_registry_test.cpp
+++ b/runtime/kernel/test/operator_registry_test.cpp
@@ -10,6 +10,8 @@
 #include <vector>
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/kernel/test/test_util.h>
@@ -20,15 +22,17 @@ using namespace ::testing;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using executorch::runtime::ArrayRef;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::hasOpsFn;
+using executorch::runtime::get_op_function_from_registry;
 using executorch::runtime::Kernel;
 using executorch::runtime::KernelKey;
 using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::OpFunction;
 using executorch::runtime::register_kernels;
+using executorch::runtime::registry_has_op_function;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
 using executorch::runtime::TensorMeta;
 using executorch::runtime::testing::make_kernel_key;
 
@@ -41,18 +45,18 @@ class OperatorRegistryTest : public ::testing::Test {
 
 TEST_F(OperatorRegistryTest, Basic) {
   Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
-  auto s1 = register_kernels(kernels_array);
-  EXPECT_FALSE(hasOpsFn("fpp"));
-  EXPECT_TRUE(hasOpsFn("foo"));
+  Span<const Kernel> kernels_span(kernels);
+  (void)register_kernels(kernels_span);
+  EXPECT_FALSE(registry_has_op_function("fpp"));
+  EXPECT_TRUE(registry_has_op_function("foo"));
 }
 
 TEST_F(OperatorRegistryTest, RegisterOpsMoreThanOnceDie) {
   Kernel kernels[] = {
       Kernel("foo", [](KernelRuntimeContext&, EValue**) {}),
       Kernel("foo", [](KernelRuntimeContext&, EValue**) {})};
-  ArrayRef<Kernel> kernels_array = ArrayRef<Kernel>(kernels);
-  ET_EXPECT_DEATH({ auto res = register_kernels(kernels_array); }, "");
+  Span<const Kernel> kernels_span = Span<const Kernel>(kernels);
+  ET_EXPECT_DEATH({ (void)register_kernels(kernels_span); }, "");
 }
 
 constexpr int BUF_SIZE = KernelKey::MAX_SIZE;
@@ -91,24 +95,31 @@ TEST_F(OperatorRegistryTest, RegisterKernels) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key = ArrayRef<TensorMeta>(meta, 1);
-  EXPECT_TRUE(hasOpsFn("test::boo", user_kernel_key));
+  Span<const TensorMeta> user_kernel_key(meta);
+
   // no fallback kernel is registered
-  EXPECT_FALSE(hasOpsFn("test::boo", {}));
-  OpFunction func = getOpsFn("test::boo", user_kernel_key);
+  EXPECT_FALSE(registry_has_op_function("test::boo", {}));
+  Result<OpFunction> fallback_func =
+      get_op_function_from_registry("test::boo", {});
+  EXPECT_NE(fallback_func.error(), Error::Ok);
+
+  EXPECT_TRUE(registry_has_op_function("test::boo", user_kernel_key));
+  Result<OpFunction> func =
+      get_op_function_from_registry("test::boo", user_kernel_key);
+  EXPECT_EQ(func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
@@ -136,18 +147,18 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) {
   auto s1 = register_kernels(kernels);
   // has both kernels
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_1 = ArrayRef<TensorMeta>(meta, 1);
+  Span<const TensorMeta> user_kernel_key_1(meta);
 
   TensorMeta meta_2[] = {TensorMeta(ScalarType::Float, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_2 = ArrayRef<TensorMeta>(meta_2, 1);
-
-  EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_1));
-  EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_2));
+  Span<const TensorMeta> user_kernel_key_2(meta_2);
 
   // no fallback kernel is registered
-  EXPECT_FALSE(hasOpsFn("test::bar", {}));
+  EXPECT_FALSE(registry_has_op_function("test::bar", {}));
+  Result<OpFunction> fallback_func =
+      get_op_function_from_registry("test::bar", {});
+  EXPECT_NE(fallback_func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
@@ -156,16 +167,22 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) {
   KernelRuntimeContext context{};
 
   // test kernel_1
-  OpFunction func_1 = getOpsFn("test::bar", user_kernel_key_1);
-  func_1(context, evalues);
+  EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_1));
+  Result<OpFunction> func_1 =
+      get_op_function_from_registry("test::bar", user_kernel_key_1);
+  EXPECT_EQ(func_1.error(), Error::Ok);
+  (*func_1)(context, evalues);
 
   auto val_1 = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val_1, 100);
 
   // test kernel_2
+  EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_2));
+  Result<OpFunction> func_2 =
+      get_op_function_from_registry("test::bar", user_kernel_key_2);
+  EXPECT_EQ(func_2.error(), Error::Ok);
   values[0] = Scalar(0);
-  OpFunction func_2 = getOpsFn("test::bar", user_kernel_key_2);
-  func_2(context, evalues);
+  (*func_2)(context, evalues);
 
   auto val_2 = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val_2, 50);
@@ -202,27 +219,26 @@ TEST_F(OperatorRegistryTest, ExecutorChecksKernel) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_1 = ArrayRef<TensorMeta>(meta, 1);
-  EXPECT_TRUE(hasOpsFn("test::qux", user_kernel_key_1));
+  Span<const TensorMeta> user_kernel_key_1(meta);
+  EXPECT_TRUE(registry_has_op_function("test::qux", user_kernel_key_1));
 
   Tensor::DimOrderType dims_channel_first[] = {0, 3, 1, 2};
   auto dim_order_type_channel_first =
-      ArrayRef<Tensor::DimOrderType>(dims_channel_first, 4);
+      Span<Tensor::DimOrderType>(dims_channel_first, 4);
   TensorMeta meta_channel_first[] = {
       TensorMeta(ScalarType::Long, dim_order_type_channel_first)};
-  ArrayRef<TensorMeta> user_kernel_key_2 =
-      ArrayRef<TensorMeta>(meta_channel_first, 1);
-  EXPECT_FALSE(hasOpsFn("test::qux", user_kernel_key_2));
+  Span<const TensorMeta> user_kernel_key_2(meta_channel_first);
+  EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_2));
 
   TensorMeta meta_float[] = {TensorMeta(ScalarType::Float, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_3 = ArrayRef<TensorMeta>(meta_float, 1);
-  EXPECT_FALSE(hasOpsFn("test::qux", ArrayRef<TensorMeta>(user_kernel_key_3)));
+  Span<const TensorMeta> user_kernel_key_3(meta_float);
+  EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_3));
 }
 
 TEST_F(OperatorRegistryTest, ExecutorUsesKernel) {
@@ -235,23 +251,25 @@ TEST_F(OperatorRegistryTest, ExecutorUsesKernel) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
   Tensor::DimOrderType dims[] = {0, 1, 2, 3};
-  auto dim_order_type = ArrayRef<Tensor::DimOrderType>(dims, 4);
+  auto dim_order_type = Span<Tensor::DimOrderType>(dims, 4);
   TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)};
-  ArrayRef<TensorMeta> user_kernel_key_1 = ArrayRef<TensorMeta>(meta, 1);
-  EXPECT_TRUE(hasOpsFn("test::quux", ArrayRef<TensorMeta>(meta)));
+  Span<const TensorMeta> user_kernel_key_1(meta);
 
-  OpFunction func = getOpsFn("test::quux", ArrayRef<TensorMeta>(meta));
+  EXPECT_TRUE(registry_has_op_function("test::quux", user_kernel_key_1));
+  Result<OpFunction> func =
+      get_op_function_from_registry("test::quux", user_kernel_key_1);
+  EXPECT_EQ(func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
@@ -265,20 +283,21 @@ TEST_F(OperatorRegistryTest, ExecutorUsesFallbackKernel) {
         (void)context;
         *(stack[0]) = Scalar(100);
       });
-  auto s1 = register_kernels({kernel_1});
+  auto s1 = register_kernels({&kernel_1, 1});
   EXPECT_EQ(s1, Error::Ok);
 
-  EXPECT_TRUE(hasOpsFn("test::corge"));
-  EXPECT_TRUE(hasOpsFn("test::corge", ArrayRef<TensorMeta>()));
+  EXPECT_TRUE(registry_has_op_function("test::corge"));
+  EXPECT_TRUE(registry_has_op_function("test::corge", {}));
 
-  OpFunction func = getOpsFn("test::corge", ArrayRef<TensorMeta>());
+  Result<OpFunction> func = get_op_function_from_registry("test::corge", {});
+  EXPECT_EQ(func.error(), Error::Ok);
 
   EValue values[1];
   values[0] = Scalar(0);
   EValue* kernels[1];
   kernels[0] = &values[0];
   KernelRuntimeContext context{};
-  func(context, kernels);
+  (*func)(context, kernels);
 
   auto val = values[0].toScalar().to<int64_t>();
   ASSERT_EQ(val, 100);
diff --git a/runtime/kernel/test/test_kernel_manual_registration.cpp b/runtime/kernel/test/test_kernel_manual_registration.cpp
index c150b61ad73..de8853c7813 100644
--- a/runtime/kernel/test/test_kernel_manual_registration.cpp
+++ b/runtime/kernel/test/test_kernel_manual_registration.cpp
@@ -15,7 +15,7 @@
 
 using namespace ::testing;
 using executorch::runtime::Error;
-using executorch::runtime::hasOpsFn;
+using executorch::runtime::registry_has_op_function;
 
 class KernelManualRegistrationTest : public ::testing::Test {
  public:
@@ -26,15 +26,15 @@ class KernelManualRegistrationTest : public ::testing::Test {
 
 TEST_F(KernelManualRegistrationTest, ManualRegister) {
   // Before registering, we can't find the add operator.
-  EXPECT_FALSE(hasOpsFn("aten::add.out"));
+  EXPECT_FALSE(registry_has_op_function("aten::add.out"));
 
   // Call the generated registration function.
   Error result = torch::executor::register_all_kernels();
   EXPECT_EQ(result, Error::Ok);
 
   // We can now find the registered add operator.
-  EXPECT_TRUE(hasOpsFn("aten::add.out"));
+  EXPECT_TRUE(registry_has_op_function("aten::add.out"));
 
   // We can't find a random other operator.
-  EXPECT_FALSE(hasOpsFn("fpp"));
+  EXPECT_FALSE(registry_has_op_function("fpp"));
 }
diff --git a/runtime/kernel/test/test_util.h b/runtime/kernel/test/test_util.h
index 23993fd39d6..0c6c651af32 100644
--- a/runtime/kernel/test/test_util.h
+++ b/runtime/kernel/test/test_util.h
@@ -16,9 +16,6 @@
 namespace executorch {
 namespace runtime {
 
-// Defined in //executorch/runtime/kernel/operator_registry.cpp.
-void make_kernel_key_string(ArrayRef<TensorMeta> key, char* buf);
-
 namespace testing {
 
 inline void make_kernel_key(
@@ -28,12 +25,11 @@ inline void make_kernel_key(
     char* buf) {
   std::vector<TensorMeta> meta;
   for (auto& t : tensors) {
-    ArrayRef<exec_aten::DimOrderType> dim_order(
-        t.second.data(), t.second.size());
+    Span<exec_aten::DimOrderType> dim_order(t.second.data(), t.second.size());
     meta.emplace_back(t.first, dim_order);
   }
-  auto meatadata = ArrayRef<TensorMeta>(meta.data(), meta.size());
-  make_kernel_key_string(meatadata, buf);
+  Span<const TensorMeta> metadata(meta.data(), meta.size());
+  internal::make_kernel_key_string(metadata, buf);
 }
 
 } // namespace testing
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
index c7f603756c8..9a8e18c0f1e 100644
--- a/runtime/platform/compiler.h
+++ b/runtime/platform/compiler.h
@@ -13,17 +13,32 @@
 
 #pragma once
 
-// Compiler support checks.
+/*
+ * Compiler support checks. Follows the logic used by pytorch/c10/util/C++17.h
+ * but may support older versions.
+ */
+
+// https://gcc.gnu.org/projects/cxx-status.html#cxx17
+#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
+    __GNUC__ < 7
+#error \
+    "You're trying to build ExecuTorch with a too old version of GCC. We need GCC 7 or later."
+#endif
+
+// https://clang.llvm.org/cxx_status.html#cxx17
+#if defined(__clang__) && __clang_major__ < 5
+#error \
+    "You're trying to build ExecuTorch with a too old version of Clang. We need Clang 5 or later."
+#endif
 
-#if !defined(__cplusplus)
-#error ExecuTorch must be compiled using a C++ compiler.
+#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \
+    (!defined(_MSC_VER) && __cplusplus < 201703L)
+#error "You need C++17 to compile ExecuTorch"
 #endif
 
-#if __cplusplus < 201103L && (!defined(_MSC_VER) || _MSC_VER < 1600) && \
-    (!defined(__GNUC__) ||                                              \
-     (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40400))
-#error ExecuTorch must use a compiler supporting at least the C++11 standard.
-#error __cplusplus _MSC_VER __GNUC__  __GNUC_MINOR__  __GNUC_PATCHLEVEL__
+#if defined(_WIN32) && (defined(min) || defined(max))
+#error \
+    "Macro clash with min and max -- define NOMINMAX when compiling your program on Windows"
 #endif
 
 /*
diff --git a/runtime/platform/default/minimal.cpp b/runtime/platform/default/minimal.cpp
index e1db2083f4a..8236f993188 100644
--- a/runtime/platform/default/minimal.cpp
+++ b/runtime/platform/default/minimal.cpp
@@ -47,3 +47,9 @@ void et_pal_emit_log_message(
     ET_UNUSED size_t line,
     ET_UNUSED const char* message,
     ET_UNUSED size_t length) {}
+
+void* et_pal_allocate(ET_UNUSED size_t size) {
+  return nullptr;
+}
+
+void et_pal_free(ET_UNUSED void* ptr) {}
diff --git a/runtime/platform/default/posix.cpp b/runtime/platform/default/posix.cpp
index cfc8cafc491..aba504f53e0 100644
--- a/runtime/platform/default/posix.cpp
+++ b/runtime/platform/default/posix.cpp
@@ -170,3 +170,26 @@ void et_pal_emit_log_message(
       message);
   fflush(ET_LOG_OUTPUT_FILE);
 }
+
+/**
+ * NOTE: Core runtime code must not call this directly. It may only be called by
+ * a MemoryAllocator wrapper.
+ *
+ * Allocates size bytes of memory via malloc.
+ *
+ * @param[in] size Number of bytes to allocate.
+ * @returns the allocated memory, or nullptr on failure. Must be freed using
+ *     et_pal_free().
+ */
+void* et_pal_allocate(size_t size) {
+  return malloc(size);
+}
+
+/**
+ * Frees memory allocated by et_pal_allocate().
+ *
+ * @param[in] ptr Pointer to memory to free. May be nullptr.
+ */
+void et_pal_free(void* ptr) {
+  free(ptr);
+}
diff --git a/runtime/platform/platform.h b/runtime/platform/platform.h
index e29dad8e9a8..03cdef8eb2f 100644
--- a/runtime/platform/platform.h
+++ b/runtime/platform/platform.h
@@ -115,4 +115,23 @@ void et_pal_emit_log_message(
     const char* message,
     size_t length) ET_INTERNAL_PLATFORM_WEAKNESS;
 
+/**
+ * NOTE: Core runtime code must not call this directly. It may only be called by
+ * a MemoryAllocator wrapper.
+ *
+ * Allocates size bytes of memory.
+ *
+ * @param[in] size Number of bytes to allocate.
+ * @returns the allocated memory, or nullptr on failure. Must be freed using
+ *     et_pal_free().
+ */
+void* et_pal_allocate(size_t size) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Frees memory allocated by et_pal_allocate().
+ *
+ * @param[in] ptr Pointer to memory to free. May be nullptr.
+ */
+void et_pal_free(void* ptr) ET_INTERNAL_PLATFORM_WEAKNESS;
+
 } // extern "C"
diff --git a/runtime/platform/test/executor_pal_override_test.cpp b/runtime/platform/test/executor_pal_override_test.cpp
index bb9ea2ce589..9bc500e652e 100644
--- a/runtime/platform/test/executor_pal_override_test.cpp
+++ b/runtime/platform/test/executor_pal_override_test.cpp
@@ -53,12 +53,29 @@ class PalSpy : public PlatformIntercept {
     last_log_message_args.length = length;
   }
 
+  void* allocate(size_t size) override {
+    ++allocate_call_count;
+    last_allocated_size = size;
+    last_allocated_ptr = (void*)0x1234;
+    return nullptr;
+  }
+
+  void free(void* ptr) override {
+    ++free_call_count;
+    last_freed_ptr = ptr;
+  }
+
   virtual ~PalSpy() = default;
 
   size_t init_call_count = 0;
   size_t current_ticks_call_count = 0;
   size_t emit_log_message_call_count = 0;
   et_tick_ratio_t tick_ns_multiplier = {1, 1};
+  size_t allocate_call_count = 0;
+  size_t free_call_count = 0;
+  size_t last_allocated_size = 0;
+  void* last_allocated_ptr = nullptr;
+  void* last_freed_ptr = nullptr;
 
   /// The args that were passed to the most recent call to emit_log_message().
   struct {
@@ -158,4 +175,33 @@ TEST(ExecutorPalOverrideTest, TickToNsMultiplier) {
   EXPECT_EQ(et_pal_ticks_to_ns_multiplier().denominator, 1);
 }
 
+TEST(ExecutorPalOverrideTest, AllocateSmokeTest) {
+  PalSpy spy;
+  InterceptWith iw(spy);
+
+  // Validate that et_pal_allocate is overridden.
+  EXPECT_EQ(spy.allocate_call_count, 0);
+  EXPECT_EQ(spy.last_allocated_ptr, nullptr);
+  et_pal_allocate(4);
+  EXPECT_EQ(spy.allocate_call_count, 1);
+  EXPECT_EQ(spy.last_allocated_size, 4);
+  EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234);
+}
+
+TEST(ExecutorPalOverrideTest, FreeSmokeTest) {
+  PalSpy spy;
+  InterceptWith iw(spy);
+
+  et_pal_allocate(4);
+  EXPECT_EQ(spy.last_allocated_size, 4);
+  EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234);
+
+  // Validate that et_pal_free is overridden.
+  EXPECT_EQ(spy.free_call_count, 0);
+  EXPECT_EQ(spy.last_freed_ptr, nullptr);
+  et_pal_free(spy.last_allocated_ptr);
+  EXPECT_EQ(spy.free_call_count, 1);
+  EXPECT_EQ(spy.last_freed_ptr, (void*)0x1234);
+}
+
 #endif
diff --git a/runtime/platform/test/stub_platform.cpp b/runtime/platform/test/stub_platform.cpp
index f7ad2f9ee63..8cee404e4e1 100644
--- a/runtime/platform/test/stub_platform.cpp
+++ b/runtime/platform/test/stub_platform.cpp
@@ -75,6 +75,16 @@ void et_pal_emit_log_message(
       timestamp, level, filename, function, line, message, length);
 }
 
+void* et_pal_allocate(size_t size) {
+  ASSERT_INTERCEPT_INSTALLED();
+  return platform_intercept->allocate(size);
+}
+
+void et_pal_free(void* ptr) {
+  ASSERT_INTERCEPT_INSTALLED();
+  platform_intercept->free(ptr);
+}
+
 } // extern "C"
 
 #include <gtest/gtest.h>
diff --git a/runtime/platform/test/stub_platform.h b/runtime/platform/test/stub_platform.h
index af3756f3136..de5599b53b0 100644
--- a/runtime/platform/test/stub_platform.h
+++ b/runtime/platform/test/stub_platform.h
@@ -45,6 +45,12 @@ class PlatformIntercept {
       ET_UNUSED const char* message,
       ET_UNUSED size_t length) {}
 
+  virtual void* allocate(ET_UNUSED size_t size) {
+    return nullptr;
+  }
+
+  virtual void free(ET_UNUSED void* ptr) {}
+
   virtual ~PlatformIntercept() = default;
 };
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5dbe47c8671..b651bd2dd93 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -19,8 +19,7 @@
 cmake_minimum_required(VERSION 3.19)
 project(size_test)
 
-# Use C++11 for size test.
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
diff --git a/test/build_size_test.sh b/test/build_size_test.sh
index 540b78e9f05..428e351cf08 100644
--- a/test/build_size_test.sh
+++ b/test/build_size_test.sh
@@ -11,29 +11,12 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
 
-# Set compile flags for Clang and GCC.
-# -Wno-gnu allows us to use gnu statement-expressions.
-# -Werror -Wc++17* ensure we do not use features from C++17.
-CXX_FLAGS="-Wno-gnu"
-compiler=$(cc --version)
-if [[ $compiler == *"clang"* ]]; then
-  CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-extensions -Wc++14-extensions"
-elif [[ $compiler == *"cc"* ]]; then
-  CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-compat -Wc++14-compat"
-else
-  echo "Unknown compiler: $compiler"
-  exit 1
-fi
-echo "Using compiler $compiler with flags $CXX_FLAGS"
-
 cmake_install_executorch_lib() {
   echo "Installing libexecutorch.a"
   rm -rf cmake-out
 
   retry cmake -DBUCK2="$BUCK2" \
-          -DCMAKE_CXX_STANDARD=11 \
           -DCMAKE_CXX_STANDARD_REQUIRED=ON \
-          -DCMAKE_CXX_FLAGS="$CXX_FLAGS" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
           -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \