diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index c48ac2056aa..2492b1fd3d6 100644 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -11,7 +11,7 @@ set -o xtrace build_qnn_backend() { echo "Start building qnn backend." export ANDROID_NDK_ROOT=/opt/ndk - export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531 + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh index 3b39e1aafe3..92ffd07bccc 100644 --- a/.ci/scripts/setup-qnn-deps.sh +++ b/.ci/scripts/setup-qnn-deps.sh @@ -7,14 +7,18 @@ set -ex +verify_pkg_installed() { + echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed") +} + install_qnn() { echo "Start installing qnn." QNN_INSTALLATION_DIR=/tmp/qnn mkdir -p "${QNN_INSTALLATION_DIR}" - curl -Lo /tmp/v2.23.0.24.06.24.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip" + curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip" echo "Finishing downloading qnn sdk." - unzip -qo /tmp/v2.23.0.24.06.24.zip -d /tmp + unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp echo "Finishing unzip qnn sdk." @@ -26,4 +30,22 @@ install_qnn() { ls -lah "${QNN_INSTALLATION_DIR}" } +setup_libc++() { + sudo apt-get update + pkgs_to_check=('libc++-dev') + j=0 + while [ $j -lt ${#pkgs_to_check[*]} ]; do + install_status=$(verify_pkg_installed ${pkgs_to_check[$j]}) + if [ "$install_status" == "" ]; then + sudo apt-get install -y ${pkgs_to_check[$j]} + if [[ $? -ne 0 ]]; then + echo "ERROR: Failed to install required packages for libc++" + exit 1 + fi + fi + j=$(( $j +1)); + done +} + +setup_libc++ install_qnn diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 290ece7b8e6..5721b7fd607 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -75,7 +75,7 @@ echo "COREML option ${COREML}" if [[ "${MODE}" =~ .*qnn.* ]]; then QNN=ON export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" - export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531 + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" export PYTHONPATH=".." cp schema/program.fbs exir/_serialize/program.fbs diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 7dc6d15e407..8ac87b2302d 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -33,6 +33,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi EXECUTORCH_COMMON_CMAKE_ARGS=" \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index e589337666d..0b8574573fb 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -209,7 +209,13 @@ elif [[ "${BACKEND}" == "coreml" ]]; then fi elif [[ "${BACKEND}" == "xnnpack" ]]; then echo "Testing ${MODEL_NAME} with xnnpack..." - test_model_with_xnnpack true true + WITH_QUANTIZATION=true + WITH_DELEGATION=true + if [[ "$MODEL_NAME" == "mobilebert" ]]; then + # TODO(T197452682) + WITH_QUANTIZATION=false + fi + test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}" if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index c98fa98bb26..ba58435c69a 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -178,6 +178,7 @@ jobs: upload-models: needs: export-models runs-on: linux.2xlarge + if: always() # Continue this job regardless of previous job outcome steps: - name: Download the models from GitHub uses: actions/download-artifact@v3 diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 416d1ca805e..cb1b2b6a1b2 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -165,6 +165,8 @@ jobs: # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ ${{ matrix.delegate }} == "coreml" ]]; then + DELEGATE_CONFIG="coreml" fi PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" @@ -177,6 +179,7 @@ jobs: upload-models: needs: export-models runs-on: linux.2xlarge + if: always() # Continue this job regardless of previous job outcome steps: - name: Download the models from GitHub uses: actions/download-artifact@v3 diff --git a/.lintrunner.toml b/.lintrunner.toml index c28512c5986..eca965bb1e6 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -74,6 +74,8 @@ exclude_patterns = [ # NB: Objective-C is not supported 'examples/apple/**', 'examples/demo-apps/apple_ios/**', + # File contains @generated + 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', ] command = [ 'python', @@ -177,6 +179,8 @@ exclude_patterns = [ '**/*.bat', '**/*.jpg', '**/*.jar', + # File contains @generated + 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', ] command = [ 'python', diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2ad23f84d17..d434c1fe198 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -131,9 +131,7 @@ for detailed advice. #### C++ language version -**C++11.** - -NOTE: The code does not yet fully conform to this, and some files require C++17. +**C++17.** Rationale: This is a compromise between being compatible with older, proprietary toolchains, and having access to relatively modern C++ features. diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py index 375fdf406b2..5084405c468 100644 --- a/backends/apple/coreml/compiler/coreml_preprocess.py +++ b/backends/apple/coreml/compiler/coreml_preprocess.py @@ -3,6 +3,7 @@ # CoreML backend for delegating a EdgeProgram to CoreML. import json +import logging import shutil import uuid @@ -14,6 +15,7 @@ from typing import Any, Dict, final, List, Optional, Tuple import coremltools as ct +import coremltools.optimize as cto import executorchcoreml from executorch.exir.backend.backend_details import ( @@ -23,12 +25,16 @@ ) from executorch.exir.backend.compile_spec_schema import CompileSpec +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + class COMPILE_SPEC_KEYS(Enum): COMPUTE_UNITS = "compute_units" MODEL_TYPE = "model_type" MIN_DEPLOYMENT_TARGET = "min_deployment_target" MODEL_COMPUTE_PRECISION = "model_compute_precision" + OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config" class MODEL_PATHS(Enum): @@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec( compute_unit.name.lower().encode("utf-8"), ) + @staticmethod + def generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config: Dict, + ) -> CompileSpec: + """ + Returns the compile spec representing the model post conversion quantization, + which is a dict that will construct cto.coreml.OpLinearQuantizerConfig + """ + str_representation = json.dumps(op_linear_quantizer_config) + byte_representation = str_representation.encode("utf-8") + return CompileSpec( + COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value, + byte_representation, + ) + + @staticmethod + def op_linear_quantizer_config_from_compile_specs( + compile_specs: List[CompileSpec], + ) -> cto.coreml.OpLinearQuantizerConfig: + """ + Returns the model's post conversion quantization by parsing the list of compile specs. + """ + for compile_spec in compile_specs: + if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value: + config_dict_str = compile_spec.value.decode("utf-8") + config_dict = json.loads(config_dict_str) + config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict) + return config + + return None + @staticmethod def generate_compile_specs( compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL, minimum_deployment_target: ct.target = ct.target.iOS15, compute_precision: ct.precision = ct.precision.FLOAT16, model_type: MODEL_TYPE = MODEL_TYPE.MODEL, + op_linear_quantizer_config: Optional[Dict] = None, ) -> List[CompileSpec]: """ Returns the list of compile specs that's used by CoreMLBackend to lower the module. @@ -192,6 +230,12 @@ def generate_compile_specs( CoreMLBackend.generate_compute_precision_compile_spec(compute_precision) ) compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type)) + if op_linear_quantizer_config is not None: + compile_specs.append( + CoreMLBackend.generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config + ) + ) return compile_specs @@ -368,18 +412,18 @@ def preprocess( compile_specs, ) ) - model_compute_precision: ct.precision = ( CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs) ) - minimum_deployment_target: ct.target = ( CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs) ) - compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs( compile_specs ) + op_linear_quantizer_config = ( + CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs) + ) mlmodel = ct.convert( model=edge_program, @@ -392,4 +436,15 @@ def preprocess( compute_units=compute_units, ) + if op_linear_quantizer_config is not None: + logger.warning( + "Core ML Backend op_linear_quantizer_config API is experimental" + ) + config = cto.coreml.OptimizationConfig( + global_config=op_linear_quantizer_config, + # skip embedding + op_type_configs={"gather": None}, + ) + mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config) + return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type) diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py index ecf6d44b19c..c0b6663f729 100644 --- a/backends/apple/coreml/partition/coreml_partitioner.py +++ b/backends/apple/coreml/partition/coreml_partitioner.py @@ -17,7 +17,7 @@ Partitioner, PartitionResult, ) -from executorch.exir.backend.utils import tag_constant_data +from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer from torch.export.exported_program import ExportedProgram from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch.fx.passes.operator_support import OperatorSupportBase @@ -61,6 +61,7 @@ def __init__( self, skip_ops_for_coreml_delegation: Optional[List[str]] = None, compile_specs: Optional[List[CompileSpec]] = None, + take_over_mutable_buffer: Optional[bool] = True, ) -> None: if skip_ops_for_coreml_delegation is None: skip_ops_for_coreml_delegation = [] @@ -69,6 +70,7 @@ def __init__( backend_id=CoreMLBackend.__name__, compile_specs=compile_specs if compile_specs is not None else [], ) + self.take_over_mutable_buffer = take_over_mutable_buffer def partition(self, exported_program: ExportedProgram) -> PartitionResult: # Run the CapabilityBasedPartitioner to return the largest possible @@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: partition_tags[tag] = self.delegation_spec tag_constant_data(exported_program) + if self.take_over_mutable_buffer: + logger.info( + "Core ML partitioner will take over torch mutable buffer as Core ML state, " + "so if your model contains mutable buffer, " + "then you will need MacOS15+/iOS18+ to execute. " + "If you want your mutable buffer model to be compatible with older OS, " + "then please set `take_over_mutable_buffer=False`" + ) + tag_mutated_buffer(exported_program) return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh index 0018b5ffc2d..b6c9a073e08 100755 --- a/backends/apple/coreml/scripts/install_requirements.sh +++ b/backends/apple/coreml/scripts/install_requirements.sh @@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party" mkdir "$COREML_DIR_PATH/third-party" echo "${green}ExecuTorch: Cloning coremltools." -git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH +git clone --depth 1 --branch 8.0b2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH cd $COREMLTOOLS_DIR_PATH STATUS=$? @@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel echo "${green}ExecuTorch: Installing coremltools." pip install "$COREMLTOOLS_DIR_PATH" +# CoreMLTools have started supporting numpy 2.0, +# but ExecuTorch example model test env is still using older transformers, +# so for now we will need to downgrade numpy to 1.x +# TODO: Remove this numpy downgrade once later transformers starts to be used +pip install numpy==1.26.4 STATUS=$? if [ $STATUS -ne 0 ]; then echo "${red}ExecuTorch: Failed to install coremltools." diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py index 34cf531b261..72a7fbf0932 100644 --- a/backends/apple/coreml/test/test_coreml_partitioner.py +++ b/backends/apple/coreml/test/test_coreml_partitioner.py @@ -4,11 +4,14 @@ import unittest +import coremltools as ct + import executorch.exir import torch import torchvision +from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner @@ -86,8 +89,54 @@ def test_vit_skip_conv(self): if node.op == "call_function" ] == total + def test_buffer(self): + embedding_dim = 3 + max_seq_len = 2 + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer( + "cache", + torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32), + ) + + def forward(self, q, k_val, input_pos): + q_T = q.transpose(0, 1) + k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val) + attn = k.mm(q_T) + return attn + + model = Model() + model.eval() + + q = torch.randn((1, embedding_dim)) + k_val = torch.randn((1, embedding_dim)) + input_pos = torch.tensor([0]) + example_inputs = (q, k_val, input_pos) + exir_program_aten = torch.export.export(model, example_inputs) + + compile_specs = CoreMLBackend.generate_compile_specs( + minimum_deployment_target=ct.target.iOS18 + ) + partitioner = CoreMLPartitioner(compile_specs=compile_specs) + edge_program_manager = executorch.exir.to_edge( + exir_program_aten, compile_config=self.edge_compile_config + ) + delegated_program_manager = edge_program_manager.to_backend(partitioner) + + assert [ + node.target.__name__ + for node in delegated_program_manager.exported_program().graph.nodes + if node.op == "call_function" + ] == [ + "executorch_call_delegate", + "getitem", + ] + if __name__ == "__main__": test_runner = TestCoreMLPartitioner() test_runner.test_add_sub_skip_mm() test_runner.test_vit_skip_conv() + test_runner.test_buffer() diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py index 20e1b2b8d76..339aa62719f 100644 --- a/backends/arm/operators/op_mean_dim.py +++ b/backends/arm/operators/op_mean_dim.py @@ -11,7 +11,6 @@ register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common @register_node_visitor @@ -30,29 +29,4 @@ def define_node( is_quant_node: bool, ) -> None: - input_tensor = inputs[0] - dim = node.args[1] - keep_dim = node.args[2] - - # mean.dim(-1, -2) is the same as avg_pool2d when just computing mean over HW dimensions. - # Since tosa doesn't have mean.dim operation, lowers it to average pooling instead. - if dim == [-1, -2]: - if keep_dim is True: - # Given the shape format of input is (N, C, H, W) - kernel_size = [input_tensor.shape[2], input_tensor.shape[3]] - stride = [1, 1] - padding = [0, 0, 0, 0] - - build_avg_pool_2d_common( - node, - tosa_graph, - input_tensor, - kernel_size, - stride, - padding, - is_quant_node, - output, - ) - return - raise AssertionError("unsupported") diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py index 914bf57aabc..db8511df613 100644 --- a/backends/arm/passes/arm_pass_manager.py +++ b/backends/arm/passes/arm_pass_manager.py @@ -15,6 +15,9 @@ from executorch.backends.arm.passes.convert_split_to_slice import ( ConvertSplitToSlicePass, ) +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass from executorch.exir.backend.compile_spec_schema import CompileSpec @@ -33,6 +36,7 @@ def transform_to_backend_pipeline( self.add_pass(SizeAdjustConv2DPass()) self.add_pass(RemoveClonePass()) self.add_pass(ConvertExpandCopyToRepeatPass()) + self.add_pass(ConvertMeanDimToAveragePool()) self.add_pass(ConvertSplitToSlicePass()) for spec in compile_spec: if spec.key == "permute_memory_format": diff --git a/backends/arm/passes/meandim_to_averagepool_pass.py b/backends/arm/passes/meandim_to_averagepool_pass.py new file mode 100644 index 00000000000..3f57e8023ca --- /dev/null +++ b/backends/arm/passes/meandim_to_averagepool_pass.py @@ -0,0 +1,52 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, cast, Dict, Tuple + +import torch.fx + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue + +Argument = Any + + +class ConvertMeanDimToAveragePool(ExportPass): + """ + Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation. + """ + + def call_operator( + self, + op: torch.fx.node.Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op != exir_ops.edge.aten.mean.dim: + return super().call_operator(op, args, kwargs, meta) + + input_value = cast(ProxyValue, args[0]) + dim = cast(list, args[1]) + keep_dim = cast(bool, args[2]) if len(args) > 2 else False + + # averagepool2d gets converted to a mean operation with dim = [-1, -2] and keep_dim = True + # so check the dim argument for this case + if dim == [-1, -2] and keep_dim is True: + # Given the shape format of input is (N, C, H, W) + kernel_size = [ + input_value.to_tensor().size()[2], + input_value.to_tensor().size()[3], + ] + stride = [1, 1] + return super().call_operator( + exir_ops.edge.aten.avg_pool2d.default, + (input_value, kernel_size, stride), + {}, + meta, + ) + else: + return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index e0db958f743..e48d749c194 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -106,7 +106,12 @@ def _test_meandim_tosa_u55_BI_pipeline( .check(["torch.ops.quantized_decomposed"]) .to_edge() .partition() - .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mean_dim", + "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default", + ] + ) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() ) diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py new file mode 100644 index 00000000000..1cd63e6e52e --- /dev/null +++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py @@ -0,0 +1,75 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import RunPasses + + +class MeanDim(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=[-1, -2], keepdim=True) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class MeanDim2(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=1) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class TestMeandimToAveragePool2dPass(unittest.TestCase): + """ + Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d + for the special case where dim is [-1, -2] and keepdim is True. + """ + + def test_tosa_BI_meandim_to_averagepool(self): + module = MeanDim() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) + + def test_tosa_BI_meandim_no_modification(self): + module = MeanDim2() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index d077169022a..08093efe317 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -60,6 +60,17 @@ python_library( ], ) +python_library( + name = "ops_registrations", + srcs = [ + "ops_registrations.py", + ], + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/backends/cadence/aot:utils", + ], +) + export_file(name = "functions.yaml") executorch_generated_lib( diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index a4d856ebed2..e73de6ab7ce 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -4,12 +4,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-strict + from math import prod from typing import Optional, Tuple import torch -from executorch.exir.scalar_type import ScalarType -from torch.library import impl, Library +from torch.library import Library, register_fake from .utils import get_conv1d_output_size, get_conv2d_output_size @@ -67,31 +68,31 @@ m = Library("cadence", "IMPL", "Meta") -@impl(m, "quantize_per_tensor") +@register_fake("cadence::quantize_per_tensor") def quantize_per_tensor_meta( input: torch.Tensor, scale: float, zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=dtype) -@impl(m, "dequantize_per_tensor") +@register_fake("cadence::dequantize_per_tensor") def dequantize_per_tensor_meta( input: torch.Tensor, scale: float, zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.float) -@impl(m, "quantized_linear") +@register_fake("cadence::quantized_linear") def quantized_linear_meta( src: torch.Tensor, weight: torch.Tensor, @@ -102,7 +103,7 @@ def quantized_linear_meta( out_shift: torch.Tensor, out_zero_point: int, offset: Optional[torch.Tensor], -): +) -> torch.Tensor: # src comes in shape [leading_dims, in_dim] # weight comes in shape [out_dim, in_dim] # output comes in empty with shape [leading_dims, out_dim] @@ -113,7 +114,7 @@ def quantized_linear_meta( return src.new_empty(out_size, dtype=torch.uint8) -@impl(m, "quantized_conv") +@register_fake("cadence::quantized_conv") def quantized_conv_meta( input: torch.Tensor, weight: torch.Tensor, @@ -151,7 +152,7 @@ def quantized_conv_meta( return input.new_empty(output_size, dtype=input.dtype) -@impl(m, "quantized_layer_norm") +@register_fake("cadence::quantized_layer_norm") def quantized_layer_norm_meta( input: torch.Tensor, X_scale: torch.Tensor, @@ -162,22 +163,22 @@ def quantized_layer_norm_meta( eps: float, output_scale: float, output_zero_point: int, -): +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.uint8) -@impl(m, "quantized_relu") +@register_fake("cadence::quantized_relu") def quantized_relu_meta( X: torch.Tensor, X_zero_point: torch.Tensor, out_zero_point: int, out_multiplier: torch.Tensor, out_shift: torch.Tensor, -): +) -> torch.Tensor: return X.new_empty(X.size(), dtype=torch.uint8) -@impl(m, "quantized_matmul") +@register_fake("cadence::quantized_matmul") def quantized_matmul_meta( X: torch.Tensor, X_zero_point: int, diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py index 973b457bade..d830c1bb312 100644 --- a/backends/example/test_example_delegate.py +++ b/backends/example/test_example_delegate.py @@ -46,7 +46,7 @@ def get_example_inputs(): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() # print("original model:", m) quantizer = ExampleQuantizer() # quantizer = XNNPACKQuantizer() @@ -82,7 +82,7 @@ def test_delegate_mobilenet_v2(self): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() quantizer = ExampleQuantizer() m = prepare_pt2e(m, quantizer) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index 4b233d94f04..744b1193d5a 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -25,9 +25,13 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) # targets add_library(neuron_backend SHARED) -target_link_libraries( - neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log - ${NEURON_BUFFER_ALLOCATOR_LIB} +target_link_libraries(neuron_backend + PRIVATE + executorch_no_prim_ops + portable_ops_lib + android + log + ${NEURON_BUFFER_ALLOCATOR_LIB} ) target_sources( neuron_backend diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py index d3bf98bae72..79c02e22072 100644 --- a/backends/qualcomm/builders/__init__.py +++ b/backends/qualcomm/builders/__init__.py @@ -38,6 +38,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, @@ -92,6 +93,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py index e07a745df5f..514bc6efd78 100644 --- a/backends/qualcomm/builders/node_visitor.py +++ b/backends/qualcomm/builders/node_visitor.py @@ -202,7 +202,7 @@ def get_quant_tensor_value( dtype = quant_configs[QCOM_DTYPE] - tensor = tensor.div(scale + 1e-6).add(zero_point).round().to(dtype) + tensor = tensor.div(scale).add(zero_point).round().to(dtype) # Make the backends access data correctly if quant_configs.get(QCOM_BITWIDTH) == 4: mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8) diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py index 13b24c0d722..6b2e9ab91d8 100644 --- a/backends/qualcomm/builders/op_batch_norm.py +++ b/backends/qualcomm/builders/op_batch_norm.py @@ -8,6 +8,11 @@ import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper import torch +from executorch.backends.qualcomm.utils.constants import ( + QCOM_QUANT_ATTRS, + QCOM_QUANT_MAX, + QCOM_SCALE, +) from .node_visitor import NodeVisitor, register_node_visitor from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW @@ -21,6 +26,14 @@ class BatchNorm(NodeVisitor): def __init__(self, *args) -> None: super().__init__(*args) + def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor): + if isinstance(tensor, torch._subclasses.FakeTensor): + return + + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + diff = max(abs(tensor.max()), abs(tensor.min())) + quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX] + def define_node( self, node: torch.fx.Node, @@ -48,6 +61,7 @@ def define_node( amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps) bias_tensor = bias_tensor - amount + self.update_encoding(bias_node, bias_tensor) bias_tensor_wrapper = self.define_tensor( bias_node, bias_tensor, @@ -57,6 +71,7 @@ def define_node( ) filter_tensor = filter_tensor / torch.sqrt(var_tensor + eps) + self.update_encoding(filter_node, filter_tensor) filter_tensor_wrapper = self.define_tensor( filter_node, filter_tensor, diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py index 909cc6a21f6..4b58edbac63 100644 --- a/backends/qualcomm/builders/op_conv2d.py +++ b/backends/qualcomm/builders/op_conv2d.py @@ -10,16 +10,7 @@ import numpy as np import torch -from executorch.backends.qualcomm.utils.constants import ( - QCOM_DATA, - QCOM_DTYPE, - QCOM_QUANT_ATTRS, - QCOM_QUANT_MAX, - QCOM_QUANT_MIN, - QCOM_SCALE, - QCOM_ZERO_POINT, -) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.qualcomm.utils.constants import QCOM_DATA from .node_visitor import NodeVisitor, register_node_visitor from .qnn_constants import ( @@ -94,52 +85,6 @@ def _add_conv_op_parameter( return conv_op - def _get_bias_tensor( - self, - node: torch.fx.Node, - nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper], - num_output_channel: int, - ) -> PyQnnWrapper.PyQnnOpWrapper: - # build dummy node if bias is not given - bias_node = ( - node.args[2] - if node.args[2] is not None - else torch.fx.Node( - node.graph, - node.name + "_runtime_bias", - "call_function", - exir_ops.edge.aten.full.default, - (), # args - {}, # kwargs - ) - ) - # zeros tensor to meet HTP constraint if bias is not given - bias_tensor = ( - get_parameter(bias_node, self.edge_program) - if node.args[2] is not None - else torch.zeros(num_output_channel) - ) - # insert quant attribute to meet HTP constraint if bias is not given - if ( - node.args[2] is None - and (bias_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS)) is not None - ): - quant_attrs = bias_quant_attrs.copy() - quant_attrs[QCOM_ZERO_POINT] = 0 - quant_attrs[QCOM_SCALE] = 0 - quant_attrs[QCOM_DTYPE] = torch.int32 - quant_attrs[QCOM_QUANT_MAX] = torch.iinfo(torch.int32).max - quant_attrs[QCOM_QUANT_MIN] = torch.iinfo(torch.int32).min + 1 - bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs - - return self.define_tensor( - bias_node, - bias_tensor, - PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, - nodes_to_wrappers, - is_input_tensor=False, - ) - def _define_conv1d( self, node: torch.fx.Node, @@ -204,9 +149,17 @@ def _define_conv1d( is_input_tensor=False, ) conv_input_tensors = [unsqueeze_output_tensor_wrapper, filter_tensor_wrapper] - conv_input_tensors.append( - self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1]) - ) + if node.args[2] is not None: + bias_node = node.args[2] + bias_tensor = get_parameter(bias_node, self.edge_program) + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + conv_input_tensors.append(bias_tensor_wrapper) stride = [1] + cast(List[int], node.args[3]) padding = [0] + cast(List[int], node.args[4]) @@ -312,9 +265,18 @@ def define_node( is_input_tensor=False, ) conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper] - conv_input_tensors.append( - self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1]) - ) + + if node.args[2] is not None: + bias_node = node.args[2] + bias_tensor = get_parameter(bias_node, self.edge_program) + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + conv_input_tensors.append(bias_tensor_wrapper) output_tensor = self.get_tensor(node, node) output_tensor_wrapper = self.define_tensor( diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py new file mode 100644 index 00000000000..e99b1f47ba1 --- /dev/null +++ b/backends/qualcomm/builders/op_rms_norm.py @@ -0,0 +1,127 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper +import numpy as np + +import torch +from executorch.backends.qualcomm.builders.utils import get_parameter +from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpRmsNorm, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class RmsNormVisitor(NodeVisitor): + target = ["aten.rms_norm.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + # args of node : ['input', 'normalized_shape', 'weight', 'eps'] + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + + # should be a immutable list + normalized_shapes = node.args[1] + if ( + len(normalized_shapes) != 1 + and normalized_shapes[0] != input_tensor.shape[-1] + ): + print("Only supports normalization with last input dimension") + return + axes = [node.args[0].meta["val"].dim() - 1] + axes_shape = [len(axes)] + + weight_node = node.args[2] + weight_tensor = get_parameter(weight_node, self.edge_program) + weight_tensor_wrapper = self.define_tensor( + weight_node, + weight_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + # Fake node, nn moudle seems to be inconsistant with document + bias_tensor = torch.zeros(weight_tensor.shape) + bias_node = torch.fx.Node( + node.graph, + node.name + "_runtime_bias", + "call_function", + exir_ops.edge.aten.tensor.default, + (), # args + {}, # kwargs + ) + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + epsilon = node.args[3] + if isinstance(epsilon, torch.fx.Node): + epsilon = get_parameter(epsilon, self.edge_program) + epsilon = ( + epsilon + if isinstance(epsilon, float) + else torch.finfo(epsilon.dtype).eps + ) + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=False, + ) + + rms_nrom_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpRmsNorm.op_name, + ) + + rms_nrom_op.AddInputTensors( + [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper] + ) + rms_nrom_op.AddOutputTensors([output_tensor_wrapper]) + rms_nrom_op.AddScalarParam( + OpRmsNorm.param_epsilon, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32, + {QCOM_DATA: np.float32(epsilon)}, + ) + rms_nrom_op.AddTensorParam( + OpRmsNorm.param_axes, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(axes_shape), + axes_shape, + np.array(axes, dtype=np.uint32), + True, + ) + + return rms_nrom_op diff --git a/backends/qualcomm/builders/op_softmax.py b/backends/qualcomm/builders/op_softmax.py index ae4c89bbb96..cda40aed458 100644 --- a/backends/qualcomm/builders/op_softmax.py +++ b/backends/qualcomm/builders/op_softmax.py @@ -17,7 +17,7 @@ @register_node_visitor class Softmax(NodeVisitor): - target = ["aten._softmax.default"] + target = ["aten._softmax.default", "aten._safe_softmax.default"] def __init__(self, *args) -> None: super().__init__(*args) diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py index 4a87e5dbbb3..8ac702f2ad5 100644 --- a/backends/qualcomm/builders/qnn_constants.py +++ b/backends/qualcomm/builders/qnn_constants.py @@ -278,6 +278,13 @@ class OpResizeNearestNeighbor: param_half_pixel_centers: str = "half_pixel_centers" +@dataclass(init=False, frozen=True) +class OpRmsNorm: + op_name: str = "RmsNorm" + param_epsilon: str = "epsilon" + param_axes: str = "axes" + + @dataclass(init=False, frozen=True) class OpScatterNd: op_name: str = "ScatterNd" diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py index 5f111ee9c8b..1db50694ece 100644 --- a/backends/qualcomm/passes/annotate_and_quant_scalar.py +++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py @@ -14,7 +14,7 @@ from executorch.exir.passes import dead_code_elimination_pass from torch.fx.passes.utils.source_matcher_utils import get_source_partitions -from .utils import get_quant_attrs +from .utils import dq_ops, get_quant_attrs class AnnotateAndQuantScalar(ExportPass): @@ -78,6 +78,7 @@ def _annotate_scalar_node( float, torch.float32, torch.int32, + torch.int64, ]: return @@ -88,30 +89,43 @@ def _traverse_binary_node(self, graph_module: torch.fx.GraphModule): graph_module.graph, self.binary_op_sources ) src_partitions = list(itertools.chain(*src_partitions.values())) + processed = set() for src_partition in src_partitions: - output = src_partition.output_nodes[0] - if ( - output.meta.get(QCOM_QUANT_ATTRS) - and len(src_partition.input_nodes) == 1 - ): - dq_node = src_partition.input_nodes[0] - q_node = dq_node.args[0] - q_node_attrs = get_quant_attrs(graph_module, q_node) - - scalar_nodes = [n for n in output.args if n != dq_node] - if len(scalar_nodes) == 0: + # need post process here to identify partitioned nodes: + src_fn_dict = {} + for n in src_partition.nodes: + # e.g. + # meta["source_fn_stack"]: [('mul', )] + # we'll use as grouping key + node_list = src_fn_dict.setdefault(n.meta["source_fn_stack"][-1][1], []) + node_list.append(n) + + for nodes in src_fn_dict.values(): + output = [n for n in nodes if n in src_partition.output_nodes][0] + # if all args have been annotated, it shouldn't be a scalar operation + if all(arg.target in dq_ops for arg in output.args): continue - scalar_node = scalar_nodes[0] - source_scalar_node = self._get_source_scalar_node(scalar_node) - # we'll abandon cast op here, since the constant scalar will - # be pre-loaded into QNN context binary - output.replace_input_with(scalar_node, source_scalar_node) + if output not in processed and QCOM_QUANT_ATTRS in output.meta: + dq_node = [n for n in output.args if n.target in dq_ops][0] + q_node = dq_node.args[0] + q_node_attrs = get_quant_attrs(graph_module, q_node) + + scalar_nodes = [n for n in output.args if n != dq_node] + if len(scalar_nodes) == 0: + continue + + scalar_node = scalar_nodes[0] + source_scalar_node = self._get_source_scalar_node(scalar_node) + # we'll abandon cast op here, since the constant scalar will + # be pre-loaded into QNN context binary + output.replace_input_with(scalar_node, source_scalar_node) - scalar_quant_attrs = self._update_scalar_node_attrs( - source_scalar_node, q_node_attrs - ) - self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + scalar_quant_attrs = self._update_scalar_node_attrs( + source_scalar_node, q_node_attrs + ) + self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + processed.add(output) def call(self, graph_module: torch.fx.GraphModule): self._traverse_binary_node(graph_module) diff --git a/backends/qualcomm/passes/i64_to_i32.py b/backends/qualcomm/passes/i64_to_i32.py index 7814a3ff0d6..1d2171cc37a 100644 --- a/backends/qualcomm/passes/i64_to_i32.py +++ b/backends/qualcomm/passes/i64_to_i32.py @@ -5,7 +5,9 @@ # LICENSE file in the root directory of this source tree. import torch from executorch.backends.qualcomm.builders.utils import get_parameter, is_constant +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult +from torch._subclasses.fake_tensor import FakeTensor class I64toI32(ExportPass): @@ -16,6 +18,8 @@ class I64toI32(ExportPass): def __init__(self, edge_program: torch.export.ExportedProgram): super(I64toI32, self).__init__() self.edge_program = edge_program + # pyre-ignore[4] + self.copy_op = exir_ops.edge.aten._to_copy.default def _update_meta(self, node: torch.fx.node) -> None: meta_val = node.meta["val"] @@ -32,6 +36,10 @@ def _update_meta(self, node: torch.fx.node) -> None: if meta_val.dtype == torch.int64: node.meta["val"] = meta_val.to(torch.float) + # pyre-ignore[2] + def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool: + return isinstance(node_val, FakeTensor) and node_val.dtype == dtype + def _cast_to_int32(self, graph_module: torch.fx.GraphModule): for n in graph_module.graph.nodes: if is_constant(n, self.edge_program): @@ -39,6 +47,22 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule): if param.dtype == torch.int64: # QNN does not support int64 self._update_meta(n) + elif n.op == "placeholder": + node_val = n.meta["val"] + if self._is_tensor_of_dtype(node_val, torch.int64): + with graph_module.graph.inserting_after(n): + args = (n,) + to_dst_node = graph_module.graph.create_node( + "call_function", + self.copy_op, + args, + {"dtype": torch.int32}, + ) + to_dst_node.meta["val"] = node_val.to(torch.int32) + + # Replace usage of the src dtype result with the dst dtype result. + n.replace_all_uses_with(to_dst_node) + to_dst_node.args = (n,) def call(self, graph_module: torch.fx.GraphModule): self._cast_to_int32(graph_module) diff --git a/backends/qualcomm/passes/recompose_pixel_shuffle.py b/backends/qualcomm/passes/recompose_pixel_shuffle.py deleted file mode 100644 index 9eec6bfa264..00000000000 --- a/backends/qualcomm/passes/recompose_pixel_shuffle.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -import torch -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions - - -class RecomposePixelShuffle(ExportPass): - """ - Merge decomposed operators back to one super node. - """ - - def __init__(self): - super().__init__() - - def call(self, graph_module: torch.fx.GraphModule): - graph = graph_module.graph - # decomposed core aten ops - partitions = get_source_partitions(graph, [torch.nn.PixelShuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - upscale_factor = h_out_shape / h_in_shape - - pixel_shuffle_node = graph.create_node( - "call_function", - exir_ops.edge.aten.pixel_shuffle.default, - (input_node, int(upscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_shuffle_node) - # copy metadata - pixel_shuffle_node.meta = output_node.meta - - graph.eliminate_dead_code() - graph_module.recompile() - return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py index a47f3d119a5..00d46639089 100644 --- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py +++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py @@ -6,7 +6,6 @@ import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions class RecomposePixelUnshuffle(ExportPass): @@ -85,30 +84,6 @@ def call(self, graph_module: torch.fx.GraphModule): # copy metadata pixel_unshuffle_node.meta = node.meta - # decomposed core aten ops - if not self.quantization_capture: - partitions = get_source_partitions(graph, [torch.nn.PixelUnshuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - downscale_factor = h_in_shape / h_out_shape - - op = self.op - pixel_unshuffle_node = graph.create_node( - "call_function", - op, - (input_node, int(downscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_unshuffle_node) - # copy metadata - pixel_unshuffle_node.meta = output_node.meta - graph.eliminate_dead_code() graph_module.recompile() return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/recompose_rms_norm.py b/backends/qualcomm/passes/recompose_rms_norm.py new file mode 100644 index 00000000000..b26de8bd794 --- /dev/null +++ b/backends/qualcomm/passes/recompose_rms_norm.py @@ -0,0 +1,76 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + +from .utils import dq_ops + + +class RecomposeRmsNorm(ExportPass): + """ + Merge decomposed operators back to one super node. + """ + + def __init__(self): + super().__init__() + + def _get_eps_node(self, nodes): + # eps: one of inputs of add node + add_node = [n for n in nodes if hasattr(n, "name") and "add" in n.name][0] + for a in add_node.args: + if isinstance(a, float) or a.op != "call_function": + return a + + def _get_gamma_node(self, output_node): + # gamma: one of inputs of output node + for a in output_node.args: + if a.op != "call_function" or a.target in dq_ops: + return a + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + partitions = get_source_partitions(graph, [torch.nn.RMSNorm]) + for _, src_partitions in partitions.items(): + for src_partition in src_partitions: + input_len = len(src_partition.input_nodes) + if input_len == 1: + input_node = src_partition.input_nodes[0] + elif input_len == 2: + inp_0, inp_1 = src_partition.input_nodes + input_node = inp_0 if len(inp_0.users) == 2 else inp_1 + else: + raise RuntimeError( + f"Found a edge case of rms_node partitoin {src_partition}, which has {input_len} inputs" + ) + + output_node = src_partition.output_nodes[0] + eps_node = self._get_eps_node(src_partition.nodes) + gamma_node = self._get_gamma_node(output_node) + + with graph.inserting_before(output_node): + # args schema + # (Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor + rms_node = graph.create_node( + "call_function", + exir_ops.edge.aten.rms_norm.default, + ( + input_node, + list(gamma_node.meta["val"].shape), + gamma_node, + eps_node, + ), + ) + users = output_node.users.copy() + for user in users: + user.replace_input_with(output_node, rms_node) + # copy metadata + rms_node.meta = output_node.meta + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/replace_index_put_input.py b/backends/qualcomm/passes/replace_index_put_input.py new file mode 100644 index 00000000000..1eb210cf67e --- /dev/null +++ b/backends/qualcomm/passes/replace_index_put_input.py @@ -0,0 +1,54 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ReplaceIndexPutInput(ExportPass): + """ + Index put input workaround for quantized module + """ + + dq_q_map = { + # per tensor + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, + # per channel + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: exir_ops.edge.quantized_decomposed.quantize_per_channel.default, + } + + def __init__(self, edge_program: torch.export.ExportedProgram): + super(ReplaceIndexPutInput, self).__init__() + self.edge_program = edge_program + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + for node in graph.nodes: + if node.target == exir_ops.edge.aten.index_put.default: + if ( + copy_node := list(node.users)[0] + ) and copy_node.target == exir_ops.edge.aten.copy.default: + m_buffer_node = copy_node.args[0] + bad_frozen_node = node.args[0] + if QCOM_QUANT_ATTRS in bad_frozen_node.meta: + m_buffer_node.meta[QCOM_QUANT_ATTRS] = bad_frozen_node.meta[ + QCOM_QUANT_ATTRS + ] + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] = ( + self.dq_q_map[ + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] + ] + ) + with graph.inserting_after(bad_frozen_node): + node.replace_input_with(bad_frozen_node, m_buffer_node) + else: + continue + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py index b2c86e50d33..9cde50b9c70 100644 --- a/backends/qualcomm/quantizer/custom_annotation.py +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -91,15 +91,17 @@ def is_edge_condition(node: Node): def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig): if is_edge_condition(node): return - if node.target == torch.ops.aten.index_put_.default: + if node.target in [ + torch.ops.aten.index_put.default, + torch.ops.aten.index_put_.default, + ]: annotate_index_put(node, quantization_config) annotate_matmul_input1(node.args[0], quantization_config) elif node.target == torch.ops.aten.cat.default: annotate_cat(node, quantization_config) # Expect that the inputs of the cat op are select ops - for arg in node.args[0][1:]: - annotate_single_in_single_out(arg, quantization_config) - annotate_matmul_input1(node.args[0][0], quantization_config) + for arg in node.args[0]: + annotate_matmul_input1(arg, quantization_config) else: annotate_single_in_single_out(node, quantization_config) annotate_matmul_input1(node.args[0], quantization_config) diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py index d31b4753a3d..d3ae1194acd 100644 --- a/backends/qualcomm/quantizer/utils.py +++ b/backends/qualcomm/quantizer/utils.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import numbers +import operator from dataclasses import dataclass from functools import partial from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple @@ -77,7 +78,7 @@ def _derive_bias_qparams_fn( def get_default_8bit_qnn_ptq_config( - act_symmetric: bool = False, act_observer=MinMaxObserver + act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver ) -> QuantizationConfig: extra_args: Dict[str, Any] = {"eps": 2**-12} @@ -96,7 +97,7 @@ def get_default_8bit_qnn_ptq_config( quant_max=torch.iinfo(torch.int8).max, qscheme=torch.per_tensor_symmetric, ch_axis=0, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) bias_quantization_spec = QuantizationSpec( @@ -104,7 +105,7 @@ def get_default_8bit_qnn_ptq_config( quant_min=torch.iinfo(torch.int32).min, quant_max=torch.iinfo(torch.int32).max, qscheme=torch.per_tensor_symmetric, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) quantization_config = QuantizationConfig( @@ -619,7 +620,13 @@ def annotate_upsample_nearest2d( annotate_single_in_single_out(node, quantization_config) -@register_annotator([torch.ops.aten.softmax.int, torch.ops.aten._softmax.default]) +@register_annotator( + [ + torch.ops.aten.softmax.int, + torch.ops.aten._softmax.default, + torch.ops.aten._safe_softmax.default, + ] +) def annotate_softmax(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -684,6 +691,31 @@ def annotate_squeeze(node: Node, quantization_config: QuantizationConfig) -> Non annotate_single_in_single_out(node, quantization_config) +@register_annotator([torch.ops.aten.rms_norm.default]) +def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act_node = node.args[0] + weight_node = node.args[2] + + if _is_annotated([node]): + return + + # TODO current only support 16a16w + _annotate_input_qspec_map( + node, + act_node, + quantization_config.input_activation, + ) + + _annotate_input_qspec_map( + node, + weight_node, + quantization_config.input_activation, + ) + nodes_to_mark_annotated = [node] + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + @register_annotator([torch.ops.aten.rsqrt.default]) def annotate_rsqrt(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -975,6 +1007,38 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None node.meta["source_fn_stack"] = [(node, torch.nn.Linear)] +@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default]) +def annotate_batch_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act, weight, bias = node.args[0:3] + if _is_annotated([node]): + return + + _annotate_input_qspec_map( + node, + act, + quantization_config.input_activation, + ) + # QNN requires uint8 instead of int8 in 'weight' config + _annotate_input_qspec_map( + node, + weight, + quantization_config.input_activation, + ) + _annotate_input_qspec_map( + node, + bias, + quantization_config.bias, + ) + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node, *node.args[0:3]]) + + +@register_annotator([operator.getitem]) +def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> None: + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node]) + + @register_annotator([torch.ops.aten.layer_norm.default]) def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) -> None: act_node = node.args[0] diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 319cc6092cd..e448a219284 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -55,6 +55,16 @@ def forward(self, x): return self.avgPool(x) +class BatchNorm(torch.nn.Module): + def __init__(self, n_features): + super().__init__() + self.native_batchnorm = torch.nn.BatchNorm2d(n_features) + self.eval() + + def forward(self, x): + return self.native_batchnorm(x) + + class Bmm(torch.nn.Module): def __init__(self): super().__init__() @@ -734,6 +744,16 @@ def forward(self, x): ) +class RmsNorm(torch.nn.Module): + def __init__(self): + super().__init__() + self.eps = 1e-5 + self.rms = torch.nn.RMSNorm([4], 1e-5) + + def forward(self, x): + return self.rms(x) + + class Rsqrt(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index cba23f935c2..d17fce2b839 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -16,6 +16,7 @@ from executorch.backends.qualcomm.tests.utils import ( generate_context_binary, QnnPartitioner, + QnnQuantizer, QuantDtype, TestQNN, to_backend, @@ -33,6 +34,7 @@ from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + skip_annotation, ) from executorch.examples.qualcomm.utils import setup_common_args_and_variables @@ -50,8 +52,8 @@ from executorch.examples.models.mobilenet_v3 import MV3Model from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel from executorch.examples.models.wav2letter import Wav2LetterModel +from executorch.exir import to_edge from executorch.exir.backend.backend_api import disable_validation -from executorch.exir.program._program import EdgeCompileConfig, ExirExportedProgram class TestQNNFloatingPointOperator(TestQNN): @@ -81,6 +83,11 @@ def test_qnn_backend_avg_pool2d(self): sample_input = (torch.randn(1, 3, 2, 2),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_bmm(self): module = Bmm() # noqa: F405 torch.manual_seed(8) @@ -291,7 +298,6 @@ def test_qnn_backend_layer_norm(self): sample_input = (torch.randn(196, 768),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_leaky_relu(self): test_comb = [ { @@ -334,7 +340,7 @@ def test_qnn_backend_mean_dim(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) - @unittest.skip("it will hang in runtime") + @unittest.skip("failed to lower in QNN 2.25") def test_qnn_backend_mha(self): module = MultiheadAttention() # noqa: F405 sample_input = (torch.randn(1, 197, 96),) @@ -362,7 +368,6 @@ def test_qnn_backend_pow_tensor_scalar(self): sample_input = (torch.rand([2, 4, 3, 3]),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_prelu(self): test_comb = [ { @@ -393,6 +398,11 @@ def test_qnn_backend_reshape(self): sample_input = (torch.randn([3, 4]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) @@ -655,6 +665,12 @@ def test_qnn_backend_avg_pool2d(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_bmm(self): module = Bmm() # noqa: F405 torch.manual_seed(8) @@ -662,13 +678,6 @@ def test_qnn_backend_bmm(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("not applicable") - def test_qnn_backend_cast(self): - module = Cast() # noqa: F405 - sample_input = (10 * torch.rand((9, 4, 5, 3)),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_cat(self): modules = [Cat2(), Cat3(), Cat4()] # noqa: F405 sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2)) @@ -1000,6 +1009,14 @@ def test_qnn_backend_reshape(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + module = self.get_qdq_module( + module, sample_input, quant_dtype=QuantDtype.use_16a4w + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) @@ -1329,16 +1346,10 @@ def test_qnn_backend_multi_contexts_composite(self): lowered_method=to_backend, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) @@ -1388,6 +1399,7 @@ def test_qnn_backend_online_prepare(self): sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -1431,7 +1443,7 @@ def setUp(self): saver=False, ) - def test_qnn_backend_skip_node_id(self): + def test_qnn_backend_skip_node_id_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1442,7 +1454,43 @@ def test_qnn_backend_skip_node_id(self): skip_node_id_set={"aten_add_tensor", "aten_mean_dim"}, ) - def test_qnn_backend_skip_node_op(self): + def test_qnn_backend_skip_node_id_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"conv2d"}, + ) + self.assertEqual(len(exported_progs), 1) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_skip_node_op_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1453,6 +1501,79 @@ def test_qnn_backend_skip_node_op(self): skip_node_op_set={"aten.add.Tensor"}, ) + def test_qnn_backend_skip_node_op_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_op_set={torch.ops.aten.add.Tensor}, + ) + self.assertEqual(len(exported_progs), 2) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_graph_level_mixed_precision(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"add", "mean"}, + fallback_to_cpu=False, + ) + self.assertEqual(len(exported_progs), 5) + # lower all graph again, the skipped operators will be delegated with fp16 + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + def test_qnn_backend_multi_contexts(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) @@ -1493,16 +1614,10 @@ def test_qnn_backend_multi_contexts_composite(self): quantize_method=self.get_qdq_module, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) @@ -1555,6 +1670,7 @@ def test_qnn_backend_online_prepare(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -2418,6 +2534,7 @@ def test_stories_single_llama(self): model_out = msg["result"][0] self.assertTrue(model_out.startswith(golden_start_with)) + @unittest.skip("dynamic shape inputs appear in recent torch.export.export") def test_mobilebert(self): if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2458,13 +2575,8 @@ def test_mobilebert(self): for k, v in cpu.items(): self.assertLessEqual(abs(v[0] - htp[k][0]), 2) - @unittest.skip("will be enabled after TODOs got resolved") + @unittest.skip("eagar mode fake quant works well, need further investigation") def test_ptq_mobilebert(self): - # TODO: 2 approaches to resolve accuracy issue - # 1. fallback embedding layers: - # - skip annotation in quantizer (need PR to provide helper funciton) - # - skip operators in partitioner (use existent "skip_node_op_set") - # 2. investigate different quantization configurations / mechanisms if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2481,6 +2593,8 @@ def test_ptq_mobilebert(self): self.model, "--pretrained_weight", self.pretrained_weight, + "--ptq", + "16a16w", "--ip", self.ip, "--port", diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index b206a7e1330..0d9e1a69679 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -41,7 +41,7 @@ from executorch.exir.lowered_backend_module import LoweredBackendModule from executorch.exir.pass_base import ExportPass from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.exir.program._program import ExecutorchProgram +from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -192,7 +192,9 @@ def verify_output( with tempfile.TemporaryDirectory() as tmp_dir: buffer = ( executorch_prog.buffer - if isinstance(executorch_prog, ExecutorchProgram) + if isinstance( + executorch_prog, (ExecutorchProgram, ExecutorchProgramManager) + ) else executorch_prog.buffer() ) ( diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 6dc0c4c3c8d..2a954f90d24 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import operator from collections import OrderedDict from typing import Callable, Dict, List, Tuple @@ -38,7 +39,11 @@ from executorch.backends.qualcomm.passes.recompose_pixel_unshuffle import ( RecomposePixelUnshuffle, ) +from executorch.backends.qualcomm.passes.recompose_rms_norm import RecomposeRmsNorm from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy +from executorch.backends.qualcomm.passes.replace_index_put_input import ( + ReplaceIndexPutInput, +) from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( _soc_info_table, QcomChipset, @@ -56,6 +61,7 @@ convert_to_option, ) from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC + from executorch.exir import ExirExportedProgram from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.lowered_backend_module import LoweredBackendModule @@ -63,9 +69,74 @@ from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions from torch.export.exported_program import ExportedProgram from torch.fx import passes +from torch.fx.passes.operator_support import OperatorSupportBase from torch.library import Library +class _AnnotationSkipper(OperatorSupportBase): + """ + Class used to partition out unwanted graph nodes. + e.g. - nodes are prevented from quantization annotation + - nodes have been grouped together as a submodule + + Attributes + ---------- + fp_node_id_set : set + a set contains nodes' name to be left in fp precision + fp_node_op_set : set + a set contains nodes' target (aten dialect) to be left in fp precision + skip_annotated_submodule : bool + flag to skip annotated submodule or not + + Methods + ------- + should_delegate(n: torch.fx.Node) + identify the residual nodes haven't be lowered with fixed-precision + should_skip(n: torch.fx.Node) + identify the nodes should be kept out with fixed-precision or not + is_node_supported(_, node: torch.fx.Node) + overridden method for graph partitioning + """ + + def __init__( + self, + fp_node_id_set: set = None, + fp_node_op_set: set = None, + skip_annotated_submodule: bool = False, + ): + self.fp_node_id_set = fp_node_id_set + self.fp_node_op_set = fp_node_op_set + self.skip_annotated_submodule = skip_annotated_submodule + + def should_delegate(self, n: torch.fx.Node): + return n.op == "call_function" and n.target != operator.getitem + + def should_skip(self, n: torch.fx.Node): + return n.name in self.fp_node_id_set or n.target in self.fp_node_op_set + + def is_node_supported(self, _, node: torch.fx.Node) -> bool: + if self.skip_annotated_submodule: + if node.op == "get_attr": + return all(self.should_delegate(user) for user in node.users) + return self.should_delegate(node) + + if any( + [ + node.op in ("placeholder", "output"), + self.should_skip(node), + # check if parameters belong to fallbacked operator + ( + node.op == "get_attr" + and all(self.should_skip(user) for user in node.users) + ), + ] + ): + print(f"[QNN Quantizer Annotation]: {node.name} | Skipped") + return False + + return True + + def qnn_capture_config(): return exir.CaptureConfig(enable_aot=True) @@ -184,8 +255,10 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]: # The below super ops are supported by QNN remove_decompositions = [ torch.ops.aten.pixel_shuffle.default, + torch.ops.aten.pixel_unshuffle.default, torch.ops.aten.hardsigmoid.default, torch.ops.aten.hardswish.default, + torch.ops.aten._safe_softmax.default, ] for key in remove_decompositions: @@ -201,6 +274,7 @@ def _transform(edge_program: ExportedProgram) -> None: graph_module = edge_program.graph_module RemoveRedundancy()(graph_module) RecomposePixelUnshuffle()(graph_module) + RecomposeRmsNorm()(graph_module) ConvertToLinear()(graph_module) ConvertPReLU(edge_program)(graph_module) ConvertBmmToMatmul()(graph_module) @@ -211,6 +285,7 @@ def _transform(edge_program: ExportedProgram) -> None: AnnotateDecomposed(edge_program)(graph_module) FoldQDQ()(graph_module) LayoutTransform(edge_program)(graph_module) + ReplaceIndexPutInput(edge_program)(graph_module) # Since QDQ nodes are stripped, update graph signature again to validate program edge_program._graph_signature = _get_updated_graph_signature( @@ -238,6 +313,285 @@ def capture_program( return edge_ep +def _partition_graph_into_submodules(gm, subgm_tag, subgm_cb, ptn): + from torch.fx.passes.utils.fuser_utils import ( + erase_nodes, + fuse_as_graphmodule, + insert_subgm, + legalize_graph, + topo_sort, + ) + + partitions = ptn.propose_partitions() + # insert meta for each partition group + for i, partition in enumerate(partitions): + for node in partition.nodes: + node.meta[subgm_tag] = i + + for i in range(len(partitions)): + # find nodes with same group id in current graph + node_list = [ + node for node in gm.graph.nodes if node.meta.get(subgm_tag, "") == i + ] + # fuse group nodes into submodule + sorted_nodes = topo_sort(node_list) + submodule_name = f"{subgm_tag}_{i}" + subgm, orig_inputs, orig_outputs = fuse_as_graphmodule( + gm, sorted_nodes, submodule_name + ) + # insert submodule & trim group nodes + gm = insert_subgm( + gm, + subgm_cb(subgm, submodule_name), + orig_inputs, + orig_outputs, + ) + erase_nodes(gm, sorted_nodes) + legalize_graph(gm) + + gm.recompile() + return gm + + +def _canonicalize_graph_with_lowered_module(gm, subgm_tag, ptn): + from executorch.exir.backend.backend_api import to_backend + + # return lowered program for user to debug + exported_progs = [] + # partition each submodule which went through convert_pt2e + for node in gm.graph.nodes: + if node.op == "call_module" and subgm_tag in node.name: + # obtain sample inputs through meta + subgm_input = [ + torch.ones(arg.meta["val"].shape, dtype=arg.meta["val"].dtype) + for arg in node.args + ] + # program meets QNN backend requirement + sub_prog = capture_program(gm.get_submodule(node.name), tuple(subgm_input)) + # start lowering with given partitioner + exported_progs.append(to_backend(sub_prog.exported_program, ptn)) + # replace submodule with lowered module + gm.set_submodule( + node.name, + exported_progs[-1].graph_module, + ) + # if node has multiple outputs, getitems will be default generated + if all(n.target != operator.getitem for n in node.users): + with gm.graph.inserting_after(node): + getitem_node = gm.graph.call_function( + operator.getitem, + (node, 0), + ) + getitem_node.meta = node.meta + node.replace_all_uses_with( + replace_with=getitem_node, + delete_user_cb=lambda user: user.target != operator.getitem, + ) + + gm.recompile() + return gm, exported_progs + + +def skip_annotation( + nn_module: torch.nn.Module, + quantizer, + partitioner, + sample_input: Tuple[torch.Tensor, ...], + calibration_cb: Callable[[torch.fx.GraphModule], None], + fp_node_id_set: set = None, + fp_node_op_set: set = None, + fallback_to_cpu: bool = True, +): + r""" + Exclude speific operators from quantizer annotation. + Skipped operators will defaultly stay in CPU, set 'fallback_to_cpu' + to False for trying to delegate them with FP16 precision. + + e.g.: consider following graph: + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \_______ _______/ + add_1 + (torch.ops.aten.add.default) + | + output + + If user wants to skip convolution op by names with + 'skip_node_id_set' = {"conv2d_1"} + "bias_1 / weight_1 / input_1 / input_2 / conv2d_1" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 input_2 + | (placeholder) | | + \ | / | + \ | / | + \ | / | + conv2d_1 | + \ / + \ / + \ / + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to skip convolution op by target with + 'skip_node_op_set' = {torch.ops.aten.conv2d.default} + "bias_1 / weight_1 / input_1 / conv2d_1, + bias_2 / weight_2 / input_2 / conv2d_2" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \__ __/ + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to delegate the skipped conv2d from above graph + with 'fallback_to_cpu' = False: + + [Generated graph] + input_1 input_2 + (placeholder) (placeholder) + | | + \ / + lowered_module_2 + (QNN fp16 precision) + | + | + lowered_module_1 + (QNN fixed precision) + | + output + + Args: + nn_module (torch.nn.Module): The module to be lowered. + quantizer (QnnQuantizer): Instance of QnnQuantizer. + partitioner (QnnPartitioner): Instance of QnnPartitioner. + sample_input ((torch.Tensor, ...)): Sample input tensors for graph exporting. + calibration_cb (callable): Callback function for user-defined calibration. + fp_node_id_set ({str, ...}): Set of operator names to be left in fp precision. + fp_node_op_set ({torch.ops.aten.xxx, ...}): Set of operator targets to be left in fp precision. + fallback_to_cpu (bool): Whether to lower skipped nodes to fp16 or not. + + Returns: + exported_programs: List of programs lowered to QnnBackend (quantized graphs only). + """ + from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + QnnExecuTorchHtpPrecision, + ) + from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import ( + convert_to_option, + ) + from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner + + def prepare_subgm(subgm, subgm_name): + # prepare current submodule for quantization annotation + subgm_prepared = prepare_pt2e(subgm, quantizer) + # overwrite this attribute or name will be set to "GraphModule" + # we could not identify each submodule if action is not performed + subgm_prepared.__class__.__name__ = subgm_name + return subgm_prepared + + fp_node_id_set = fp_node_id_set if fp_node_id_set is not None else set() + fp_node_op_set = fp_node_op_set if fp_node_op_set is not None else set() + graph_module = torch.export.export(nn_module, sample_input).module() + # define node support type + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(fp_node_id_set, fp_node_op_set), + allows_single_node_partition=True, + ) + subgm_tag = "annotated_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=prepare_subgm, + ptn=capability_partitioner, + ) + # perform calibration + calibration_cb(graph_module) + # convert sub modules which went through prepare_pt2e + for node in graph_module.graph.nodes: + if node.op == "call_module": + graph_module.set_submodule( + node.name, convert_pt2e(graph_module.get_submodule(node.name)) + ) + # canonicalize graph for lowering again + graph_module, exported_progs = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + + if not fallback_to_cpu: + try: + from executorch.exir.backend.partitioner import DelegationSpec + + # change HTP compiler spec for hardware to enable fp16 + qnn_option = generate_qnn_executorch_option( + partitioner.compiler_specs_snapshot + ) + compile_option = convert_to_option(qnn_option) + htp_options = compile_option.backend_options.htp_options + htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16 + partitioner.delegation_spec = DelegationSpec( + "QnnBackend", + [ + CompileSpec( + QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option) + ) + ], + ) + except: + print( + "Failed to change HTP compiler spec with 'use_fp16' as True," + " skipped operators will fallback to cpu," + ) + return graph_module, exported_progs + + # try lowering skipped operator into fp16 + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(skip_annotated_submodule=True), + allows_single_node_partition=True, + ) + subgm_tag = "skipped_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=lambda subgm, _: subgm, + ptn=capability_partitioner, + ) + graph_module, exported_progs_fp = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + exported_progs.extend(exported_progs_fp) + + return graph_module, exported_progs + + def from_context_binary( ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650 ): diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 6fe6746ec0d..dc507f91626 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -356,6 +356,14 @@ vkapi::VulkanBuffer& vTensor::buffer( return storage_.buffer_; } +utils::uvec3 vTensor::mapped_extents() const { + utils::uvec3 m_extents; + m_extents[0] = storage_.image_extents_[axis_mapping_.at(0)]; + m_extents[1] = storage_.image_extents_[axis_mapping_.at(1)]; + m_extents[2] = storage_.image_extents_[axis_mapping_.at(2)]; + return m_extents; +} + const vkapi::BufferBindInfo vTensor::sizes_ubo() { if (!sizes_uniform_.buffer()) { sizes_uniform_ = diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 70f363796fd..31052b351de 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -347,10 +347,25 @@ class vTensor final { return storage_.storage_type_ == utils::kBuffer; } + /* + * Returns the raw image extents of the underlying image texture used to store + * the tensor's data. Note that due to axis mapping, the X, Y, and Z extents + * may not correspond to the width, height, or channels dimension of the + * tensor. + */ inline const utils::uvec3& image_extents() const { return storage_.image_extents_; } + /* + * Returns the image extents of the underlying image texture, but re-ordered + * such that the first element is the extent of the axis used to represent the + * tensor's width dimension, the second element is the extent of the axis used + * to represent the tensor's height dimension, and the third element is the + * extent of the axis used to represent the tensor's channels dimension. + */ + utils::uvec3 mapped_extents() const; + /* * Extract an `vkapi::ScalarType` from the TensorOptions member */ diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index afdc8290cdd..46787955336 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -288,6 +288,10 @@ class ComputeGraph final { return values_.at(idx).toConstTensor().image_extents(); } + inline utils::uvec3 mapped_extents_of(const ValueRef idx) const { + return values_.at(idx).toConstTensor().mapped_extents(); + } + inline int32_t numel_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().numel(); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl index 1698efb0b15..6e964c745e3 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl @@ -16,90 +16,219 @@ $if MAT2_IS_TRANSPOSED: $if BATCH_MODE: #define BATCH_MODE -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 +$if HAS_BIAS: + #define HAS_BIAS #include "indexing_utils.h" -#include "matmul.h" -// addmm will have additional arguments compared to regular mm -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; -layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self; +${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_mapping")} +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "mat1_axis_mapping")} +${layout_declare_ubo(B, "ivec4", "mat2_sizes")} +${layout_declare_ubo(B, "ivec4", "mat2_axis_mapping")} +$if HAS_BIAS: + ${layout_declare_ubo(B, "ivec4", "bias_sizes")} + ${layout_declare_ubo(B, "ivec4", "bias_axis_mapping")} + ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} -layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; +layout(constant_id = 3) const int out_packed_dim = C_DIM; -layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes { - ivec4 self_sizes; -}; +// To convince the SPIR-V compiler to unroll the loops optimally, need this +// macro +#define FOUR 4 -layout(set = 0, binding = 7) uniform PRECISION restrict InLimits { - ivec3 in_limits; +#define TILE_ROWS ${TILE_ROWS} + +// we avoid mat4 and vec4 usage here as they compile to much less efficient +// SPIR-V +struct FloatMatrix_2d { + float data[TILE_ROWS][FOUR]; }; -layout(set = 0, binding = 8) uniform PRECISION restrict Params { - float alpha; - float beta; +struct FloatMatrix_3d { + float data[TILE_ROWS][FOUR][FOUR]; }; -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +#ifdef BATCH_MODE + #define FloatMatrix FloatMatrix_3d +#else + #define FloatMatrix FloatMatrix_2d +#endif // BATCH_MODE + +#ifdef HAS_BIAS +// get texel from self tensor (channel_packed) in addmm +vec4 get_texel_C_packed(const ivec2 idx) { + ivec3 bias_pos = ivec3(0); + if (bias_sizes.x > 1) { + bias_pos[bias_axis_mapping.x] = idx.x; + } + if (bias_sizes.y > 1) { + bias_pos[bias_axis_mapping.y] = idx.y; + } -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); + return texelFetch(bias_tensor, bias_pos, 0); +} +#endif // HAS_BIAS + +FloatMatrix matmul_partial(const ivec4 out_idx_tl) { + FloatMatrix results; + for (int i = 0; i < TILE_ROWS; i++) { + for (int j = 0; j < FOUR; j++) { +#ifdef BATCH_MODE + for (int k = 0; k < FOUR; k++) { + results.data[i][j][k] = 0.0f; + } +#else + results.data[i][j] = 0.0f; +#endif // BATCH_MODE + } + } + vec4 mat1_tensor_partial_load[TILE_ROWS]; + vec4 mat2_tensor_partial_load[FOUR]; + +#ifdef MAT2_IS_TRANSPOSED + const int mat2_k_axis = mat2_axis_mapping.x; + const int mat2_row_axis = mat2_axis_mapping.y; +#else + const int mat2_k_axis = mat2_axis_mapping.y; + const int mat2_row_axis = mat2_axis_mapping.x; +#endif // MAT2_IS_TRANSPOSED + +#ifdef BATCH_MODE + for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) { + if (out_idx_tl.z + batch_idx >= out_sizes.z) { + break; + } +#endif // BATCH_MODE + for (int k = 0; k < mat1_sizes.x; k+=4) { + const int k_div4 = k >> 2; + // read and cache (4 x TILE_ROWS) tile of mat1 + for (int r = 0; r < TILE_ROWS; r++) { + ivec3 mat1_pos = ivec3(0); + mat1_pos[mat1_axis_mapping.x] = k_div4; + mat1_pos[mat1_axis_mapping.y] = out_idx_tl.y + r; +#ifdef BATCH_MODE + mat1_pos[mat1_axis_mapping.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0); + } - if (any(greaterThanEqual(pos, out_limits))) { - return; + // read and cache (4 x 4) tile of mat2 + for (int r = 0; r < FOUR; ++r) { + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_k_axis] = k_div4; + mat2_pos[mat2_row_axis] = out_idx_tl.x + r; +#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED) + mat2_pos[mat2_axis_mapping.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0); + } + + // perform partial dot products and add partial result to results + for (int out_row = 0; out_row < TILE_ROWS; out_row++) { + for (int out_col = 0; out_col < FOUR; out_col++) { +#ifdef BATCH_MODE + results.data[out_row][out_col][batch_idx] += +#else + results.data[out_row][out_col] += +#endif // BATCH_MODE + dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]); + } + } } +#ifdef BATCH_MODE + } +#endif // BATCH_MODE + + return results; +} - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - vec4 self_texel = get_texel_C_packed( - im_self, - out_pos, - self_sizes.x == 1, - self_sizes.y == 1); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r][0], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][1], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][2], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); +// +// Write result matrix to output (3D matmul) +// + +void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) { + ivec3 out_pos = to_texture_pos( + out_idx_tl, out_sizes, out_axis_mapping, out_packed_dim); + + for (int tile_c = 0; + tile_c < TILE_ROWS; + tile_c++, out_pos[out_axis_mapping.y]++) { + out_pos[out_axis_mapping.x] = out_idx_tl.x; + + for (int tile_r = 0; + tile_r < FOUR; + tile_r++, out_pos[out_axis_mapping.x]++) { + +#ifdef HAS_BIAS + ivec2 bias_idx; + bias_idx[bias_axis_mapping.x] = out_pos[out_axis_mapping.x]; + bias_idx[bias_axis_mapping.y] = out_pos[out_axis_mapping.y]; + float bias_val = get_texel_C_packed(bias_idx).x; +#ifdef BATCH_MODE + vec4 bias_texel = vec4(bias_val); +#else + vec4 bias_texel = vec4(bias_val, 0, 0, 0); +#endif // BATCH_MODE +#endif // HAS_BIAS + +#ifdef BATCH_MODE + vec4 out_texel = vec4( + results.data[tile_c][tile_r][0], + results.data[tile_c][tile_r][1], + results.data[tile_c][tile_r][2], + results.data[tile_c][tile_r][3]); +#else + vec4 out_texel = vec4( + results.data[tile_c][tile_r], + 0.0, + 0.0, + 0.0); +#endif // BATCH_MODE + +#ifdef HAS_BIAS + imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel); +#else + imageStore(out_tensor, out_pos, out_texel); +#endif // HAS_BIAS } } } + +void main() { + // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of + // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4) + // tile of output elements will be computed. Note the sizes are written in + // (W x H x C) format. + const ivec3 tile_idx = ivec3(gl_GlobalInvocationID); + + // Calculate the tensor index of the top left element in the output tile + const ivec4 out_idx_topleft = ivec4( + tile_idx.x * 4, + tile_idx.y * TILE_ROWS, +#ifdef BATCH_MODE + tile_idx.z * 4, +#else + tile_idx.z, +#endif // BATCH_MODE + 0); + + // If the top left element is already out of range, then skip + if (any(greaterThanEqual(out_idx_topleft, out_sizes))) { + return; + } + + FloatMatrix results = matmul_partial(out_idx_topleft); + + write_results_C_packed(out_idx_topleft, results); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml index b958d3b9543..c82c2003d20 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml @@ -7,24 +7,37 @@ addmm_optimized: parameter_names_with_default_values: DTYPE: float - NDIM: 3 - PACKING: C_packed MAT2_IS_TRANSPOSED: false BATCH_MODE: false - TILE_ROW: tile_row_4 + TILE_ROWS: 4 + HAS_BIAS: true generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 + TILE_ROWS: + - VALUE: 4 + SUFFIX: tile_row_4 + - VALUE: 2 + SUFFIX: tile_row_2 DTYPE: - VALUE: float - VALUE: half shader_variants: - NAME: addmm_optimized + - NAME: matmul_optimized + HAS_BIAS: false - NAME: linear_optimized MAT2_IS_TRANSPOSED: true + - NAME: matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + HAS_BIAS: false - NAME: batch_addmm_optimized BATCH_MODE: true + - NAME: batch_matmul_optimized + BATCH_MODE: true + HAS_BIAS: false - NAME: batch_linear_optimized MAT2_IS_TRANSPOSED: true BATCH_MODE: true + - NAME: batch_matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + BATCH_MODE: true + HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl deleted file mode 100644 index 8634371a7b4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -$if BATCH_MODE: - #define BATCH_MODE - -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 - -#include "indexing_utils.h" -#include "matmul.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; - -layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; - -layout(set = 0, binding = 5) uniform PRECISION restrict InLimits { - ivec3 in_limits; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - results.data[idx_c][idx_r][0], - results.data[idx_c][idx_r][1], - results.data[idx_c][idx_r][2], - results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml deleted file mode 100644 index 9268d5a25aa..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -matmul_optimized: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - PACKING: C_packed - MAT2_IS_TRANSPOSED: false - BATCH_MODE: false - TILE_ROW: tile_row_4 - generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: matmul_optimized - - NAME: matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - - NAME: batch_matmul_optimized - BATCH_MODE: true - - NAME: batch_matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - BATCH_MODE: true diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp index 63b60bf52f7..14c814b084a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp @@ -174,10 +174,19 @@ void add_addmm_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); utils::uvec3 global_size; + + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `mapped_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1}); } utils::uvec3 local_size = adaptive_work_group_size(global_size); @@ -191,14 +200,18 @@ void add_addmm_optimized_node( {{mat1_W_packed, mat2_packed, self}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), + graph.axis_mapping_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_mapping_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_mapping_ubo(mat2_packed), graph.sizes_ubo(self), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_mapping_ubo(self), graph.create_params_buffer(params), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out)}, // Resizing Logic resize_addmm_node, {mat2_is_transposed})); diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp index a25a602e38f..07618239a65 100644 --- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp @@ -181,12 +181,21 @@ void add_matmul_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] utils::uvec3 global_size; if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `mapped_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1}); } + utils::uvec3 local_size = adaptive_work_group_size(global_size); graph.execute_nodes().emplace_back(new ExecuteNode( @@ -199,12 +208,15 @@ void add_matmul_optimized_node( {{mat1_W_packed, mat2_packed}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_mapping_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_mapping_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_mapping_ubo(mat2_packed), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out)}, // Resizing Logic resize_matmul_node, {mat2_is_transposed})); diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h index 20c6254e1a0..9af908eb170 100644 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -242,7 +242,7 @@ void warp_size(const App& app, const bool verbose = false) { }); std::vector data(app.nthread_logic); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + out_buf.copy_to(data.data(), out_buf.nbytes()); if (verbose) { std::stringstream ss; diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/passes/convert_to_linear.py index 69f882523c8..2cef71bf927 100644 --- a/backends/xnnpack/passes/convert_to_linear.py +++ b/backends/xnnpack/passes/convert_to_linear.py @@ -13,9 +13,8 @@ from executorch.backends.transforms.addmm_mm_to_linear import ( apply_addmm_mm_to_linear_transform, ) -from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass from torch.fx.passes.infra.pass_base import PassResult from torch.fx.passes.utils.source_matcher_utils import ( @@ -27,7 +26,7 @@ logger.setLevel(logging.WARNING) -class ConvertToLinearPass(XNNPACKPass): +class ConvertToLinearPass(ExportPass): linear_modules = [ torch.nn.Linear, torch.nn.functional.linear, @@ -71,28 +70,24 @@ def get_arg(node: torch.fx.Node, arg: str): map_ = {"input": 0, "weight": 1} return None if arg == "bias" else node.args[map_[arg]] - def find_bias_for_mm(self, src_partition: SourcePartition, weight: torch.fx.Node): + def find_bias_for_mm(self, src_partition: SourcePartition, mm_node: torch.fx.Node): """ For linear decomposed with mm + add, find bias in src partition """ - out_channels = get_shape(weight)[0] - bias = None - - # Try to find bias node in all nodes - for node in src_partition.nodes: - if is_param_node(self.exported_program, node) and node != weight: - bias = node - - if bias is not None: - assert get_shape(bias) == [ - out_channels - ], f"Expected bias shape {[out_channels]} but got {get_shape(bias)}" - else: - assert exir_ops.edge.aten.add.Tensor not in [ - node.target for node in src_partition.nodes - ], f"Expecting to find bias for Linear module: {src_partition} but could not find it" - return bias + mm_users = list(mm_node.users.keys()) + if len(mm_users) != 1: + return None + + add_node = mm_users[0] + if add_node.target != exir_ops.edge.aten.add.Tensor: + return None + + for arg in add_node.all_input_nodes: + if arg != mm_node and arg in src_partition.input_nodes: + return arg + + return None def create_linear( self, @@ -119,7 +114,7 @@ def create_linear( src_partition.input_nodes + src_partition.params, # bias can be in params ) if linear_bias is None and node.target == exir_ops.edge.aten.mm.default: - linear_bias = self.find_bias_for_mm(src_partition, linear_weight) + linear_bias = self.find_bias_for_mm(src_partition, node) logger.debug(f"Found bias(?): {linear_bias} from node {node}") diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 3c076cc5bdf..917512d71b6 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -54,20 +54,6 @@ build_android_native_library() { fi cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release - cmake examples/models/llama2 \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI="$ANDROID_ABI" \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -B"${CMAKE_OUT}"/examples/models/llama2 - - cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release - - cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ @@ -75,6 +61,7 @@ build_android_native_library() { -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android @@ -110,7 +97,7 @@ build_aar() { find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml + zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml popd } diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp index a7790be7fed..3076cde1a99 100644 --- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp +++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include "${fn_header}" // Generated Function import headers @@ -21,7 +22,8 @@ // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast // arguments into C++ types (instead of IValue) and delegate to unboxed kernels. -using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>; +using KernelSpan = + ::executorch::runtime::Span; namespace torch { namespace executor { namespace function { @@ -31,15 +33,15 @@ static Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; -// Explicitly convert to ArrayRef, so that the API can take an empty C array of +// Explicitly convert to Span, so that the API can take an empty C array of // Kernels. -static KernelArrayRef kernel_array_ref( +static KernelSpan kernel_span( kernels_to_register, kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel)); // Return value not used. Keep the static variable assignment to register // kernels in static initialization time. -static auto success_with_kernel_reg = register_kernels(kernel_array_ref); +static auto success_with_kernel_reg = register_kernels(kernel_span); } // namespace } // namespace function } // namespace executor diff --git a/codegen/templates/RegisterKernels.cpp b/codegen/templates/RegisterKernels.cpp index 2313a30a307..91eac200222 100644 --- a/codegen/templates/RegisterKernels.cpp +++ b/codegen/templates/RegisterKernels.cpp @@ -19,7 +19,8 @@ Error register_all_kernels() { Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; - Error success_with_kernel_reg = register_kernels(kernels_to_register); + Error success_with_kernel_reg = + ::executorch::runtime::register_kernels({kernels_to_register}); if (success_with_kernel_reg != Error::Ok) { ET_LOG(Error, "Failed register all kernels"); return success_with_kernel_reg; diff --git a/devtools/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp index d174cbdcdad..54f84f6fef1 100644 --- a/devtools/bundled_program/bundled_program.cpp +++ b/devtools/bundled_program/bundled_program.cpp @@ -23,13 +23,21 @@ #include #include -namespace torch { -namespace executor { +using exec_aten::ArrayRef; +using exec_aten::Half; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::Method; +using ::executorch::runtime::Result; + +namespace executorch { namespace bundled_program { namespace { -#define kMaxDim 16 +constexpr size_t kMaxDim = 16; #ifdef USE_ATEN_LIB @@ -53,6 +61,7 @@ at::Tensor tensor_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { } #else // !USE_ATEN_LIB +using torch::executor::TensorImpl; // Create a tensorimpl with same content using bundled tensor TensorImpl impl_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { ScalarType scalar_type = @@ -234,9 +243,9 @@ get_method_test_suite( } // namespace // Load testset_idx-th bundled data into the Method -ET_NODISCARD Error LoadBundledInput( +ET_NODISCARD Error load_bundled_input( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx) { ET_CHECK_OR_RETURN_ERROR( bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( @@ -319,19 +328,19 @@ ET_NODISCARD Error LoadBundledInput( ET_CHECK_OR_RETURN_ERROR( status == Error::Ok, NotSupported, - "set_input failed during load bundled inputs with status %" PRIu32, - static_cast(status)); + "set_input failed during load bundled inputs with status 0%" PRIx32, + static_cast(status)); } - internal::event_tracer_set_bundled_input_index( + ::executorch::runtime::internal::event_tracer_set_bundled_input_index( method.get_event_tracer(), testset_idx); return Error::Ok; } -ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( +ET_NODISCARD Error verify_method_outputs( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol, double atol) { @@ -390,12 +399,12 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( return Error::Ok; } -ET_NODISCARD Error GetProgramData( +ET_NODISCARD Error get_program_data( void* file_data, size_t file_data_len, const void** out_program_data, size_t* out_program_data_len) { - if (IsBundledProgram(file_data)) { + if (is_bundled_program(file_data, file_data_len)) { auto program_bundled = bundled_program_flatbuffer::GetBundledProgram(file_data); *out_program_data = program_bundled->program()->data(); @@ -410,11 +419,13 @@ ET_NODISCARD Error GetProgramData( return Error::Ok; } -bool IsBundledProgram(void* file_data) { +bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) { + // Even though the flatbuffer API doesn't accept a length, it's important to + // require one so that we could change the internal representation, or use a + // future API that does require a length. return bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( file_data); } } // namespace bundled_program -} // namespace executor -} // namespace torch +} // namespace executorch diff --git a/devtools/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h index 8b42923866e..884ca6f21bc 100644 --- a/devtools/bundled_program/bundled_program.h +++ b/devtools/bundled_program/bundled_program.h @@ -11,14 +11,13 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { namespace bundled_program { /** * An opaque pointer to a serialized bundled program. */ -using serialized_bundled_program = const void; +using SerializedBundledProgram = const void; /** * Load testset_idx-th bundled input of method_idx-th Method test in @@ -31,9 +30,9 @@ using serialized_bundled_program = const void; * @returns Return Error::Ok if load successfully, or the error happens during * execution. */ -ET_NODISCARD Error LoadBundledInput( - Method& method, - serialized_bundled_program* bundled_program_ptr, +ET_NODISCARD ::executorch::runtime::Error load_bundled_input( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx); /** @@ -49,9 +48,9 @@ ET_NODISCARD Error LoadBundledInput( * @returns Return Error::Ok if two outputs match, or the error happens during * execution. */ -ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( - Method& method, - serialized_bundled_program* bundled_program_ptr, +ET_NODISCARD ::executorch::runtime::Error verify_method_outputs( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol = 1e-5, double atol = 1e-8); @@ -73,7 +72,7 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( * in it, and out_program_data/out_program_data_len point to the data. Other * values on failure. */ -ET_NODISCARD Error GetProgramData( +ET_NODISCARD ::executorch::runtime::Error get_program_data( void* file_data, size_t file_data_len, const void** out_program_data, @@ -83,11 +82,61 @@ ET_NODISCARD Error GetProgramData( * Checks whether the given file is a bundled program. * * @param[in] file_data The contents of the given file. + * @param[in] file_data_len The length of file_data, in bytes. * * @returns true if the given file is a bundled program, false otherwise */ -bool IsBundledProgram(void* file_data); +bool is_bundled_program(void* file_data, size_t file_data_len); + +/// DEPRECATED: Use the version with the file_data_len parameter. +ET_DEPRECATED inline bool is_bundled_program(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return is_bundled_program(file_data, 128); +} + +} // namespace bundled_program +} // namespace executorch + +namespace torch { +namespace executor { +namespace bundled_program { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using serialized_bundled_program = + ::executorch::bundled_program::SerializedBundledProgram; + +ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx) { + return ::executorch::bundled_program::load_bundled_input( + method, bundled_program_ptr, testset_idx); +} + +ET_NODISCARD inline ::executorch::runtime::Error +VerifyResultWithBundledExpectedOutput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx, + double rtol = 1e-5, + double atol = 1e-8) { + return ::executorch::bundled_program::verify_method_outputs( + method, bundled_program_ptr, testset_idx, rtol, atol); +} + +ET_NODISCARD inline ::executorch::runtime::Error GetProgramData( + void* file_data, + size_t file_data_len, + const void** out_program_data, + size_t* out_program_data_len) { + return ::executorch::bundled_program::get_program_data( + file_data, file_data_len, out_program_data, out_program_data_len); +} +inline bool IsBundledProgram(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return ::executorch::bundled_program::is_bundled_program(file_data, 128); +} } // namespace bundled_program } // namespace executor } // namespace torch diff --git a/devtools/etdump/emitter.cpp b/devtools/etdump/emitter.cpp index dfca6295306..653c75cb084 100644 --- a/devtools/etdump/emitter.cpp +++ b/devtools/etdump/emitter.cpp @@ -6,16 +6,25 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include + #include +#include + +#include +#include + +#include -#include "executorch/devtools/etdump/emitter.h" -#include "executorch/runtime/platform/assert.h" +using executorch::etdump::internal::ETDumpStaticAllocator; -namespace torch { -namespace executor { +namespace executorch { +namespace etdump { +namespace internal { -static int _allocator_fn( +namespace { + +int allocator_fn( void* alloc_context, flatcc_iovec_t* b, size_t request, @@ -24,8 +33,8 @@ static int _allocator_fn( void* p; size_t n; - struct etdump_static_allocator* state = - (struct etdump_static_allocator*)alloc_context; + ETDumpStaticAllocator* state = + reinterpret_cast(alloc_context); // This allocator doesn't support freeing memory. if (request == 0) { @@ -113,14 +122,14 @@ static int _allocator_fn( // This emitter implementation emits to a fixed size buffer and will fail if it // runs out of room on either end. -static int _emitter_fn( +int emitter_fn( void* emit_context, const flatcc_iovec_t* iov, int iov_count, flatbuffers_soffset_t offset, size_t len) { - struct etdump_static_allocator* E = - (struct etdump_static_allocator*)emit_context; + ETDumpStaticAllocator* E = + reinterpret_cast(emit_context); uint8_t* p; if (offset < 0) { @@ -144,40 +153,15 @@ static int _emitter_fn( return 0; } -/******************************************************************************* - * Public Functions - ******************************************************************************/ - -int etdump_static_allocator_builder_init( - flatcc_builder_t* builder, - struct etdump_static_allocator* alloc) { - ET_CHECK(builder != nullptr); - ET_CHECK(alloc != nullptr); - - // Ensure data size is multiple of 32 (minimum allocation size). - ET_CHECK((alloc->data_size & 0x1F) == 0); - // Ensure out_size is divisable by 2 to ensure front/back sizes are equal for - // emitter.. - ET_CHECK((alloc->out_size & 0x1) == 0); - - return flatcc_builder_custom_init( - builder, _emitter_fn, alloc, _allocator_fn, alloc); -} - -void etdump_static_allocator_reset(struct etdump_static_allocator* alloc) { - ET_CHECK(alloc != nullptr); - alloc->allocated = 0; - size_t n = alloc->out_size / 2; - alloc->front_cursor = &alloc->data[alloc->data_size + n]; - alloc->front_left = n; -} +} // namespace -int et_flatcc_custom_init( +int etdump_flatcc_custom_init( flatcc_builder_t* builder, - struct etdump_static_allocator* alloc) { + struct ETDumpStaticAllocator* alloc) { return flatcc_builder_custom_init( - builder, _emitter_fn, alloc, _allocator_fn, alloc); + builder, emitter_fn, alloc, allocator_fn, alloc); } -} // namespace executor -} // namespace torch +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/emitter.h b/devtools/etdump/emitter.h index bf8ab0b1e1c..09c1b56aa56 100644 --- a/devtools/etdump/emitter.h +++ b/devtools/etdump/emitter.h @@ -6,26 +6,23 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include +#pragma once -#include -#include +#include +#include -#pragma once +#include -namespace torch { -namespace executor { +typedef struct flatcc_builder flatcc_builder_t; -int et_flatcc_custom_init( - flatcc_builder_t* builder, - struct etdump_static_allocator* alloc); +namespace executorch { +namespace etdump { +namespace internal { -int etdump_static_allocator_builder_init( +int etdump_flatcc_custom_init( flatcc_builder_t* builder, - struct etdump_static_allocator* alloc); - -void etdump_static_allocator_reset(struct etdump_static_allocator* alloc); + internal::ETDumpStaticAllocator* alloc); -} // namespace executor -} // namespace torch +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp index ca46c12f51c..4c05bb5acee 100644 --- a/devtools/etdump/etdump_flatcc.cpp +++ b/devtools/etdump/etdump_flatcc.cpp @@ -6,19 +6,33 @@ * LICENSE file in the root directory of this source tree. */ -#include "executorch/devtools/etdump/etdump_flatcc.h" +#include + +#include + +#include #include #include +#include +#include +#include + #include -#include -#include -#include "executorch/devtools/etdump/emitter.h" -#include "executorch/runtime/core/exec_aten/exec_aten.h" -#include "executorch/runtime/core/exec_aten/util/scalar_type_util.h" -#include "executorch/runtime/platform/assert.h" -namespace torch { -namespace executor { +using ::exec_aten::Tensor; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::ChainID; +using ::executorch::runtime::DebugHandle; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; + +namespace executorch { +namespace etdump { namespace { @@ -50,30 +64,30 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type( } etdump_Tensor_ref_t add_tensor_entry( - flatcc_builder_t* builder, + flatcc_builder_t* builder_, const exec_aten::Tensor& tensor, long offset) { - etdump_Tensor_start(builder); + etdump_Tensor_start(builder_); etdump_Tensor_scalar_type_add( - builder, get_flatbuffer_scalar_type(tensor.scalar_type())); - etdump_Tensor_sizes_start(builder); + builder_, get_flatbuffer_scalar_type(tensor.scalar_type())); + etdump_Tensor_sizes_start(builder_); for (auto dim : tensor.sizes()) { int64_t cast_dim = static_cast(dim); - etdump_Tensor_sizes_push(builder, &cast_dim); + etdump_Tensor_sizes_push(builder_, &cast_dim); } - etdump_Tensor_sizes_end(builder); + etdump_Tensor_sizes_end(builder_); - etdump_Tensor_strides_start(builder); + etdump_Tensor_strides_start(builder_); for (auto dim : tensor.strides()) { int64_t cast_dim = static_cast(dim); - etdump_Tensor_strides_push(builder, &cast_dim); + etdump_Tensor_strides_push(builder_, &cast_dim); } - etdump_Tensor_strides_end(builder); - etdump_Tensor_offset_add(builder, offset); + etdump_Tensor_strides_end(builder_); + etdump_Tensor_offset_add(builder_, offset); - return etdump_Tensor_end(builder); + return etdump_Tensor_end(builder_); } static uint8_t* alignPointer(void* ptr, size_t alignment) { @@ -88,71 +102,71 @@ static uint8_t* alignPointer(void* ptr, size_t alignment) { } // namespace -constexpr size_t max_alloc_buf_size = 128 * 1024; - // Constructor implementation ETDumpGen::ETDumpGen(Span buffer) { - // Initialize the flatcc builder using the buffer and buffer size. + constexpr size_t max_alloc_buf_size = 128 * 1024; + + // Initialize the flatcc builder_ using the buffer and buffer size. if (buffer.data() != nullptr) { - builder = (struct flatcc_builder*)alignPointer(buffer.data(), 64); + builder_ = (struct flatcc_builder*)alignPointer(buffer.data(), 64); uintptr_t buffer_with_builder = - (uintptr_t)alignPointer(builder + sizeof(struct flatcc_builder), 64); + (uintptr_t)alignPointer(builder_ + sizeof(struct flatcc_builder), 64); size_t buffer_size = buffer.size() - (size_t)(buffer_with_builder - (uintptr_t)buffer.data()); - alloc.set_buffer( + alloc_.set_buffer( (uint8_t*)buffer_with_builder, buffer_size, (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size : buffer_size / 4)); - et_flatcc_custom_init(builder, &alloc); + internal::etdump_flatcc_custom_init(builder_, &alloc_); } else { - builder = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder)); + builder_ = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder)); ET_CHECK_MSG( - builder != nullptr, "Failed to allocate memory for flatcc builder."); - flatcc_builder_init(builder); + builder_ != nullptr, "Failed to allocate memory for flatcc builder_."); + flatcc_builder_init(builder_); } reset(); } ETDumpGen::~ETDumpGen() { - flatcc_builder_clear(builder); + flatcc_builder_clear(builder_); if (!is_static_etdump()) { - free(builder); + free(builder_); } } void ETDumpGen::reset() { - etdump_gen_state = ETDumpGen_Init; - num_blocks = 0; - flatcc_builder_reset(builder); - flatbuffers_buffer_start(builder, etdump_ETDump_file_identifier); - etdump_ETDump_start_as_root_with_size(builder); - etdump_ETDump_version_add(builder, ETDUMP_VERSION); - etdump_ETDump_run_data_start(builder); - etdump_ETDump_run_data_push_start(builder); + state_ = State::Init; + num_blocks_ = 0; + flatcc_builder_reset(builder_); + flatbuffers_buffer_start(builder_, etdump_ETDump_file_identifier); + etdump_ETDump_start_as_root_with_size(builder_); + etdump_ETDump_version_add(builder_, ETDUMP_VERSION); + etdump_ETDump_run_data_start(builder_); + etdump_ETDump_run_data_push_start(builder_); } void ETDumpGen::create_event_block(const char* name) { - if (etdump_gen_state == ETDumpGen_Adding_Events) { - etdump_RunData_events_end(builder); - } else if (etdump_gen_state == ETDumpGen_Done) { + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::Done) { reset(); } - if (num_blocks > 0) { - etdump_ETDump_run_data_push_end(builder); - etdump_ETDump_run_data_push_start(builder); + if (num_blocks_ > 0) { + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_push_start(builder_); } - ++num_blocks; - etdump_RunData_name_create_strn(builder, name, strlen(name)); - if (bundled_input_index != -1) { - etdump_RunData_bundled_input_index_add(builder, bundled_input_index); + ++num_blocks_; + etdump_RunData_name_create_strn(builder_, name, strlen(name)); + if (bundled_input_index_ != -1) { + etdump_RunData_bundled_input_index_add(builder_, bundled_input_index_); } - etdump_gen_state = ETDumpGen_Block_Created; + state_ = State::BlockCreated; } int64_t ETDumpGen::create_string_entry(const char* name) { - return flatbuffers_string_create_str(builder, name); + return flatbuffers_string_create_str(builder_, name); } // ETDumpGen has the following possible states, ETDumpGen_Init, @@ -169,16 +183,15 @@ int64_t ETDumpGen::create_string_entry(const char* name) { // type again. In this case once we close the allocators table and start pushing // to the events table we cannot push to the allocators table again. void ETDumpGen::check_ready_to_add_events() { - if (etdump_gen_state != ETDumpGen_Adding_Events) { + if (state_ != State::AddingEvents) { ET_CHECK_MSG( - (etdump_gen_state == ETDumpGen_Adding_Allocators || - etdump_gen_state == ETDumpGen_Block_Created), + (state_ == State::AddingAllocators || state_ == State::BlockCreated), "ETDumpGen in an invalid state. Cannot add new events now."); - if (etdump_gen_state == ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_end(builder); + if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); } - etdump_RunData_events_start(builder); - etdump_gen_state = ETDumpGen_Adding_Events; + etdump_RunData_events_start(builder_); + state_ = State::AddingEvents; } } @@ -231,29 +244,29 @@ void ETDumpGen::end_profiling_delegate( check_ready_to_add_events(); // Start building the ProfileEvent entry. - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, event_tracer_entry.start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, chain_id_); - etdump_ProfileEvent_instruction_id_add(builder, debug_handle_); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, event_tracer_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); // Delegate debug identifier can either be of a string type or an integer // type. If it's a string type then it's a value of type // flatbuffers_string_ref_t type, whereas if it's an integer type then we // write the integer value directly. if (event_tracer_entry.delegate_event_id_type == DelegateDebugIdType::kInt) { etdump_ProfileEvent_delegate_debug_id_int_add( - builder, event_tracer_entry.event_id); + builder_, event_tracer_entry.event_id); } else { etdump_ProfileEvent_delegate_debug_id_str_add( - builder, event_tracer_entry.event_id); + builder_, event_tracer_entry.event_id); } flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( - builder, (const uint8_t*)metadata, metadata_len); - etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref); - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::log_profiling_delegate( @@ -268,24 +281,24 @@ void ETDumpGen::log_profiling_delegate( "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); check_ready_to_add_events(); int64_t string_id = name != nullptr ? create_string_entry(name) : -1; - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, chain_id_); - etdump_ProfileEvent_instruction_id_add(builder, debug_handle_); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); if (string_id == -1) { etdump_ProfileEvent_delegate_debug_id_int_add( - builder, delegate_debug_index); + builder_, delegate_debug_index); } else { - etdump_ProfileEvent_delegate_debug_id_str_add(builder, string_id); + etdump_ProfileEvent_delegate_debug_id_str_add(builder_, string_id); } flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( - builder, (const uint8_t*)metadata, metadata_len); - etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref); - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::log_intermediate_output_delegate( @@ -331,7 +344,7 @@ void ETDumpGen::log_intermediate_output_delegate_helper( ET_CHECK_MSG( (name == nullptr) ^ (delegate_debug_index == -1), "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); - if (debug_buffer.empty()) { + if (debug_buffer_.empty()) { ET_CHECK_MSG(0, "Must pre-set debug buffer with set_debug_buffer()\n"); return; } @@ -339,71 +352,71 @@ void ETDumpGen::log_intermediate_output_delegate_helper( check_ready_to_add_events(); int64_t string_id = name != nullptr ? create_string_entry(name) : -1; - etdump_DebugEvent_start(builder); + etdump_DebugEvent_start(builder_); - etdump_DebugEvent_chain_index_add(builder, chain_id_); - etdump_DebugEvent_instruction_id_add(builder, debug_handle_); + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); if (string_id == -1) { - etdump_DebugEvent_delegate_debug_id_int_add(builder, delegate_debug_index); + etdump_DebugEvent_delegate_debug_id_int_add(builder_, delegate_debug_index); } else { - etdump_DebugEvent_delegate_debug_id_str_add(builder, string_id); + etdump_DebugEvent_delegate_debug_id_str_add(builder_, string_id); } // Check the type of `output` then call the corresponding logging functions if constexpr (std::is_same::value) { long offset = copy_tensor_to_debug_buffer(output); - etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder, output, offset); + etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder, tensor_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); } else if constexpr (std::is_same>::value) { - etdump_Tensor_vec_start(builder); + etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < output.size(); ++i) { long offset = copy_tensor_to_debug_buffer(output[i]); etdump_Tensor_vec_push( - builder, add_tensor_entry(builder, output[i], offset)); + builder_, add_tensor_entry(builder_, output[i], offset)); } - etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder); + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = - etdump_TensorList_create(builder, tensor_vec_ref); + etdump_TensorList_create(builder_, tensor_vec_ref); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_TensorList); - etdump_Value_tensor_list_add(builder, tensor_list_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); } else if constexpr (std::is_same::value) { - auto int_ref = etdump_Int_create(builder, output); + auto int_ref = etdump_Int_create(builder_, output); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Int); - etdump_Value_int_value_add(builder, int_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); } else if constexpr (std::is_same::value) { - auto double_ref = etdump_Double_create(builder, output); + auto double_ref = etdump_Double_create(builder_, output); - etdump_Value_start(builder); - etdump_Value_double_value_add(builder, double_ref); - etdump_Value_val_add(builder, etdump_ValueType_Double); + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); } else if constexpr (std::is_same::value) { flatbuffers_bool_t flatbuffer_bool_val = output ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; - auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val); + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); - etdump_Value_start(builder); - etdump_Value_bool_value_add(builder, bool_ref); - etdump_Value_val_add(builder, etdump_ValueType_Bool); + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); } else { ET_CHECK_MSG(0, "Unsupported output type for intermediate logging\n"); } - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); - etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder); + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); - etdump_RunData_events_push_start(builder); - etdump_Event_debug_event_add(builder, debug_event); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::end_profiling(EventTracerEntry prof_entry) { @@ -413,32 +426,31 @@ void ETDumpGen::end_profiling(EventTracerEntry prof_entry) { "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); check_ready_to_add_events(); - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, prof_entry.start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, prof_entry.chain_id); - etdump_ProfileEvent_instruction_id_add(builder, prof_entry.debug_handle); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, prof_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, prof_entry.chain_id); + etdump_ProfileEvent_instruction_id_add(builder_, prof_entry.debug_handle); if (prof_entry.event_id != -1) { - etdump_ProfileEvent_name_add(builder, prof_entry.event_id); + etdump_ProfileEvent_name_add(builder_, prof_entry.event_id); } - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } AllocatorID ETDumpGen::track_allocator(const char* name) { ET_CHECK_MSG( - (etdump_gen_state == ETDumpGen_Block_Created || - etdump_gen_state == ETDumpGen_Adding_Allocators), + (state_ == State::BlockCreated || state_ == State::AddingAllocators), "Allocators can only be added immediately after a new block is created and before any events are added."); - if (etdump_gen_state != ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_start(builder); - etdump_gen_state = ETDumpGen_Adding_Allocators; + if (state_ != State::AddingAllocators) { + etdump_RunData_allocators_start(builder_); + state_ = State::AddingAllocators; } flatbuffers_string_ref_t ref = create_string_entry(name); - etdump_RunData_allocators_push_create(builder, ref); - return etdump_RunData_allocators_reserved_len(builder); + etdump_RunData_allocators_push_create(builder_, ref); + return etdump_RunData_allocators_reserved_len(builder_); } void ETDumpGen::track_allocation( @@ -446,43 +458,43 @@ void ETDumpGen::track_allocation( size_t allocation_size) { check_ready_to_add_events(); - etdump_RunData_events_push_start(builder); - etdump_Event_allocation_event_create(builder, allocator_id, allocation_size); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_allocation_event_create(builder_, allocator_id, allocation_size); + etdump_RunData_events_push_end(builder_); } -etdump_result ETDumpGen::get_etdump_data() { - etdump_result result; - if (etdump_gen_state == ETDumpGen_Adding_Events) { - etdump_RunData_events_end(builder); - } else if (etdump_gen_state == ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_end(builder); - } else if (etdump_gen_state == ETDumpGen_Init) { +ETDumpResult ETDumpGen::get_etdump_data() { + ETDumpResult result; + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); + } else if (state_ == State::Init) { result.buf = nullptr; result.size = 0; return result; } - etdump_ETDump_run_data_push_end(builder); - etdump_ETDump_run_data_end(builder); - etdump_ETDump_ref_t root = etdump_ETDump_end(builder); - flatbuffers_buffer_end(builder, root); - if (num_blocks == 0) { + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_end(builder_); + etdump_ETDump_ref_t root = etdump_ETDump_end(builder_); + flatbuffers_buffer_end(builder_, root); + if (num_blocks_ == 0) { result = {nullptr, 0}; } else { - if (alloc.data) { - result.buf = alloc.front_cursor; - result.size = alloc.out_size - alloc.front_left; + if (alloc_.data) { + result.buf = alloc_.front_cursor; + result.size = alloc_.out_size - alloc_.front_left; } else { result.buf = - flatcc_builder_finalize_aligned_buffer(builder, &result.size); + flatcc_builder_finalize_aligned_buffer(builder_, &result.size); } } - etdump_gen_state = ETDumpGen_Done; + state_ = State::Done; return result; } void ETDumpGen::set_debug_buffer(Span buffer) { - debug_buffer = buffer; + debug_buffer_ = buffer; } size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) { @@ -490,94 +502,94 @@ size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) { return static_cast(-1); } uint8_t* offset_ptr = - alignPointer(debug_buffer.data() + debug_buffer_offset, 64); - debug_buffer_offset = (offset_ptr - debug_buffer.data()) + tensor.nbytes(); + alignPointer(debug_buffer_.data() + debug_buffer_offset_, 64); + debug_buffer_offset_ = (offset_ptr - debug_buffer_.data()) + tensor.nbytes(); ET_CHECK_MSG( - debug_buffer_offset <= debug_buffer.size(), + debug_buffer_offset_ <= debug_buffer_.size(), "Ran out of space to store intermediate outputs."); memcpy(offset_ptr, tensor.const_data_ptr(), tensor.nbytes()); - return (size_t)(offset_ptr - debug_buffer.data()); + return (size_t)(offset_ptr - debug_buffer_.data()); } void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { - if (debug_buffer.empty()) { + if (debug_buffer_.empty()) { return; } check_ready_to_add_events(); - etdump_DebugEvent_start(builder); + etdump_DebugEvent_start(builder_); - etdump_DebugEvent_chain_index_add(builder, chain_id_); - etdump_DebugEvent_instruction_id_add(builder, debug_handle_); + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); switch (evalue.tag) { case Tag::Tensor: { exec_aten::Tensor tensor = evalue.toTensor(); long offset = copy_tensor_to_debug_buffer(tensor); etdump_Tensor_ref_t tensor_ref = - add_tensor_entry(builder, tensor, offset); + add_tensor_entry(builder_, tensor, offset); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder, tensor_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); if (evalue_type == LoggedEValueType::kProgramOutput) { - auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE); - etdump_Value_output_add(builder, bool_ref); + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); } - auto value_ref = etdump_Value_end(builder); + auto value_ref = etdump_Value_end(builder_); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::ListTensor: { exec_aten::ArrayRef tensors = evalue.toTensorList(); - etdump_Tensor_vec_start(builder); + etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < tensors.size(); ++i) { long offset = copy_tensor_to_debug_buffer(tensors[i]); etdump_Tensor_vec_push( - builder, add_tensor_entry(builder, tensors[i], offset)); + builder_, add_tensor_entry(builder_, tensors[i], offset)); } - etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder); + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = - etdump_TensorList_create(builder, tensor_vec_ref); + etdump_TensorList_create(builder_, tensor_vec_ref); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_TensorList); - etdump_Value_tensor_list_add(builder, tensor_list_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); if (evalue_type == LoggedEValueType::kProgramOutput) { - auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE); - etdump_Value_output_add(builder, bool_ref); + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); } - auto value_ref = etdump_Value_end(builder); + auto value_ref = etdump_Value_end(builder_); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::Int: { int64_t val = evalue.toInt(); - auto int_ref = etdump_Int_create(builder, val); + auto int_ref = etdump_Int_create(builder_, val); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Int); - etdump_Value_int_value_add(builder, int_ref); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::Double: { double val = evalue.toDouble(); - auto double_ref = etdump_Double_create(builder, val); + auto double_ref = etdump_Double_create(builder_, val); - etdump_Value_start(builder); - etdump_Value_double_value_add(builder, double_ref); - etdump_Value_val_add(builder, etdump_ValueType_Double); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } @@ -585,13 +597,13 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { case Tag::Bool: { flatbuffers_bool_t flatbuffer_bool_val = evalue.toBool() ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; - auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val); + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); - etdump_Value_start(builder); - etdump_Value_bool_value_add(builder, bool_ref); - etdump_Value_val_add(builder, etdump_ValueType_Bool); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } @@ -604,20 +616,20 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { break; } - etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder); + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); - etdump_RunData_events_push_start(builder); - etdump_Event_debug_event_add(builder, debug_event); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); } size_t ETDumpGen::get_num_blocks() { - return num_blocks; + return num_blocks_; } bool ETDumpGen::is_static_etdump() { - return alloc.data != nullptr; + return alloc_.data != nullptr; } -} // namespace executor -} // namespace torch +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h index e56d09f8107..0bd891a0970 100644 --- a/devtools/etdump/etdump_flatcc.h +++ b/devtools/etdump/etdump_flatcc.h @@ -8,33 +8,22 @@ #pragma once -#include #include -#include "executorch/runtime/core/event_tracer.h" -#include "executorch/runtime/platform/platform.h" + +#include +#include +#include #define ETDUMP_VERSION 0 struct flatcc_builder; -namespace torch { -namespace executor { - -enum ETDumpGen_State { - ETDumpGen_Init, - ETDumpGen_Block_Created, - ETDumpGen_Adding_Allocators, - ETDumpGen_Adding_Events, - ETDumpGen_Done, -}; +namespace executorch { +namespace etdump { -struct etdump_result { - void* buf; - size_t size; -}; - -struct etdump_static_allocator { - etdump_static_allocator() {} +namespace internal { +struct ETDumpStaticAllocator { + ETDumpStaticAllocator() = default; void set_buffer(uint8_t* buffer, size_t total_buf_size, size_t alloc_buf_size) { @@ -64,61 +53,72 @@ struct etdump_static_allocator { // Bytes left in front of front_cursor. size_t front_left{0}; }; +} // namespace internal + +struct ETDumpResult { + void* buf; + size_t size; +}; -class ETDumpGen : public EventTracer { +class ETDumpGen : public ::executorch::runtime::EventTracer { public: - ETDumpGen(Span buffer = {nullptr, (size_t)0}); + ETDumpGen(::executorch::runtime::Span buffer = {nullptr, (size_t)0}); ~ETDumpGen() override; void clear_builder(); void create_event_block(const char* name) override; - virtual EventTracerEntry start_profiling( + virtual ::executorch::runtime::EventTracerEntry start_profiling( const char* name, - ChainID chain_id = -1, - DebugHandle debug_handle = 0) override; - virtual void end_profiling(EventTracerEntry prof_entry) override; - virtual EventTracerEntry start_profiling_delegate( + ::executorch::runtime::ChainID chain_id = -1, + ::executorch::runtime::DebugHandle debug_handle = 0) override; + virtual void end_profiling( + ::executorch::runtime::EventTracerEntry prof_entry) override; + virtual ::executorch::runtime::EventTracerEntry start_profiling_delegate( const char* name, - DebugHandle delegate_debug_index) override; + ::executorch::runtime::DebugHandle delegate_debug_index) override; virtual void end_profiling_delegate( - EventTracerEntry prof_entry, + ::executorch::runtime::EventTracerEntry prof_entry, const void* metadata, size_t metadata_len) override; virtual void log_profiling_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, et_timestamp_t start_time, et_timestamp_t end_time, const void* metadata, size_t metadata_len) override; - virtual void track_allocation(AllocatorID id, size_t size) override; - virtual AllocatorID track_allocator(const char* name) override; + virtual void track_allocation( + ::executorch::runtime::AllocatorID id, + size_t size) override; + virtual ::executorch::runtime::AllocatorID track_allocator( + const char* name) override; virtual void log_evalue( - const EValue& evalue, - LoggedEValueType evalue_type = - LoggedEValueType::kIntermediateOutput) override; + const ::executorch::runtime::EValue& evalue, + ::executorch::runtime::LoggedEValueType evalue_type = + ::executorch::runtime::LoggedEValueType::kIntermediateOutput) + override; /** * Log an intermediate tensor output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, - const Tensor& output) override; + ::executorch::runtime::DebugHandle delegate_debug_index, + const exec_aten::Tensor& output) override; /** * Log an intermediate tensor array output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, - const ArrayRef output) override; + ::executorch::runtime::DebugHandle delegate_debug_index, + const ::executorch::runtime::ArrayRef output) override; /** * Log an intermediate int output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const int& output) override; /** @@ -126,7 +126,7 @@ class ETDumpGen : public EventTracer { */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const bool& output) override; /** @@ -134,22 +134,22 @@ class ETDumpGen : public EventTracer { */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const double& output) override; - void set_debug_buffer(Span buffer); - etdump_result get_etdump_data(); + void set_debug_buffer(::executorch::runtime::Span buffer); + ETDumpResult get_etdump_data(); size_t get_num_blocks(); bool is_static_etdump(); void reset(); private: - struct flatcc_builder* builder; - size_t num_blocks = 0; - Span debug_buffer; - size_t debug_buffer_offset = 0; - int bundled_input_index = -1; - ETDumpGen_State etdump_gen_state = ETDumpGen_Init; - struct etdump_static_allocator alloc; + enum class State { + Init, + BlockCreated, + AddingAllocators, + AddingEvents, + Done, + }; void check_ready_to_add_events(); int64_t create_string_entry(const char* name); @@ -162,9 +162,26 @@ class ETDumpGen : public EventTracer { template void log_intermediate_output_delegate_helper( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const T& output); + + struct flatcc_builder* builder_; + size_t num_blocks_ = 0; + ::executorch::runtime::Span debug_buffer_; + size_t debug_buffer_offset_ = 0; + int bundled_input_index_ = -1; + State state_ = State::Init; + struct internal::ETDumpStaticAllocator alloc_; }; +} // namespace etdump +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using etdump_result = ::executorch::etdump::ETDumpResult; +using ::executorch::etdump::ETDumpGen; } // namespace executor } // namespace torch diff --git a/devtools/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs index d90d278f5fc..1244ebd4aeb 100644 --- a/devtools/etdump/etdump_schema_flatcc.fbs +++ b/devtools/etdump/etdump_schema_flatcc.fbs @@ -76,6 +76,10 @@ table DebugEvent { // String based delegate debug identifier. delegate_debug_id_str:string; + + // Name assigned to this debug event by the runtime. If it is an operator + // call this will just be the name of the operator that was executed. + name:string; } // All the details pertaining to an allocation done in the runtime. The main diff --git a/devtools/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs index fdfe550e9e3..a8da080c679 100644 --- a/devtools/etdump/scalar_type.fbs +++ b/devtools/etdump/scalar_type.fbs @@ -14,6 +14,7 @@ enum ScalarType : byte { SHORT = 2, INT = 3, LONG = 4, + HALF = 5, FLOAT = 6, DOUBLE = 7, BOOL = 11, @@ -24,7 +25,6 @@ enum ScalarType : byte { QUINT4X2 = 16, QUINT2X4 = 17, // Types currently not implemented. - // Half = 5, // COMPLEXHALF = 8, // COMPLEXFLOAT = 9, // COMPLEXDOUBLE = 10, diff --git a/devtools/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py index f19f328d3fa..404fa1c9758 100644 --- a/devtools/etdump/schema_flatcc.py +++ b/devtools/etdump/schema_flatcc.py @@ -93,6 +93,7 @@ class Value: @dataclass class DebugEvent: + name: Optional[str] chain_index: int instruction_id: int delegate_debug_id_int: Optional[int] diff --git a/devtools/etdump/targets.bzl b/devtools/etdump/targets.bzl index 6d548ce650f..ddbb35eab74 100644 --- a/devtools/etdump/targets.bzl +++ b/devtools/etdump/targets.bzl @@ -95,9 +95,11 @@ def define_common_targets(): "etdump_flatcc.cpp", "emitter.cpp", ], + headers = [ + "emitter.h", + ], exported_headers = [ "etdump_flatcc.h", - "emitter.h", ], deps = [ "//executorch/runtime/platform:platform", diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp index de8c0abc39d..b750e21eb07 100644 --- a/devtools/etdump/tests/etdump_test.cpp +++ b/devtools/etdump/tests/etdump_test.cpp @@ -20,8 +20,20 @@ #include #include -namespace torch { -namespace executor { +using ::exec_aten::ScalarType; +using ::exec_aten::Tensor; +using ::executorch::etdump::ETDumpGen; +using ::executorch::etdump::ETDumpResult; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::BoxedEvalueList; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; +using ::executorch::runtime::testing::TensorFactory; class ProfilerETDumpTest : public ::testing::Test { protected: @@ -49,7 +61,7 @@ TEST_F(ProfilerETDumpTest, SingleProfileEvent) { EventTracerEntry entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -105,7 +117,7 @@ TEST_F(ProfilerETDumpTest, EmptyBlocks) { etdump_gen[i]->start_profiling("test_event_1", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -160,7 +172,7 @@ TEST_F(ProfilerETDumpTest, AllocationEvents) { TEST_F(ProfilerETDumpTest, DebugEvent) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); etdump_gen[i]->create_event_block("test_block"); @@ -189,7 +201,7 @@ TEST_F(ProfilerETDumpTest, DebugEvent) { TEST_F(ProfilerETDumpTest, DebugEventTensorList) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; exec_aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})}; EValue evalue_1(storage[0]); EValue evalue_2(storage[1]); @@ -212,7 +224,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) { } TEST_F(ProfilerETDumpTest, VerifyLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -225,7 +237,7 @@ TEST_F(ProfilerETDumpTest, VerifyLogging) { etdump_gen[i]->log_evalue(evalue); etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -297,7 +309,7 @@ TEST_F(ProfilerETDumpTest, MultipleBlocksWithEvents) { entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -363,7 +375,7 @@ TEST_F(ProfilerETDumpTest, VerifyData) { entry = etdump_gen[i]->start_profiling("test_event2", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -421,7 +433,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { Span buffer((uint8_t*)ptr, 2048); etdump_gen[i]->create_event_block("test_block"); - testing::TensorFactory tf; + TensorFactory tf; ET_EXPECT_DEATH( etdump_gen[i]->log_intermediate_output_delegate( @@ -462,7 +474,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { static_cast(-1), true); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -474,7 +486,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { } TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -492,7 +504,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { etdump_gen[i]->log_intermediate_output_delegate( nullptr, 258, tf.ones({5, 6})); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -603,7 +615,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateEvents) { etdump_gen[i]->end_profiling(entry), "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -681,7 +693,7 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -712,6 +724,3 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { } } } - -} // namespace executor -} // namespace torch diff --git a/devtools/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py index 1a7f3bd93f5..5cab3e5b2ba 100644 --- a/devtools/etdump/tests/serialize_test.py +++ b/devtools/etdump/tests/serialize_test.py @@ -83,6 +83,7 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC: profile_event=None, allocation_event=None, debug_event=flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py index f98e3cd3a56..0539d4f5e4b 100644 --- a/devtools/inspector/_inspector.py +++ b/devtools/inspector/_inspector.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import dataclasses import logging import sys @@ -39,6 +41,7 @@ ) from executorch.devtools.etrecord import ETRecord, parse_etrecord from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, EXCLUDED_COLUMNS_WHEN_PRINTING, @@ -52,7 +55,6 @@ is_inference_output_equal, ProgramOutput, RESERVED_FRAMEWORK_EVENT_NAMES, - TIME_SCALE_DICT, TimeScale, verify_debug_data_equivalence, ) @@ -150,6 +152,7 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature": # Signature of a DebugEvent @dataclass(frozen=True, order=True) class DebugEventSignature: + name: str = "" instruction_id: Optional[int] = -1 delegate_id: Optional[int] = None delegate_id_str: Optional[str] = None @@ -163,6 +166,7 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature": The Signature will convert these back to the intended None value """ return DebugEventSignature( + event.name or "", event.instruction_id if event.instruction_id != -1 else None, event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None, event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None, @@ -468,46 +472,63 @@ def _calculate_elapsed_time(start_time, end_time): return elapsed_time @staticmethod - def _populate_profiling_related_fields( + def _populate_event_signature_fields( ret_event: "Event", - profile_event_signature: Optional[ProfileEventSignature], - events: List[InstructionEvent], - scale_factor: float, + event_signature: Optional[Union[ProfileEventSignature, DebugEventSignature]], ) -> None: """ Given a partially constructed Event, populate the fields related to - the profile events + the profile event signature or debug event signature Fields Updated: name delegate_debug_identifier is_delegated_op - perf_data - delegate_debug_metadatas """ - - # Fill out fields from profile event signature - if profile_event_signature is not None: - if profile_event_signature.delegate_id is not None: # 0 is a valid value - delegate_debug_identifier = profile_event_signature.delegate_id + # TODO: T201347372 Push the None check to ealier in the stack. + if event_signature is not None: + if event_signature.delegate_id is not None: # 0 is a valid value + delegate_debug_identifier = event_signature.delegate_id else: - delegate_debug_identifier = ( - profile_event_signature.delegate_id_str or None - ) + delegate_debug_identifier = event_signature.delegate_id_str or None # Use the delegate identifier as the event name if delegated is_delegated_op = delegate_debug_identifier is not None name = ( - profile_event_signature.name + event_signature.name if not is_delegated_op else str(delegate_debug_identifier) ) # Update fields - ret_event.name = name + # This is for older version of etdump that doesn't have the name field for debug events, we don't update the name field + if name: + ret_event.name = name ret_event.delegate_debug_identifier = delegate_debug_identifier ret_event.is_delegated_op = is_delegated_op + @staticmethod + def _populate_profiling_related_fields( + ret_event: "Event", + profile_event_signature: Optional[ProfileEventSignature], + events: List[InstructionEvent], + scale_factor: float, + ) -> None: + """ + Given a partially constructed Event, populate the fields related to + the profile events + + Fields Updated: + name + delegate_debug_identifier + is_delegated_op + perf_data + delegate_debug_metadatas + """ + + # Fill out fields from profile event signature + Event._populate_event_signature_fields(ret_event, profile_event_signature) + # Fill out fields from profile event data = [] delegate_debug_metadatas = [] @@ -575,9 +596,15 @@ def _populate_debugging_related_fields( the debug events Fields Updated: + name + delegate_debug_identifier + is_delegated_op debug_data """ + # Fill out fields from debug event signature + Event._populate_event_signature_fields(ret_event, debug_event_signature) + debug_data: List[flatcc.Value] = [] for event in events: if (debug_events := event.debug_events) is None: @@ -799,9 +826,7 @@ class GroupedRunInstances: # Construct the EventBlocks event_blocks = [] - scale_factor = ( - TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] - ) + scale_factor = calculate_time_scale_factor(source_time_scale, target_time_scale) for run_signature, grouped_run_instance in run_groups.items(): run_group: OrderedDict[EventSignature, List[InstructionEvent]] = ( grouped_run_instance.events @@ -966,6 +991,9 @@ def __init__( debug_buffer_path: Debug buffer file path that contains the debug data referenced by ETDump for intermediate and program outputs. delegate_metadata_parser: Optional function to parse delegate metadata from an Profiling Event. Expected signature of the function is: (delegate_metadata_list: List[bytes]) -> Union[List[str], Dict[str, Any]] + delegate_time_scale_converter: Optional function to convert the time scale of delegate profiling data. If not given, use the conversion ratio of + target_time_scale/source_time_scale. + enable_module_hierarchy: Enable submodules in the operator graph. Defaults to False. Returns: None @@ -980,6 +1008,14 @@ def __init__( self._source_time_scale = source_time_scale self._target_time_scale = target_time_scale + if delegate_time_scale_converter is None: + scale_factor = calculate_time_scale_factor( + source_time_scale, target_time_scale + ) + delegate_time_scale_converter = ( + lambda event_name, input_time: input_time / scale_factor + ) + if etrecord is None: self._etrecord = None elif isinstance(etrecord, ETRecord): @@ -1002,10 +1038,10 @@ def __init__( ) self.event_blocks = EventBlock._gen_from_etdump( - etdump, - self._source_time_scale, - self._target_time_scale, - output_buffer, + etdump=etdump, + source_time_scale=self._source_time_scale, + target_time_scale=self._target_time_scale, + output_buffer=output_buffer, delegate_metadata_parser=delegate_metadata_parser, delegate_time_scale_converter=delegate_time_scale_converter, ) diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py index 98b5fdc722f..5f04e2d0413 100644 --- a/devtools/inspector/_inspector_utils.py +++ b/devtools/inspector/_inspector_utils.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import math from enum import Enum from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union @@ -63,6 +65,15 @@ class TimeScale(Enum): } +def calculate_time_scale_factor( + source_time_scale: TimeScale, target_time_scale: TimeScale +) -> float: + """ + Calculate the factor (source divided by target) between two time scales + """ + return TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] + + # Model Debug Output InferenceOutput: TypeAlias = Union[ torch.Tensor, List[torch.Tensor], int, float, str, bool, None diff --git a/devtools/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py index 4101035f99b..85b65aa5f34 100644 --- a/devtools/inspector/tests/event_blocks_test.py +++ b/devtools/inspector/tests/event_blocks_test.py @@ -62,6 +62,7 @@ def _gen_sample_profile_event( def _gen_sample_debug_event( instruction_id: int, delegate_debug_id: Optional[Union[int, str]] = None, + name: str = "test_debug_event", ) -> flatcc.DebugEvent: """ Helper for generating test DebugEvents @@ -77,6 +78,7 @@ def _gen_sample_debug_event( ) return flatcc.DebugEvent( + name=name, chain_index=0, instruction_id=instruction_id, delegate_debug_id_int=delegate_debug_id_int, @@ -299,6 +301,42 @@ def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC: return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3]) + @staticmethod + def _get_sample_etdump_flatcc_debug_events_only( + event_name: str, + delegate_debug_id: str, + ) -> flatcc.ETDumpFlatCC: + """ + Helper for getting a sample ETDumpFlatCC object with RunData signature_a + and (debug_event_delegated, debug_event_non_delegated, no profile event) + """ + + debug_event_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, delegate_debug_id=delegate_debug_id, name=event_name + ) + debug_event_non_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, name=event_name + ) + run_data_1 = flatcc.RunData( + name="signature_a", + bundled_input_index=-1, + allocators=[], + events=[ + flatcc.Event( + allocation_event=None, + debug_event=debug_event_delegated, + profile_event=None, + ), + flatcc.Event( + allocation_event=None, + debug_event=debug_event_non_delegated, + profile_event=None, + ), + ], + ) + + return ETDumpFlatCC(version=0, run_data=[run_data_1]) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def test_gen_from_etdump(self) -> None: @@ -370,6 +408,30 @@ def test_gen_from_etdump_inconsistent_debug_data(self) -> None: with self.assertRaises(AssertionError): EventBlock._gen_from_etdump(etdump) + def test_gen_from_etdump_debug_events_only(self) -> None: + """ + Test generation of EventBlocks given an ETDump with only debugging events + + Specifically it tests: + - Correct number of EventBlocks and Events + - Correct name of each Event + """ + event_name = "test_debug_event_only" + delegate_debug_id = "debug_id" + etdump: ETDumpFlatCC = ( + TestEventBlock._get_sample_etdump_flatcc_debug_events_only( + event_name=event_name, + delegate_debug_id=delegate_debug_id, + ) + ) + event_blocks = EventBlock._gen_from_etdump(etdump) + self.assertEqual(len(event_blocks), 1) + self.assertEqual(len(event_blocks[0].events), 2) + # Delegated event uses delegate_debug_id as event name + self.assertEqual(event_blocks[0].events[0].name, delegate_debug_id) + # Non delegated event uses event_name as event name + self.assertEqual(event_blocks[0].events[1].name, event_name) + def test_inspector_event_generation(self) -> None: """ Test Inspector.Event derivation from various ProfileEvent cases diff --git a/devtools/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py index 55f0cd10ae9..34c96eef534 100644 --- a/devtools/inspector/tests/inspector_test.py +++ b/devtools/inspector/tests/inspector_test.py @@ -4,13 +4,15 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import random import statistics import tempfile import unittest from contextlib import redirect_stdout -from typing import List +from typing import Callable, List from unittest.mock import patch @@ -32,6 +34,7 @@ InstructionEvent, InstructionEventSignature, ProfileEventSignature, + TimeScale, ) from executorch.exir import ExportedProgram @@ -88,6 +91,33 @@ def test_inspector_constructor(self): # Because we mocked parse_etrecord() to return None, this method shouldn't be called mock_gen_graphs_from_etrecord.assert_not_called() + def test_default_delegate_time_scale_converter(self): + # Create a context manager to patch functions called by Inspector.__init__ + with patch.object( + _inspector, "parse_etrecord", return_value=None + ), patch.object( + _inspector, "gen_etdump_object", return_value=None + ), patch.object( + EventBlock, "_gen_from_etdump" + ) as mock_gen_from_etdump, patch.object( + _inspector, "gen_graphs_from_etrecord" + ), patch.object( + _inspector, "create_debug_handle_to_op_node_mapping" + ): + # Call the constructor of Inspector + Inspector( + etdump_path=ETDUMP_PATH, + etrecord=ETRECORD_PATH, + source_time_scale=TimeScale.US, + target_time_scale=TimeScale.S, + ) + + # Verify delegate_time_scale_converter is set to be a callable + self.assertIsInstance( + mock_gen_from_etdump.call_args.get("delegate_time_scale_converter"), + Callable, + ) + def test_inspector_print_data_tabular(self): # Create a context manager to patch functions called by Inspector.__init__ with patch.object( @@ -288,6 +318,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -311,6 +342,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): # Note the sizes of this tensor are different from the previous one debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -355,6 +387,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -378,6 +411,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): # Same as the event above except for offset debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py index d853732fcc7..73511f5fcd7 100644 --- a/devtools/inspector/tests/inspector_utils_test.py +++ b/devtools/inspector/tests/inspector_utils_test.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import tempfile import unittest from typing import Dict, Tuple @@ -23,11 +25,13 @@ from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, find_populated_event, gen_graphs_from_etrecord, is_inference_output_equal, + TimeScale, ) @@ -74,6 +78,7 @@ def test_find_populated_event(self): end_time=2002, ) debug_event = flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", @@ -170,6 +175,19 @@ def test_is_inference_output_equal_returns_true_for_same_strs(self): ) ) + def test_calculate_time_scale_factor_second_based(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.NS, TimeScale.MS), 1000000 + ) + self.assertEqual( + calculate_time_scale_factor(TimeScale.MS, TimeScale.NS), 1 / 1000000 + ) + + def test_calculate_time_scale_factor_cycles(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.CYCLES, TimeScale.CYCLES), 1 + ) + def gen_mock_operator_graph_with_expected_map() -> ( Tuple[OperatorGraph, Dict[int, OperatorNode]] diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md index d610f020ef2..1fbe35c72bc 100644 --- a/docs/source/getting-started-setup.md +++ b/docs/source/getting-started-setup.md @@ -59,13 +59,11 @@ also work in similar environments. - We recommend `conda` as it provides cross-language support and integrates smoothly with `pip` (Python's built-in package manager) - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative. -* `g++` version 8 or higher, `clang++` version 8 or higher, or another - C++17-compatible toolchain that supports GNU C-style [statement - expressions](https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html) (`({ ... - })` syntax). +* `g++` version 7 or higher, `clang++` version 5 or higher, or another + C++17-compatible toolchain. Note that the cross-compilable core runtime code supports a wider range of -toolchains, down to C++11. See the [Runtime Overview](./runtime-overview.md) for +toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for portability details. ## Quick Setup: Colab/Jupyter Notebook Prototype diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md index 7bc8b4dd8b4..6766e678e0e 100644 --- a/docs/source/runtime-overview.md +++ b/docs/source/runtime-overview.md @@ -96,7 +96,7 @@ can build it for a wide variety of target systems. #### C++ Language Considerations -* The code is C++11-compatible to work with older toolchains. +* The code is C++17-compatible to work with older toolchains. * The runtime does not use exceptions or RTTI, although it is not antagonistic to them. * The code is compatible with GCC and Clang, and has also been built with diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index 272ddcfc0c5..9cef98e6227 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -91,6 +91,7 @@ fi ### Optional user args ######## root_dir=${1:-"${script_dir}/ethos-u-scratch"} +mkdir -p ${root_dir} root_dir=$(realpath ${root_dir}) ######## @@ -246,7 +247,6 @@ fi cd "${script_dir}" # Setup the root dir -mkdir -p "${root_dir}" cd "${root_dir}" echo "[main] Using root dir ${root_dir}" diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 7ed9c9ec979..ac14270ed51 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -19,6 +19,7 @@ import android.os.Bundle; import android.os.Handler; import android.os.Looper; +import android.os.Process; import android.provider.MediaStore; import android.system.ErrnoException; import android.system.Os; @@ -44,6 +45,8 @@ import java.lang.reflect.Type; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; import org.pytorch.executorch.LlamaCallback; import org.pytorch.executorch.LlamaModule; @@ -70,13 +73,17 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa private SettingsFields mCurrentSettingsFields; private Handler mMemoryUpdateHandler; private Runnable memoryUpdater; + private int promptID = 0; + private long startPos = 0; + private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2; + private Executor executor; @Override public void onResult(String result) { if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) { return; } - if (result.equals("\n\n")) { + if (result.equals("\n\n") || result.equals("\n")) { if (!mResultMessage.getText().isEmpty()) { mResultMessage.appendText(result); run(); @@ -147,6 +154,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera + (float) loadDuration / 1000 + " sec." + " You can send text or image for inference"; + + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + ETLogging.getInstance().log("Llava start prefill prompt"); + startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0); + ETLogging.getInstance().log("Llava completes prefill prompt"); + } } Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0); @@ -195,6 +208,11 @@ private void populateExistingMessages(String existingMsgJSON) { mMessageAdapter.notifyDataSetChanged(); } + private int setPromptID() { + + return mMessageAdapter.getMaxPromptID() + 1; + } + @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); @@ -216,6 +234,7 @@ protected void onCreate(Bundle savedInstanceState) { String existingMsgJSON = mDemoSharedPreferences.getSavedMessages(); if (!existingMsgJSON.isEmpty()) { populateExistingMessages(existingMsgJSON); + promptID = setPromptID(); } mSettingsButton = requireViewById(R.id.settings); mSettingsButton.setOnClickListener( @@ -232,6 +251,7 @@ protected void onCreate(Bundle savedInstanceState) { setupCameraRoll(); startMemoryUpdate(); setupShowLogsButton(); + executor = Executors.newSingleThreadExecutor(); } @Override @@ -537,6 +557,32 @@ private void showMediaPreview(List uris) { imageViews.get(i).setVisibility(View.VISIBLE); imageViews.get(i).setImageURI(mSelectedImageUri.get(i)); } + + // For LLava, we want to call prefill_image as soon as an image is selected + // Llava only support 1 image for now + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + List processedImageList = getProcessedImagesForModel(mSelectedImageUri); + if (!processedImageList.isEmpty()) { + mMessageAdapter.add( + new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0)); + mMessageAdapter.notifyDataSetChanged(); + Runnable runnable = + () -> { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("Starting runnable prefill image"); + ETImage img = processedImageList.get(0); + ETLogging.getInstance().log("Llava start prefill image"); + startPos = + mModule.prefillImages( + img.getInts(), + img.getWidth(), + img.getHeight(), + ModelUtils.VISION_MODEL_IMAGE_CHANNELS, + startPos); + }; + executor.execute(runnable); + } + } } private void addSelectedImagesToChatThread(List selectedImageUri) { @@ -552,6 +598,48 @@ private void addSelectedImagesToChatThread(List selectedImageUri) { mMessageAdapter.notifyDataSetChanged(); } + private String getConversationHistory() { + String conversationHistory = ""; + + ArrayList conversations = + mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK); + if (conversations.isEmpty()) { + return conversationHistory; + } + + int prevPromptID = conversations.get(0).getPromptID(); + String conversationFormat = + PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType()); + String format = conversationFormat; + for (int i = 0; i < conversations.size(); i++) { + Message conversation = conversations.get(i); + int currentPromptID = conversation.getPromptID(); + if (currentPromptID != prevPromptID) { + conversationHistory = conversationHistory + format; + format = conversationFormat; + prevPromptID = currentPromptID; + } + if (conversation.getIsSent()) { + format = format.replace(PromptFormat.USER_PLACEHOLDER, conversation.getText()); + } else { + format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText()); + } + } + conversationHistory = conversationHistory + format; + + return conversationHistory; + } + + private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) { + if (conversationHistory.isEmpty()) { + return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); + } + + return mCurrentSettingsFields.getFormattedSystemPrompt() + + conversationHistory + + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt); + } + private void onModelRunStarted() { mSendButton.setClickable(false); mSendButton.setImageResource(R.drawable.baseline_stop_24); @@ -567,42 +655,26 @@ private void onModelRunStopped() { mSendButton.setOnClickListener( view -> { addSelectedImagesToChatThread(mSelectedImageUri); - List processedImageList = getProcessedImagesForModel(mSelectedImageUri); - processedImageList.forEach( - image -> { - ETLogging.getInstance() - .log( - "Image preprocessed:" - + " uri = " - + image.getUri().getLastPathSegment() - + "," - + " width = " - + image.getWidth() - + "," - + " height = " - + image.getHeight() - + "," - + " bytes size = " - + image.getBytes().length); - }); String rawPrompt = mEditTextMessage.getText().toString(); - String prompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); // We store raw prompt into message adapter, because we don't want to show the extra // tokens from system prompt - mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, 0)); + mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID)); mMessageAdapter.notifyDataSetChanged(); mEditTextMessage.setText(""); - mResultMessage = new Message("", false, MessageType.TEXT, 0); + mResultMessage = new Message("", false, MessageType.TEXT, promptID); mMessageAdapter.add(mResultMessage); // Scroll to bottom of the list mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1); // After images are added to prompt and chat thread, we clear the imageURI list // Note: This has to be done after imageURIs are no longer needed by LlamaModule mSelectedImageUri = null; + promptID++; Runnable runnable = new Runnable() { @Override public void run() { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("starting runnable generate()"); runOnUiThread( new Runnable() { @Override @@ -610,37 +682,24 @@ public void run() { onModelRunStarted(); } }); - ETLogging.getInstance().log("Running inference.. prompt=" + prompt); long generateStartTime = System.currentTimeMillis(); if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) == ModelUtils.VISION_MODEL) { - if (!processedImageList.isEmpty()) { - // For now, Llava only support 1 image. - ETImage img = processedImageList.get(0); - mModule.generate( - processedImageList.get(0).getInts(), - img.getWidth(), - img.getHeight(), - ModelUtils.VISION_MODEL_IMAGE_CHANNELS, - prompt, - ModelUtils.VISION_MODEL_SEQ_LEN, - false, - MainActivity.this); - } else { - // no image selected, we pass in empty int array - mModule.generate( - new int[0], - 0, - 0, - ModelUtils.VISION_MODEL_IMAGE_CHANNELS, - prompt, - ModelUtils.VISION_MODEL_SEQ_LEN, - false, - MainActivity.this); - } + mModule.generateFromPos( + mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt), + ModelUtils.VISION_MODEL_SEQ_LEN, + startPos, + MainActivity.this, + false); } else { + String finalPrompt = + getTotalFormattedPrompt(getConversationHistory(), rawPrompt); + ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt); mModule.generate( - prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, false, MainActivity.this); + finalPrompt, + (int) (finalPrompt.length() * 0.75) + 64, + MainActivity.this, + false); } long generateDuration = System.currentTimeMillis() - generateStartTime; @@ -655,7 +714,7 @@ public void run() { ETLogging.getInstance().log("Inference completed"); } }; - new Thread(runnable).start(); + executor.execute(runnable); }); mMessageAdapter.notifyDataSetChanged(); } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java index d9cbd95a1a7..2538c852e48 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java @@ -16,6 +16,7 @@ import android.widget.ImageView; import android.widget.TextView; import java.util.ArrayList; +import java.util.Collections; public class MessageAdapter extends ArrayAdapter { @@ -90,4 +91,41 @@ public void clear() { public ArrayList getSavedMessages() { return savedMessages; } + + public ArrayList getRecentSavedTextMessages(int numOfLatestPromptMessages) { + ArrayList recentMessages = new ArrayList(); + int lastIndex = savedMessages.size() - 1; + Message messageToAdd = savedMessages.get(lastIndex); + int oldPromptID = messageToAdd.getPromptID(); + + for (int i = 0; i < savedMessages.size(); i++) { + messageToAdd = savedMessages.get(lastIndex - i); + if (messageToAdd.getMessageType() != MessageType.SYSTEM) { + if (messageToAdd.getPromptID() != oldPromptID) { + numOfLatestPromptMessages--; + oldPromptID = messageToAdd.getPromptID(); + } + if (numOfLatestPromptMessages > 0) { + if (messageToAdd.getMessageType() == MessageType.TEXT) { + recentMessages.add(messageToAdd); + } + } else { + break; + } + } + } + + // To place the order in [input1, output1, input2, output2...] + Collections.reverse(recentMessages); + return recentMessages; + } + + public int getMaxPromptID() { + int maxPromptID = -1; + for (Message msg : savedMessages) { + + maxPromptID = Math.max(msg.getPromptID(), maxPromptID); + } + return maxPromptID; + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java index 7342b4ab00c..36e738c3d0e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -12,6 +12,8 @@ public class PromptFormat { public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}"; public static final String USER_PLACEHOLDER = "{{ user_prompt }}"; + public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}"; + public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences"; public static String getSystemPromptTemplate(ModelType modelType) { switch (modelType) { @@ -33,8 +35,20 @@ public static String getUserPromptTemplate(ModelType modelType) { case LLAMA_3_1: return "<|start_header_id|>user<|end_header_id|>\n" + USER_PLACEHOLDER - + "<|eot_id|>\n" + + "<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>"; + + case LLAVA_1_5: + default: + return USER_PLACEHOLDER; + } + } + + public static String getConversationFormat(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>"; case LLAVA_1_5: return USER_PLACEHOLDER + " ASSISTANT:"; default: @@ -53,4 +67,9 @@ public static String getStopToken(ModelType modelType) { return ""; } } + + public static String getLlavaPresetPrompt() { + return "A chat between a curious human and an artificial intelligence assistant. The assistant" + + " gives helpful, detailed, and polite answers to the human's questions. USER: "; + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java index 5f1fc96e1ac..0736c8cda94 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -43,7 +43,7 @@ public class SettingsActivity extends AppCompatActivity { public SettingsFields mSettingsFields; private DemoSharedPreferences mDemoSharedPreferences; - public static double TEMPERATURE_MIN_VALUE = 0.1; + public static double TEMPERATURE_MIN_VALUE = 0.0; @Override protected void onCreate(Bundle savedInstanceState) { @@ -120,6 +120,7 @@ private void setupLoadModelButton() { public void onClick(DialogInterface dialog, int whichButton) { mSettingsFields.saveLoadModelAction(true); mLoadModelButton.setEnabled(false); + onBackPressed(); } }) .setNegativeButton(android.R.string.no, null) @@ -208,8 +209,7 @@ public void afterTextChanged(Editable s) { new DialogInterface.OnClickListener() { public void onClick(DialogInterface dialog, int whichButton) { // Clear the messageAdapter and sharedPreference - mSystemPromptEditText.setText( - PromptFormat.getSystemPromptTemplate(mModelType)); + mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT); } }) .setNegativeButton(android.R.string.no, null) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java index 466d3303e28..b71799981b2 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java @@ -38,12 +38,12 @@ public String getFormattedSystemAndUserPrompt(String prompt) { return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt); } - private String getFormattedSystemPrompt() { + public String getFormattedSystemPrompt() { return PromptFormat.getSystemPromptTemplate(modelType) .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt); } - private String getFormattedUserPrompt(String prompt) { + public String getFormattedUserPrompt(String prompt) { return userPrompt.replace(PromptFormat.USER_PLACEHOLDER, prompt); } diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 87d0f47c956..68d191685d3 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ @@ -37,6 +38,8 @@ cmake examples/models/llama2 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/examples/models/llama2 @@ -47,7 +50,9 @@ cmake extension/android \ -DANDROID_ABI="${ANDROID_ABI}" \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android @@ -59,7 +64,7 @@ mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}" BUILD_AAR_DIR="$(mktemp -d)" mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs" JNI_LIBS_PATH="${BUILD_AAR_DIR}/jni" -cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" +cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch_jni.so" cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 91a68d4b88b..5e65929426b 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ @@ -37,6 +38,7 @@ cmake examples/models/llama2 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/examples/models/llama2 @@ -48,6 +50,7 @@ cmake extension/android \ -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android @@ -56,7 +59,7 @@ cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Relea BUILD_AAR_DIR="$(mktemp -d)" mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs" -cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" +cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/libexecutorch.so" cp extension/android/build/libs/executorch.jar "${BUILD_AAR_DIR}/libs" echo \ \ diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt index 2abee59759f..1d411f07ca7 100644 --- a/examples/mediatek/CMakeLists.txt +++ b/examples/mediatek/CMakeLists.txt @@ -75,6 +75,44 @@ if(${ANDROID}) ) target_compile_options(mtk_executor_runner PUBLIC ${_common_compile_options}) + set(_mtk_oss_executor_runner__srcs ${_executor_runner__srcs}) + list( + TRANSFORM + _mtk_oss_executor_runner__srcs + PREPEND + "${EXECUTORCH_SOURCE_DIR}/" + ) + list( + FILTER + _mtk_oss_executor_runner__srcs + EXCLUDE REGEX + ".*executor_runner.cpp$" + ) + list( + PREPEND + _mtk_oss_executor_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_oss_executor_runner.cpp + ) + + add_executable(mtk_oss_executor_runner ${_mtk_oss_executor_runner__srcs}) + + target_include_directories(mtk_oss_executor_runner + PUBLIC + ${_common_include_directories} + ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include + ) + + target_link_libraries(mtk_oss_executor_runner + ${_executor_runner_libs} + executorch + neuron_backend + gflags + ) + target_compile_options(mtk_oss_executor_runner + PUBLIC + ${_common_compile_options} + ) + set(_mtk_llama_executor_runner__srcs ${_mtk_executor_runner__srcs}) list(FILTER _mtk_llama_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$" diff --git a/examples/mediatek/README.md b/examples/mediatek/README.md index faca42fb50c..9727f2587fd 100644 --- a/examples/mediatek/README.md +++ b/examples/mediatek/README.md @@ -9,6 +9,8 @@ examples/mediatek ├── preformatter_templates # Model specific prompt preformatter templates ├── prompts # Calibration Prompts ├── tokenizers_ # Model tokenizer scripts + ├── oss_utils # Utils for oss models +├── eval_utils # Utils for eval oss models ├── model_export_scripts # Model specifc export scripts ├── models # Model definitions ├── llm_models # LLM model definitions @@ -44,6 +46,7 @@ pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylin ``` ## AoT Flow +### llama ##### Note: Verify that localhost connection is available before running AoT Flow 1. Exporting Models to `.pte` - In the `examples/mediatek directory`, run: @@ -72,6 +75,14 @@ source shell_scripts/export_llama.sh +``` +- Argument Options: + - `model_name`: deeplabv3/edsr/inceptionv3/inceptionv4/mobilenetv2/mobilenetv3/resnet18/resnet50 + # Runtime ## Supported Chips @@ -100,6 +111,13 @@ adb push .pte Make sure to replace `` with the actual name of your model file. And, replace the `` with the desired detination on the device. +##### Note: For oss models, please push additional files to your Android device +```bash +adb push mtk_oss_executor_runner +adb push input_list.txt +for i in input*bin; do adb push "$i" ; done; +``` + ### Executing the Model Execute the model on your Android device by running: @@ -111,3 +129,21 @@ adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/` with the name of your model file and `` with the desired number of iterations to run the model. ##### Note: For llama models, please use `mtk_llama_executor_runner`. Refer to `examples/mediatek/executor_runner/run_llama3_sample.sh` for reference. +##### Note: For oss models, please use `mtk_oss_executor_runner`. +```bash +adb shell "/data/local/tmp/mtk_oss_executor_runner --model_path /data/local/tmp/.pte --input_list /data/local/tmp/input_list.txt --output_folder /data/local/tmp/output_" +adb pull "/data/local/tmp/output_ ./" +``` + +### Check oss result on PC +```bash +python3 eval_utils/eval_oss_result.py --eval_type --target_f --output_f +``` +For example: +``` +python3 eval_utils/eval_oss_result.py --eval_type piq --target_f edsr --output_f output_edsr +``` +- Argument Options: + - `eval_type`: topk/piq/segmentation + - `target_f`: folder contain golden data files. file name is `golden__0.bin` + - `output_f`: folder contain model output data files. file name is `output__0.bin` diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py new file mode 100755 index 00000000000..f447b2ac68f --- /dev/null +++ b/examples/mediatek/aot_utils/oss_utils/utils.py @@ -0,0 +1,73 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import os +from typing import Optional + +import torch +from executorch import exir +from executorch.backends.mediatek import ( + NeuropilotPartitioner, + NeuropilotQuantizer, + Precision, +) +from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + + +def build_executorch_binary( + model, + inputs, + file_name, + dataset, + quant_dtype: Optional[Precision] = None, +): + if quant_dtype is not None: + quantizer = NeuropilotQuantizer() + quantizer.setup_precision(quant_dtype) + if quant_dtype not in Precision: + raise AssertionError(f"No support for Precision {quant_dtype}.") + + captured_model = torch._export.capture_pre_autograd_graph(model, inputs) + annotated_model = prepare_pt2e(captured_model, quantizer) + print("Quantizing the model...") + # calibration + for data in dataset: + annotated_model(*data) + quantized_model = convert_pt2e(annotated_model, fold_quantize=False) + aten_dialect = torch.export.export(quantized_model, inputs) + else: + aten_dialect = torch.export.export(model, inputs) + + from executorch.exir.program._program import to_edge_transform_and_lower + + edge_compile_config = exir.EdgeCompileConfig(_check_ir_validity=False) + # skipped op names are used for deeplabV3 model + neuro_partitioner = NeuropilotPartitioner( + [], + op_names_to_skip={ + "aten_convolution_default_106", + "aten_convolution_default_107", + }, + ) + edge_prog = to_edge_transform_and_lower( + aten_dialect, + compile_config=edge_compile_config, + partitioner=[neuro_partitioner], + ) + + exec_prog = edge_prog.to_executorch( + config=exir.ExecutorchBackendConfig(extract_constant_segment=False) + ) + with open(f"{file_name}.pte", "wb") as file: + file.write(exec_prog.buffer) + + +def make_output_dir(path: str): + if os.path.exists(path): + for f in os.listdir(path): + os.remove(os.path.join(path, f)) + os.removedirs(path) + os.makedirs(path) diff --git a/examples/mediatek/eval_utils/eval_oss_result.py b/examples/mediatek/eval_utils/eval_oss_result.py new file mode 100755 index 00000000000..3e599330b66 --- /dev/null +++ b/examples/mediatek/eval_utils/eval_oss_result.py @@ -0,0 +1,198 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import json +import os + +import numpy as np +import piq +import torch + + +def check_data(target_f, predict_f): + target_files = os.listdir(target_f) + predict_files = os.listdir(predict_f) + if len(target_files) != len(predict_files): + raise RuntimeError( + "Data number in target folder and prediction folder must be same" + ) + + predict_set = set(predict_files) + for f in target_files: + # target file naming rule is golden_sampleId_outId.bin + # predict file naming rule is output_sampleId_outId.bin + pred_name = f.replace("golden", "output") + try: + predict_set.remove(pred_name) + except KeyError: + raise RuntimeError(f"Cannot find {pred_name} in {predict_f}") + + if predict_set: + target_name = next(predict_set).replace("output", "golden") + raise RuntimeError(f"Cannot find {target_name} in {target_f}") + + +def eval_topk(target_f, predict_f): + def solve(prob, target, k): + _, indices = torch.topk(prob, k=k, sorted=True) + golden = torch.reshape(target, [-1, 1]) + correct = golden == indices + if torch.any(correct): + return 1 + else: + return 0 + + target_files = os.listdir(target_f) + + cnt10 = 0 + cnt50 = 0 + for target_name in target_files: + pred_name = target_name.replace("golden", "output") + + pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32) + target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.int64)[0] + cnt10 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 10) + cnt50 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 50) + + print("Top10 acc:", cnt10 * 100.0 / len(target_files)) + print("Top50 acc:", cnt50 * 100.0 / len(target_files)) + + +def eval_piq(target_f, predict_f): + target_files = os.listdir(target_f) + + psnr_list = [] + ssim_list = [] + for target_name in target_files: + pred_name = target_name.replace("golden", "output") + hr = np.fromfile(os.path.join(target_f, target_name), dtype=np.float32) + hr = hr.reshape((1, 448, 448, 3)) + hr = np.moveaxis(hr, 3, 1) + hr = torch.from_numpy(hr) + + sr = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32) + sr = sr.reshape((1, 448, 448, 3)) + sr = np.moveaxis(sr, 3, 1) + sr = torch.from_numpy(sr).clamp(0, 1) + + psnr_list.append(piq.psnr(hr, sr)) + ssim_list.append(piq.ssim(hr, sr)) + + avg_psnr = sum(psnr_list).item() / len(psnr_list) + avg_ssim = sum(ssim_list).item() / len(ssim_list) + + print(f"Avg of PSNR is: {avg_psnr}") + print(f"Avg of SSIM is: {avg_ssim}") + + +def eval_segmentation(target_f, predict_f): + classes = [ + "Backround", + "Aeroplane", + "Bicycle", + "Bird", + "Boat", + "Bottle", + "Bus", + "Car", + "Cat", + "Chair", + "Cow", + "DiningTable", + "Dog", + "Horse", + "MotorBike", + "Person", + "PottedPlant", + "Sheep", + "Sofa", + "Train", + "TvMonitor", + ] + + target_files = os.listdir(target_f) + + def make_confusion(goldens, predictions, num_classes): + def histogram(golden, predict): + mask = golden < num_classes + hist = np.bincount( + num_classes * golden[mask].astype(int) + predict[mask], + minlength=num_classes**2, + ).reshape(num_classes, num_classes) + return hist + + confusion = np.zeros((num_classes, num_classes)) + for g, p in zip(goldens, predictions): + confusion += histogram(g.flatten(), p.flatten()) + + return confusion + + pred_list = [] + target_list = [] + for target_name in target_files: + pred_name = target_name.replace("golden", "output") + target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.uint8) + target_npy = target_npy.reshape((224, 224)) + target_list.append(target_npy) + + pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32) + pred_npy = pred_npy.reshape((224, 224, len(classes))) + pred_npy = pred_npy.argmax(2).astype(np.uint8) + pred_list.append(pred_npy) + + eps = 1e-6 + confusion = make_confusion(target_list, pred_list, len(classes)) + + pa = np.diag(confusion).sum() / (confusion.sum() + eps) + mpa = np.mean(np.diag(confusion) / (confusion.sum(axis=1) + eps)) + iou = np.diag(confusion) / ( + confusion.sum(axis=1) + confusion.sum(axis=0) - np.diag(confusion) + eps + ) + miou = np.mean(iou) + cls_iou = dict(zip(classes, iou)) + + print(f"PA : {pa}") + print(f"MPA : {mpa}") + print(f"MIoU : {miou}") + print(f"CIoU : \n{json.dumps(cls_iou, indent=2)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--target_f", + help="folder of target data", + type=str, + required=True, + ) + + parser.add_argument( + "--out_f", + help="folder of model prediction data", + type=str, + required=True, + ) + + parser.add_argument( + "--eval_type", + help="Choose eval type from: topk, piq, segmentation", + type=str, + choices=["topk", "piq", "segmentation"], + required=True, + ) + + args = parser.parse_args() + + check_data(args.target_f, args.out_f) + + if args.eval_type == "topk": + eval_topk(args.target_f, args.out_f) + elif args.eval_type == "piq": + eval_piq(args.target_f, args.out_f) + elif args.eval_type == "segmentation": + eval_segmentation(args.target_f, args.out_f) diff --git a/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp new file mode 100755 index 00000000000..3a1ad1d863b --- /dev/null +++ b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 MediaTek Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * @file + * + * This tool can run ExecuTorch model files that only use operators that + * are covered by the portable kernels, with possible delegate to the + * test_backend_compiler_lib. + * + * It sets all input tensor data to ones, and assumes that the outputs are + * all fp32 tensors. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +static uint8_t method_allocator_pool[8 * 1024U * 1024U]; // 8 MB + +// Model Path +DEFINE_string( + model_path, + "model.pte", + "Model serialized in flatbuffer format. Default to 'model.pte'"); +DEFINE_string( + input_list, + "input_list.txt", + "Model input list. Default to 'input_list.txt'"); +DEFINE_string( + output_folder, + "outputs", + "Model output folder. Default to 'outputs'"); + +using namespace torch::executor; +using torch::executor::MemoryAllocator; +using torch::executor::util::BufferCleanup; +using torch::executor::util::FileDataLoader; +using namespace std::filesystem; + +int main(int argc, char** argv) { + runtime_init(); + + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (argc != 1) { + std::string msg = "Extra commandline args:"; + for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) { + msg += std::string(" ") + argv[i]; + } + ET_LOG(Error, "%s", msg.c_str()); + return 1; + } + + // Create output folder + create_directories(FLAGS_output_folder); + + // Create a loader to get the data of the program file. There are other + // DataLoaders that use mmap() or point to data that's already in memory, and + // users can create their own DataLoaders to load from arbitrary sources. + const char* model_path = FLAGS_model_path.c_str(); + Result loader = FileDataLoader::from(model_path); + ET_CHECK_MSG( + loader.ok(), + "FileDataLoader::from() failed: 0x%" PRIx32, + (uint32_t)loader.error()); + + // Parse the program file. This is immutable, and can also be reused between + // multiple execution invocations across multiple threads. + Result program = Program::load(&loader.get()); + if (!program.ok()) { + ET_LOG(Error, "Failed to parse model file %s", model_path); + return 1; + } + ET_LOG(Info, "Model file %s is loaded.", model_path); + + // Use the first method in the program. + const char* method_name = nullptr; + { + const auto method_name_result = program->get_method_name(0); + ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); + method_name = *method_name_result; + } + ET_LOG(Info, "Using method %s", method_name); + + // MethodMeta describes the memory requirements of the method. + Result method_meta_result = program->method_meta(method_name); + ET_CHECK_MSG( + method_meta_result.ok(), + "Failed to get method_meta for %s: 0x%" PRIx32, + method_name, + (uint32_t)method_meta_result.error()); + + // + // The runtime does not use malloc/new; it allocates all memory using the + // MemoryManger provided by the client. Clients are responsible for allocating + // the memory ahead of time, or providing MemoryAllocator subclasses that can + // do it dynamically. + // + + // The method allocator is used to allocate all dynamic C++ metadata/objects + // used to represent the loaded method. This allocator is only used during + // loading a method of the program, which will return an error if there was + // not enough memory. + // + // The amount of memory required depends on the loaded method and the runtime + // code itself. The amount of memory here is usually determined by running the + // method and seeing how much memory is actually used, though it's possible to + // subclass MemoryAllocator so that it calls malloc() under the hood (see + // MallocMemoryAllocator). + // + // In this example we use a statically allocated memory pool. + MemoryAllocator method_allocator{ + MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; + + // The memory-planned buffers will back the mutable tensors used by the + // method. The sizes of these buffers were determined ahead of time during the + // memory-planning pasees. + // + // Each buffer typically corresponds to a different hardware memory bank. Most + // mobile environments will only have a single buffer. Some embedded + // environments may have more than one for, e.g., slow/large DRAM and + // fast/small SRAM, or for memory associated with particular cores. + std::vector> planned_buffers; // Owns the memory + std::vector> planned_spans; // Passed to the allocator + size_t num_memory_planned_buffers = + method_meta_result->num_memory_planned_buffers(); + for (size_t id = 0; id < num_memory_planned_buffers; ++id) { + // .get() will always succeed because id < num_memory_planned_buffers. + size_t buffer_size = static_cast( + method_meta_result->memory_planned_buffer_size(id).get()); + ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size); + planned_buffers.push_back(std::make_unique(buffer_size)); + planned_spans.push_back({planned_buffers.back().get(), buffer_size}); + } + HierarchicalAllocator planned_memory( + {planned_spans.data(), planned_spans.size()}); + + // Assemble all of the allocators into the MemoryManager that the Executor + // will use. + MemoryManager memory_manager(&method_allocator, &planned_memory); + + // + // Load the method from the program, using the provided allocators. Running + // the method can mutate the memory-planned buffers, so the method should only + // be used by a single thread at at time, but it can be reused. + // + Result method = program->load_method(method_name, &memory_manager); + ET_CHECK_MSG( + method.ok(), + "Loading of method %s failed with status 0x%" PRIx32, + method_name, + (uint32_t)method.error()); + ET_LOG(Info, "Method loaded."); + + std::ifstream input_list(FLAGS_input_list); + ET_CHECK_MSG( + input_list.is_open(), + "Error: cannot open input file %s", + FLAGS_input_list.c_str()); + + auto split = [](std::string s, std::string delimiter) { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector res; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; + }; + + MethodMeta method_meta = method->method_meta(); + size_t num_inputs = method_meta.num_inputs(); + std::string file_path; + int inference_index = 0; + while (std::getline(input_list, file_path)) { + auto input_files = split(file_path, " "); + if (input_files.size() == 0) { + break; + } + ET_CHECK_MSG( + input_files.size() == num_inputs, + "Model expect %zu inputs but get %zu from input files", + num_inputs, + input_files.size()); + + // Prepare the inputs. + size_t num_allocated = 0; + ET_LOG(Info, "Number of inputs: %zu", num_inputs); + void** inputs = (void**)malloc(num_inputs * sizeof(void*)); + + for (size_t i = 0; i < num_inputs; i++) { + auto tag = method_meta.input_tag(i); + if (tag.get() != Tag::Tensor) { + ET_LOG(Debug, "Skipping malloc non-tensor input %zu", i); + continue; + } + Result tensor_meta = method_meta.input_tensor_meta(i); + const auto nbytes = tensor_meta->nbytes(); + // This input is a tensor. Allocate a buffer for it. + void* data_ptr = malloc(nbytes); + + // Read data from file + std::ifstream fin(input_files[i], std::ios::binary); + fin.seekg(0, fin.end); + size_t file_size = fin.tellg(); + + ET_CHECK_MSG( + file_size == nbytes, + "Input %zu size mismatch. file bytes: %zu, tensor bytes: %zu", + i, + file_size, + nbytes); + + fin.seekg(0, fin.beg); + fin.read(static_cast(data_ptr), file_size); + fin.close(); + inputs[num_allocated++] = data_ptr; + + // Set backend input + auto scalar_type = tensor_meta->scalar_type(); + auto sizes_raw = tensor_meta->sizes(); + auto dim = sizes_raw.size(); + auto dim_order_raw = tensor_meta->dim_order(); + std::vector sizes(sizes_raw.begin(), sizes_raw.end()); + std::vector dim_order(dim_order_raw.begin(), dim_order_raw.end()); + + TensorImpl impl = TensorImpl( + scalar_type, dim, sizes.data(), data_ptr, dim_order.data()); + + Tensor tensor(&impl); + Error ret = method->set_input(tensor, i); + if (ret != Error::Ok) { + ET_LOG(Error, "Failed to set input %zu: 0x%" PRIx32, i, (uint32_t)ret); + // The BufferCleanup will free the inputs when it goes out of scope. + BufferCleanup cleanup({inputs, num_allocated}); + return 1; + } + } + BufferCleanup({inputs, num_allocated}); + ET_LOG(Info, "Inputs prepared."); + + // Run the model. + auto before_exec = std::chrono::high_resolution_clock::now(); + Error status = Error::Ok; + status = method->execute(); + auto after_exec = std::chrono::high_resolution_clock::now(); + double elapsed_time = std::chrono::duration_cast( + after_exec - before_exec) + .count() / + 1000.0; + + ET_LOG(Info, "Inference took %f ms", elapsed_time); + ET_CHECK_MSG( + status == Error::Ok, + "Execution of method %s failed with status 0x%" PRIx32, + method_name, + (uint32_t)status); + ET_LOG(Info, "Model executed successfully."); + + // Get output data + size_t output_size = method->outputs_size(); + ET_LOG(Info, "Number of outputs: %zu", output_size); + std::vector outputs(output_size); + status = method->get_outputs(outputs.data(), output_size); + ET_CHECK(status == Error::Ok); + for (size_t i = 0; i < output_size; i++) { + auto output_tensor = outputs[i].toTensor(); + auto output_file_name = FLAGS_output_folder + "/output_" + + std::to_string(inference_index) + "_" + std::to_string(i) + ".bin"; + std::ofstream fout(output_file_name.c_str(), std::ios::binary); + fout.write(output_tensor.const_data_ptr(), output_tensor.nbytes()); + fout.close(); + } + + inference_index++; + } + + return 0; +} diff --git a/examples/mediatek/model_export_scripts/deeplab_v3.py b/examples/mediatek/model_export_scripts/deeplab_v3.py new file mode 100755 index 00000000000..da6766c0f54 --- /dev/null +++ b/examples/mediatek/model_export_scripts/deeplab_v3.py @@ -0,0 +1,124 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import random + +import numpy as np + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.deeplabv3 = DeepLabV3ResNet101Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + nchw_output = self.deeplabv3(nchw_input1) + return nchw_output.permute(0, 2, 3, 1) + + +def get_dataset(data_size, dataset_dir, download): + from torchvision import datasets, transforms + + input_size = (224, 224) + preprocess = transforms.Compose( + [ + transforms.Resize(input_size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) + dataset = list( + datasets.VOCSegmentation( + root=os.path.join(dataset_dir, "voc_image"), + year="2009", + image_set="val", + transform=preprocess, + download=download, + ) + ) + + # prepare input data + random.shuffle(dataset) + inputs, targets, input_list = [], [], "" + for index, data in enumerate(dataset): + if index >= data_size: + break + image, target = data + inputs.append((image.unsqueeze(0).permute(0, 2, 3, 1),)) + targets.append(np.array(target.resize(input_size))) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./deeplab_v3", + default="./deeplab_v3", + type=str, + ) + + parser.add_argument( + "-d", + "--download", + help="If specified, download VOCSegmentation dataset by torchvision API", + action="store_true", + default=False, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + data_size=data_num, dataset_dir=args.artifact, download=args.download + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + if idx == 0: + print("inp shape: ", d.detach().numpy().shape) + print("inp type: ", d.detach().numpy().dtype) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.tofile(file_name) + if idx == 0: + print("golden shape: ", data.shape) + print("golden type: ", data.dtype) + + # build pte + pte_filename = "deeplabV3Resnet101_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/edsr.py b/examples/mediatek/model_export_scripts/edsr.py new file mode 100755 index 00000000000..4192d67e569 --- /dev/null +++ b/examples/mediatek/model_export_scripts/edsr.py @@ -0,0 +1,170 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import numpy as np + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.edsr import EdsrModel + +from PIL import Image +from torch.utils.data import Dataset +from torchsr.datasets import B100 +from torchvision.transforms.functional import to_tensor + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.edsr = EdsrModel().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + nchw_output = self.edsr(nchw_input1) + return nchw_output.permute(0, 2, 3, 1) + + +class SrDataset(Dataset): + def __init__(self, hr_dir: str, lr_dir: str): + self.input_size = np.asanyarray([224, 224]) + self.hr = [] + self.lr = [] + + for file in sorted(os.listdir(hr_dir)): + self.hr.append(self._resize_img(os.path.join(hr_dir, file), 2)) + + for file in sorted(os.listdir(lr_dir)): + self.lr.append(self._resize_img(os.path.join(lr_dir, file), 1)) + + if len(self.hr) != len(self.lr): + raise AssertionError( + "The number of high resolution pics is not equal to low " + "resolution pics" + ) + + def __getitem__(self, idx: int): + return self.hr[idx], self.lr[idx] + + def __len__(self): + return len(self.lr) + + def _resize_img(self, file: str, scale: int): + with Image.open(file) as img: + return ( + to_tensor(img.resize(tuple(self.input_size * scale))) + .unsqueeze(0) + .permute(0, 2, 3, 1) + ) + + def get_input_list(self): + input_list = "" + for i in range(len(self.lr)): + input_list += f"input_{i}_0.bin\n" + return input_list + + +def get_b100( + dataset_dir: str, +): + hr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/HR" + lr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/LR_bicubic/X2" + + if not os.path.exists(hr_dir) or not os.path.exists(lr_dir): + B100(root=f"{dataset_dir}/sr_bm_dataset", scale=2, download=True) + + return SrDataset(hr_dir, lr_dir) + + +def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str): + if not (lr_dir and hr_dir) and not default_dataset: + raise RuntimeError( + "Nither custom dataset is provided nor using default dataset." + ) + + if (lr_dir and hr_dir) and default_dataset: + raise RuntimeError("Either use custom dataset, or use default dataset.") + + if default_dataset: + return get_b100(dataset_dir) + + return SrDataset(hr_dir, lr_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./edsr", + default="./edsr", + type=str, + ) + + parser.add_argument( + "-r", + "--hr_ref_dir", + help="Path to the high resolution images", + default="", + type=str, + ) + + parser.add_argument( + "-l", + "--lr_dir", + help="Path to the low resolution image inputs", + default="", + type=str, + ) + + parser.add_argument( + "-d", + "--default_dataset", + help="If specified, download and use B100 dataset by torchSR API", + action="store_true", + default=False, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + dataset = get_dataset( + args.hr_ref_dir, args.lr_dir, args.default_dataset, args.artifact + ) + + inputs, targets, input_list = dataset.lr, dataset.hr, dataset.get_input_list() + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "edsr_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (inputs[0],), + f"{args.artifact}/{pte_filename}", + [(input,) for input in inputs], + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/inception_v3.py b/examples/mediatek/model_export_scripts/inception_v3.py new file mode 100755 index 00000000000..c28bd85b402 --- /dev/null +++ b/examples/mediatek/model_export_scripts/inception_v3.py @@ -0,0 +1,120 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.inception_v3 import InceptionV3Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.inception = InceptionV3Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.inception(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./inceptionV3", + default="./inceptionV3", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + pte_filename = "inceptionV3_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/inception_v4.py b/examples/mediatek/model_export_scripts/inception_v4.py new file mode 100755 index 00000000000..ccb2ce16f22 --- /dev/null +++ b/examples/mediatek/model_export_scripts/inception_v4.py @@ -0,0 +1,120 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.inception_v4 import InceptionV4Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.inception = InceptionV4Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.inception(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize((299, 299)), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./inceptionV4", + default="./inceptionV4", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "inceptionV4_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 299, 299, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/mobilenet_v2.py b/examples/mediatek/model_export_scripts/mobilenet_v2.py new file mode 100755 index 00000000000..97f2ed884eb --- /dev/null +++ b/examples/mediatek/model_export_scripts/mobilenet_v2.py @@ -0,0 +1,121 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.mobilenet_v2 import MV2Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.mobilenet = MV2Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.mobilenet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./mobilenetV2", + default="./mobilenetV2", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "mobilenetV2_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/mobilenet_v3.py b/examples/mediatek/model_export_scripts/mobilenet_v3.py new file mode 100755 index 00000000000..fed2497ca26 --- /dev/null +++ b/examples/mediatek/model_export_scripts/mobilenet_v3.py @@ -0,0 +1,121 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.mobilenet_v3 import MV3Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.mobilenet = MV3Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.mobilenet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./mobilenetV3", + default="./mobilenetV3", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "mobilenetV3_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/resnet18.py b/examples/mediatek/model_export_scripts/resnet18.py new file mode 100755 index 00000000000..2f3af57e7f3 --- /dev/null +++ b/examples/mediatek/model_export_scripts/resnet18.py @@ -0,0 +1,122 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.resnet import ResNet18Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.resnet = ResNet18Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.resnet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./resnet18", + default="./resnet18", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + aaa = data.detach().numpy() + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "resnet18_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/resnet50.py b/examples/mediatek/model_export_scripts/resnet50.py new file mode 100755 index 00000000000..ce23842447b --- /dev/null +++ b/examples/mediatek/model_export_scripts/resnet50.py @@ -0,0 +1,121 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.resnet import ResNet50Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.resnet = ResNet50Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.resnet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./resnet50", + default="./resnet50", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # compile to pte + pte_filename = "resnet50_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/requirements.txt b/examples/mediatek/requirements.txt index 038700059ba..7c3de886e27 100644 --- a/examples/mediatek/requirements.txt +++ b/examples/mediatek/requirements.txt @@ -4,3 +4,5 @@ safetensors sentencepiece tokenizers transformers +piq +pillow diff --git a/examples/mediatek/shell_scripts/export_oss.sh b/examples/mediatek/shell_scripts/export_oss.sh new file mode 100755 index 00000000000..3da5dc41f94 --- /dev/null +++ b/examples/mediatek/shell_scripts/export_oss.sh @@ -0,0 +1,29 @@ +model=$1 + +echo "Export model: $model" + +if [ $model = "deeplabv3" ] +then + python3 model_export_scripts/deeplab_v3.py -d +elif [ $model = "edsr" ] +then + python3 model_export_scripts/edsr.py -d +elif [ $model = "inceptionv3" ] +then + python3 model_export_scripts/inception_v3.py -d PATH_TO_DATASET +elif [ $model = "inceptionv4" ] +then + python3 model_export_scripts/inception_v4.py -d PATH_TO_DATASET +elif [ $model = "mobilenetv2" ] +then + python3 model_export_scripts/mobilenet_v2.py -d PATH_TO_DATASET +elif [ $model = "mobilenetv3" ] +then + python3 model_export_scripts/mobilenet_v3.py -d PATH_TO_DATASET +elif [ $model = "resnet18" ] +then + python3 model_export_scripts/resnet18.py -d PATH_TO_DATASET +elif [ $model = "resnet50" ] +then + python3 model_export_scripts/resnet50.py -d PATH_TO_DATASET +fi diff --git a/examples/models/flamingo/preprocess/export_preprocess_lib.py b/examples/models/flamingo/preprocess/export_preprocess_lib.py index 358b1f2149a..366f5989222 100644 --- a/examples/models/flamingo/preprocess/export_preprocess_lib.py +++ b/examples/models/flamingo/preprocess/export_preprocess_lib.py @@ -14,7 +14,7 @@ from executorch.extension.llm.custom_ops import preprocess_custom_ops # noqa from torch.export import Dim, ExportedProgram -from torchtune.models.clip.inference._transforms import _CLIPImageTransform +from torchtune.models.clip.inference._transform import _CLIPImageTransform def get_example_inputs() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: diff --git a/examples/models/flamingo/preprocess/test_preprocess.py b/examples/models/flamingo/preprocess/test_preprocess.py index 34ad0ab8ed1..b990f44ca1b 100644 --- a/examples/models/flamingo/preprocess/test_preprocess.py +++ b/examples/models/flamingo/preprocess/test_preprocess.py @@ -22,7 +22,7 @@ from parameterized import parameterized from PIL import Image -from torchtune.models.clip.inference._transforms import ( +from torchtune.models.clip.inference._transform import ( _CLIPImageTransform, CLIPImageTransform, ) diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS index 467949a5ebf..f1c56a5bda3 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama2/TARGETS @@ -70,9 +70,12 @@ runtime.python_library( "export_llama.py", "export_llama_lib.py", "model.py", + "source_transformation/apply_spin_quant_r1_r2.py", "source_transformation/quantize.py", + "source_transformation/rms_norm.py", "source_transformation/rope.py", "source_transformation/sdpa.py", + "source_transformation/spin_quant.py", ], _is_external_target = True, base_module = "executorch.examples.models.llama2", @@ -83,6 +86,7 @@ runtime.python_library( "@EXECUTORCH_CLIENTS", ], deps = [ + "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform", "//caffe2:torch", "//executorch/examples/models:model_base", "//executorch/examples/models:models", diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 2d10f5edc0a..b8987ac5d49 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -41,6 +41,7 @@ def __init__( tokenizer: Union[SentencePieceTokenizer, Tiktoken], max_seq_length: Optional[int] = None, use_kv_cache: bool = False, + generate_full_logits: bool = False, enable_dynamic_shape: bool = True, ): super().__init__( @@ -48,6 +49,7 @@ def __init__( ) self._model = model.to(self.device) self._use_kv_cache = use_kv_cache + self._generate_full_logits = generate_full_logits self._enable_dynamic_shape = enable_dynamic_shape def _model_call(self, inps): @@ -60,7 +62,10 @@ def _model_call(self, inps): pos_tensor = torch.tensor([pos], dtype=torch.int64) logits = self._model(inps[:, pos : pos + 1], pos_tensor) result_logits.append(logits) - return torch.cat(result_logits, dim=1) + if self._generate_full_logits: + return torch.cat(result_logits, dim=1) + else: + return torch.stack(result_logits, dim=1) else: pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) # Batch process the whole sequence. diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index f6abc3aaf4e..97228bb5c5d 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -16,7 +16,7 @@ from enum import Enum from json import JSONDecodeError from pathlib import Path -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union import pkg_resources @@ -45,10 +45,15 @@ from executorch.util.activation_memory_profiler import generate_memory_trace from ..model_factory import EagerModelFactory +from .source_transformation.apply_spin_quant_r1_r2 import ( + fuse_layer_norms, + get_model_with_r1_r2, +) from .source_transformation.quantize import ( get_quant_embedding_transform, get_quant_weight_transform, ) +from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis from .source_transformation.sdpa import ( replace_causal_mask, @@ -224,6 +229,13 @@ def build_args_parser() -> argparse.ArgumentParser: default=f"{ckpt_dir}/params/demo_config.json", help="config.json", ) + parser.add_argument( + "--optimized_rotation_path", + default=None, + required=False, + help="[QNN backend] Optimized rotation checkpoint path. Just apply R1/R2 here." + "You can download the optimized rotation matrices from https://github.com/facebookresearch/SpinQuant/tree/main", + ) parser.add_argument( "-m", "--metadata", @@ -287,6 +299,17 @@ def build_args_parser() -> argparse.ArgumentParser: parser.add_argument("-V", "--vulkan", action="store_true") parser.add_argument("--mps", action="store_true") parser.add_argument("--coreml", action="store_true") + parser.add_argument( + "--coreml-enable-state", + action="store_true", + help="This option is only for coreml, and is only supported for MacOS15+/iOS18+", + ) + parser.add_argument( + "--coreml-quantize", + default=None, + choices=["b4w"], + help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)", + ) parser.add_argument( "--qnn", action="store_true", @@ -315,6 +338,23 @@ def build_args_parser() -> argparse.ArgumentParser: default=False, help="Generate logits for all inputs.", ) + + parser.add_argument( + "--soc_model", + help="[QNN backend] SoC model of current device. e.g. 'SM8650' for Snapdragon 8 Gen 3.", + type=str, + required=False, + default="SM8650", + ) + + parser.add_argument( + "-sq", + "--use_spin_quant", + type=str, + default=None, + choices=["cuda", "native"], + help="Use SpinQuant for better quantization performance. Only support cuda and native.", + ) return parser @@ -386,35 +426,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: else: dtype_override = None - # source transforms - transforms = [] - if args.quantization_mode: - modelname = f"{modelname}_q" - transforms.append( - get_quant_weight_transform(args, dtype_override, verbose_export()) - ) - - if args.embedding_quantize: - modelname = f"{modelname}_e" - transforms.append(get_quant_embedding_transform(args)) - - if args.expand_rope_table: - transforms.append(materialze_broadcast_of_rope_freq_cis) - - if args.use_sdpa_with_kv_cache: - transforms.append(replace_sdpa_with_custom_op) - - if args.use_kv_cache: - if args.qnn: - transforms.append(replace_kv_cache_with_simple_kv_cache) - transforms.append(replace_sdpa_with_flex_sdpa) - transforms.append(replace_causal_mask) - - elif args.coreml or args.mps: - # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition - # to get free perf gain. - transforms.append(replace_sdpa_with_simple_sdpa) - transforms.append(replace_causal_mask) return ( _load_llama_model( modelname=modelname, @@ -438,7 +449,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: ) .set_output_dir(output_dir_path) .to_dtype(dtype_override) - .source_transform(transforms) + .source_transform(_get_source_transforms(modelname, dtype_override, args)) ) @@ -515,7 +526,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.coreml: coreml_partitioner = get_coreml_partitioner( - args.use_kv_cache, args.pt2e_quantize + args.use_kv_cache and args.coreml_enable_state, + args.embedding_quantize, + args.pt2e_quantize, + args.coreml_quantize, ) partitioners.append(coreml_partitioner) modelname = f"coreml_{modelname}" @@ -525,7 +539,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 partitioners.append( get_qnn_partitioner( - args.use_kv_cache, args.pt2e_quantize, args.num_sharding + args.use_kv_cache, args.pt2e_quantize, args.num_sharding, args.soc_model ) ) # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` @@ -552,7 +566,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.num_sharding > 0 and args.qnn: from executorch.backends.qualcomm.utils.utils import canonicalize_program - canonicalize_program(builder.edge_manager.exported_program()) + # TODO: Need to remove this once we have better way to handle buffer size + canonicalize_program( + builder.edge_manager.exported_program(), custom_buffer_size=542048256 + ) builder = builder.to_executorch() @@ -569,7 +586,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.num_sharding > 0 and args.qnn: from executorch.backends.qualcomm.utils.utils import canonicalize_program - canonicalize_program(builder.edge_manager.exported_program()) + # TODO: Need to remove this once we have better way to handle buffer size + canonicalize_program( + builder.edge_manager.exported_program(), custom_buffer_size=542048256 + ) builder = builder.to_executorch() @@ -700,6 +720,7 @@ def _load_llama_model( max_seq_len=model.params.max_seq_len, dtype=dtype, use_kv_cache=use_kv_cache, + generate_full_logits=generate_full_logits, example_inputs=example_inputs, enable_dynamic_shape=enable_dynamic_shape, calibration_tasks=calibration_tasks, @@ -718,3 +739,59 @@ def _load_llama_model( ), args=args, ) + + +def _get_source_transforms( + modelname: str, dtype_override: Optional[DType], args +) -> List[Callable[[torch.nn.Module], torch.nn.Module]]: + transforms = [] + if args.quantization_mode: + modelname = f"{modelname}_q" + transforms.append( + get_quant_weight_transform(args, dtype_override, verbose_export()) + ) + + if args.embedding_quantize: + modelname = f"{modelname}_e" + transforms.append(get_quant_embedding_transform(args)) + + if args.expand_rope_table: + transforms.append(materialze_broadcast_of_rope_freq_cis) + + if args.use_sdpa_with_kv_cache: + transforms.append(replace_sdpa_with_custom_op) + + if args.use_kv_cache: + if args.qnn: + # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` + from executorch.backends.qualcomm.utils.utils import ( + convert_linear_to_conv2d, + ) + + transforms.append(replace_kv_cache_with_simple_kv_cache) + transforms.append(replace_sdpa_with_flex_sdpa) + transforms.append(replace_causal_mask) + transforms.append(replace_rms_norm_with_native_rms_norm) + if args.optimized_rotation_path: + transforms.append(fuse_layer_norms) + transforms.append(get_model_with_r1_r2(args.optimized_rotation_path)) + transforms.append(convert_linear_to_conv2d) + + elif args.coreml or args.mps: + # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition + # to get free perf gain. + transforms.append(replace_sdpa_with_simple_sdpa) + transforms.append(replace_causal_mask) + + if args.use_spin_quant: + if args.use_spin_quant == "cuda": + from .source_transformation.spin_quant import ( + inject_fast_hadamard_transform_cuda_for_spin_quant, + ) + + transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant) + + elif args.use_spin_quant == "native": + raise NotImplementedError("native SpinQuant is not implemented yet.") + + return transforms diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py index 0c93115ee3b..534d90c6ed9 100644 --- a/examples/models/llama2/llama_transformer.py +++ b/examples/models/llama2/llama_transformer.py @@ -39,6 +39,7 @@ def __init__(self, dim: int, eps: float = 1e-6): """ super().__init__() + self.dim = dim self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) diff --git a/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py new file mode 100644 index 00000000000..e71007b1958 --- /dev/null +++ b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py @@ -0,0 +1,179 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import typing + +import torch + + +def rotate_embeddings(model, R1: torch.Tensor) -> None: + # Rotate the embeddings. + for W in [model.tok_embeddings]: + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_attention_inputs(layer, R1) -> None: + # Rotate the WQ, WK and WV matrices of the self-attention layer. + for W in [layer.attention.wq, layer.attention.wk, layer.attention.wv]: + dtype = W.weight.dtype + W_ = W.weight.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_attention_output(layer, R1) -> None: + # Rotate output matrix of the self-attention layer. + W = layer.attention.wo + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype) + if W.bias is not None: + b = W.bias.data.to(device="cpu", dtype=torch.float32) + W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype) + + +def rotate_mlp_input(layer, R1): + # Rotate the MLP input weights. + mlp_inputs = [layer.feed_forward.w3, layer.feed_forward.w1] + for W in mlp_inputs: + dtype = W.weight.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_mlp_output(layer, R1): + # Rotate the MLP output weights and bias. + W = layer.feed_forward.w2 + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype) + + if W.bias is not None: + b = W.bias.data.to(device="cpu", dtype=torch.float32) + W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype) + + +def rotate_head(model, R1: torch.Tensor) -> None: + # Rotate the head. + W = model.output + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_ov_proj(layer, head_dim, R2=None): + W = layer.attention.wv + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32).t() + transposed_shape = W_.shape + temp = W_.reshape(-1, transposed_shape[-1] // head_dim, head_dim) + temp = temp.to(torch.float32) @ R2 + W_ = temp.reshape(transposed_shape).t() + W.weight.data = W_.to(device="cpu", dtype=dtype) + + W = layer.attention.wo + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + init_shape = W_.shape + temp = W_.reshape(-1, init_shape[-1] // head_dim, head_dim) + temp = temp.to(torch.float32) @ R2 + W_ = temp.reshape(init_shape) + W.weight.data = W_.to(device="cpu", dtype=dtype) + + +def cleanup_memory() -> None: + """Run GC and clear GPU memory.""" + import gc + + # gc.collect and empty cache are necessary to clean up GPU memory if the model was distributed + gc.collect() + + +def get_model_with_r1_r2(optimized_rotation_path: str): + return lambda model: apply_spin_quant_r1_r2(model, optimized_rotation_path) + + +def apply_spin_quant_r1_r2(model: torch.nn.Module, optimized_rotation_path: str): + optimized_rotation = torch.load(optimized_rotation_path, weights_only=True) + R1 = optimized_rotation["R1"].to(torch.float32) + config = model.params + num_heads = config.n_heads + head_dim = config.dim // num_heads + + rotate_embeddings(model, R1) + rotate_head(model, R1) + cleanup_memory() + + for idx, layer in enumerate(model.layers): + key = f"model.layers.{idx}.self_attn.R2" + R2 = optimized_rotation[key].to(torch.float32) + rotate_attention_inputs(layer, R1) + rotate_attention_output(layer, R1) + rotate_mlp_input(layer, R1) + rotate_mlp_output(layer, R1) + rotate_ov_proj(layer, head_dim, R2=R2) + return model + + +def fuse_ln_linear( + layernorm: torch.nn.Module, linear_layers: typing.Iterable[torch.nn.Linear] +) -> None: + """ + fuse the linear operations in Layernorm into the adjacent linear blocks. + """ + for linear in linear_layers: + linear_dtype = linear.weight.dtype + + # Calculating new weight and bias + W_ = linear.weight.data.to(dtype=torch.float32) + linear.weight.data = (W_ * layernorm.weight.to(dtype=torch.float32)).to( + linear_dtype + ) + + if hasattr(layernorm, "bias"): + if linear.bias is None: + linear.bias = torch.nn.Parameter( + torch.zeros(linear.out_features, dtype=torch.float32) + ) + linear.bias.data = linear.bias.data.to(dtype=torch.float32) + torch.matmul( + W_, layernorm.bias.to(dtype=torch.float32) + ) + linear.bias.data = linear.bias.data.to(linear_dtype) + + +def fuse_layer_norms(model: torch.nn.Module): + # Embedding fusion + for W in [model.tok_embeddings]: + W_ = W.weight.data.to(dtype=torch.float32) + W.weight.data = (W_ - W_.mean(dim=-1, keepdim=True)).to(W.weight.data.dtype) + + # Fuse the linear operations in Layernorm into the adjacent linear blocks. + for layer in model.layers: + # fuse the input layernorms into the linear layers + fuse_ln_linear(layer.ffn_norm, [layer.feed_forward.w3, layer.feed_forward.w1]) + fuse_ln_linear( + layer.attention_norm, + [ + layer.attention.wq, + layer.attention.wk, + layer.attention.wv, + ], + ) + + W_norm = layer.ffn_norm.weight.data + layer.ffn_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32) + W_norm = layer.attention_norm.weight.data + layer.attention_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32) + + fuse_ln_linear( + model.norm, + [model.output], + ) + W_norm = model.norm.weight.data + model.norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32) + + return model diff --git a/examples/models/llama2/source_transformation/rms_norm.py b/examples/models/llama2/source_transformation/rms_norm.py new file mode 100644 index 00000000000..ff7e8b67457 --- /dev/null +++ b/examples/models/llama2/source_transformation/rms_norm.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.examples.models.llama2.llama_transformer import RMSNorm + + +def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module): + for name, child in module.named_children(): + if isinstance(child, RMSNorm): + rms_norm = torch.nn.RMSNorm(child.dim, eps=child.eps) + rms_norm.weight = child.weight + setattr( + module, + name, + rms_norm, + ) + else: + replace_rms_norm_with_native_rms_norm(child) + return module diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py index 8e5de7d97ae..c48fdf0ae58 100644 --- a/examples/models/llama2/source_transformation/sdpa.py +++ b/examples/models/llama2/source_transformation/sdpa.py @@ -118,8 +118,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ - if n_rep == 1: - return hidden_states + # TODO: Encounter the bug about source partition, need to investigate more on it. + # if n_rep == 1: + # return hidden_states new_kv = [] batch, n_heads, seqlen, head_dim = hidden_states.shape diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py new file mode 100644 index 00000000000..7b38312c182 --- /dev/null +++ b/examples/models/llama2/source_transformation/spin_quant.py @@ -0,0 +1,55 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +# Helper functions for tranforming the model to be able to run SpinQuant. +# See https://github.com/facebookresearch/SpinQuant for more details about SpinQuant. + +import torch + +import torch.nn.functional as F + +from executorch.examples.models.llama2.llama_transformer import FeedForward +from torch import nn + + +def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module): + """ + SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer. + R3 needs to be injected as well when KV cache quantization is enabled. + """ + try: + from fast_hadamard_transform import hadamard_transform + except ImportError: + raise ImportError( + "Please install fast-hadamard-transform: pip install fast-hadamard-transform" + ) + + class FeedForwardCustom(nn.Module): + def __init__(self, w1, w2, w3): + super().__init__() + self.w1 = w1 + self.w2 = w2 + self.w3 = w3 + + def forward(self, x): + w = F.silu(self.w1(x)) * self.w3(x) + n = w.shape[-1] + return self.w2(hadamard_transform(w.contiguous()) / torch.tensor(n).sqrt()) + + for name, child in module.named_children(): + if isinstance(child, FeedForward): + setattr(module, name, FeedForwardCustom(child.w1, child.w2, child.w3)) + else: + _inject_fast_hadamard_transform_cuda_for_spin_quant(child) + + +def inject_fast_hadamard_transform_cuda_for_spin_quant( + module: torch.nn.Module, +) -> torch.nn.Module: + _inject_fast_hadamard_transform_cuda_for_spin_quant(module) + return module diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 64763c72576..1924b057ec4 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -99,12 +99,17 @@ Error LlavaRunner::generate_from_pos( int64_t start_pos, std::function token_callback, std::function - stats_callback) { + stats_callback, + bool echo) { // prefill user prompt. No BOS because preset prompt already has it. - token_callback(prompt); + if (echo) { + token_callback(prompt); + } uint64_t prefill_next_token = ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0)); + stats_.first_token_ms = util::time_in_ms(); + stats_.prompt_eval_end_ms = util::time_in_ms(); stats_.num_prompt_tokens = start_pos; // Generate tokens @@ -113,7 +118,6 @@ Error LlavaRunner::generate_from_pos( // Bookkeeping stats_.num_generated_tokens = num_generated_tokens; - ::executorch::llm::print_report(stats_); if (stats_callback) { stats_callback(stats_); } @@ -125,7 +129,8 @@ Error LlavaRunner::generate( const std::string& prompt, int32_t seq_len, std::function token_callback, - std::function stats_callback) { + std::function stats_callback, + bool echo) { ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -147,6 +152,7 @@ Error LlavaRunner::generate( }; int64_t pos = 0; + stats_.inference_start_ms = util::time_in_ms(); // prefill preset prompt prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0); @@ -160,8 +166,11 @@ Error LlavaRunner::generate( util::get_rss_bytes() / 1024.0 / 1024.0); // Generate tokens - Error err = - generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback); + Error err = generate_from_pos( + prompt, seq_len, pos, wrapped_callback, stats_callback, echo); + + stats_.inference_end_ms = util::time_in_ms(); + ::executorch::llm::print_report(stats_); ET_LOG( Info, diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h index 923f8180a83..e671718ae5e 100644 --- a/examples/models/llava/runner/llava_runner.h +++ b/examples/models/llava/runner/llava_runner.h @@ -36,7 +36,8 @@ class LlavaRunner : public MultimodalRunner { int32_t seq_len = 1024, std::function token_callback = {}, std::function - stats_callback = {}); + stats_callback = {}, + bool echo = true); /** * Prefill an LLaVA Module with the given images input. @@ -70,6 +71,7 @@ class LlavaRunner : public MultimodalRunner { * @param start_pos The starting position in KV cache of the input in the LLM. * @param token_callback What to do after a token is generated. * @param stats_callback What to do with Stats. + * @param echo Whether to echo the input prompt or not. * @return The error code. */ Error generate_from_pos( @@ -78,7 +80,8 @@ class LlavaRunner : public MultimodalRunner { int64_t start_pos = 0, std::function token_callback = {}, std::function - stats_callback = {}); + stats_callback = {}, + bool echo = true); private: inline static const std::string kPresetPrompt = diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py index f7fda3b9849..df8c876abf2 100644 --- a/examples/qualcomm/oss_scripts/llama2/llama.py +++ b/examples/qualcomm/oss_scripts/llama2/llama.py @@ -16,8 +16,7 @@ from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.passes.build_quant_io import BuildQuantIo -from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype -from executorch.backends.qualcomm.quantizer.utils import get_16a4w_qnn_ptq_config +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( QcomChipset, ) @@ -34,13 +33,13 @@ ) from executorch.examples.qualcomm.utils import ( make_output_dir, + make_quantizer, setup_common_args_and_variables, SimpleADB, ) from executorch.exir import EdgeCompileConfig, EdgeProgramManager from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.exir.program._program import _get_updated_graph_signature from executorch.extension.llm.export.builder import DType from sentencepiece import SentencePieceProcessor @@ -274,20 +273,12 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type): def quantize(self, quant_dtype, custom_annotations=()): self.quant_dtype = quant_dtype - quantizer = QnnQuantizer() - quantizer.set_per_channel_linear_quant(True) - quantizer.set_per_channel_conv_quant(True) - - if quant_dtype == QuantDtype.use_8a8w: - pass # default setting - elif quant_dtype == QuantDtype.use_16a4w: - quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) - quantizer.set_bit16_op_quant_config( - get_16a4w_qnn_ptq_config(act_observer=MinMaxObserver) - ) - quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4") - else: - raise AssertionError(f"No support for QuantDtype {quant_dtype}.") + quantizer = make_quantizer( + quant_dtype=quant_dtype, + per_channel_conv=True, + per_channel_linear=True, + act_observer=MinMaxObserver, + ) quantizer.add_custom_quant_annotations(custom_annotations) self.has_quant_io = True @@ -367,6 +358,7 @@ def compile(args): ) end_load_ts = time.time() print("torch.load checkpoint", end_load_ts - start_ts) + llama_instance = None with torch.device("meta"): llama_instance = LlamaModel(config, output_new_cache_only=True) @@ -383,16 +375,13 @@ def compile(args): for layer in llama_instance.layers: if getattr(layer.attention, "prepare_sha", None): layer.attention.prepare_sha() - kv_type = torch.uint8 - if args.ptq == "8a8w": - quant_dtype = QuantDtype.use_8a8w - elif args.ptq == "16a4w": - quant_dtype = QuantDtype.use_16a4w - else: - raise AssertionError( - f"No support for quant type {args.ptq}. Support 8a8w and 16a4w." - ) + kv_type = torch.uint8 + assert args.ptq in [ + "8a8w", + "16a4w", + ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w." + quant_dtype = getattr(QuantDtype, f"use_{args.ptq}") assert args.tokenizer_model is not None, "Need tokenizer model for calibration" if args.dtype_override is not None: diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 278ab8e8c02..605bb27d330 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -13,13 +13,24 @@ import torch from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + QcomChipset, +) +from executorch.backends.qualcomm.utils.utils import ( + generate_htp_compiler_spec, + generate_qnn_executorch_compiler_spec, + skip_annotation, +) from executorch.examples.qualcomm.utils import ( build_executorch_binary, make_output_dir, + make_quantizer, parse_skip_delegation_node, + QnnPartitioner, setup_common_args_and_variables, SimpleADB, ) +from executorch.exir import to_edge from transformers import BertTokenizer, MobileBertForSequenceClassification @@ -204,8 +215,6 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): ) model.load_state_dict( - # TODO: If possible, it's better to set weights_only to True - # https://pytorch.org/docs/stable/generated/torch.load.html torch.load( ( f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model" @@ -213,7 +222,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): else pretrained_weight ), map_location=torch.device("cpu"), - weights_only=False, + weights_only=True, ), ) @@ -232,38 +241,65 @@ def main(args): "Please specify a device serial by -s/--device argument." ) - pte_filename = "ptq_mb_qnn" if args.ptq else "mb_qnn" - batch_size = 1 if args.ptq else 3 + batch_size, pte_filename = 1, "ptq_mb_qnn" model, data_val, labels = get_fine_tuned_mobilebert( args.artifact, args.pretrained_weight, batch_size ) inputs, input_list = get_dataset(data_val) - if args.ptq == "8a8w": - quant_dtype = QuantDtype.use_8a8w - elif args.ptq == "16a16w": - quant_dtype = QuantDtype.use_16a16w - elif args.ptq == "16a4w": - quant_dtype = QuantDtype.use_16a4w - else: + try: + quant_dtype = getattr(QuantDtype, f"use_{args.ptq}") + except: raise AssertionError( f"No support for quant type {args.ptq}. Support 8a8w, 16a16w and 16a4w." ) if args.use_fp16: quant_dtype = None + pte_filename = "mb_qnn" + build_executorch_binary( + model, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=quant_dtype, + shared_buffer=args.shared_buffer, + ) + else: - build_executorch_binary( - model, - inputs[0], - args.model, - f"{args.artifact}/{pte_filename}", - inputs, - skip_node_id_set=skip_node_id_set, - skip_node_op_set=skip_node_op_set, - quant_dtype=quant_dtype, - shared_buffer=args.shared_buffer, - ) + def calibrator(gm): + for input in inputs: + gm(*input) + + quantizer = make_quantizer(quant_dtype=quant_dtype) + backend_options = generate_htp_compiler_spec(quant_dtype is not None) + partitioner = QnnPartitioner( + generate_qnn_executorch_compiler_spec( + soc_model=getattr(QcomChipset, args.model), + backend_options=backend_options, + ), + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + ) + # skip embedding layer cause it's quantization sensitive + graph_module, _ = skip_annotation( + nn_module=model, + quantizer=quantizer, + partitioner=partitioner, + sample_input=inputs[0], + calibration_cb=calibrator, + fp_node_op_set={torch.ops.aten.embedding.default}, + ) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = to_edge( + torch.export.export(graph_module, inputs[0]), + ).to_executorch() + + with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: + file.write(exec_prog.buffer) if args.compile_only: sys.exit(0) diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 1a748bb45e1..5d9a3aef262 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -19,6 +19,7 @@ from executorch.backends.qualcomm.quantizer.quantizer import ( get_16a4w_qnn_ptq_config, get_default_16bit_qnn_ptq_config, + get_default_8bit_qnn_ptq_config, QnnQuantizer, QuantDtype, ) @@ -30,7 +31,7 @@ generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, ) -from executorch.exir import EdgeCompileConfig, EdgeProgramManager +from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge from executorch.exir.backend.backend_api import to_backend from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass @@ -178,6 +179,39 @@ def pull_etdump(self, output_path, callback=None): callback() +def make_quantizer( + quant_dtype: Optional[QuantDtype], + custom_annotations=(), + per_channel_conv=True, + per_channel_linear=False, + act_observer=MovingAverageMinMaxObserver, +): + quantizer = QnnQuantizer() + quantizer.add_custom_quant_annotations(custom_annotations) + quantizer.set_per_channel_conv_quant(per_channel_conv) + quantizer.set_per_channel_linear_quant(per_channel_linear) + + if quant_dtype == QuantDtype.use_8a8w: + quantizer.set_bit8_op_quant_config( + get_default_8bit_qnn_ptq_config(act_observer=act_observer) + ) + elif quant_dtype == QuantDtype.use_16a16w: + quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) + quantizer.set_bit16_op_quant_config( + get_default_16bit_qnn_ptq_config(act_observer=act_observer) + ) + elif quant_dtype == QuantDtype.use_16a4w: + quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) + quantizer.set_bit16_op_quant_config( + get_16a4w_qnn_ptq_config(act_observer=act_observer) + ) + quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4") + else: + raise AssertionError(f"No support for QuantDtype {quant_dtype}.") + + return quantizer + + # TODO: refactor to support different backends def build_executorch_binary( model, # noqa: B006 @@ -195,27 +229,13 @@ def build_executorch_binary( act_observer=MovingAverageMinMaxObserver, ): if quant_dtype is not None: - quantizer = QnnQuantizer() - quantizer.add_custom_quant_annotations(custom_annotations) - quantizer.set_per_channel_linear_quant(per_channel_linear) - quantizer.set_per_channel_conv_quant(True) - - if quant_dtype == QuantDtype.use_8a8w: - pass # default setting - elif quant_dtype == QuantDtype.use_16a16w: - quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) - quantizer.set_bit16_op_quant_config( - get_default_16bit_qnn_ptq_config(act_observer=act_observer) - ) - elif quant_dtype == QuantDtype.use_16a4w: - quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) - quantizer.set_bit16_op_quant_config( - get_16a4w_qnn_ptq_config(act_observer=act_observer) - ) - quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4") - else: - raise AssertionError(f"No support for QuantDtype {quant_dtype}.") - + quantizer = make_quantizer( + quant_dtype=quant_dtype, + custom_annotations=custom_annotations, + per_channel_conv=True, + per_channel_linear=per_channel_linear, + act_observer=act_observer, + ) captured_model = torch.export.export(model, inputs).module() annotated_model = prepare_pt2e(captured_model, quantizer) print("Quantizing the model...") @@ -225,29 +245,20 @@ def build_executorch_binary( else: for data in dataset: annotated_model(*data) + quantized_model = convert_pt2e(annotated_model) edge_prog = capture_program(quantized_model, inputs) else: edge_prog = capture_program(model, inputs) - arch_table = { - "SM8650": QcomChipset.SM8650, - "SM8550": QcomChipset.SM8550, - "SM8475": QcomChipset.SM8475, - "SM8450": QcomChipset.SM8450, - } - backend_options = generate_htp_compiler_spec( use_fp16=False if quant_dtype else True ) qnn_partitioner = QnnPartitioner( generate_qnn_executorch_compiler_spec( - soc_model=arch_table[soc_model], + soc_model=getattr(QcomChipset, soc_model), backend_options=backend_options, - debug=False, - saver=False, shared_buffer=shared_buffer, - profile=False, ), skip_node_id_set, skip_node_op_set, @@ -263,15 +274,12 @@ def build_executorch_binary( alloc_graph_input=not shared_buffer, alloc_graph_output=not shared_buffer, ), - extract_delegate_segments=True, ) if metadata is None: - edge_prog.exported_program = to_backend( - edge_prog.exported_program, qnn_partitioner - ) - edge_prog.exported_program.graph_module.graph.print_tabular() - exec_prog = edge_prog.to_executorch(config=executorch_config) + exported_program = to_backend(edge_prog.exported_program, qnn_partitioner) + exported_program.graph_module.graph.print_tabular() + exec_prog = to_edge(exported_program).to_executorch(config=executorch_config) with open(f"{file_name}.pte", "wb") as file: file.write(exec_prog.buffer) else: diff --git a/exir/_serialize/_dataclass.py b/exir/_serialize/_dataclass.py index 8f6ef1c172b..013d733bcda 100644 --- a/exir/_serialize/_dataclass.py +++ b/exir/_serialize/_dataclass.py @@ -129,6 +129,13 @@ class Example data[key] = [_json_to_dataclass(e, T) for e in value] continue + # If T is a Union, then check which type in the Union it is and initialize. + # eg. Double type in schema.py + if get_origin(T) is Union: + res = [x for x in get_args(get_type_hints(cls)[key]) if x == type(value)] + data[key] = res[0](value) + continue + # If T is an enum then lookup the value in the enum otherwise try to # cast value to whatever type is required if isinstance(T, enum.EnumMeta): diff --git a/exir/_serialize/_flatbuffer.py b/exir/_serialize/_flatbuffer.py index 93006612c73..4599249f00c 100644 --- a/exir/_serialize/_flatbuffer.py +++ b/exir/_serialize/_flatbuffer.py @@ -29,14 +29,6 @@ def _is_valid_alignment(alignment: int) -> bool: return alignment > 0 and (alignment & (alignment - 1)) == 0 -# TODO(T182299196): Replace this hack with a proper flatc binary. -def _replace_infinity_in_json_file(content: str) -> str: - content = re.sub( - r'"double_val"\s*:\s*(-)?Infinity', r'"double_val": "\g<1>inf"', content - ) - return content - - def _patch_schema_alignment( schema: bytes, constant_tensor_alignment: Optional[int], @@ -291,11 +283,8 @@ def _program_json_to_flatbuffer( json_path = os.path.join(temp_dir, file_stem + ".json") output_path = os.path.join(temp_dir, file_stem + ".pte") - # TODO(T182299196): Replace this hack with a proper flatc binary. - replaced_program_json = _replace_infinity_in_json_file(program_json) - with open(json_path, "wb") as json_file: - json_file.write(replaced_program_json.encode("ascii")) + json_file.write(program_json.encode("ascii")) try: _flatc_compile(temp_dir, schema_info.root_path, json_path) @@ -330,6 +319,19 @@ def _program_json_to_flatbuffer( ) +def _replace_infinity_in_json_file(content: bytes) -> bytes: + """Replace -inf and inf with "inf" and "-inf" in the JSON file. program.fbs + is used to convert from flatbuffer to JSON. +-inf float values are not + supported by JSON, so we replace them with the string equivalent. When + converting from JSON to python dataclasses, the string is read as a Union + of float and string (see schema.py). + """ + content = re.sub( + rb'"double_val"\s*:\s*(-)?inf', rb'"double_val": "\g<1>inf"', content + ) + return content + + def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes: """Converts binary flatbuffer data into Program-compatible JSON. @@ -348,4 +350,5 @@ def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes: _flatc_decompile(temp_dir, schema_info.root_path, bin_path) with open(json_path, "rb") as output_file: - return output_file.read() + json_data = output_file.read() + return _replace_infinity_in_json_file(json_data) diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index 2256d5fcc99..00a3d4700f0 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -553,6 +553,24 @@ def _restore_segments(program: Program, segment_data: bytes) -> Program: location=DataLocation.INLINE, index=data_index ) + # Replace constants from constant_segment into constant_buffer. + if program.constant_segment and len(program.constant_segment.offsets) > 0: + buffers: List[Buffer] = [] + constant_segment = segments[program.constant_segment.segment_index] + for i in range(len(program.constant_segment.offsets)): + start_offset = program.constant_segment.offsets[i] + # Note: this is the original end offset plus any padding between + # it and the next start offset. + end_offset = ( + program.constant_segment.offsets[i + 1] + if i < len(program.constant_segment.offsets) - 1 + else len(constant_segment) + ) + buffers.append(Buffer(storage=constant_segment[start_offset:end_offset])) + program.constant_buffer = buffers + program.constant_segment.segment_index = 0 + program.constant_segment.offsets = [] + # Clear out the segments list since the original Program didn't have one. program.segments = [] return program diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index afd8e3d282e..f20c0b39798 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -272,6 +272,15 @@ def constant_segment_with_tensor_alignment( f"{segment_table}", ) + # Convert back. + program2 = deserialize_pte_binary(pte_data) + # Programs are the same besides constant_buffer, as deserialization + # does not preserve constant segment; padding may be added + # during serialization. + self.assertEqual(program2.execution_plan, program.execution_plan) + # Number of constant tensors should be the same. + self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer)) + def test_canonicalize_delegate_indices(self) -> None: def make_execution_plan( name: str, delegates: List[BackendDelegate] @@ -462,7 +471,6 @@ def gen_blob_data(size: int, pattern: bytes) -> bytes: assert len(ret) == size return ret - @unittest.skip("TODO(T181362263): Update restore segments to restore cords") def test_round_trip_with_segments(self) -> None: # Create a program with some delegate data blobs. program = get_test_program() @@ -803,6 +811,15 @@ def test_constant_segment_and_delegate_segment(self) -> None: + b"\x40\x44\x44", ) + # Convert back. + program2 = deserialize_pte_binary(pte_data) + # Programs are the same besides constant_buffer, as deserialization + # does not preserve constant segment; padding may be added + # during serialization. + self.assertEqual(program2.execution_plan, program.execution_plan) + # Number of constant tensors should be the same. + self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer)) + # Common data for extended header tests. The two example values should produce # the example data. diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS index b99f374d83c..5c3a5e3eb32 100644 --- a/exir/backend/test/TARGETS +++ b/exir/backend/test/TARGETS @@ -82,15 +82,14 @@ python_library( "//executorch/test/...", ], deps = [ - ":backend_with_compiler_demo", - "//caffe2:torch", - "//executorch/exir:graph_module", - "//executorch/exir/backend:compile_spec_schema", - "//executorch/exir/backend:partitioner", - "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib", - "//executorch/exir/backend/test/demos/rpc:executor_backend_partitioner", - "//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess", - "//executorch/exir/dialects:lib", + "fbcode//caffe2:torch", + "fbcode//executorch/exir:graph_module", + "fbcode//executorch/exir/backend:compile_spec_schema", + "fbcode//executorch/exir/backend:partitioner", + "fbcode//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib", + "fbcode//executorch/exir/backend/test:backend_with_compiler_demo", + "fbcode//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess", + "fbcode//executorch/exir/dialects:lib", ], ) diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py index 3973011a269..da1ae0444dd 100644 --- a/exir/backend/test/test_partitioner.py +++ b/exir/backend/test/test_partitioner.py @@ -39,9 +39,8 @@ _load_for_executorch_from_buffer, ) from executorch.extension.pytree import tree_flatten -from torch._export import capture_pre_autograd_graph from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param -from torch.export import export +from torch.export import export, export_for_training from torch.fx.passes.operator_support import any_chain @@ -77,7 +76,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = capture_pre_autograd_graph(mlp, example_inputs) + model = export_for_training(mlp, example_inputs).module() aten = export(model, example_inputs) spec_key = "path" spec_value = "/a/b/c/d" @@ -138,7 +137,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = capture_pre_autograd_graph(mlp, example_inputs) + model = export_for_training(mlp, example_inputs).module() aten = export(model, example_inputs) edge = exir.to_edge(aten) @@ -178,7 +177,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = capture_pre_autograd_graph(mlp, example_inputs) + model = export_for_training(mlp, example_inputs).module() edge = exir.to_edge(export(model, example_inputs)) with self.assertRaisesRegex( @@ -230,7 +229,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),)) + model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) delegated = edge.to_backend(PartitionerNoTagData()) @@ -309,7 +308,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),)) + model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) delegated = edge.to_backend(PartitionerTagData()) @@ -384,7 +383,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),)) + model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) delegated = edge.to_backend(PartitionerTagData()) @@ -472,7 +471,7 @@ def partition( ) inputs = (torch.ones(2, 2),) - model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),)) + model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) exec_prog = edge.to_backend(PartitionerTagData()).to_executorch() executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer) @@ -532,7 +531,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),)) + model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) with self.assertRaises(RuntimeError) as error: _ = edge.to_backend(PartitionerTagData()) diff --git a/exir/backend/test/test_passes.py b/exir/backend/test/test_passes.py index 8a43431520d..4dcc7757faa 100644 --- a/exir/backend/test/test_passes.py +++ b/exir/backend/test/test_passes.py @@ -11,8 +11,8 @@ from executorch.exir.backend.canonical_partitioners.duplicate_constant_node_pass import ( duplicate_constant_node, ) -from torch._export import capture_pre_autograd_graph from torch._export.utils import is_buffer +from torch.export import export_for_training from torch.testing import FileCheck @@ -29,7 +29,7 @@ def forward(self, x): z = x - self.const return y, z - model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),)) + model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() edge = exir.to_edge(torch.export.export(model, (torch.ones(2, 2),))) const_nodes = [ diff --git a/exir/backend/utils.py b/exir/backend/utils.py index 2b768fe7c23..fb5e16c6bd0 100644 --- a/exir/backend/utils.py +++ b/exir/backend/utils.py @@ -383,6 +383,40 @@ def tag_constant_data(edge_program: ExportedProgram) -> None: node.meta["delegation_tag"] = user_tags.pop() +def tag_mutated_buffer(edge_program: ExportedProgram) -> None: + """ + Util function for partitioners. This function tags the mutated buffer nodes + whose users all belong within the same partition. This should be called after tagging all other nodes. + Any buffer which is used as input to a subgraph, will be tagged with the same tag as that + subgraph. Throw error when buffers is used across different partitions. That is the + underlying data will be owned by multiple delegates. + """ + for node in edge_program.graph.nodes: + # Determine whether this node is a mutated buffer + is_mutated_buffer_node = False + if node.op == "placeholder" and is_buffer(edge_program, node): + for node_user in node.users: + if node_user.name in edge_program.graph_signature.buffers_to_mutate: + is_mutated_buffer_node = True + break + # This node is mutated buffer, tag it + if is_mutated_buffer_node: + user_tags = set() + for user in node.users: + user_tag = user.meta.get("delegation_tag", None) + if user_tag is not None: + user_tags.add(user_tag) + if len(user_tags) > 1: + logging.info( + f"The data node is used across multiple partitions, including {user_tags}. " + "If the data is too large and it's not preferred to copy, please tag the " + "constant node like node.['no_copy'] = True and they won't be copied." + ) + # tag the data node with the same tag as the last user + if len(user_tags) > 0: + node.meta["delegation_tag"] = user_tags.pop() + + # TODO - style: use templated types class DelegateMappingBuilder: """ diff --git a/exir/capture/_config.py b/exir/capture/_config.py index 2d0a6c4ca80..11a0d6d069d 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -5,10 +5,11 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe - from dataclasses import dataclass, field from typing import Dict, List, Optional, Union +import torch + from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode from executorch.exir.pass_manager import PassType from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass @@ -38,6 +39,10 @@ class EdgeCompileConfig: _check_ir_validity: bool = True # TODO(larryliu): remove this _use_edge_ops: bool = True + # Allow core ATen ops check to be skipped for certain ops, but continue with the rest of the checks. + _core_aten_ops_exception_list: List[torch._ops.OpOverload] = field( + default_factory=list + ) _skip_type_promotion: bool = False # TODO(gasoonjia): remove this # TODO(T192537614): reenanle dim order as default diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index f1b980a9aea..123896ecdba 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -23,6 +23,7 @@ ExecutorchProgramManager, to_edge, ) +from executorch.exir._serialize._program import deserialize_pte_binary from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult from executorch.exir.dialects._ops import ops as exir_ops @@ -35,6 +36,7 @@ from executorch.exir.schema import ( Bool, DelegateCall, + Double, EValue, ExecutionPlan, Int, @@ -1620,3 +1622,33 @@ def forward(self, x): executorch_module = _load_for_executorch_from_buffer(model.buffer) self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1)) self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1) + 1) + + def test_infinity_in_model(self) -> None: + class InfinityMaskModel(nn.Module): + def __init__(self): + super().__init__() + self.mask = torch.tensor([[1, 0], [0, 1]], dtype=torch.float32) + + def forward(self, x): + masked_weights = x.masked_fill(self.mask == 0, float("-inf")) + return masked_weights + + model = to_edge( + export( + InfinityMaskModel(), + (torch.randn(2, 2),), + ) + ) + + # Confirm that we can serialize the model with infinity in it. + model = model.to_executorch() + + # Assert that the infinity is stored as a string "-inf". + values = model.executorch_program.execution_plan[0].values + self.assertEqual(values[5].val, Double(double_val=float("-inf"))) + + # Confirm that we can also deserialize the model with infinity in it. + pte_data = deserialize_pte_binary(model.buffer) + self.assertEqual( + pte_data.execution_plan, model.executorch_program.execution_plan + ) diff --git a/exir/program/_program.py b/exir/program/_program.py index 1339760f215..6b72d190f9d 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -573,6 +573,9 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram": EXIRATenDialectVerifier()(ep.exported_program.graph_module) except ExportError: logging.info( + "If a particular operator failed core ATen IR check, please consider adding it to the exception list. " + "Add the operator to _core_aten_ops_exception_list in EdgeCompileConfig. This is the recommended way " + "to resolve this type of failure, so that the rest of the IR validation check can still be performed.\n" "If you'd like to disable IR validation checking, please set _check_ir_validity in EdgeCompileConfig, " "like *.to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))." ) @@ -590,7 +593,11 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram": module_call_graph=ep.exported_program.module_call_graph, example_inputs=ep.exported_program.example_inputs, constants=ep.exported_program.constants, - verifiers=[get_aten_verifier(enable=config._check_ir_validity)], + verifiers=[ + get_aten_verifier( + config=config, + ) + ], ), False, ) @@ -698,10 +705,13 @@ def _generate_edge_program( program: ExportedProgram, ops_set_to_not_decompose: Optional[List[torch._ops.OpOverload]] = None, ) -> ExportedProgram: - if config._check_ir_validity: try: - EXIRATenDialectVerifier(ops_set_to_not_decompose)(program.graph_module) + EXIRATenDialectVerifier( + edge_compile_config=config, + class_only=False, + exception_list=ops_set_to_not_decompose, + )(program.graph_module) except ExportError as e: logging.info(f"Input program {name} is not in ATen dialect.") raise e @@ -1020,13 +1030,8 @@ def to_edge_transform_and_lower( edge_manager = edge_manager.to_backend({name: curr_partitioner}) for name, program in edge_manager._edge_programs.items(): - if config._check_ir_validity: - EXIREdgeDialectVerifier( - edge_compile_config=config, - class_only=True, - )()(program.graph_module) - ops_set_to_not_decompose = set() + ops_set_to_not_decompose: Set[torch._ops.OpOverload] = set() partitioners = partitioner.get(name, []) for curr_partitioner in partitioners: curr_op_set, check_op_support = curr_partitioner.ops_to_not_decompose( @@ -1042,6 +1047,13 @@ def to_edge_transform_and_lower( generate_error=True, ) + if config._check_ir_validity: + EXIREdgeDialectVerifier( + edge_compile_config=config, + class_only=True, + exception_list=list(ops_set_to_not_decompose), + )()(program.graph_module) + return edge_manager @@ -1107,6 +1119,7 @@ def __init__( self.compile_config = compile_config or EdgeCompileConfig() if not isinstance(edge_programs, dict): edge_programs = {"forward": edge_programs} + for name, program in edge_programs.items(): try: EXIREdgeDialectVerifier( diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py index 4d2f5dfd699..73f023e778b 100644 --- a/exir/program/test/test_program.py +++ b/exir/program/test/test_program.py @@ -531,11 +531,14 @@ def test_edge_manager_dialect(self): ) self.assertTrue(edge_manager.exported_program().dialect == "EDGE") - def _test_edge_dialect_verifier(self, callable, validate_ir=True): + def _test_edge_dialect_verifier( + self, callable, validate_ir=True, exception_list=None + ): from executorch.exir import EdgeCompileConfig edge_compile_config = EdgeCompileConfig( _check_ir_validity=validate_ir, + _core_aten_ops_exception_list=exception_list, ) # pre-autograd export. eventually this will become torch.export one = torch.ones(1, dtype=torch.float) @@ -681,3 +684,35 @@ def count_nodes(graph_module, target): ), 1, ) + + def test_edge_dialect_non_core_aten_ops(self): + class LinalgNorm(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.linalg.norm(x) + + from torch._export.verifier import SpecViolationError + + input = torch.arange(9, dtype=torch.float) - 4 + ep = torch.export.export(LinalgNorm(), (input,)) + + # aten::linalg_norm is not a core op, so it should error out + with self.assertRaises(SpecViolationError): + _ = to_edge(ep, compile_config=EdgeCompileConfig(_check_ir_validity=True)) + + # with exception list, it should not error out + try: + # This should not raise error + _ = to_edge( + ep, + compile_config=EdgeCompileConfig( + _check_ir_validity=True, + _core_aten_ops_exception_list=[ + torch.ops.aten.linalg_vector_norm.default + ], + ), + ) + except SpecViolationError: + self.fail("Should not error out on linalg_vector_norm op") diff --git a/exir/schema.py b/exir/schema.py index 706bc611403..9436465459a 100644 --- a/exir/schema.py +++ b/exir/schema.py @@ -75,7 +75,23 @@ class Bool: @dataclass class Double: - double_val: float + double_val: Union[float, str] + + def __init__(self, double_val: float) -> None: + if double_val == float("inf"): + self.double_val = "inf" + elif double_val == float("-inf"): + self.double_val = "-inf" + else: + self.double_val = double_val + + def __post_init__(self) -> None: + if isinstance(self.double_val, str): + assert self.double_val in ["inf", "-inf"] + else: + assert isinstance(self.double_val, float) + assert not self.double_val == float("inf") + assert not self.double_val == float("-inf") @dataclass diff --git a/exir/verification/verifier.py b/exir/verification/verifier.py index 8b6ec91dd3b..b519e20393a 100644 --- a/exir/verification/verifier.py +++ b/exir/verification/verifier.py @@ -52,12 +52,6 @@ def _check_valid_dim_order_ops(op, use_dim_order) -> None: class EXIRATenDialectVerifierBase(Verifier): dialect = "OLD_EXIR_ATEN_DISABLED" - def __init__( - self, exception_list: Optional[List[torch._ops.OpOverload]] = None - ) -> None: - super().__init__() - self._exception_list = exception_list if exception_list else [] - def allowed_getattr_types(self) -> Tuple[Type[Any], ...]: return ( torch.fx.GraphModule, @@ -78,38 +72,68 @@ def __call__(self, *args, **kwargs): raise RuntimeError("") -class EXIRATenDialectVerifier(EXIRATenDialectVerifierBase): - dialect = "OLD_EXIR_ATEN" +def EXIRATenDialectVerifier( # noqa: C901 + edge_compile_config: Optional[EdgeCompileConfig] = None, + class_only: bool = False, + exception_list: Optional[List[torch._ops.OpOverload]] = None, +): + """ + Returns a verifier class that runs ATen dialect specific checks on the graph module. + """ + # merge the exception list from edge_compile_config and exception_list + if edge_compile_config and edge_compile_config._core_aten_ops_exception_list: + exception_list = edge_compile_config._core_aten_ops_exception_list + ( + exception_list or [] + ) - def _get_exception_list(self) -> List[torch._ops.OpOverload]: - exception_list = [ - torch.ops.aten.mkldnn_rnn_layer.default, - torch.ops.aten._upsample_bilinear2d_aa.default, - torch.ops.aten.quantize_per_tensor.default, - torch.ops.aten.dequantize.self, - torch.ops.aten.max.default, # TODO(T188268054) - torch.ops.aten.min.default, # TODO(T188268054) - torch.ops.aten.full_like.default, # TODO(T183507359) - ] - exception_list += self._exception_list + class _EXIRATenDialectVerifier(EXIRATenDialectVerifierBase): + dialect = "OLD_EXIR_ATEN" - return exception_list + def __init__(self) -> None: + super().__init__() + # Note: here we are using the exception list passed from EXIRATenDialectVerifier function! + self._exception_list = exception_list if exception_list else [] - def check_valid_op(self, op): - if isinstance(op, OpOverload): - # TODO These special ops should be removable easily. - if op.namespace != "aten" or op in self._get_exception_list(): - return - if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags: - # NOTE(qihan): whether view_copy operators are marked as canonical is still under - # discussion. - raise SpecViolationError( - f"Operator {op.__module__}.{op.__name__} is not Aten Canonical." - ) + def _get_exception_list(self) -> List[torch._ops.OpOverload]: + exception_list = [ + torch.ops.aten.mkldnn_rnn_layer.default, + torch.ops.aten._upsample_bilinear2d_aa.default, + torch.ops.aten.quantize_per_tensor.default, + torch.ops.aten.dequantize.self, + torch.ops.aten.max.default, # TODO(T188268054) + torch.ops.aten.min.default, # TODO(T188268054) + torch.ops.aten.full_like.default, # TODO(T183507359) + ] + exception_list += self._exception_list + return exception_list -def get_aten_verifier(enable: bool = True): - return EXIRATenDialectVerifier if enable else EXIRATenDialectVerifierBase + def check_valid_op(self, op): + if isinstance(op, OpOverload): + # TODO These special ops should be removable easily. + if op.namespace != "aten" or op in self._get_exception_list(): + return + if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags: + # NOTE(qihan): whether view_copy operators are marked as canonical is still under + # discussion. + raise SpecViolationError( + f"Operator {op.__module__}.{op.__name__} is not Aten Canonical." + ) + + ret = _EXIRATenDialectVerifier + if not class_only: + ret = ret() + return ret + + +def get_aten_verifier(config: EdgeCompileConfig): + return ( + EXIRATenDialectVerifier( + class_only=True, exception_list=config._core_aten_ops_exception_list + ) + if config._check_ir_validity + else EXIRATenDialectVerifierBase + ) def _get_inputs(graph_module: GraphModule) -> List[Optional[FakeTensor]]: @@ -160,6 +184,12 @@ def EXIREdgeDialectVerifier( # noqa: C901 class_only: bool = False, exception_list: Optional[List[torch._ops.OpOverload]] = None, ): + # merge the exception list from edge_compile_config and exception_list + if edge_compile_config and edge_compile_config._core_aten_ops_exception_list: + exception_list = edge_compile_config._core_aten_ops_exception_list + ( + exception_list or [] + ) + class _EXIREdgeDialectVerifier(Verifier): dialect = "EDGE" @@ -170,7 +200,9 @@ def __init__(self) -> None: self.check_edge_ops = _edge_compile_config._use_edge_ops self.use_dim_order = not _edge_compile_config._skip_dim_order - self.aten_op_verifier = EXIRATenDialectVerifier(exception_list) + self.aten_op_verifier = EXIRATenDialectVerifier( + exception_list=exception_list + ) self.check_valid_aten_op = self.aten_op_verifier.check_valid_op if self.check_edge_ops: diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 74f98960002..ab1f3650102 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -10,7 +10,6 @@ project(executorch_jni) if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) - # Can't set to 11 due to executor_runner.cpp make_unique endif() if(NOT ANDROID) @@ -71,78 +70,55 @@ if(TARGET vulkan_backend) list(APPEND link_libraries vulkan_backend) endif() +if(EXECUTORCH_BUILD_KERNELS_CUSTOM) + add_subdirectory( + ${EXECUTORCH_ROOT}/extension/llm/custom_ops + ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops + ) + list(APPEND link_libraries custom_ops) + target_link_options_shared_lib(custom_ops) +endif() + add_library(executorch_jni SHARED jni/jni_layer.cpp) -target_link_libraries(executorch_jni ${link_libraries}) -target_include_directories( - executorch_jni PRIVATE ${_common_include_directories} -) -target_compile_options(executorch_jni PUBLIC ${_common_compile_options}) if(EXECUTORCH_BUILD_LLAMA_JNI) - set(LLAMA_RUNNER_PATH - ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner/libllama_runner.a - ) - add_library(llama_runner STATIC IMPORTED) - set_property( - TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH} - ) - + target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp) + list(APPEND link_libraries llama_runner llava_runner) + target_compile_definitions(executorch_jni PUBLIC EXECUTORCH_BUILD_LLAMA_JNI=1) add_subdirectory( ${EXECUTORCH_ROOT}/examples/models/llava/runner ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llava/runner ) - set(CUSTOM_OPS_PATH - ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops/libcustom_ops.a + add_subdirectory( + ${EXECUTORCH_ROOT}/examples/models/llama2/runner + ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner ) - add_library(custom_ops STATIC IMPORTED) - set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH}) - target_link_options_shared_lib(custom_ops) +endif() +if(TARGET quantized_kernels) + list(APPEND link_libraries quantized_kernels quantized_ops_lib) target_link_options_shared_lib(quantized_ops_lib) +endif() + +target_include_directories( + executorch_jni PRIVATE ${_common_include_directories} +) + +target_compile_options(executorch_jni PUBLIC ${_common_compile_options}) + +target_link_libraries(executorch_jni ${link_libraries}) - set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp) - add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS}) - if(TARGET pthreadpool) - target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1) - target_include_directories( - executorch_llama_jni - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include - ) - target_include_directories( - executorch_llama_jni - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include - ) - endif() +if(TARGET pthreadpool) + target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1) target_include_directories( - executorch_llama_jni PRIVATE ${_common_include_directories} - ) - target_link_libraries( - executorch_llama_jni - ${link_libraries} - llama_runner - llava_runner - custom_ops - cpublas - eigen_blas - quantized_kernels - quantized_ops_lib + executorch_jni + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include ) - target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) - # link re2 - set(ABSL_ENABLE_INSTALL ON) - set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) - set(CMAKE_POSITION_INDEPENDENT_CODE ON) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp - ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp - ) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2 - ${CMAKE_CURRENT_BINARY_DIR}/re2 + target_include_directories( + executorch_jni + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include ) - set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) - target_link_libraries(executorch_llama_jni re2::re2) endif() diff --git a/extension/android/benchmark/app/build.gradle.kts b/extension/android/benchmark/app/build.gradle.kts index b716f2e8bd0..dcf99ca9cd0 100644 --- a/extension/android/benchmark/app/build.gradle.kts +++ b/extension/android/benchmark/app/build.gradle.kts @@ -38,6 +38,7 @@ dependencies { implementation(files("libs/executorch.aar")) implementation("com.facebook.soloader:soloader:0.10.5") implementation("com.facebook.fbjni:fbjni:0.5.1") + implementation("com.google.code.gson:gson:2.8.6") testImplementation("junit:junit:4.13.2") androidTestImplementation("androidx.test.ext:junit:1.2.1") androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1") diff --git a/extension/android/benchmark/app/src/main/AndroidManifest.xml b/extension/android/benchmark/app/src/main/AndroidManifest.xml index 49711b6830e..098905c052c 100644 --- a/extension/android/benchmark/app/src/main/AndroidManifest.xml +++ b/extension/android/benchmark/app/src/main/AndroidManifest.xml @@ -16,6 +16,14 @@ + + + + + + diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java index e9599dd3518..a79f668f80b 100644 --- a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java @@ -11,8 +11,10 @@ import android.app.Activity; import android.content.Intent; import android.os.Bundle; +import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.util.Arrays; import org.pytorch.executorch.Module; public class BenchmarkActivity extends Activity { @@ -20,13 +22,19 @@ public class BenchmarkActivity extends Activity { protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); Intent intent = getIntent(); - String modelPath = intent.getStringExtra("model_path"); + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); + int numIter = intent.getIntExtra("num_iter", 10); // TODO: Format the string with a parsable format StringBuilder resultText = new StringBuilder(); - Module module = Module.load(modelPath); + Module module = Module.load(model.getPath()); for (int i = 0; i < numIter; i++) { long start = System.currentTimeMillis(); module.forward(); diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java new file mode 100644 index 00000000000..496cbde53d6 --- /dev/null +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench; + +import android.app.Activity; +import android.content.Intent; +import android.os.Bundle; +import android.util.Log; +import com.google.gson.Gson; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Arrays; + +public class LlmBenchmarkActivity extends Activity implements ModelRunnerCallback { + ModelRunner mModelRunner; + + String mPrompt; + StatsInfo mStatsInfo; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + + Intent intent = getIntent(); + + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); + String tokenizerPath = intent.getStringExtra("tokenizer_path"); + + float temperature = intent.getFloatExtra("temperature", 0.8f); + mPrompt = intent.getStringExtra("prompt"); + if (mPrompt == null) { + mPrompt = "The ultimate answer"; + } + + mStatsInfo = new StatsInfo(); + mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this); + mStatsInfo.loadStart = System.currentTimeMillis(); + } + + @Override + public void onModelLoaded(int status) { + mStatsInfo.loadEnd = System.currentTimeMillis(); + if (status != 0) { + Log.e("LlmBenchmarkRunner", "Loaded failed: " + status); + onGenerationStopped(); + return; + } + mStatsInfo.generateStart = System.currentTimeMillis(); + mModelRunner.generate(mPrompt); + } + + @Override + public void onTokenGenerated(String token) {} + + @Override + public void onStats(String stats) { + mStatsInfo.tokens = stats; + } + + @Override + public void onGenerationStopped() { + mStatsInfo.generateEnd = System.currentTimeMillis(); + + // TODO (huydhn): Remove txt files here once the JSON format is ready + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { + writer.write(mStatsInfo.toString()); + } catch (IOException e) { + e.printStackTrace(); + } + + // TODO (huydhn): Figure out on what the final JSON results looks like, we need something + // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042 + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { + Gson gson = new Gson(); + writer.write(gson.toJson(mStatsInfo)); + } catch (IOException e) { + e.printStackTrace(); + } + } +} + +class StatsInfo { + long loadStart; + long loadEnd; + long generateStart; + long generateEnd; + String tokens; + + @Override + public String toString() { + return "loadStart: " + + loadStart + + "\nloadEnd: " + + loadEnd + + "\ngenerateStart: " + + generateStart + + "\ngenerateEnd: " + + generateEnd + + "\n" + + tokens; + } +} diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java new file mode 100644 index 00000000000..9e9b9e003d8 --- /dev/null +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench; + +import android.os.Handler; +import android.os.HandlerThread; +import android.os.Looper; +import android.os.Message; +import org.pytorch.executorch.LlamaCallback; +import org.pytorch.executorch.LlamaModule; + +/** A helper class to handle all model running logic within this class. */ +public class ModelRunner implements LlamaCallback { + LlamaModule mModule = null; + + String mModelFilePath = ""; + String mTokenizerFilePath = ""; + + ModelRunnerCallback mCallback = null; + + HandlerThread mHandlerThread = null; + Handler mHandler = null; + + /** + * ] Helper class to separate between UI logic and model runner logic. Automatically handle + * generate() request on worker thread. + * + * @param modelFilePath + * @param tokenizerFilePath + * @param callback + */ + ModelRunner( + String modelFilePath, + String tokenizerFilePath, + float temperature, + ModelRunnerCallback callback) { + mModelFilePath = modelFilePath; + mTokenizerFilePath = tokenizerFilePath; + mCallback = callback; + + mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f); + mHandlerThread = new HandlerThread("ModelRunner"); + mHandlerThread.start(); + mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this); + + mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL); + } + + int generate(String prompt) { + Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt); + msg.sendToTarget(); + return 0; + } + + void stop() { + mModule.stop(); + } + + @Override + public void onResult(String result) { + mCallback.onTokenGenerated(result); + } + + @Override + public void onStats(float tps) { + mCallback.onStats("tokens/second: " + tps); + } +} + +class ModelRunnerHandler extends Handler { + public static int MESSAGE_LOAD_MODEL = 1; + public static int MESSAGE_GENERATE = 2; + + private final ModelRunner mModelRunner; + + public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) { + super(looper); + mModelRunner = modelRunner; + } + + @Override + public void handleMessage(android.os.Message msg) { + if (msg.what == MESSAGE_LOAD_MODEL) { + int status = mModelRunner.mModule.load(); + mModelRunner.mCallback.onModelLoaded(status); + } else if (msg.what == MESSAGE_GENERATE) { + mModelRunner.mModule.generate((String) msg.obj, mModelRunner); + mModelRunner.mCallback.onGenerationStopped(); + } + } +} diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java new file mode 100644 index 00000000000..63701a7bbc6 --- /dev/null +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench; + +/** + * A helper interface within the app for MainActivity and Benchmarking to handle callback from + * ModelRunner. + */ +public interface ModelRunnerCallback { + + void onModelLoaded(int status); + + void onTokenGenerated(String token); + + void onStats(String token); + + void onGenerationStopped(); +} diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index 7cdf8ef7ec4..3c8f00b2bdc 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -70,21 +70,30 @@ fb_android_cxx_library( fb_android_cxx_library( name = "executorch_llama_jni", - srcs = ["jni_layer_llama.cpp"], + srcs = [ + "jni_layer.cpp", + "jni_layer_llama.cpp", + ], + headers = ["jni_layer_constants.h"], allow_jni_merging = False, compiler_flags = [ "-frtti", "-fexceptions", + "-DEXECUTORCH_BUILD_LLAMA_JNI", "-Wno-format", ], - soname = "libexecutorch_llama_jni.$(ext)", + soname = "libexecutorch.$(ext)", visibility = ["PUBLIC"], deps = [ "//fbandroid/libraries/fbjni:fbjni", "//fbandroid/native/fb:fb", "//third-party/glog:glog", + "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", "//xplat/executorch/examples/models/llama2/runner:runner_static", "//xplat/executorch/examples/models/llava/runner:runner_static", + "//xplat/executorch/extension/module:module_static", + "//xplat/executorch/extension/runner_util:inputs_static", + "//xplat/executorch/extension/tensor:tensor_static", "//xplat/executorch/extension/threadpool:cpuinfo_utils_static", "//xplat/executorch/extension/threadpool:threadpool_static", ], diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp index f2cfc4a5cff..1ef81b20b08 100644 --- a/extension/android/jni/jni_layer.cpp +++ b/extension/android/jni/jni_layer.cpp @@ -386,7 +386,15 @@ class ExecuTorchJni : public facebook::jni::HybridClass { }; } // namespace executorch::extension +#ifdef EXECUTORCH_BUILD_LLAMA_JNI +extern void register_natives_for_llama(); +#else +// No op if we don't build llama +void register_natives_for_llama() {} +#endif JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) { - return facebook::jni::initialize( - vm, [] { executorch::extension::ExecuTorchJni::registerNatives(); }); + return facebook::jni::initialize(vm, [] { + executorch::extension::ExecuTorchJni::registerNatives(); + register_natives_for_llama(); + }); } diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 0d43317c3ca..e6a9b5de58c 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -30,33 +30,6 @@ #include #include -#ifdef __ANDROID__ -#include - -// For Android, write to logcat -void et_pal_emit_log_message( - et_timestamp_t timestamp, - et_pal_log_level_t level, - const char* filename, - const char* function, - size_t line, - const char* message, - size_t length) { - int android_log_level = ANDROID_LOG_UNKNOWN; - if (level == 'D') { - android_log_level = ANDROID_LOG_DEBUG; - } else if (level == 'I') { - android_log_level = ANDROID_LOG_INFO; - } else if (level == 'E') { - android_log_level = ANDROID_LOG_ERROR; - } else if (level == 'F') { - android_log_level = ANDROID_LOG_FATAL; - } - - __android_log_print(android_log_level, "LLAMA", "%s", message); -} -#endif - using namespace torch::executor; namespace executorch_jni { @@ -150,8 +123,8 @@ class ExecuTorchLlamaJni jint channels, facebook::jni::alias_ref prompt, jint seq_len, - jboolean echo, - facebook::jni::alias_ref callback) { + facebook::jni::alias_ref callback, + jboolean echo) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { auto image_size = image->size(); std::vector images; @@ -170,7 +143,8 @@ class ExecuTorchLlamaJni prompt->toStdString(), seq_len, [callback](std::string result) { callback->onResult(result); }, - [callback](const Stats& result) { callback->onStats(result); }); + [callback](const Stats& result) { callback->onStats(result); }, + echo); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { runner_->generate( prompt->toStdString(), @@ -248,7 +222,8 @@ class ExecuTorchLlamaJni facebook::jni::alias_ref prompt, jint seq_len, jlong start_pos, - facebook::jni::alias_ref callback) { + facebook::jni::alias_ref callback, + jboolean echo) { if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { return static_cast(Error::NotSupported); } @@ -259,7 +234,8 @@ class ExecuTorchLlamaJni [callback](const std::string& result) { callback->onResult(result); }, [callback](const ::executorch::extension::llm::Stats& stats) { callback->onStats(stats); - })); + }, + echo)); } void stop() { @@ -285,13 +261,18 @@ class ExecuTorchLlamaJni makeNativeMethod("generate", ExecuTorchLlamaJni::generate), makeNativeMethod("stop", ExecuTorchLlamaJni::stop), makeNativeMethod("load", ExecuTorchLlamaJni::load), + makeNativeMethod( + "prefillImagesNative", ExecuTorchLlamaJni::prefill_images), + makeNativeMethod( + "prefillPromptNative", ExecuTorchLlamaJni::prefill_prompt), + makeNativeMethod( + "generateFromPos", ExecuTorchLlamaJni::generate_from_pos), }); } }; } // namespace executorch_jni -JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) { - return facebook::jni::initialize( - vm, [] { executorch_jni::ExecuTorchLlamaJni::registerNatives(); }); +void register_natives_for_llama() { + executorch_jni::ExecuTorchLlamaJni::registerNatives(); } diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index c4de23df0ee..7c77dbae08f 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -28,7 +28,7 @@ public class LlamaModule { if (!NativeLoader.isInitialized()) { NativeLoader.init(new SystemDelegate()); } - NativeLoader.loadLibrary("executorch_llama_jni"); + NativeLoader.loadLibrary("executorch"); } private final HybridData mHybridData; @@ -60,7 +60,7 @@ public void resetNative() { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, LlamaCallback llamaCallback) { - return generate(prompt, DEFAULT_SEQ_LEN, DEFAULT_ECHO, llamaCallback); + return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback, DEFAULT_ECHO); } /** @@ -71,18 +71,18 @@ public int generate(String prompt, LlamaCallback llamaCallback) { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, seqLen, DEFAULT_ECHO, llamaCallback); + return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, DEFAULT_ECHO); } /** * Start generating tokens from the module. * * @param prompt Input prompt + * @param llamaCallback callback object to receive results * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - * @param llamaCallback callback object to receive results. */ - public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, echo, llamaCallback); + public int generate(String prompt, LlamaCallback llamaCallback, boolean echo) { + return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, llamaCallback, echo); } /** @@ -90,11 +90,11 @@ public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) { * * @param prompt Input prompt * @param seqLen sequence length + * @param llamaCallback callback object to receive results * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - * @param llamaCallback callback object to receive results. */ - public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, seqLen, echo, llamaCallback); + public int generate(String prompt, int seqLen, LlamaCallback llamaCallback, boolean echo) { + return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, echo); } /** @@ -106,8 +106,8 @@ public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llama * @param channels Input image number of channels * @param prompt Input prompt * @param seqLen sequence length - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) * @param llamaCallback callback object to receive results. + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) */ @DoNotStrip public native int generate( @@ -117,8 +117,8 @@ public native int generate( int channels, String prompt, int seqLen, - boolean echo, - LlamaCallback llamaCallback); + LlamaCallback llamaCallback, + boolean echo); /** * Prefill an LLaVA Module with the given images input. @@ -172,10 +172,11 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) { * @param seqLen The total sequence length, including the prompt tokens and new tokens. * @param startPos The starting position in KV cache of the input in the LLM. * @param llamaCallback callback object to receive results. + * @param echo indicate whether to echo the input prompt or not. * @return The error code. */ public native int generateFromPos( - String prompt, int seqLen, long startPos, LlamaCallback callback); + String prompt, int seqLen, long startPos, LlamaCallback callback, boolean echo); /** Stop current generate() before it finishes. */ @DoNotStrip diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index 4dcffaffbf6..1bc3188fe17 100644 --- a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -10,14 +10,14 @@ 03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; }; 03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; }; 03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; }; - 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */; }; - 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */; }; - 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */; }; - 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */; }; - 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */; }; - 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */; }; - 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */; }; - 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */; }; + 03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */; }; + 03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */; }; + 03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */; }; + 03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */; }; + 03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */; }; + 03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */; }; + 03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A32C8FE44600FE4619 /* executorch.xcframework */; settings = {ATTRIBUTES = (Required, ); }; }; + 03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */; }; 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; }; 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; }; 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; }; @@ -45,14 +45,14 @@ 03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; 03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = ""; }; 03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; }; - 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = ""; }; - 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = ""; }; - 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = ""; }; - 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = ""; }; - 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = ""; }; - 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = ""; }; - 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = ""; }; - 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = ""; }; + 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = ""; }; + 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = ""; }; + 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = ""; }; + 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = ""; }; + 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = ""; }; + 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = ""; }; + 03DD00A32C8FE44600FE4619 /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = ""; }; + 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = ""; }; 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; }; 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; }; 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; }; @@ -79,14 +79,14 @@ 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */, 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */, 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */, - 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */, - 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */, - 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */, - 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */, - 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */, - 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */, - 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */, - 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */, + 03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */, + 03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */, + 03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */, + 03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */, + 03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */, + 03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */, + 03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */, + 03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -141,14 +141,14 @@ 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */, 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */, 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */, - 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */, - 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */, - 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */, - 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */, - 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */, - 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */, - 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */, - 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */, + 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */, + 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */, + 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */, + 03DD00A32C8FE44600FE4619 /* executorch.xcframework */, + 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */, + 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */, + 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */, + 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */, ); name = Frameworks; sourceTree = SOURCE_ROOT; diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm index 5cf958765d3..dd85cb69542 100644 --- a/extension/apple/Benchmark/Tests/Tests.mm +++ b/extension/apple/Benchmark/Tests/Tests.mm @@ -22,82 +22,105 @@ @interface Tests : XCTestCase @implementation Tests + (void)initialize { - if (self == [Tests class]) { - NSString *modelsDir = [[NSBundle bundleForClass:[self class]].resourcePath - stringByAppendingPathComponent:@"Models"]; - NSArray *models = - [NSFileManager.defaultManager contentsOfDirectoryAtPath:modelsDir - error:nil]; - for (NSString *model in models) { - NSString *modelName = model.stringByDeletingPathExtension; - NSString *modelPath = [modelsDir stringByAppendingPathComponent:model]; - XCTAssertGreaterThan(modelPath.length, 0); - - SEL testLoadSelector = NSSelectorFromString( - [NSString stringWithFormat:@"test_load_%@", modelName]); - IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) { - auto __block module = std::make_unique(modelPath.UTF8String); - [_self - measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] - options:XCTMeasureOptions.defaultOptions - block:^{ - XCTAssertEqual(module->load_method("forward"), - Error::Ok); - }]; - }); - class_addMethod( - [self class], testLoadSelector, testLoadImplementation, "v@:"); - - SEL testForwardSelector = NSSelectorFromString( - [NSString stringWithFormat:@"test_forward_%@", modelName]); - IMP testForwardImplementation = imp_implementationWithBlock(^(id _self) { - auto __block module = std::make_unique(modelPath.UTF8String); - XCTAssertEqual(module->load_method("forward"), Error::Ok); - - const auto method_meta = module->method_meta("forward"); - XCTAssertEqual(method_meta.error(), Error::Ok); - - const auto num_inputs = method_meta->num_inputs(); - XCTAssertGreaterThan(num_inputs, 0); - - std::vector> buffers; - buffers.reserve(num_inputs); - std::vector tensors; - tensors.reserve(num_inputs); - std::vector __block inputs; - inputs.reserve(num_inputs); - - for (auto index = 0; index < num_inputs; ++index) { - auto input_tag = method_meta->input_tag(index); - XCTAssertEqual(input_tag.error(), Error::Ok); - - switch (*input_tag) { - case Tag::Tensor: { - const auto tensor_meta = method_meta->input_tensor_meta(index); - XCTAssertEqual(tensor_meta.error(), Error::Ok); - - const auto sizes = tensor_meta->sizes(); - buffers.emplace_back(tensor_meta->nbytes(), - 0b01010101); // Set all bytes to be non-zero. - tensors.emplace_back(from_blob(buffers.rbegin()->data(), - {sizes.begin(), sizes.end()}, - tensor_meta->scalar_type())); - inputs.emplace_back(tensors.back()); - } break; - default: - XCTFail("Unsupported tag %i at input %d", *input_tag, index); - } + if (self != [self class]) { + return; + } + for (NSBundle *bundle in @[ + [NSBundle mainBundle], + [NSBundle bundleForClass:[self class]], + ]) { + for (NSString *directory in @[ + @"Models", + @"aatp/data", + ]) { + NSString *directoryPath = + [bundle.resourcePath stringByAppendingPathComponent:directory]; + NSArray *filePaths = + [NSFileManager.defaultManager contentsOfDirectoryAtPath:directoryPath + error:nil]; + for (NSString *filePath in filePaths) { + if (![filePath hasSuffix:@".pte"]) { + continue; } - [_self - measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] - options:XCTMeasureOptions.defaultOptions - block:^{ - XCTAssertEqual(module->forward(inputs).error(), - Error::Ok); - }]; - }); - class_addMethod( - [self class], testForwardSelector, testForwardImplementation, "v@:"); + NSString *modelPath = + [directoryPath stringByAppendingPathComponent:filePath]; + NSString *directoryName = + [directory stringByReplacingOccurrencesOfString:@"/" + withString:@"_"] + .lowercaseString; + NSString *modelName = + modelPath.lastPathComponent.stringByDeletingPathExtension; + + SEL testLoadSelector = NSSelectorFromString([NSString + stringWithFormat:@"test_load_%@_%@", directoryName, modelName]); + IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + [_self measureWithMetrics:@[ + [XCTClockMetric new], + [XCTMemoryMetric new], + ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->load_method("forward"), + Error::Ok); + }]; + }); + class_addMethod( + [self class], testLoadSelector, testLoadImplementation, "v@:"); + + SEL testForwardSelector = NSSelectorFromString([NSString + stringWithFormat:@"test_forward_%@_%@", directoryName, modelName]); + IMP testForwardImplementation = imp_implementationWithBlock(^( + id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + XCTAssertEqual(module->load_method("forward"), Error::Ok); + + const auto method_meta = module->method_meta("forward"); + XCTAssertEqual(method_meta.error(), Error::Ok); + + const auto num_inputs = method_meta->num_inputs(); + XCTAssertGreaterThan(num_inputs, 0); + + std::vector __block tensors; + tensors.reserve(num_inputs); + std::vector __block inputs; + inputs.reserve(num_inputs); + + for (auto index = 0; index < num_inputs; ++index) { + const auto input_tag = method_meta->input_tag(index); + XCTAssertEqual(input_tag.error(), Error::Ok); + + switch (*input_tag) { + case Tag::Tensor: { + const auto tensor_meta = method_meta->input_tensor_meta(index); + XCTAssertEqual(tensor_meta.error(), Error::Ok); + + const auto sizes = tensor_meta->sizes(); + tensors.emplace_back(make_tensor_ptr( + tensor_meta->scalar_type(), + {sizes.begin(), sizes.end()}, + std::vector(tensor_meta->nbytes(), 0b01010101))); + inputs.emplace_back(tensors.back()); + } break; + default: + XCTFail("Unsupported tag %i at input %d", *input_tag, index); + } + } + [_self measureWithMetrics:@[ + [XCTClockMetric new], + [XCTMemoryMetric new], + ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->forward(inputs).error(), + Error::Ok); + }]; + }); + class_addMethod([self class], + testForwardSelector, + testForwardImplementation, + "v@:"); + } } } } diff --git a/extension/kernel_util/make_boxed_from_unboxed_functor.h b/extension/kernel_util/make_boxed_from_unboxed_functor.h index 2b21914f49b..409c981cbb1 100644 --- a/extension/kernel_util/make_boxed_from_unboxed_functor.h +++ b/extension/kernel_util/make_boxed_from_unboxed_functor.h @@ -173,9 +173,9 @@ static executorch::runtime::Kernel make_boxed_kernel( } // namespace extension } // namespace executorch -#define EXECUTORCH_LIBRARY(ns, op_name, func) \ - static auto res_##ns = ::executorch::runtime::register_kernels( \ - ::executorch::extension::make_boxed_kernel( \ +#define EXECUTORCH_LIBRARY(ns, op_name, func) \ + static auto res_##ns = ::executorch::runtime::register_kernel( \ + ::executorch::extension::make_boxed_kernel( \ #ns "::" op_name, EXECUTORCH_FN(func))) namespace torch { diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp index da9596def70..dce3694d517 100644 --- a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp +++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp @@ -21,10 +21,11 @@ using exec_aten::ScalarType; using exec_aten::Tensor; using exec_aten::TensorImpl; using executorch::runtime::BoxedEvalueList; +using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::getOpsFn; -using executorch::runtime::hasOpsFn; +using executorch::runtime::get_op_function_from_registry; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::registry_has_op_function; Tensor& my_op_out(KernelRuntimeContext& ctx, const Tensor& a, Tensor& out) { (void)ctx; @@ -91,12 +92,12 @@ class MakeBoxedFromUnboxedFunctorTest : public ::testing::Test { TEST_F(MakeBoxedFromUnboxedFunctorTest, Basic) { EXECUTORCH_LIBRARY(my_ns, "my_op.out", my_op_out); - EXPECT_TRUE(hasOpsFn("my_ns::my_op.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::my_op.out")); } TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { EXECUTORCH_LIBRARY(my_ns, "set_1.out", set_1_out); - EXPECT_TRUE(hasOpsFn("my_ns::set_1.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::set_1.out")); // prepare out tensor TensorImpl::SizesType sizes[1] = {5}; @@ -106,7 +107,8 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { auto a = Tensor(&a_impl); // get boxed callable - auto fn = getOpsFn("my_ns::set_1.out"); + auto fn = get_op_function_from_registry("my_ns::set_1.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it KernelRuntimeContext context; @@ -115,7 +117,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { EValue* stack[1]; stack[0] = &values[0]; - fn(context, stack); + (*fn)(context, stack); // check result EXPECT_EQ(a.const_data_ptr()[0], 1); @@ -123,7 +125,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) { EXECUTORCH_LIBRARY(my_ns, "add_tensor.out", add_tensor_out); - EXPECT_TRUE(hasOpsFn("my_ns::add_tensor.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::add_tensor.out")); // prepare ArrayRef input. torch::executor::testing::TensorFactory tf; @@ -135,13 +137,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) { // prepare out tensor. EValue out(tf.zeros({5})); - auto fn = getOpsFn("my_ns::add_tensor.out"); + auto fn = get_op_function_from_registry("my_ns::add_tensor.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it. KernelRuntimeContext context; EValue values[2] = {boxed_array_ref, out}; EValue* stack[2] = {&values[0], &values[1]}; - fn(context, stack); + (*fn)(context, stack); // check result. for (int i = 0; i < 5; i++) { @@ -151,7 +154,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) { TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) { EXECUTORCH_LIBRARY(my_ns, "add_optional_scalar.out", add_optional_scalar_out); - EXPECT_TRUE(hasOpsFn("my_ns::add_optional_scalar.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_scalar.out")); // prepare optional input. EValue scalar((int64_t)3); @@ -160,13 +163,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) { // prepare out tensor. torch::executor::testing::TensorFactory tf; EValue out(tf.ones({1})); - auto fn = getOpsFn("my_ns::add_optional_scalar.out"); + auto fn = get_op_function_from_registry("my_ns::add_optional_scalar.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it. KernelRuntimeContext context; EValue values[3] = {scalar, scalar_none, out}; EValue* stack[3] = {&values[0], &values[1], &values[2]}; - fn(context, stack); + (*fn)(context, stack); // check result. EXPECT_EQ(stack[2]->toTensor().const_data_ptr()[0], 4); @@ -174,7 +178,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) { TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) { EXECUTORCH_LIBRARY(my_ns, "add_optional_tensor.out", add_optional_tensor_out); - EXPECT_TRUE(hasOpsFn("my_ns::add_optional_tensor.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_tensor.out")); // prepare optional tensors. torch::executor::testing::TensorFactory tf; @@ -186,13 +190,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) { // prepare out tensor. EValue out(tf.zeros({5})); - auto fn = getOpsFn("my_ns::add_optional_tensor.out"); + auto fn = get_op_function_from_registry("my_ns::add_optional_tensor.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it. KernelRuntimeContext context; EValue values[2] = {boxed_array_ref, out}; EValue* stack[2] = {&values[0], &values[1]}; - fn(context, stack); + (*fn)(context, stack); // check result. for (int i = 0; i < 5; i++) { diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index 56db1c208ea..c5ac365825b 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -158,7 +158,7 @@ static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { template < typename scalar_t, typename std::enable_if_t< - ::executorch::runtime::is_reduced_floating_point::value, + ::executorch::runtime::is_reduced_floating_point_v, int> = 0> static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) { (void)ptr; @@ -247,7 +247,7 @@ void cpu_flash_attention( "KV_split_size must be greater than q_split_size"); constexpr bool is_reduced_type = - ::executorch::runtime::is_reduced_floating_point::value; + ::executorch::runtime::is_reduced_floating_point_v; ET_CHECK_MSG( !is_reduced_type, "FlashAttention does not support reduced types."); diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.clang-format b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.clang-format deleted file mode 100644 index 4b3f13fa55e..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.clang-format +++ /dev/null @@ -1,5 +0,0 @@ ---- -Language: Cpp -BasedOnStyle: Google -... - diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.gitignore b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.gitignore deleted file mode 100644 index 3c1b4f2183e..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.gitignore +++ /dev/null @@ -1,46 +0,0 @@ -*.a -*.so -*.so.?* -*.dll -*.exe -*.dylib -*.cmake -!/cmake/*.cmake -*~ -*.pyc -__pycache__ - -# lcov -*.lcov -/lcov - -# cmake files. -/Testing -CMakeCache.txt -CMakeFiles/ -cmake_install.cmake - -# makefiles. -Makefile - -# in-source build. -bin/ -lib/ -/test/*_test - -# exuberant ctags. -tags - -# YouCompleteMe configuration. -.ycm_extra_conf.pyc - -# ninja generated files. -.ninja_deps -.ninja_log -build.ninja -install_manifest.txt -rules.ninja - -# out-of-source build top-level folders. -build/ -_build/ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis-libcxx-setup.sh b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis-libcxx-setup.sh deleted file mode 100644 index a591743c6a6..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis-libcxx-setup.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# Install a newer CMake version -curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh -chmod +x install-cmake.sh -sudo ./install-cmake.sh --prefix=/usr/local --skip-license - -# Checkout LLVM sources -git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source -git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx -git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi - -# Setup libc++ options -if [ -z "$BUILD_32_BITS" ]; then - export BUILD_32_BITS=OFF && echo disabling 32 bit build -fi - -# Build and install libc++ (Use unstable ABI for better sanitizer coverage) -mkdir llvm-build && cd llvm-build -cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \ - -DLIBCXX_ABI_UNSTABLE=ON \ - -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \ - -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \ - ../llvm-source -make cxx -j2 -sudo make install-cxxabi install-cxx -cd ../ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis.yml b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis.yml deleted file mode 100644 index 36df088446c..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/.travis.yml +++ /dev/null @@ -1,157 +0,0 @@ -sudo: required -dist: trusty -language: cpp - -env: - global: - - /usr/local/bin:$PATH - -matrix: - include: - - compiler: gcc - addons: - apt: - packages: - - lcov - env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage - - compiler: gcc - env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug - - compiler: gcc - env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release - - compiler: gcc - addons: - apt: - packages: - - g++-multilib - env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON - - compiler: gcc - addons: - apt: - packages: - - g++-multilib - env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON - - compiler: gcc - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-6 - env: - - COMPILER=g++-6 C_COMPILER=gcc-6 BUILD_TYPE=Debug - - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold" - - compiler: clang - env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug - - compiler: clang - env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release - # Clang w/ libc++ - - compiler: clang - addons: - apt: - packages: - clang-3.8 - env: - - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug - - LIBCXX_BUILD=1 - - EXTRA_FLAGS="-stdlib=libc++" - - compiler: clang - addons: - apt: - packages: - clang-3.8 - env: - - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release - - LIBCXX_BUILD=1 - - EXTRA_FLAGS="-stdlib=libc++" - # Clang w/ 32bit libc++ - - compiler: clang - addons: - apt: - packages: - - clang-3.8 - - g++-multilib - env: - - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug - - LIBCXX_BUILD=1 - - BUILD_32_BITS=ON - - EXTRA_FLAGS="-stdlib=libc++ -m32" - # Clang w/ 32bit libc++ - - compiler: clang - addons: - apt: - packages: - - clang-3.8 - - g++-multilib - env: - - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release - - LIBCXX_BUILD=1 - - BUILD_32_BITS=ON - - EXTRA_FLAGS="-stdlib=libc++ -m32" - # Clang w/ libc++, ASAN, UBSAN - - compiler: clang - addons: - apt: - packages: - clang-3.8 - env: - - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug - - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address" - - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all" - - UBSAN_OPTIONS=print_stacktrace=1 - # Clang w/ libc++ and MSAN - - compiler: clang - addons: - apt: - packages: - clang-3.8 - env: - - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug - - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins - - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" - # Clang w/ libc++ and MSAN - - compiler: clang - addons: - apt: - packages: - clang-3.8 - env: - - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo - - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread - - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" - - - os: osx - osx_image: xcode8.3 - compiler: clang - env: - - COMPILER=clang++ BUILD_TYPE=Debug - - os: osx - osx_image: xcode8.3 - compiler: clang - env: - - COMPILER=clang++ BUILD_TYPE=Release - -before_script: - - if [ -z "$BUILD_32_BITS" ]; then - export BUILD_32_BITS=OFF && echo disabling 32 bit build; - fi - - if [ -n "${LIBCXX_BUILD}" ]; then - source .travis-libcxx-setup.sh; - fi - - mkdir build && cd build - -install: - - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then - PATH=~/.local/bin:${PATH}; - pip install --user --upgrade pip; - pip install --user cpp-coveralls; - fi - -script: - - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} .. - - make - - ctest -C ${BUILD_TYPE} --output-on-failure - -after_success: - - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then - coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .; - fi diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/AUTHORS b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/AUTHORS deleted file mode 100644 index ae278df4046..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/AUTHORS +++ /dev/null @@ -1,40 +0,0 @@ -# This is the official list of benchmark authors for copyright purposes. -# This file is distinct from the CONTRIBUTORS files. -# See the latter for an explanation. -# -# Names should be added to this file as: -# Name or Organization -# The email address is not required for organizations. -# -# Please keep the list sorted. - -Albert Pretorius -Arne Beer -Christopher Seymour -David Coeurjolly -Dominic Hamon -Eric Fiselier -Eugene Zhuk -Evgeny Safronov -Felix Homann -Google Inc. -International Business Machines Corporation -Ismael Jimenez Martinez -Jern-Kuan Leong -Joao Paulo Magalhaes -JianXiong Zhou -Jussi Knuuttila -Kaito Udagawa -Lei Xu -Matt Clarkson -Maxim Vafin -Nick Hutchinson -Oleksandr Sochka -Paul Redmond -Radoslav Yovchev -Shuo Chen -Yixuan Qiu -Yusuke Suzuki -Dirac Research -Zbigniew Skowron -Dominik Czarnota diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CMakeLists.txt b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CMakeLists.txt deleted file mode 100644 index f7f1566f569..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CMakeLists.txt +++ /dev/null @@ -1,202 +0,0 @@ -cmake_minimum_required (VERSION 2.8.12) - -project (benchmark) - -foreach(p - CMP0054 # CMake 3.1 - CMP0056 # export EXE_LINKER_FLAGS to try_run - ) - if(POLICY ${p}) - cmake_policy(SET ${p} NEW) - endif() -endforeach() - -option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON) -option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON) -option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF) -option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF) -option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library" OFF) - -# Make sure we can import out CMake functions -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") - -# Read the git tags to determine the project version -include(GetGitVersion) -get_git_version(GIT_VERSION) - -# Tell the user what versions we are using -string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION}) -message("-- Version: ${VERSION}") - -# The version of the libraries -set(GENERIC_LIB_VERSION ${VERSION}) -string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION) - -# Import our CMake modules -include(CheckCXXCompilerFlag) -include(AddCXXCompilerFlag) -include(CXXFeatureCheck) - -if (BENCHMARK_BUILD_32_BITS) - add_required_cxx_compiler_flag(-m32) -endif() - -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - # Turn compiler warnings up to 11 - string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") - add_definitions(-D_CRT_SECURE_NO_WARNINGS) - - if (NOT BENCHMARK_ENABLE_EXCEPTIONS) - add_cxx_compiler_flag(-EHs-) - add_cxx_compiler_flag(-EHa-) - endif() - # Link time optimisation - if (BENCHMARK_ENABLE_LTO) - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL") - set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS_RELEASE} /LTCG") - set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /LTCG") - set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG") - - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /GL") - string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO}") - set(CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO} /LTCG") - string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO}") - set(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO} /LTCG") - string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO}") - set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /LTCG") - - set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /GL") - set(CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL "${CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL} /LTCG") - set(CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL "${CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL} /LTCG") - set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG") - endif() -else() - # Try and enable C++11. Don't use C++14 because it doesn't work in some - # configurations. - add_cxx_compiler_flag(-std=c++11) - if (NOT HAVE_CXX_FLAG_STD_CXX11) - add_cxx_compiler_flag(-std=c++0x) - endif() - - # Turn compiler warnings up to 11 - add_cxx_compiler_flag(-Wall) - - add_cxx_compiler_flag(-Wextra) - add_cxx_compiler_flag(-Wshadow) - add_cxx_compiler_flag(-Werror RELEASE) - add_cxx_compiler_flag(-Werror RELWITHDEBINFO) - add_cxx_compiler_flag(-Werror MINSIZEREL) - add_cxx_compiler_flag(-pedantic) - add_cxx_compiler_flag(-pedantic-errors) - add_cxx_compiler_flag(-Wshorten-64-to-32) - add_cxx_compiler_flag(-Wfloat-equal) - add_cxx_compiler_flag(-fstrict-aliasing) - if (NOT BENCHMARK_ENABLE_EXCEPTIONS) - add_cxx_compiler_flag(-fno-exceptions) - endif() - if (NOT BENCHMARK_USE_LIBCXX) - add_cxx_compiler_flag(-Wzero-as-null-pointer-constant) - endif() - if (HAVE_CXX_FLAG_FSTRICT_ALIASING) - if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing - add_cxx_compiler_flag(-Wstrict-aliasing) - endif() - endif() - # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden - # (because of deprecated overload) - add_cxx_compiler_flag(-wd654) - add_cxx_compiler_flag(-Wthread-safety) - if (HAVE_CXX_FLAG_WTHREAD_SAFETY) - cxx_feature_check(THREAD_SAFETY_ATTRIBUTES) - endif() - - # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a - # predefined macro, which turns on all of the wonderful libc extensions. - # However g++ doesn't do this in Cygwin so we have to define it ourselfs - # since we depend on GNU/POSIX/BSD extensions. - if (CYGWIN) - add_definitions(-D_GNU_SOURCE=1) - endif() - - # Link time optimisation - if (BENCHMARK_ENABLE_LTO) - add_cxx_compiler_flag(-flto) - if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") - find_program(GCC_AR gcc-ar) - if (GCC_AR) - set(CMAKE_AR ${GCC_AR}) - endif() - find_program(GCC_RANLIB gcc-ranlib) - if (GCC_RANLIB) - set(CMAKE_RANLIB ${GCC_RANLIB}) - endif() - endif() - endif() - - # Coverage build type - set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING - "Flags used by the C++ compiler during coverage builds." - FORCE) - set(CMAKE_EXE_LINKER_FLAGS_COVERAGE - "${CMAKE_EXE_LINKER_FLAGS_DEBUG}" CACHE STRING - "Flags used for linking binaries during coverage builds." - FORCE) - set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE - "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" CACHE STRING - "Flags used by the shared libraries linker during coverage builds." - FORCE) - mark_as_advanced( - CMAKE_CXX_FLAGS_COVERAGE - CMAKE_EXE_LINKER_FLAGS_COVERAGE - CMAKE_SHARED_LINKER_FLAGS_COVERAGE) - set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING - "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage." - FORCE) - add_cxx_compiler_flag(--coverage COVERAGE) -endif() - -if (BENCHMARK_USE_LIBCXX) - if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - add_cxx_compiler_flag(-stdlib=libc++) - elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR - "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - add_cxx_compiler_flag(-nostdinc++) - message("libc++ header path must be manually specified using CMAKE_CXX_FLAGS") - # Adding -nodefaultlibs directly to CMAKE__LINKER_FLAGS will break - # configuration checks such as 'find_package(Threads)' - list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs) - # -lc++ cannot be added directly to CMAKE__LINKER_FLAGS because - # linker flags appear before all linker inputs and -lc++ must appear after. - list(APPEND BENCHMARK_CXX_LIBRARIES c++) - else() - message(FATAL "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler") - endif() -endif(BENCHMARK_USE_LIBCXX) - -# C++ feature checks -# Determine the correct regular expression engine to use -cxx_feature_check(STD_REGEX) -cxx_feature_check(GNU_POSIX_REGEX) -cxx_feature_check(POSIX_REGEX) -if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX) - message(FATAL_ERROR "Failed to determine the source files for the regular expression backend") -endif() -if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX - AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX) - message(WARNING "Using std::regex with exceptions disabled is not fully supported") -endif() -cxx_feature_check(STEADY_CLOCK) -# Ensure we have pthreads -find_package(Threads REQUIRED) - -# Set up directories -include_directories(${PROJECT_SOURCE_DIR}/include) - -# Build the targets -add_subdirectory(src) - -if (BENCHMARK_ENABLE_TESTING) - enable_testing() - add_subdirectory(test) -endif() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTING.md b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTING.md deleted file mode 100644 index 43de4c9d470..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTING.md +++ /dev/null @@ -1,58 +0,0 @@ -# How to contribute # - -We'd love to accept your patches and contributions to this project. There are -a just a few small guidelines you need to follow. - - -## Contributor License Agreement ## - -Contributions to any Google project must be accompanied by a Contributor -License Agreement. This is not a copyright **assignment**, it simply gives -Google permission to use and redistribute your contributions as part of the -project. - - * If you are an individual writing original source code and you're sure you - own the intellectual property, then you'll need to sign an [individual - CLA][]. - - * If you work for a company that wants to allow you to contribute your work, - then you'll need to sign a [corporate CLA][]. - -You generally only need to submit a CLA once, so if you've already submitted -one (even if it was for a different project), you probably don't need to do it -again. - -[individual CLA]: https://developers.google.com/open-source/cla/individual -[corporate CLA]: https://developers.google.com/open-source/cla/corporate - -Once your CLA is submitted (or if you already submitted one for -another Google project), make a commit adding yourself to the -[AUTHORS][] and [CONTRIBUTORS][] files. This commit can be part -of your first [pull request][]. - -[AUTHORS]: AUTHORS -[CONTRIBUTORS]: CONTRIBUTORS - - -## Submitting a patch ## - - 1. It's generally best to start by opening a new issue describing the bug or - feature you're intending to fix. Even if you think it's relatively minor, - it's helpful to know what people are working on. Mention in the initial - issue that you are planning to work on that bug or feature so that it can - be assigned to you. - - 1. Follow the normal process of [forking][] the project, and setup a new - branch to work in. It's important that each group of changes be done in - separate branches in order to ensure that a pull request only includes the - commits related to that bug or feature. - - 1. Do your best to have [well-formed commit messages][] for each change. - This provides consistency throughout the project, and ensures that commit - messages are able to be formatted properly by various git tools. - - 1. Finally, push the commits to your fork and submit a [pull request][]. - -[forking]: https://help.github.com/articles/fork-a-repo -[well-formed commit messages]: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html -[pull request]: https://help.github.com/articles/creating-a-pull-request diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTORS b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTORS deleted file mode 100644 index 9abb60865eb..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/CONTRIBUTORS +++ /dev/null @@ -1,59 +0,0 @@ -# People who have agreed to one of the CLAs and can contribute patches. -# The AUTHORS file lists the copyright holders; this file -# lists people. For example, Google employees are listed here -# but not in AUTHORS, because Google holds the copyright. -# -# Names should be added to this file only after verifying that -# the individual or the individual's organization has agreed to -# the appropriate Contributor License Agreement, found here: -# -# https://developers.google.com/open-source/cla/individual -# https://developers.google.com/open-source/cla/corporate -# -# The agreement for individuals can be filled out on the web. -# -# When adding J Random Contributor's name to this file, -# either J's name or J's organization's name should be -# added to the AUTHORS file, depending on whether the -# individual or corporate CLA was used. -# -# Names should be added to this file as: -# Name -# -# Please keep the list sorted. - -Albert Pretorius -Arne Beer -Billy Robert O'Neal III -Chris Kennelly -Christopher Seymour -David Coeurjolly -Dominic Hamon -Eric Fiselier -Eugene Zhuk -Evgeny Safronov -Felix Homann -Ismael Jimenez Martinez -Jern-Kuan Leong -Joao Paulo Magalhaes -JianXiong Zhou -Jussi Knuuttila -Kaito Udagawa -Kai Wolf -Lei Xu -Matt Clarkson -Maxim Vafin -Nick Hutchinson -Oleksandr Sochka -Pascal Leroy -Paul Redmond -Pierre Phaneuf -Radoslav Yovchev -Ray Glover -Shuo Chen -Tom Madams -Yixuan Qiu -Yusuke Suzuki -Tobias Ulvgård -Zbigniew Skowron -Dominik Czarnota diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/LICENSE b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/LICENSE deleted file mode 100644 index d6456956733..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/README.md b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/README.md deleted file mode 100644 index 2430d93bf9c..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/README.md +++ /dev/null @@ -1,726 +0,0 @@ -# benchmark -[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark) -[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master) -[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark) - -A library to support the benchmarking of functions, similar to unit-tests. - -Discussion group: https://groups.google.com/d/forum/benchmark-discuss - -IRC channel: https://freenode.net #googlebenchmark - -[Known issues and common problems](#known-issues) - -[Additional Tooling Documentation](docs/tools.md) - -## Example usage -### Basic usage -Define a function that executes the code to be measured. - -```c++ -static void BM_StringCreation(benchmark::State& state) { - while (state.KeepRunning()) - std::string empty_string; -} -// Register the function as a benchmark -BENCHMARK(BM_StringCreation); - -// Define another benchmark -static void BM_StringCopy(benchmark::State& state) { - std::string x = "hello"; - while (state.KeepRunning()) - std::string copy(x); -} -BENCHMARK(BM_StringCopy); - -BENCHMARK_MAIN(); -``` - -### Passing arguments -Sometimes a family of benchmarks can be implemented with just one routine that -takes an extra argument to specify which one of the family of benchmarks to -run. For example, the following code defines a family of benchmarks for -measuring the speed of `memcpy()` calls of different lengths: - -```c++ -static void BM_memcpy(benchmark::State& state) { - char* src = new char[state.range(0)]; - char* dst = new char[state.range(0)]; - memset(src, 'x', state.range(0)); - while (state.KeepRunning()) - memcpy(dst, src, state.range(0)); - state.SetBytesProcessed(int64_t(state.iterations()) * - int64_t(state.range(0))); - delete[] src; - delete[] dst; -} -BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10); -``` - -The preceding code is quite repetitive, and can be replaced with the following -short-hand. The following invocation will pick a few appropriate arguments in -the specified range and will generate a benchmark for each such argument. - -```c++ -BENCHMARK(BM_memcpy)->Range(8, 8<<10); -``` - -By default the arguments in the range are generated in multiples of eight and -the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the -range multiplier is changed to multiples of two. - -```c++ -BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10); -``` -Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ]. - -You might have a benchmark that depends on two or more inputs. For example, the -following code defines a family of benchmarks for measuring the speed of set -insertion. - -```c++ -static void BM_SetInsert(benchmark::State& state) { - while (state.KeepRunning()) { - state.PauseTiming(); - std::set data = ConstructRandomSet(state.range(0)); - state.ResumeTiming(); - for (int j = 0; j < state.range(1); ++j) - data.insert(RandomNumber()); - } -} -BENCHMARK(BM_SetInsert) - ->Args({1<<10, 1}) - ->Args({1<<10, 8}) - ->Args({1<<10, 64}) - ->Args({1<<10, 512}) - ->Args({8<<10, 1}) - ->Args({8<<10, 8}) - ->Args({8<<10, 64}) - ->Args({8<<10, 512}); -``` - -The preceding code is quite repetitive, and can be replaced with the following -short-hand. The following macro will pick a few appropriate arguments in the -product of the two specified ranges and will generate a benchmark for each such -pair. - -```c++ -BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}}); -``` - -For more complex patterns of inputs, passing a custom function to `Apply` allows -programmatic specification of an arbitrary set of arguments on which to run the -benchmark. The following example enumerates a dense range on one parameter, -and a sparse range on the second. - -```c++ -static void CustomArguments(benchmark::internal::Benchmark* b) { - for (int i = 0; i <= 10; ++i) - for (int j = 32; j <= 1024*1024; j *= 8) - b->Args({i, j}); -} -BENCHMARK(BM_SetInsert)->Apply(CustomArguments); -``` - -### Calculate asymptotic complexity (Big O) -Asymptotic complexity might be calculated for a family of benchmarks. The -following code will calculate the coefficient for the high-order term in the -running time and the normalized root-mean square error of string comparison. - -```c++ -static void BM_StringCompare(benchmark::State& state) { - std::string s1(state.range(0), '-'); - std::string s2(state.range(0), '-'); - while (state.KeepRunning()) { - benchmark::DoNotOptimize(s1.compare(s2)); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_StringCompare) - ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN); -``` - -As shown in the following invocation, asymptotic complexity might also be -calculated automatically. - -```c++ -BENCHMARK(BM_StringCompare) - ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(); -``` - -The following code will specify asymptotic complexity with a lambda function, -that might be used to customize high-order term calculation. - -```c++ -BENCHMARK(BM_StringCompare)->RangeMultiplier(2) - ->Range(1<<10, 1<<18)->Complexity([](int n)->double{return n; }); -``` - -### Templated benchmarks -Templated benchmarks work the same way: This example produces and consumes -messages of size `sizeof(v)` `range_x` times. It also outputs throughput in the -absence of multiprogramming. - -```c++ -template int BM_Sequential(benchmark::State& state) { - Q q; - typename Q::value_type v; - while (state.KeepRunning()) { - for (int i = state.range(0); i--; ) - q.push(v); - for (int e = state.range(0); e--; ) - q.Wait(&v); - } - // actually messages, not bytes: - state.SetBytesProcessed( - static_cast(state.iterations())*state.range(0)); -} -BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue)->Range(1<<0, 1<<10); -``` - -Three macros are provided for adding benchmark templates. - -```c++ -#if __cplusplus >= 201103L // C++11 and greater. -#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters. -#else // C++ < C++11 -#define BENCHMARK_TEMPLATE(func, arg1) -#endif -#define BENCHMARK_TEMPLATE1(func, arg1) -#define BENCHMARK_TEMPLATE2(func, arg1, arg2) -``` - -## Passing arbitrary arguments to a benchmark -In C++11 it is possible to define a benchmark that takes an arbitrary number -of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)` -macro creates a benchmark that invokes `func` with the `benchmark::State` as -the first argument followed by the specified `args...`. -The `test_case_name` is appended to the name of the benchmark and -should describe the values passed. - -```c++ -template ` -void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) { - [...] -} -// Registers a benchmark named "BM_takes_args/int_string_test` that passes -// the specified values to `extra_args`. -BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc")); -``` -Note that elements of `...args` may refer to global variables. Users should -avoid modifying global state inside of a benchmark. - -## Using RegisterBenchmark(name, fn, args...) - -The `RegisterBenchmark(name, func, args...)` function provides an alternative -way to create and register benchmarks. -`RegisterBenchmark(name, func, args...)` creates, registers, and returns a -pointer to a new benchmark with the specified `name` that invokes -`func(st, args...)` where `st` is a `benchmark::State` object. - -Unlike the `BENCHMARK` registration macros, which can only be used at the global -scope, the `RegisterBenchmark` can be called anywhere. This allows for -benchmark tests to be registered programmatically. - -Additionally `RegisterBenchmark` allows any callable object to be registered -as a benchmark. Including capturing lambdas and function objects. This -allows the creation - -For Example: -```c++ -auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ }; - -int main(int argc, char** argv) { - for (auto& test_input : { /* ... */ }) - benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input); - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); -} -``` - -### Multithreaded benchmarks -In a multithreaded test (benchmark invoked by multiple threads simultaneously), -it is guaranteed that none of the threads will start until all have called -`KeepRunning`, and all will have finished before KeepRunning returns false. As -such, any global setup or teardown can be wrapped in a check against the thread -index: - -```c++ -static void BM_MultiThreaded(benchmark::State& state) { - if (state.thread_index == 0) { - // Setup code here. - } - while (state.KeepRunning()) { - // Run the test as normal. - } - if (state.thread_index == 0) { - // Teardown code here. - } -} -BENCHMARK(BM_MultiThreaded)->Threads(2); -``` - -If the benchmarked code itself uses threads and you want to compare it to -single-threaded code, you may want to use real-time ("wallclock") measurements -for latency comparisons: - -```c++ -BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime(); -``` - -Without `UseRealTime`, CPU time is used by default. - - -## Manual timing -For benchmarking something for which neither CPU time nor real-time are -correct or accurate enough, completely manual timing is supported using -the `UseManualTime` function. - -When `UseManualTime` is used, the benchmarked code must call -`SetIterationTime` once per iteration of the `KeepRunning` loop to -report the manually measured time. - -An example use case for this is benchmarking GPU execution (e.g. OpenCL -or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot -be accurately measured using CPU time or real-time. Instead, they can be -measured accurately using a dedicated API, and these measurement results -can be reported back with `SetIterationTime`. - -```c++ -static void BM_ManualTiming(benchmark::State& state) { - int microseconds = state.range(0); - std::chrono::duration sleep_duration { - static_cast(microseconds) - }; - - while (state.KeepRunning()) { - auto start = std::chrono::high_resolution_clock::now(); - // Simulate some useful workload with a sleep - std::this_thread::sleep_for(sleep_duration); - auto end = std::chrono::high_resolution_clock::now(); - - auto elapsed_seconds = - std::chrono::duration_cast>( - end - start); - - state.SetIterationTime(elapsed_seconds.count()); - } -} -BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime(); -``` - -### Preventing optimisation -To prevent a value or expression from being optimized away by the compiler -the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()` -functions can be used. - -```c++ -static void BM_test(benchmark::State& state) { - while (state.KeepRunning()) { - int x = 0; - for (int i=0; i < 64; ++i) { - benchmark::DoNotOptimize(x += i); - } - } -} -``` - -`DoNotOptimize()` forces the *result* of `` to be stored in either -memory or a register. For GNU based compilers it acts as read/write barrier -for global memory. More specifically it forces the compiler to flush pending -writes to memory and reload any other values as necessary. - -Note that `DoNotOptimize()` does not prevent optimizations on `` -in any way. `` may even be removed entirely when the result is already -known. For example: - -```c++ - /* Example 1: `` is removed entirely. */ - int foo(int x) { return x + 42; } - while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42); - - /* Example 2: Result of '' is only reused */ - int bar(int) __attribute__((const)); - while (...) DoNotOptimize(bar(0)); // Optimized to: - // int __result__ = bar(0); - // while (...) DoNotOptimize(__result__); -``` - -The second tool for preventing optimizations is `ClobberMemory()`. In essence -`ClobberMemory()` forces the compiler to perform all pending writes to global -memory. Memory managed by block scope objects must be "escaped" using -`DoNotOptimize(...)` before it can be clobbered. In the below example -`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized -away. - -```c++ -static void BM_vector_push_back(benchmark::State& state) { - while (state.KeepRunning()) { - std::vector v; - v.reserve(1); - benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered. - v.push_back(42); - benchmark::ClobberMemory(); // Force 42 to be written to memory. - } -} -``` - -Note that `ClobberMemory()` is only available for GNU or MSVC based compilers. - -### Set time unit manually -If a benchmark runs a few milliseconds it may be hard to visually compare the -measured times, since the output data is given in nanoseconds per default. In -order to manually set the time unit, you can specify it manually: - -```c++ -BENCHMARK(BM_test)->Unit(benchmark::kMillisecond); -``` - -## Controlling number of iterations -In all cases, the number of iterations for which the benchmark is run is -governed by the amount of time the benchmark takes. Concretely, the number of -iterations is at least one, not more than 1e9, until CPU time is greater than -the minimum time, or the wallclock time is 5x minimum time. The minimum time is -set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on -the registered benchmark object. - -## Reporting the mean and standard devation by repeated benchmarks -By default each benchmark is run once and that single result is reported. -However benchmarks are often noisy and a single result may not be representative -of the overall behavior. For this reason it's possible to repeatedly rerun the -benchmark. - -The number of runs of each benchmark is specified globally by the -`--benchmark_repetitions` flag or on a per benchmark basis by calling -`Repetitions` on the registered benchmark object. When a benchmark is run -more than once the mean and standard deviation of the runs will be reported. - -Additionally the `--benchmark_report_aggregates_only={true|false}` flag or -`ReportAggregatesOnly(bool)` function can be used to change how repeated tests -are reported. By default the result of each repeated run is reported. When this -option is 'true' only the mean and standard deviation of the runs is reported. -Calling `ReportAggregatesOnly(bool)` on a registered benchmark object overrides -the value of the flag for that benchmark. - -## Fixtures -Fixture tests are created by -first defining a type that derives from ::benchmark::Fixture and then -creating/registering the tests using the following macros: - -* `BENCHMARK_F(ClassName, Method)` -* `BENCHMARK_DEFINE_F(ClassName, Method)` -* `BENCHMARK_REGISTER_F(ClassName, Method)` - -For Example: - -```c++ -class MyFixture : public benchmark::Fixture {}; - -BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) { - while (st.KeepRunning()) { - ... - } -} - -BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) { - while (st.KeepRunning()) { - ... - } -} -/* BarTest is NOT registered */ -BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2); -/* BarTest is now registered */ -``` - - -## User-defined counters - -You can add your own counters with user-defined names. The example below -will add columns "Foo", "Bar" and "Baz" in its output: - -```c++ -static void UserCountersExample1(benchmark::State& state) { - double numFoos = 0, numBars = 0, numBazs = 0; - while (state.KeepRunning()) { - // ... count Foo,Bar,Baz events - } - state.counters["Foo"] = numFoos; - state.counters["Bar"] = numBars; - state.counters["Baz"] = numBazs; -} -``` - -The `state.counters` object is a `std::map` with `std::string` keys -and `Counter` values. The latter is a `double`-like class, via an implicit -conversion to `double&`. Thus you can use all of the standard arithmetic -assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter. - -In multithreaded benchmarks, each counter is set on the calling thread only. -When the benchmark finishes, the counters from each thread will be summed; -the resulting sum is the value which will be shown for the benchmark. - -The `Counter` constructor accepts two parameters: the value as a `double` -and a bit flag which allows you to show counters as rates and/or as -per-thread averages: - -```c++ - // sets a simple counter - state.counters["Foo"] = numFoos; - - // Set the counter as a rate. It will be presented divided - // by the duration of the benchmark. - state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate); - - // Set the counter as a thread-average quantity. It will - // be presented divided by the number of threads. - state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads); - - // There's also a combined flag: - state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate); -``` - -When you're compiling in C++11 mode or later you can use `insert()` with -`std::initializer_list`: - -```c++ - // With C++11, this can be done: - state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}}); - // ... instead of: - state.counters["Foo"] = numFoos; - state.counters["Bar"] = numBars; - state.counters["Baz"] = numBazs; -``` - -### Counter reporting - -When using the console reporter, by default, user counters are are printed at -the end after the table, the same way as ``bytes_processed`` and -``items_processed``. This is best for cases in which there are few counters, -or where there are only a couple of lines per benchmark. Here's an example of -the default output: - -``` ------------------------------------------------------------------------------- -Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------- -BM_UserCounter/threads:8 2248 ns 10277 ns 68808 Bar=16 Bat=40 Baz=24 Foo=8 -BM_UserCounter/threads:1 9797 ns 9788 ns 71523 Bar=2 Bat=5 Baz=3 Foo=1024m -BM_UserCounter/threads:2 4924 ns 9842 ns 71036 Bar=4 Bat=10 Baz=6 Foo=2 -BM_UserCounter/threads:4 2589 ns 10284 ns 68012 Bar=8 Bat=20 Baz=12 Foo=4 -BM_UserCounter/threads:8 2212 ns 10287 ns 68040 Bar=16 Bat=40 Baz=24 Foo=8 -BM_UserCounter/threads:16 1782 ns 10278 ns 68144 Bar=32 Bat=80 Baz=48 Foo=16 -BM_UserCounter/threads:32 1291 ns 10296 ns 68256 Bar=64 Bat=160 Baz=96 Foo=32 -BM_UserCounter/threads:4 2615 ns 10307 ns 68040 Bar=8 Bat=20 Baz=12 Foo=4 -BM_Factorial 26 ns 26 ns 26608979 40320 -BM_Factorial/real_time 26 ns 26 ns 26587936 40320 -BM_CalculatePiRange/1 16 ns 16 ns 45704255 0 -BM_CalculatePiRange/8 73 ns 73 ns 9520927 3.28374 -BM_CalculatePiRange/64 609 ns 609 ns 1140647 3.15746 -BM_CalculatePiRange/512 4900 ns 4901 ns 142696 3.14355 -``` - -If this doesn't suit you, you can print each counter as a table column by -passing the flag `--benchmark_counters_tabular=true` to the benchmark -application. This is best for cases in which there are a lot of counters, or -a lot of lines per individual benchmark. Note that this will trigger a -reprinting of the table header any time the counter set changes between -individual benchmarks. Here's an example of corresponding output when -`--benchmark_counters_tabular=true` is passed: - -``` ---------------------------------------------------------------------------------------- -Benchmark Time CPU Iterations Bar Bat Baz Foo ---------------------------------------------------------------------------------------- -BM_UserCounter/threads:8 2198 ns 9953 ns 70688 16 40 24 8 -BM_UserCounter/threads:1 9504 ns 9504 ns 73787 2 5 3 1 -BM_UserCounter/threads:2 4775 ns 9550 ns 72606 4 10 6 2 -BM_UserCounter/threads:4 2508 ns 9951 ns 70332 8 20 12 4 -BM_UserCounter/threads:8 2055 ns 9933 ns 70344 16 40 24 8 -BM_UserCounter/threads:16 1610 ns 9946 ns 70720 32 80 48 16 -BM_UserCounter/threads:32 1192 ns 9948 ns 70496 64 160 96 32 -BM_UserCounter/threads:4 2506 ns 9949 ns 70332 8 20 12 4 --------------------------------------------------------------- -Benchmark Time CPU Iterations --------------------------------------------------------------- -BM_Factorial 26 ns 26 ns 26392245 40320 -BM_Factorial/real_time 26 ns 26 ns 26494107 40320 -BM_CalculatePiRange/1 15 ns 15 ns 45571597 0 -BM_CalculatePiRange/8 74 ns 74 ns 9450212 3.28374 -BM_CalculatePiRange/64 595 ns 595 ns 1173901 3.15746 -BM_CalculatePiRange/512 4752 ns 4752 ns 147380 3.14355 -BM_CalculatePiRange/4k 37970 ns 37972 ns 18453 3.14184 -BM_CalculatePiRange/32k 303733 ns 303744 ns 2305 3.14162 -BM_CalculatePiRange/256k 2434095 ns 2434186 ns 288 3.1416 -BM_CalculatePiRange/1024k 9721140 ns 9721413 ns 71 3.14159 -BM_CalculatePi/threads:8 2255 ns 9943 ns 70936 -``` -Note above the additional header printed when the benchmark changes from -``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does -not have the same counter set as ``BM_UserCounter``. - -## Exiting Benchmarks in Error - -When errors caused by external influences, such as file I/O and network -communication, occur within a benchmark the -`State::SkipWithError(const char* msg)` function can be used to skip that run -of benchmark and report the error. Note that only future iterations of the -`KeepRunning()` are skipped. Users may explicitly return to exit the -benchmark immediately. - -The `SkipWithError(...)` function may be used at any point within the benchmark, -including before and after the `KeepRunning()` loop. - -For example: - -```c++ -static void BM_test(benchmark::State& state) { - auto resource = GetResource(); - if (!resource.good()) { - state.SkipWithError("Resource is not good!"); - // KeepRunning() loop will not be entered. - } - while (state.KeepRunning()) { - auto data = resource.read_data(); - if (!resource.good()) { - state.SkipWithError("Failed to read data!"); - break; // Needed to skip the rest of the iteration. - } - do_stuff(data); - } -} -``` - -## Running a subset of the benchmarks - -The `--benchmark_filter=` option can be used to only run the benchmarks -which match the specified ``. For example: - -```bash -$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32 -Run on (1 X 2300 MHz CPU ) -2016-06-25 19:34:24 -Benchmark Time CPU Iterations ----------------------------------------------------- -BM_memcpy/32 11 ns 11 ns 79545455 -BM_memcpy/32k 2181 ns 2185 ns 324074 -BM_memcpy/32 12 ns 12 ns 54687500 -BM_memcpy/32k 1834 ns 1837 ns 357143 -``` - - -## Output Formats -The library supports multiple output formats. Use the -`--benchmark_format=` flag to set the format type. `console` -is the default format. - -The Console format is intended to be a human readable format. By default -the format generates color output. Context is output on stderr and the -tabular data on stdout. Example tabular output looks like: -``` -Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- -BM_SetInsert/1024/1 28928 29349 23853 133.097kB/s 33.2742k items/s -BM_SetInsert/1024/8 32065 32913 21375 949.487kB/s 237.372k items/s -BM_SetInsert/1024/10 33157 33648 21431 1.13369MB/s 290.225k items/s -``` - -The JSON format outputs human readable json split into two top level attributes. -The `context` attribute contains information about the run in general, including -information about the CPU and the date. -The `benchmarks` attribute contains a list of ever benchmark run. Example json -output looks like: -```json -{ - "context": { - "date": "2015/03/17-18:40:25", - "num_cpus": 40, - "mhz_per_cpu": 2801, - "cpu_scaling_enabled": false, - "build_type": "debug" - }, - "benchmarks": [ - { - "name": "BM_SetInsert/1024/1", - "iterations": 94877, - "real_time": 29275, - "cpu_time": 29836, - "bytes_per_second": 134066, - "items_per_second": 33516 - }, - { - "name": "BM_SetInsert/1024/8", - "iterations": 21609, - "real_time": 32317, - "cpu_time": 32429, - "bytes_per_second": 986770, - "items_per_second": 246693 - }, - { - "name": "BM_SetInsert/1024/10", - "iterations": 21393, - "real_time": 32724, - "cpu_time": 33355, - "bytes_per_second": 1199226, - "items_per_second": 299807 - } - ] -} -``` - -The CSV format outputs comma-separated values. The `context` is output on stderr -and the CSV itself on stdout. Example CSV output looks like: -``` -name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label -"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942, -"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115, -"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06, -``` - -## Output Files -The library supports writing the output of the benchmark to a file specified -by `--benchmark_out=`. The format of the output can be specified -using `--benchmark_out_format={json|console|csv}`. Specifying -`--benchmark_out` does not suppress the console output. - -## Debug vs Release -By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use: - -``` -cmake -DCMAKE_BUILD_TYPE=Release -``` - -To enable link-time optimisation, use - -``` -cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true -``` - -## Linking against the library -When using gcc, it is necessary to link against pthread to avoid runtime exceptions. -This is due to how gcc implements std::thread. -See [issue #67](https://github.com/google/benchmark/issues/67) for more details. - -## Compiler Support - -Google Benchmark uses C++11 when building the library. As such we require -a modern C++ toolchain, both compiler and standard library. - -The following minimum versions are strongly recommended build the library: - -* GCC 4.8 -* Clang 3.4 -* Visual Studio 2013 -* Intel 2015 Update 1 - -Anything older *may* work. - -Note: Using the library and its headers in C++03 is supported. C++11 is only -required to build the library. - -# Known Issues - -### Windows - -* Users must manually link `shlwapi.lib`. Failure to do so may result -in unresolved symbols. - diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/appveyor.yml b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/appveyor.yml deleted file mode 100644 index e084f386b77..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/appveyor.yml +++ /dev/null @@ -1,56 +0,0 @@ -version: '{build}' - -image: Visual Studio 2017 - -configuration: - - Debug - - Release - -environment: - matrix: - - compiler: msvc-15-seh - generator: "Visual Studio 15 2017" - - - compiler: msvc-15-seh - generator: "Visual Studio 15 2017 Win64" - - - compiler: msvc-14-seh - generator: "Visual Studio 14 2015" - - - compiler: msvc-14-seh - generator: "Visual Studio 14 2015 Win64" - - - compiler: msvc-12-seh - generator: "Visual Studio 12 2013" - - - compiler: msvc-12-seh - generator: "Visual Studio 12 2013 Win64" - - - compiler: gcc-5.3.0-posix - generator: "MinGW Makefiles" - cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin' - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - -matrix: - fast_finish: true - -install: - # git bash conflicts with MinGW makefiles - - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%") - - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%") - -build_script: - - md _build -Force - - cd _build - - echo %configuration% - - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" .. - - cmake --build . --config %configuration% - -test_script: - - ctest -c %configuration% --timeout 300 --output-on-failure - -artifacts: - - path: '_build/CMakeFiles/*.log' - name: logs - - path: '_build/Testing/**/*.xml' - name: test_results diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/AddCXXCompilerFlag.cmake b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/AddCXXCompilerFlag.cmake deleted file mode 100644 index 0b176ba27f1..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/AddCXXCompilerFlag.cmake +++ /dev/null @@ -1,64 +0,0 @@ -# - Adds a compiler flag if it is supported by the compiler -# -# This function checks that the supplied compiler flag is supported and then -# adds it to the corresponding compiler flags -# -# add_cxx_compiler_flag( []) -# -# - Example -# -# include(AddCXXCompilerFlag) -# add_cxx_compiler_flag(-Wall) -# add_cxx_compiler_flag(-no-strict-aliasing RELEASE) -# Requires CMake 2.6+ - -if(__add_cxx_compiler_flag) - return() -endif() -set(__add_cxx_compiler_flag INCLUDED) - -include(CheckCXXCompilerFlag) - -function(mangle_compiler_flag FLAG OUTPUT) - string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG) - string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG}) - string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG}) - string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG}) - set(${OUTPUT} "${SANITIZED_FLAG}" PARENT_SCOPE) -endfunction(mangle_compiler_flag) - -function(add_cxx_compiler_flag FLAG) - mangle_compiler_flag("${FLAG}" MANGLED_FLAG) - set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}") - check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG}) - set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}") - if(${MANGLED_FLAG}) - set(VARIANT ${ARGV1}) - if(ARGV1) - string(TOUPPER "_${VARIANT}" VARIANT) - endif() - set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE) - endif() -endfunction() - -function(add_required_cxx_compiler_flag FLAG) - mangle_compiler_flag("${FLAG}" MANGLED_FLAG) - set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}") - check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG}) - set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}") - if(${MANGLED_FLAG}) - set(VARIANT ${ARGV1}) - if(ARGV1) - string(TOUPPER "_${VARIANT}" VARIANT) - endif() - set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE) - set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}" PARENT_SCOPE) - else() - message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler") - endif() -endfunction() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/CXXFeatureCheck.cmake b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/CXXFeatureCheck.cmake deleted file mode 100644 index 2c4460f0e30..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/CXXFeatureCheck.cmake +++ /dev/null @@ -1,46 +0,0 @@ -# - Compile and run code to check for C++ features -# -# This functions compiles a source file under the `cmake` folder -# and adds the corresponding `HAVE_[FILENAME]` flag to the CMake -# environment -# -# cxx_feature_check( []) -# -# - Example -# -# include(CXXFeatureCheck) -# cxx_feature_check(STD_REGEX) -# Requires CMake 2.8.12+ - -if(__cxx_feature_check) - return() -endif() -set(__cxx_feature_check INCLUDED) - -function(cxx_feature_check FILE) - string(TOLOWER ${FILE} FILE) - string(TOUPPER ${FILE} VAR) - string(TOUPPER "HAVE_${VAR}" FEATURE) - if (DEFINED HAVE_${VAR}) - set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE) - add_definitions(-DHAVE_${VAR}) - return() - endif() - message("-- Performing Test ${FEATURE}") - try_run(RUN_${FEATURE} COMPILE_${FEATURE} - ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp - CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS} - LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}) - if(RUN_${FEATURE} EQUAL 0) - message("-- Performing Test ${FEATURE} -- success") - set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE) - add_definitions(-DHAVE_${VAR}) - else() - if(NOT COMPILE_${FEATURE}) - message("-- Performing Test ${FEATURE} -- failed to compile") - else() - message("-- Performing Test ${FEATURE} -- compiled but failed to run") - endif() - endif() -endfunction() - diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/Config.cmake.in b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/Config.cmake.in deleted file mode 100644 index 6e9256eea8a..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/Config.cmake.in +++ /dev/null @@ -1 +0,0 @@ -include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake") diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/GetGitVersion.cmake b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/GetGitVersion.cmake deleted file mode 100644 index 8dd94800459..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/GetGitVersion.cmake +++ /dev/null @@ -1,51 +0,0 @@ -# - Returns a version string from Git tags -# -# This function inspects the annotated git tags for the project and returns a string -# into a CMake variable -# -# get_git_version() -# -# - Example -# -# include(GetGitVersion) -# get_git_version(GIT_VERSION) -# -# Requires CMake 2.8.11+ -find_package(Git) - -if(__get_git_version) - return() -endif() -set(__get_git_version INCLUDED) - -function(get_git_version var) - if(GIT_EXECUTABLE) - execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8 - RESULT_VARIABLE status - OUTPUT_VARIABLE GIT_VERSION - ERROR_QUIET) - if(${status}) - set(GIT_VERSION "v0.0.0") - else() - string(STRIP ${GIT_VERSION} GIT_VERSION) - string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION}) - endif() - - # Work out if the repository is dirty - execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh - OUTPUT_QUIET - ERROR_QUIET) - execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD -- - OUTPUT_VARIABLE GIT_DIFF_INDEX - ERROR_QUIET) - string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY) - if (${GIT_DIRTY}) - set(GIT_VERSION "${GIT_VERSION}-dirty") - endif() - else() - set(GIT_VERSION "v0.0.0") - endif() - - message("-- git Version: ${GIT_VERSION}") - set(${var} ${GIT_VERSION} PARENT_SCOPE) -endfunction() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/gnu_posix_regex.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/gnu_posix_regex.cpp deleted file mode 100644 index b5b91cdab7c..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/gnu_posix_regex.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include -int main() { - std::string str = "test0159"; - regex_t re; - int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB); - if (ec != 0) { - return ec; - } - return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0; -} - diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/posix_regex.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/posix_regex.cpp deleted file mode 100644 index 466dc62560a..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/posix_regex.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include -int main() { - std::string str = "test0159"; - regex_t re; - int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB); - if (ec != 0) { - return ec; - } - int ret = regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0; - regfree(&re); - return ret; -} - diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/std_regex.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/std_regex.cpp deleted file mode 100644 index 696f2a26bce..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/std_regex.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include -#include -int main() { - const std::string str = "test0159"; - std::regex re; - re = std::regex("^[a-z]+[0-9]+$", - std::regex_constants::extended | std::regex_constants::nosubs); - return std::regex_search(str, re) ? 0 : -1; -} - diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/steady_clock.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/steady_clock.cpp deleted file mode 100644 index 66d50d17e9e..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/steady_clock.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#include - -int main() { - typedef std::chrono::steady_clock Clock; - Clock::time_point tp = Clock::now(); - ((void)tp); -} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/thread_safety_attributes.cpp b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/thread_safety_attributes.cpp deleted file mode 100644 index 46161babdb1..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/cmake/thread_safety_attributes.cpp +++ /dev/null @@ -1,4 +0,0 @@ -#define HAVE_THREAD_SAFETY_ATTRIBUTES -#include "../src/mutex.h" - -int main() {} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/docs/tools.md b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/docs/tools.md deleted file mode 100644 index f176f74a48f..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/docs/tools.md +++ /dev/null @@ -1,59 +0,0 @@ -# Benchmark Tools - -## compare_bench.py - -The `compare_bench.py` utility which can be used to compare the result of benchmarks. -The program is invoked like: - -``` bash -$ compare_bench.py [benchmark options]... -``` - -Where `` and `` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. - -The sample output using the JSON test files under `Inputs/` gives: - -``` bash -$ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json -Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json -Benchmark Time CPU ----------------------------------------------- -BM_SameTimes +0.00 +0.00 -BM_2xFaster -0.50 -0.50 -BM_2xSlower +1.00 +1.00 -BM_10PercentFaster -0.10 -0.10 -BM_10PercentSlower +0.10 +0.10 -``` - -When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like: - -``` -./compare_bench.py test/basic_test test/basic_test --benchmark_filter=BM_empty.* -RUNNING: test/basic_test --benchmark_filter=BM_empty.* -Run on (4 X 4228.32 MHz CPU s) -2016-08-02 19:21:33 -Benchmark Time CPU Iterations --------------------------------------------------------------------- -BM_empty 9 ns 9 ns 79545455 -BM_empty/threads:4 4 ns 9 ns 75268816 -BM_empty_stop_start 8 ns 8 ns 83333333 -BM_empty_stop_start/threads:4 3 ns 8 ns 83333332 -RUNNING: test/basic_test --benchmark_filter=BM_empty.* -Run on (4 X 4228.32 MHz CPU s) -2016-08-02 19:21:35 -Benchmark Time CPU Iterations --------------------------------------------------------------------- -BM_empty 9 ns 9 ns 76086957 -BM_empty/threads:4 4 ns 9 ns 76086956 -BM_empty_stop_start 8 ns 8 ns 87500000 -BM_empty_stop_start/threads:4 3 ns 8 ns 88607596 -Comparing test/basic_test to test/basic_test -Benchmark Time CPU ---------------------------------------------------------- -BM_empty +0.00 +0.00 -BM_empty/threads:4 +0.00 +0.00 -BM_empty_stop_start +0.00 +0.00 -BM_empty_stop_start/threads:4 +0.00 +0.00 -``` - -Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks. diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark.h deleted file mode 100644 index bd3b0ffb4cb..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark.h +++ /dev/null @@ -1,1210 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Support for registering benchmarks for functions. - -/* Example usage: -// Define a function that executes the code to be measured a -// specified number of times: -static void BM_StringCreation(benchmark::State& state) { - while (state.KeepRunning()) - std::string empty_string; -} - -// Register the function as a benchmark -BENCHMARK(BM_StringCreation); - -// Define another benchmark -static void BM_StringCopy(benchmark::State& state) { - std::string x = "hello"; - while (state.KeepRunning()) - std::string copy(x); -} -BENCHMARK(BM_StringCopy); - -// Augment the main() program to invoke benchmarks if specified -// via the --benchmarks command line flag. E.g., -// my_unittest --benchmark_filter=all -// my_unittest --benchmark_filter=BM_StringCreation -// my_unittest --benchmark_filter=String -// my_unittest --benchmark_filter='Copy|Creation' -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); - return 0; -} - -// Sometimes a family of microbenchmarks can be implemented with -// just one routine that takes an extra argument to specify which -// one of the family of benchmarks to run. For example, the following -// code defines a family of microbenchmarks for measuring the speed -// of memcpy() calls of different lengths: - -static void BM_memcpy(benchmark::State& state) { - char* src = new char[state.range(0)]; char* dst = new char[state.range(0)]; - memset(src, 'x', state.range(0)); - while (state.KeepRunning()) - memcpy(dst, src, state.range(0)); - state.SetBytesProcessed(int64_t(state.iterations()) * - int64_t(state.range(0))); - delete[] src; delete[] dst; -} -BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10); - -// The preceding code is quite repetitive, and can be replaced with the -// following short-hand. The following invocation will pick a few -// appropriate arguments in the specified range and will generate a -// microbenchmark for each such argument. -BENCHMARK(BM_memcpy)->Range(8, 8<<10); - -// You might have a microbenchmark that depends on two inputs. For -// example, the following code defines a family of microbenchmarks for -// measuring the speed of set insertion. -static void BM_SetInsert(benchmark::State& state) { - while (state.KeepRunning()) { - state.PauseTiming(); - set data = ConstructRandomSet(state.range(0)); - state.ResumeTiming(); - for (int j = 0; j < state.range(1); ++j) - data.insert(RandomNumber()); - } -} -BENCHMARK(BM_SetInsert) - ->Args({1<<10, 1}) - ->Args({1<<10, 8}) - ->Args({1<<10, 64}) - ->Args({1<<10, 512}) - ->Args({8<<10, 1}) - ->Args({8<<10, 8}) - ->Args({8<<10, 64}) - ->Args({8<<10, 512}); - -// The preceding code is quite repetitive, and can be replaced with -// the following short-hand. The following macro will pick a few -// appropriate arguments in the product of the two specified ranges -// and will generate a microbenchmark for each such pair. -BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}}); - -// For more complex patterns of inputs, passing a custom function -// to Apply allows programmatic specification of an -// arbitrary set of arguments to run the microbenchmark on. -// The following example enumerates a dense range on -// one parameter, and a sparse range on the second. -static void CustomArguments(benchmark::internal::Benchmark* b) { - for (int i = 0; i <= 10; ++i) - for (int j = 32; j <= 1024*1024; j *= 8) - b->Args({i, j}); -} -BENCHMARK(BM_SetInsert)->Apply(CustomArguments); - -// Templated microbenchmarks work the same way: -// Produce then consume 'size' messages 'iters' times -// Measures throughput in the absence of multiprogramming. -template int BM_Sequential(benchmark::State& state) { - Q q; - typename Q::value_type v; - while (state.KeepRunning()) { - for (int i = state.range(0); i--; ) - q.push(v); - for (int e = state.range(0); e--; ) - q.Wait(&v); - } - // actually messages, not bytes: - state.SetBytesProcessed( - static_cast(state.iterations())*state.range(0)); -} -BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue)->Range(1<<0, 1<<10); - -Use `Benchmark::MinTime(double t)` to set the minimum time used to run the -benchmark. This option overrides the `benchmark_min_time` flag. - -void BM_test(benchmark::State& state) { - ... body ... -} -BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds. - -In a multithreaded test, it is guaranteed that none of the threads will start -until all have called KeepRunning, and all will have finished before KeepRunning -returns false. As such, any global setup or teardown you want to do can be -wrapped in a check against the thread index: - -static void BM_MultiThreaded(benchmark::State& state) { - if (state.thread_index == 0) { - // Setup code here. - } - while (state.KeepRunning()) { - // Run the test as normal. - } - if (state.thread_index == 0) { - // Teardown code here. - } -} -BENCHMARK(BM_MultiThreaded)->Threads(4); - - -If a benchmark runs a few milliseconds it may be hard to visually compare the -measured times, since the output data is given in nanoseconds per default. In -order to manually set the time unit, you can specify it manually: - -BENCHMARK(BM_test)->Unit(benchmark::kMillisecond); -*/ - -#ifndef BENCHMARK_BENCHMARK_H_ -#define BENCHMARK_BENCHMARK_H_ - - -#if __cplusplus >= 201103L -#define BENCHMARK_HAS_CXX11 -#endif - -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(BENCHMARK_HAS_CXX11) -#include -#include -#include -#endif - -#if defined(_MSC_VER) -#include // for _ReadWriteBarrier -#endif - -#ifndef BENCHMARK_HAS_CXX11 -#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&); \ - TypeName& operator=(const TypeName&) -#else -#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&) = delete; \ - TypeName& operator=(const TypeName&) = delete -#endif - -#if defined(__GNUC__) -#define BENCHMARK_UNUSED __attribute__((unused)) -#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline)) -#define BENCHMARK_NOEXCEPT noexcept -#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) -#elif defined(_MSC_VER) && !defined(__clang__) -#define BENCHMARK_UNUSED -#define BENCHMARK_ALWAYS_INLINE __forceinline -#if _MSC_VER >= 1900 -#define BENCHMARK_NOEXCEPT noexcept -#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) -#else -#define BENCHMARK_NOEXCEPT -#define BENCHMARK_NOEXCEPT_OP(x) -#endif -#define __func__ __FUNCTION__ -#else -#define BENCHMARK_UNUSED -#define BENCHMARK_ALWAYS_INLINE -#define BENCHMARK_NOEXCEPT -#define BENCHMARK_NOEXCEPT_OP(x) -#endif - -#define BENCHMARK_INTERNAL_TOSTRING2(x) #x -#define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x) - -#if defined(__GNUC__) -#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y) -#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg))) -#else -#define BENCHMARK_BUILTIN_EXPECT(x, y) x -#define BENCHMARK_DEPRECATED_MSG(msg) -#define BENCHMARK_WARNING_MSG(msg) __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING(__LINE__) ") : warning note: " msg)) -#endif - -#if defined(__GNUC__) && !defined(__clang__) -#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -#endif - - -namespace benchmark { -class BenchmarkReporter; - -void Initialize(int* argc, char** argv); - -// Report to stdout all arguments in 'argv' as unrecognized except the first. -// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1). -bool ReportUnrecognizedArguments(int argc, char** argv); - -// Generate a list of benchmarks matching the specified --benchmark_filter flag -// and if --benchmark_list_tests is specified return after printing the name -// of each matching benchmark. Otherwise run each matching benchmark and -// report the results. -// -// The second and third overload use the specified 'console_reporter' and -// 'file_reporter' respectively. 'file_reporter' will write to the file -// specified -// by '--benchmark_output'. If '--benchmark_output' is not given the -// 'file_reporter' is ignored. -// -// RETURNS: The number of matching benchmarks. -size_t RunSpecifiedBenchmarks(); -size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter); -size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter, - BenchmarkReporter* file_reporter); - -// If this routine is called, peak memory allocation past this point in the -// benchmark is reported at the end of the benchmark report line. (It is -// computed by running the benchmark once with a single iteration and a memory -// tracer.) -// TODO(dominic) -// void MemoryUsage(); - -namespace internal { -class Benchmark; -class BenchmarkImp; -class BenchmarkFamilies; - -void UseCharPointer(char const volatile*); - -// Take ownership of the pointer and register the benchmark. Return the -// registered benchmark. -Benchmark* RegisterBenchmarkInternal(Benchmark*); - -// Ensure that the standard streams are properly initialized in every TU. -int InitializeStreams(); -BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams(); - -} // namespace internal - - -#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN) -# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY -#endif - -// The DoNotOptimize(...) function can be used to prevent a value or -// expression from being optimized away by the compiler. This function is -// intended to add little to no overhead. -// See: https://youtu.be/nXaxk27zwlk?t=2441 -#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY -template -inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { - // Clang doesn't like the 'X' constraint on `value` and certain GCC versions - // don't like the 'g' constraint. Attempt to placate them both. -#if defined(__clang__) - asm volatile("" : : "g"(value) : "memory"); -#else - asm volatile("" : : "i,r,m"(value) : "memory"); -#endif -} -// Force the compiler to flush pending writes to global memory. Acts as an -// effective read/write barrier -inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { - asm volatile("" : : : "memory"); -} -#elif defined(_MSC_VER) -template -inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { - internal::UseCharPointer(&reinterpret_cast(value)); - _ReadWriteBarrier(); -} - -inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { - _ReadWriteBarrier(); -} -#else -template -inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { - internal::UseCharPointer(&reinterpret_cast(value)); -} -// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers -#endif - - - -// This class is used for user-defined counters. -class Counter { -public: - - enum Flags { - kDefaults = 0, - // Mark the counter as a rate. It will be presented divided - // by the duration of the benchmark. - kIsRate = 1, - // Mark the counter as a thread-average quantity. It will be - // presented divided by the number of threads. - kAvgThreads = 2, - // Mark the counter as a thread-average rate. See above. - kAvgThreadsRate = kIsRate|kAvgThreads - }; - - double value; - Flags flags; - - BENCHMARK_ALWAYS_INLINE - Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {} - - BENCHMARK_ALWAYS_INLINE operator double const& () const { return value; } - BENCHMARK_ALWAYS_INLINE operator double & () { return value; } - -}; - -// This is the container for the user-defined counters. -typedef std::map UserCounters; - - -// TimeUnit is passed to a benchmark in order to specify the order of magnitude -// for the measured time. -enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond }; - -// BigO is passed to a benchmark in order to specify the asymptotic -// computational -// complexity for the benchmark. In case oAuto is selected, complexity will be -// calculated automatically to the best fit. -enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda }; - -// BigOFunc is passed to a benchmark in order to specify the asymptotic -// computational complexity for the benchmark. -typedef double(BigOFunc)(int); - -namespace internal { -class ThreadTimer; -class ThreadManager; - -enum ReportMode -#if defined(BENCHMARK_HAS_CXX11) - : unsigned -#else -#endif - { - RM_Unspecified, // The mode has not been manually specified - RM_Default, // The mode is user-specified as default. - RM_ReportAggregatesOnly -}; -} // namespace internal - -// State is passed to a running Benchmark and contains state for the -// benchmark to use. -class State { - public: - // Returns true if the benchmark should continue through another iteration. - // NOTE: A benchmark may not return from the test until KeepRunning() has - // returned false. - bool KeepRunning() { - if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) { - StartKeepRunning(); - } - bool const res = total_iterations_++ < max_iterations; - if (BENCHMARK_BUILTIN_EXPECT(!res, false)) { - FinishKeepRunning(); - } - return res; - } - - // REQUIRES: timer is running and 'SkipWithError(...)' has not been called - // by the current thread. - // Stop the benchmark timer. If not called, the timer will be - // automatically stopped after KeepRunning() returns false for the first time. - // - // For threaded benchmarks the PauseTiming() function only pauses the timing - // for the current thread. - // - // NOTE: The "real time" measurement is per-thread. If different threads - // report different measurements the largest one is reported. - // - // NOTE: PauseTiming()/ResumeTiming() are relatively - // heavyweight, and so their use should generally be avoided - // within each benchmark iteration, if possible. - void PauseTiming(); - - // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called - // by the current thread. - // Start the benchmark timer. The timer is NOT running on entrance to the - // benchmark function. It begins running after the first call to KeepRunning() - // - // NOTE: PauseTiming()/ResumeTiming() are relatively - // heavyweight, and so their use should generally be avoided - // within each benchmark iteration, if possible. - void ResumeTiming(); - - // REQUIRES: 'SkipWithError(...)' has not been called previously by the - // current thread. - // Skip any future iterations of the 'KeepRunning()' loop in the current - // thread and report an error with the specified 'msg'. After this call - // the user may explicitly 'return' from the benchmark. - // - // For threaded benchmarks only the current thread stops executing and future - // calls to `KeepRunning()` will block until all threads have completed - // the `KeepRunning()` loop. If multiple threads report an error only the - // first error message is used. - // - // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit - // the current scope immediately. If the function is called from within - // the 'KeepRunning()' loop the current iteration will finish. It is the users - // responsibility to exit the scope as needed. - void SkipWithError(const char* msg); - - // REQUIRES: called exactly once per iteration of the KeepRunning loop. - // Set the manually measured time for this benchmark iteration, which - // is used instead of automatically measured time if UseManualTime() was - // specified. - // - // For threaded benchmarks the final value will be set to the largest - // reported values. - void SetIterationTime(double seconds); - - // Set the number of bytes processed by the current benchmark - // execution. This routine is typically called once at the end of a - // throughput oriented benchmark. If this routine is called with a - // value > 0, the report is printed in MB/sec instead of nanoseconds - // per iteration. - // - // REQUIRES: a benchmark has exited its KeepRunning loop. - BENCHMARK_ALWAYS_INLINE - void SetBytesProcessed(size_t bytes) { bytes_processed_ = bytes; } - - BENCHMARK_ALWAYS_INLINE - size_t bytes_processed() const { return bytes_processed_; } - - // If this routine is called with complexity_n > 0 and complexity report is - // requested for the - // family benchmark, then current benchmark will be part of the computation - // and complexity_n will - // represent the length of N. - BENCHMARK_ALWAYS_INLINE - void SetComplexityN(int complexity_n) { complexity_n_ = complexity_n; } - - BENCHMARK_ALWAYS_INLINE - int complexity_length_n() { return complexity_n_; } - - // If this routine is called with items > 0, then an items/s - // label is printed on the benchmark report line for the currently - // executing benchmark. It is typically called at the end of a processing - // benchmark where a processing items/second output is desired. - // - // REQUIRES: a benchmark has exited its KeepRunning loop. - BENCHMARK_ALWAYS_INLINE - void SetItemsProcessed(size_t items) { items_processed_ = items; } - - BENCHMARK_ALWAYS_INLINE - size_t items_processed() const { return items_processed_; } - - // If this routine is called, the specified label is printed at the - // end of the benchmark report line for the currently executing - // benchmark. Example: - // static void BM_Compress(benchmark::State& state) { - // ... - // double compress = input_size / output_size; - // state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression)); - // } - // Produces output that looks like: - // BM_Compress 50 50 14115038 compress:27.3% - // - // REQUIRES: a benchmark has exited its KeepRunning loop. - void SetLabel(const char* label); - - void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) { - this->SetLabel(str.c_str()); - } - - // Range arguments for this run. CHECKs if the argument has been set. - BENCHMARK_ALWAYS_INLINE - int range(std::size_t pos = 0) const { - assert(range_.size() > pos); - return range_[pos]; - } - - BENCHMARK_DEPRECATED_MSG("use 'range(0)' instead") - int range_x() const { return range(0); } - - BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead") - int range_y() const { return range(1); } - - BENCHMARK_ALWAYS_INLINE - size_t iterations() const { return total_iterations_; } - - private: - bool started_; - bool finished_; - size_t total_iterations_; - - std::vector range_; - - size_t bytes_processed_; - size_t items_processed_; - - int complexity_n_; - - bool error_occurred_; - - public: - // Container for user-defined counters. - UserCounters counters; - // Index of the executing thread. Values from [0, threads). - const int thread_index; - // Number of threads concurrently executing the benchmark. - const int threads; - const size_t max_iterations; - - // TODO(EricWF) make me private - State(size_t max_iters, const std::vector& ranges, int thread_i, - int n_threads, internal::ThreadTimer* timer, - internal::ThreadManager* manager); - - private: - void StartKeepRunning(); - void FinishKeepRunning(); - internal::ThreadTimer* timer_; - internal::ThreadManager* manager_; - BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State); -}; - -namespace internal { - -typedef void(Function)(State&); - -// ------------------------------------------------------ -// Benchmark registration object. The BENCHMARK() macro expands -// into an internal::Benchmark* object. Various methods can -// be called on this object to change the properties of the benchmark. -// Each method returns "this" so that multiple method calls can -// chained into one expression. -class Benchmark { - public: - virtual ~Benchmark(); - - // Note: the following methods all return "this" so that multiple - // method calls can be chained together in one expression. - - // Run this benchmark once with "x" as the extra argument passed - // to the function. - // REQUIRES: The function passed to the constructor must accept an arg1. - Benchmark* Arg(int x); - - // Run this benchmark with the given time unit for the generated output report - Benchmark* Unit(TimeUnit unit); - - // Run this benchmark once for a number of values picked from the - // range [start..limit]. (start and limit are always picked.) - // REQUIRES: The function passed to the constructor must accept an arg1. - Benchmark* Range(int start, int limit); - - // Run this benchmark once for all values in the range [start..limit] with - // specific step - // REQUIRES: The function passed to the constructor must accept an arg1. - Benchmark* DenseRange(int start, int limit, int step = 1); - - // Run this benchmark once with "args" as the extra arguments passed - // to the function. - // REQUIRES: The function passed to the constructor must accept arg1, arg2 ... - Benchmark* Args(const std::vector& args); - - // Equivalent to Args({x, y}) - // NOTE: This is a legacy C++03 interface provided for compatibility only. - // New code should use 'Args'. - Benchmark* ArgPair(int x, int y) { - std::vector args; - args.push_back(x); - args.push_back(y); - return Args(args); - } - - // Run this benchmark once for a number of values picked from the - // ranges [start..limit]. (starts and limits are always picked.) - // REQUIRES: The function passed to the constructor must accept arg1, arg2 ... - Benchmark* Ranges(const std::vector >& ranges); - - // Equivalent to ArgNames({name}) - Benchmark* ArgName(const std::string& name); - - // Set the argument names to display in the benchmark name. If not called, - // only argument values will be shown. - Benchmark* ArgNames(const std::vector& names); - - // Equivalent to Ranges({{lo1, hi1}, {lo2, hi2}}). - // NOTE: This is a legacy C++03 interface provided for compatibility only. - // New code should use 'Ranges'. - Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2) { - std::vector > ranges; - ranges.push_back(std::make_pair(lo1, hi1)); - ranges.push_back(std::make_pair(lo2, hi2)); - return Ranges(ranges); - } - - // Pass this benchmark object to *func, which can customize - // the benchmark by calling various methods like Arg, Args, - // Threads, etc. - Benchmark* Apply(void (*func)(Benchmark* benchmark)); - - // Set the range multiplier for non-dense range. If not called, the range - // multiplier kRangeMultiplier will be used. - Benchmark* RangeMultiplier(int multiplier); - - // Set the minimum amount of time to use when running this benchmark. This - // option overrides the `benchmark_min_time` flag. - // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark. - Benchmark* MinTime(double t); - - // Specify the amount of iterations that should be run by this benchmark. - // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark. - // - // NOTE: This function should only be used when *exact* iteration control is - // needed and never to control or limit how long a benchmark runs, where - // `--benchmark_min_time=N` or `MinTime(...)` should be used instead. - Benchmark* Iterations(size_t n); - - // Specify the amount of times to repeat this benchmark. This option overrides - // the `benchmark_repetitions` flag. - // REQUIRES: `n > 0` - Benchmark* Repetitions(int n); - - // Specify if each repetition of the benchmark should be reported separately - // or if only the final statistics should be reported. If the benchmark - // is not repeated then the single result is always reported. - Benchmark* ReportAggregatesOnly(bool value = true); - - // If a particular benchmark is I/O bound, runs multiple threads internally or - // if for some reason CPU timings are not representative, call this method. If - // called, the elapsed time will be used to control how many iterations are - // run, and in the printing of items/second or MB/seconds values. If not - // called, the cpu time used by the benchmark will be used. - Benchmark* UseRealTime(); - - // If a benchmark must measure time manually (e.g. if GPU execution time is - // being - // measured), call this method. If called, each benchmark iteration should - // call - // SetIterationTime(seconds) to report the measured time, which will be used - // to control how many iterations are run, and in the printing of items/second - // or MB/second values. - Benchmark* UseManualTime(); - - // Set the asymptotic computational complexity for the benchmark. If called - // the asymptotic computational complexity will be shown on the output. - Benchmark* Complexity(BigO complexity = benchmark::oAuto); - - // Set the asymptotic computational complexity for the benchmark. If called - // the asymptotic computational complexity will be shown on the output. - Benchmark* Complexity(BigOFunc* complexity); - - // Support for running multiple copies of the same benchmark concurrently - // in multiple threads. This may be useful when measuring the scaling - // of some piece of code. - - // Run one instance of this benchmark concurrently in t threads. - Benchmark* Threads(int t); - - // Pick a set of values T from [min_threads,max_threads]. - // min_threads and max_threads are always included in T. Run this - // benchmark once for each value in T. The benchmark run for a - // particular value t consists of t threads running the benchmark - // function concurrently. For example, consider: - // BENCHMARK(Foo)->ThreadRange(1,16); - // This will run the following benchmarks: - // Foo in 1 thread - // Foo in 2 threads - // Foo in 4 threads - // Foo in 8 threads - // Foo in 16 threads - Benchmark* ThreadRange(int min_threads, int max_threads); - - // For each value n in the range, run this benchmark once using n threads. - // min_threads and max_threads are always included in the range. - // stride specifies the increment. E.g. DenseThreadRange(1, 8, 3) starts - // a benchmark with 1, 4, 7 and 8 threads. - Benchmark* DenseThreadRange(int min_threads, int max_threads, int stride = 1); - - // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) - Benchmark* ThreadPerCpu(); - - virtual void Run(State& state) = 0; - - // Used inside the benchmark implementation - struct Instance; - - protected: - explicit Benchmark(const char* name); - Benchmark(Benchmark const&); - void SetName(const char* name); - - int ArgsCnt() const; - - static void AddRange(std::vector* dst, int lo, int hi, int mult); - - private: - friend class BenchmarkFamilies; - - std::string name_; - ReportMode report_mode_; - std::vector arg_names_; // Args for all benchmark runs - std::vector > args_; // Args for all benchmark runs - TimeUnit time_unit_; - int range_multiplier_; - double min_time_; - size_t iterations_; - int repetitions_; - bool use_real_time_; - bool use_manual_time_; - BigO complexity_; - BigOFunc* complexity_lambda_; - std::vector thread_counts_; - - Benchmark& operator=(Benchmark const&); -}; - -} // namespace internal - -// Create and register a benchmark with the specified 'name' that invokes -// the specified functor 'fn'. -// -// RETURNS: A pointer to the registered benchmark. -internal::Benchmark* RegisterBenchmark(const char* name, - internal::Function* fn); - -#if defined(BENCHMARK_HAS_CXX11) -template -internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn); -#endif - -// Remove all registered benchmarks. All pointers to previously registered -// benchmarks are invalidated. -void ClearRegisteredBenchmarks(); - -namespace internal { -// The class used to hold all Benchmarks created from static function. -// (ie those created using the BENCHMARK(...) macros. -class FunctionBenchmark : public Benchmark { - public: - FunctionBenchmark(const char* name, Function* func) - : Benchmark(name), func_(func) {} - - virtual void Run(State& st); - - private: - Function* func_; -}; - -#ifdef BENCHMARK_HAS_CXX11 -template -class LambdaBenchmark : public Benchmark { - public: - virtual void Run(State& st) { lambda_(st); } - - private: - template - LambdaBenchmark(const char* name, OLambda&& lam) - : Benchmark(name), lambda_(std::forward(lam)) {} - - LambdaBenchmark(LambdaBenchmark const&) = delete; - - private: - template - friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&); - - Lambda lambda_; -}; -#endif - -} // namespace internal - -inline internal::Benchmark* RegisterBenchmark(const char* name, - internal::Function* fn) { - return internal::RegisterBenchmarkInternal( - ::new internal::FunctionBenchmark(name, fn)); -} - -#ifdef BENCHMARK_HAS_CXX11 -template -internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) { - using BenchType = - internal::LambdaBenchmark::type>; - return internal::RegisterBenchmarkInternal( - ::new BenchType(name, std::forward(fn))); -} -#endif - -#if defined(BENCHMARK_HAS_CXX11) && \ - (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409) -template -internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn, - Args&&... args) { - return benchmark::RegisterBenchmark( - name, [=](benchmark::State& st) { fn(st, args...); }); -} -#else -#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK -#endif - -// The base class for all fixture tests. -class Fixture : public internal::Benchmark { - public: - Fixture() : internal::Benchmark("") {} - - virtual void Run(State& st) { - this->SetUp(st); - this->BenchmarkCase(st); - this->TearDown(st); - } - - // These will be deprecated ... - virtual void SetUp(const State&) {} - virtual void TearDown(const State&) {} - // ... In favor of these. - virtual void SetUp(State& st) { SetUp(const_cast(st)); } - virtual void TearDown(State& st) { TearDown(const_cast(st)); } - - protected: - virtual void BenchmarkCase(State&) = 0; -}; - -} // namespace benchmark - -// ------------------------------------------------------ -// Macro to register benchmarks - -// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1 -// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be -// empty. If X is empty the expression becomes (+1 == +0). -#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0) -#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__ -#else -#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__ -#endif - -// Helpers for generating unique variable names -#define BENCHMARK_PRIVATE_NAME(n) \ - BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n) -#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c) -#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c - -#define BENCHMARK_PRIVATE_DECLARE(n) \ - static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \ - BENCHMARK_UNUSED - -#define BENCHMARK(n) \ - BENCHMARK_PRIVATE_DECLARE(n) = \ - (::benchmark::internal::RegisterBenchmarkInternal( \ - new ::benchmark::internal::FunctionBenchmark(#n, n))) - -// Old-style macros -#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a)) -#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->Args({(a1), (a2)}) -#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t)) -#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi)) -#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \ - BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}}) - -#if __cplusplus >= 201103L - -// Register a benchmark which invokes the function specified by `func` -// with the additional arguments specified by `...`. -// -// For example: -// -// template ` -// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) { -// [...] -//} -// /* Registers a benchmark named "BM_takes_args/int_string_test` */ -// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc")); -#define BENCHMARK_CAPTURE(func, test_case_name, ...) \ - BENCHMARK_PRIVATE_DECLARE(func) = \ - (::benchmark::internal::RegisterBenchmarkInternal( \ - new ::benchmark::internal::FunctionBenchmark( \ - #func "/" #test_case_name, \ - [](::benchmark::State& st) { func(st, __VA_ARGS__); }))) - -#endif // __cplusplus >= 11 - -// This will register a benchmark for a templatized function. For example: -// -// template -// void BM_Foo(int iters); -// -// BENCHMARK_TEMPLATE(BM_Foo, 1); -// -// will register BM_Foo<1> as a benchmark. -#define BENCHMARK_TEMPLATE1(n, a) \ - BENCHMARK_PRIVATE_DECLARE(n) = \ - (::benchmark::internal::RegisterBenchmarkInternal( \ - new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n))) - -#define BENCHMARK_TEMPLATE2(n, a, b) \ - BENCHMARK_PRIVATE_DECLARE(n) = \ - (::benchmark::internal::RegisterBenchmarkInternal( \ - new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \ - n))) - -#if __cplusplus >= 201103L -#define BENCHMARK_TEMPLATE(n, ...) \ - BENCHMARK_PRIVATE_DECLARE(n) = \ - (::benchmark::internal::RegisterBenchmarkInternal( \ - new ::benchmark::internal::FunctionBenchmark( \ - #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>))) -#else -#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a) -#endif - -#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \ - class BaseClass##_##Method##_Benchmark : public BaseClass { \ - public: \ - BaseClass##_##Method##_Benchmark() : BaseClass() { \ - this->SetName(#BaseClass "/" #Method); \ - } \ - \ - protected: \ - virtual void BenchmarkCase(::benchmark::State&); \ - }; - -#define BENCHMARK_DEFINE_F(BaseClass, Method) \ - BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \ - void BaseClass##_##Method##_Benchmark::BenchmarkCase - -#define BENCHMARK_REGISTER_F(BaseClass, Method) \ - BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark) - -#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \ - BENCHMARK_PRIVATE_DECLARE(TestName) = \ - (::benchmark::internal::RegisterBenchmarkInternal(new TestName())) - -// This macro will define and register a benchmark within a fixture class. -#define BENCHMARK_F(BaseClass, Method) \ - BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \ - BENCHMARK_REGISTER_F(BaseClass, Method); \ - void BaseClass##_##Method##_Benchmark::BenchmarkCase - -// Helper macro to create a main routine in a test that runs the benchmarks -#define BENCHMARK_MAIN() \ - int main(int argc, char** argv) { \ - ::benchmark::Initialize(&argc, argv); \ - if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \ - ::benchmark::RunSpecifiedBenchmarks(); \ - } - - -// ------------------------------------------------------ -// Benchmark Reporters - -namespace benchmark { - -// Interface for custom benchmark result printers. -// By default, benchmark reports are printed to stdout. However an application -// can control the destination of the reports by calling -// RunSpecifiedBenchmarks and passing it a custom reporter object. -// The reporter object must implement the following interface. -class BenchmarkReporter { - public: - struct Context { - int num_cpus; - double mhz_per_cpu; - bool cpu_scaling_enabled; - - // The number of chars in the longest benchmark name. - size_t name_field_width; - }; - - struct Run { - Run() - : error_occurred(false), - iterations(1), - time_unit(kNanosecond), - real_accumulated_time(0), - cpu_accumulated_time(0), - bytes_per_second(0), - items_per_second(0), - max_heapbytes_used(0), - complexity(oNone), - complexity_lambda(), - complexity_n(0), - report_big_o(false), - report_rms(false), - counters() {} - - std::string benchmark_name; - std::string report_label; // Empty if not set by benchmark. - bool error_occurred; - std::string error_message; - - int64_t iterations; - TimeUnit time_unit; - double real_accumulated_time; - double cpu_accumulated_time; - - // Return a value representing the real time per iteration in the unit - // specified by 'time_unit'. - // NOTE: If 'iterations' is zero the returned value represents the - // accumulated time. - double GetAdjustedRealTime() const; - - // Return a value representing the cpu time per iteration in the unit - // specified by 'time_unit'. - // NOTE: If 'iterations' is zero the returned value represents the - // accumulated time. - double GetAdjustedCPUTime() const; - - // Zero if not set by benchmark. - double bytes_per_second; - double items_per_second; - - // This is set to 0.0 if memory tracing is not enabled. - double max_heapbytes_used; - - // Keep track of arguments to compute asymptotic complexity - BigO complexity; - BigOFunc* complexity_lambda; - int complexity_n; - - // Inform print function whether the current run is a complexity report - bool report_big_o; - bool report_rms; - - UserCounters counters; - }; - - // Construct a BenchmarkReporter with the output stream set to 'std::cout' - // and the error stream set to 'std::cerr' - BenchmarkReporter(); - - // Called once for every suite of benchmarks run. - // The parameter "context" contains information that the - // reporter may wish to use when generating its report, for example the - // platform under which the benchmarks are running. The benchmark run is - // never started if this function returns false, allowing the reporter - // to skip runs based on the context information. - virtual bool ReportContext(const Context& context) = 0; - - // Called once for each group of benchmark runs, gives information about - // cpu-time and heap memory usage during the benchmark run. If the group - // of runs contained more than two entries then 'report' contains additional - // elements representing the mean and standard deviation of those runs. - // Additionally if this group of runs was the last in a family of benchmarks - // 'reports' contains additional entries representing the asymptotic - // complexity and RMS of that benchmark family. - virtual void ReportRuns(const std::vector& report) = 0; - - // Called once and only once after ever group of benchmarks is run and - // reported. - virtual void Finalize() {} - - // REQUIRES: The object referenced by 'out' is valid for the lifetime - // of the reporter. - void SetOutputStream(std::ostream* out) { - assert(out); - output_stream_ = out; - } - - // REQUIRES: The object referenced by 'err' is valid for the lifetime - // of the reporter. - void SetErrorStream(std::ostream* err) { - assert(err); - error_stream_ = err; - } - - std::ostream& GetOutputStream() const { return *output_stream_; } - - std::ostream& GetErrorStream() const { return *error_stream_; } - - virtual ~BenchmarkReporter(); - - // Write a human readable string to 'out' representing the specified - // 'context'. - // REQUIRES: 'out' is non-null. - static void PrintBasicContext(std::ostream* out, Context const& context); - - private: - std::ostream* output_stream_; - std::ostream* error_stream_; -}; - -// Simple reporter that outputs benchmark data to the console. This is the -// default reporter used by RunSpecifiedBenchmarks(). -class ConsoleReporter : public BenchmarkReporter { -public: - enum OutputOptions { - OO_None = 0, - OO_Color = 1, - OO_Tabular = 2, - OO_ColorTabular = OO_Color|OO_Tabular, - OO_Defaults = OO_ColorTabular - }; - explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults) - : output_options_(opts_), name_field_width_(0), - prev_counters_(), printed_header_(false) {} - - virtual bool ReportContext(const Context& context); - virtual void ReportRuns(const std::vector& reports); - - protected: - virtual void PrintRunData(const Run& report); - virtual void PrintHeader(const Run& report); - - OutputOptions output_options_; - size_t name_field_width_; - UserCounters prev_counters_; - bool printed_header_; -}; - -class JSONReporter : public BenchmarkReporter { - public: - JSONReporter() : first_report_(true) {} - virtual bool ReportContext(const Context& context); - virtual void ReportRuns(const std::vector& reports); - virtual void Finalize(); - - private: - void PrintRunData(const Run& report); - - bool first_report_; -}; - -class CSVReporter : public BenchmarkReporter { - public: - CSVReporter() : printed_header_(false) {} - virtual bool ReportContext(const Context& context); - virtual void ReportRuns(const std::vector& reports); - - private: - void PrintRunData(const Run& report); - - bool printed_header_; - std::set< std::string > user_counter_names_; -}; - -inline const char* GetTimeUnitString(TimeUnit unit) { - switch (unit) { - case kMillisecond: - return "ms"; - case kMicrosecond: - return "us"; - case kNanosecond: - default: - return "ns"; - } -} - -inline double GetTimeUnitMultiplier(TimeUnit unit) { - switch (unit) { - case kMillisecond: - return 1e3; - case kMicrosecond: - return 1e6; - case kNanosecond: - default: - return 1e9; - } -} - -} // namespace benchmark - -#endif // BENCHMARK_BENCHMARK_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark_api.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark_api.h deleted file mode 100644 index a9ae67147c5..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/benchmark_api.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifndef BENCHMARK_BENCHMARK_API_H_ -#define BENCHMARK_BENCHMARK_API_H_ - -#ifdef __DEPRECATED -# ifndef BENCHMARK_WARNING_MSG -# warning the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead -# else - BENCHMARK_WARNING_MSG("the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead") -# endif -#endif - -#include "benchmark.h" // For forward declaration of BenchmarkReporter - -#endif // BENCHMARK_BENCHMARK_API_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/reporter.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/reporter.h deleted file mode 100644 index 5baca1a740a..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/include/benchmark/reporter.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifndef BENCHMARK_REPORTER_H_ -#define BENCHMARK_REPORTER_H_ - -#ifdef __DEPRECATED -# ifndef BENCHMARK_WARNING_MSG -# warning the reporter.h header has been deprecated and will be removed, please include benchmark.h instead -# else - BENCHMARK_WARNING_MSG("the reporter.h header has been deprecated and will be removed, please include benchmark.h instead") -# endif -#endif - -#include "benchmark.h" // For forward declaration of BenchmarkReporter - -#endif // BENCHMARK_REPORTER_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/mingw.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/mingw.py deleted file mode 100644 index 706ad559db9..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/mingw.py +++ /dev/null @@ -1,320 +0,0 @@ -#! /usr/bin/env python -# encoding: utf-8 - -import argparse -import errno -import logging -import os -import platform -import re -import sys -import subprocess -import tempfile - -try: - import winreg -except ImportError: - import _winreg as winreg -try: - import urllib.request as request -except ImportError: - import urllib as request -try: - import urllib.parse as parse -except ImportError: - import urlparse as parse - -class EmptyLogger(object): - ''' - Provides an implementation that performs no logging - ''' - def debug(self, *k, **kw): - pass - def info(self, *k, **kw): - pass - def warn(self, *k, **kw): - pass - def error(self, *k, **kw): - pass - def critical(self, *k, **kw): - pass - def setLevel(self, *k, **kw): - pass - -urls = ( - 'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20' - 'targetting%20Win32/Personal%20Builds/mingw-builds/installer/' - 'repository.txt', - 'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/' - 'repository.txt' -) -''' -A list of mingw-build repositories -''' - -def repository(urls = urls, log = EmptyLogger()): - ''' - Downloads and parse mingw-build repository files and parses them - ''' - log.info('getting mingw-builds repository') - versions = {} - re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files') - re_sub = r'http://downloads.sourceforge.net/project/\1' - for url in urls: - log.debug(' - requesting: %s', url) - socket = request.urlopen(url) - repo = socket.read() - if not isinstance(repo, str): - repo = repo.decode(); - socket.close() - for entry in repo.split('\n')[:-1]: - value = entry.split('|') - version = tuple([int(n) for n in value[0].strip().split('.')]) - version = versions.setdefault(version, {}) - arch = value[1].strip() - if arch == 'x32': - arch = 'i686' - elif arch == 'x64': - arch = 'x86_64' - arch = version.setdefault(arch, {}) - threading = arch.setdefault(value[2].strip(), {}) - exceptions = threading.setdefault(value[3].strip(), {}) - revision = exceptions.setdefault(int(value[4].strip()[3:]), - re_sourceforge.sub(re_sub, value[5].strip())) - return versions - -def find_in_path(file, path=None): - ''' - Attempts to find an executable in the path - ''' - if platform.system() == 'Windows': - file += '.exe' - if path is None: - path = os.environ.get('PATH', '') - if type(path) is type(''): - path = path.split(os.pathsep) - return list(filter(os.path.exists, - map(lambda dir, file=file: os.path.join(dir, file), path))) - -def find_7zip(log = EmptyLogger()): - ''' - Attempts to find 7zip for unpacking the mingw-build archives - ''' - log.info('finding 7zip') - path = find_in_path('7z') - if not path: - key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip') - path, _ = winreg.QueryValueEx(key, 'Path') - path = [os.path.join(path, '7z.exe')] - log.debug('found \'%s\'', path[0]) - return path[0] - -find_7zip() - -def unpack(archive, location, log = EmptyLogger()): - ''' - Unpacks a mingw-builds archive - ''' - sevenzip = find_7zip(log) - log.info('unpacking %s', os.path.basename(archive)) - cmd = [sevenzip, 'x', archive, '-o' + location, '-y'] - log.debug(' - %r', cmd) - with open(os.devnull, 'w') as devnull: - subprocess.check_call(cmd, stdout = devnull) - -def download(url, location, log = EmptyLogger()): - ''' - Downloads and unpacks a mingw-builds archive - ''' - log.info('downloading MinGW') - log.debug(' - url: %s', url) - log.debug(' - location: %s', location) - - re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*') - - stream = request.urlopen(url) - try: - content = stream.getheader('Content-Disposition') or '' - except AttributeError: - content = stream.headers.getheader('Content-Disposition') or '' - matches = re_content.match(content) - if matches: - filename = matches.group(2) - else: - parsed = parse.urlparse(stream.geturl()) - filename = os.path.basename(parsed.path) - - try: - os.makedirs(location) - except OSError as e: - if e.errno == errno.EEXIST and os.path.isdir(location): - pass - else: - raise - - archive = os.path.join(location, filename) - with open(archive, 'wb') as out: - while True: - buf = stream.read(1024) - if not buf: - break - out.write(buf) - unpack(archive, location, log = log) - os.remove(archive) - - possible = os.path.join(location, 'mingw64') - if not os.path.exists(possible): - possible = os.path.join(location, 'mingw32') - if not os.path.exists(possible): - raise ValueError('Failed to find unpacked MinGW: ' + possible) - return possible - -def root(location = None, arch = None, version = None, threading = None, - exceptions = None, revision = None, log = EmptyLogger()): - ''' - Returns the root folder of a specific version of the mingw-builds variant - of gcc. Will download the compiler if needed - ''' - - # Get the repository if we don't have all the information - if not (arch and version and threading and exceptions and revision): - versions = repository(log = log) - - # Determine some defaults - version = version or max(versions.keys()) - if not arch: - arch = platform.machine().lower() - if arch == 'x86': - arch = 'i686' - elif arch == 'amd64': - arch = 'x86_64' - if not threading: - keys = versions[version][arch].keys() - if 'posix' in keys: - threading = 'posix' - elif 'win32' in keys: - threading = 'win32' - else: - threading = keys[0] - if not exceptions: - keys = versions[version][arch][threading].keys() - if 'seh' in keys: - exceptions = 'seh' - elif 'sjlj' in keys: - exceptions = 'sjlj' - else: - exceptions = keys[0] - if revision == None: - revision = max(versions[version][arch][threading][exceptions].keys()) - if not location: - location = os.path.join(tempfile.gettempdir(), 'mingw-builds') - - # Get the download url - url = versions[version][arch][threading][exceptions][revision] - - # Tell the user whatzzup - log.info('finding MinGW %s', '.'.join(str(v) for v in version)) - log.debug(' - arch: %s', arch) - log.debug(' - threading: %s', threading) - log.debug(' - exceptions: %s', exceptions) - log.debug(' - revision: %s', revision) - log.debug(' - url: %s', url) - - # Store each specific revision differently - slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}' - slug = slug.format( - version = '.'.join(str(v) for v in version), - arch = arch, - threading = threading, - exceptions = exceptions, - revision = revision - ) - if arch == 'x86_64': - root_dir = os.path.join(location, slug, 'mingw64') - elif arch == 'i686': - root_dir = os.path.join(location, slug, 'mingw32') - else: - raise ValueError('Unknown MinGW arch: ' + arch) - - # Download if needed - if not os.path.exists(root_dir): - downloaded = download(url, os.path.join(location, slug), log = log) - if downloaded != root_dir: - raise ValueError('The location of mingw did not match\n%s\n%s' - % (downloaded, root_dir)) - - return root_dir - -def str2ver(string): - ''' - Converts a version string into a tuple - ''' - try: - version = tuple(int(v) for v in string.split('.')) - if len(version) is not 3: - raise ValueError() - except ValueError: - raise argparse.ArgumentTypeError( - 'please provide a three digit version string') - return version - -def main(): - ''' - Invoked when the script is run directly by the python interpreter - ''' - parser = argparse.ArgumentParser( - description = 'Downloads a specific version of MinGW', - formatter_class = argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument('--location', - help = 'the location to download the compiler to', - default = os.path.join(tempfile.gettempdir(), 'mingw-builds')) - parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'], - help = 'the target MinGW architecture string') - parser.add_argument('--version', type = str2ver, - help = 'the version of GCC to download') - parser.add_argument('--threading', choices = ['posix', 'win32'], - help = 'the threading type of the compiler') - parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'], - help = 'the method to throw exceptions') - parser.add_argument('--revision', type=int, - help = 'the revision of the MinGW release') - group = parser.add_mutually_exclusive_group() - group.add_argument('-v', '--verbose', action='store_true', - help='increase the script output verbosity') - group.add_argument('-q', '--quiet', action='store_true', - help='only print errors and warning') - args = parser.parse_args() - - # Create the logger - logger = logging.getLogger('mingw') - handler = logging.StreamHandler() - formatter = logging.Formatter('%(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(logging.INFO) - if args.quiet: - logger.setLevel(logging.WARN) - if args.verbose: - logger.setLevel(logging.DEBUG) - - # Get MinGW - root_dir = root(location = args.location, arch = args.arch, - version = args.version, threading = args.threading, - exceptions = args.exceptions, revision = args.revision, - log = logger) - - sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin')) - -if __name__ == '__main__': - try: - main() - except IOError as e: - sys.stderr.write('IO error: %s\n' % e) - sys.exit(1) - except OSError as e: - sys.stderr.write('OS error: %s\n' % e) - sys.exit(1) - except KeyboardInterrupt as e: - sys.stderr.write('Killed\n') - sys.exit(1) diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/CMakeLists.txt b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/CMakeLists.txt deleted file mode 100644 index 244484b8b05..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/CMakeLists.txt +++ /dev/null @@ -1,78 +0,0 @@ -# Allow the source files to find headers in src/ -include_directories(${PROJECT_SOURCE_DIR}/src) - -if (DEFINED BENCHMARK_CXX_LINKER_FLAGS) - list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}) - list(APPEND CMAKE_MODULE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}) -endif() - -file(GLOB - SOURCE_FILES - *.cc - ${PROJECT_SOURCE_DIR}/include/benchmark/*.h - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) - -add_library(benchmark ${SOURCE_FILES}) -set_target_properties(benchmark PROPERTIES - OUTPUT_NAME "benchmark" - VERSION ${GENERIC_LIB_VERSION} - SOVERSION ${GENERIC_LIB_SOVERSION} -) -target_include_directories(benchmark PUBLIC - $ - ) - -# Link threads. -target_link_libraries(benchmark ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) -find_library(LIBRT rt) -if(LIBRT) - target_link_libraries(benchmark ${LIBRT}) -endif() - -# We need extra libraries on Windows -if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") - target_link_libraries(benchmark Shlwapi) -endif() - -set(include_install_dir "include") -set(lib_install_dir "lib/") -set(bin_install_dir "bin/") -set(config_install_dir "lib/cmake/${PROJECT_NAME}") - -set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated") - -set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake") -set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake") -set(targets_export_name "${PROJECT_NAME}Targets") - -set(namespace "${PROJECT_NAME}::") - -include(CMakePackageConfigHelpers) -write_basic_package_version_file( - "${version_config}" VERSION ${GIT_VERSION} COMPATIBILITY SameMajorVersion -) - -configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY) - -# Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable) -install( - TARGETS benchmark - EXPORT ${targets_export_name} - ARCHIVE DESTINATION ${lib_install_dir} - LIBRARY DESTINATION ${lib_install_dir} - RUNTIME DESTINATION ${bin_install_dir} - INCLUDES DESTINATION ${include_install_dir}) - -install( - DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark" - DESTINATION ${include_install_dir} - FILES_MATCHING PATTERN "*.*h") - -install( - FILES "${project_config}" "${version_config}" - DESTINATION "${config_install_dir}") - -install( - EXPORT "${targets_export_name}" - NAMESPACE "${namespace}" - DESTINATION "${config_install_dir}") diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/arraysize.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/arraysize.h deleted file mode 100644 index 51a50f2dff2..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/arraysize.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef BENCHMARK_ARRAYSIZE_H_ -#define BENCHMARK_ARRAYSIZE_H_ - -#include "internal_macros.h" - -namespace benchmark { -namespace internal { -// The arraysize(arr) macro returns the # of elements in an array arr. -// The expression is a compile-time constant, and therefore can be -// used in defining new arrays, for example. If you use arraysize on -// a pointer by mistake, you will get a compile-time error. -// - -// This template function declaration is used in defining arraysize. -// Note that the function doesn't need an implementation, as we only -// use its type. -template -char (&ArraySizeHelper(T (&array)[N]))[N]; - -// That gcc wants both of these prototypes seems mysterious. VC, for -// its part, can't decide which to use (another mystery). Matching of -// template overloads: the final frontier. -#ifndef COMPILER_MSVC -template -char (&ArraySizeHelper(const T (&array)[N]))[N]; -#endif - -#define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array))) - -} // end namespace internal -} // end namespace benchmark - -#endif // BENCHMARK_ARRAYSIZE_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark.cc deleted file mode 100644 index 1ba0a50adf8..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark.cc +++ /dev/null @@ -1,715 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "benchmark/benchmark.h" -#include "benchmark_api_internal.h" -#include "internal_macros.h" - -#ifndef BENCHMARK_OS_WINDOWS -#include -#include -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "check.h" -#include "colorprint.h" -#include "commandlineflags.h" -#include "complexity.h" -#include "counter.h" -#include "log.h" -#include "mutex.h" -#include "re.h" -#include "stat.h" -#include "string_util.h" -#include "sysinfo.h" -#include "timers.h" - -DEFINE_bool(benchmark_list_tests, false, - "Print a list of benchmarks. This option overrides all other " - "options."); - -DEFINE_string(benchmark_filter, ".", - "A regular expression that specifies the set of benchmarks " - "to execute. If this flag is empty, no benchmarks are run. " - "If this flag is the string \"all\", all benchmarks linked " - "into the process are run."); - -DEFINE_double(benchmark_min_time, 0.5, - "Minimum number of seconds we should run benchmark before " - "results are considered significant. For cpu-time based " - "tests, this is the lower bound on the total cpu time " - "used by all threads that make up the test. For real-time " - "based tests, this is the lower bound on the elapsed time " - "of the benchmark execution, regardless of number of " - "threads."); - -DEFINE_int32(benchmark_repetitions, 1, - "The number of runs of each benchmark. If greater than 1, the " - "mean and standard deviation of the runs will be reported."); - -DEFINE_bool(benchmark_report_aggregates_only, false, - "Report the result of each benchmark repetitions. When 'true' is " - "specified only the mean, standard deviation, and other statistics " - "are reported for repeated benchmarks."); - -DEFINE_string(benchmark_format, "console", - "The format to use for console output. Valid values are " - "'console', 'json', or 'csv'."); - -DEFINE_string(benchmark_out_format, "json", - "The format to use for file output. Valid values are " - "'console', 'json', or 'csv'."); - -DEFINE_string(benchmark_out, "", "The file to write additonal output to"); - -DEFINE_string(benchmark_color, "auto", - "Whether to use colors in the output. Valid values: " - "'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use " - "colors if the output is being sent to a terminal and the TERM " - "environment variable is set to a terminal type that supports " - "colors."); - -DEFINE_bool(benchmark_counters_tabular, false, - "Whether to use tabular format when printing user counters to " - "the console. Valid values: 'true'/'yes'/1, 'false'/'no'/0." - "Defaults to false."); - -DEFINE_int32(v, 0, "The level of verbose logging to output"); - -namespace benchmark { -namespace internal { - -void UseCharPointer(char const volatile*) {} - -} // end namespace internal - -namespace { - -static const size_t kMaxIterations = 1000000000; - -} // end namespace - -namespace internal { - -class ThreadManager { - public: - ThreadManager(int num_threads) - : alive_threads_(num_threads), start_stop_barrier_(num_threads) {} - - Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) { - return benchmark_mutex_; - } - - bool StartStopBarrier() EXCLUDES(end_cond_mutex_) { - return start_stop_barrier_.wait(); - } - - void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) { - start_stop_barrier_.removeThread(); - if (--alive_threads_ == 0) { - MutexLock lock(end_cond_mutex_); - end_condition_.notify_all(); - } - } - - void WaitForAllThreads() EXCLUDES(end_cond_mutex_) { - MutexLock lock(end_cond_mutex_); - end_condition_.wait(lock.native_handle(), - [this]() { return alive_threads_ == 0; }); - } - - public: - struct Result { - double real_time_used = 0; - double cpu_time_used = 0; - double manual_time_used = 0; - int64_t bytes_processed = 0; - int64_t items_processed = 0; - int complexity_n = 0; - std::string report_label_; - std::string error_message_; - bool has_error_ = false; - UserCounters counters; - }; - GUARDED_BY(GetBenchmarkMutex()) Result results; - - private: - mutable Mutex benchmark_mutex_; - std::atomic alive_threads_; - Barrier start_stop_barrier_; - Mutex end_cond_mutex_; - Condition end_condition_; -}; - -// Timer management class -class ThreadTimer { - public: - ThreadTimer() = default; - - // Called by each thread - void StartTimer() { - running_ = true; - start_real_time_ = ChronoClockNow(); - start_cpu_time_ = ThreadCPUUsage(); - } - - // Called by each thread - void StopTimer() { - CHECK(running_); - running_ = false; - real_time_used_ += ChronoClockNow() - start_real_time_; - cpu_time_used_ += ThreadCPUUsage() - start_cpu_time_; - } - - // Called by each thread - void SetIterationTime(double seconds) { manual_time_used_ += seconds; } - - bool running() const { return running_; } - - // REQUIRES: timer is not running - double real_time_used() { - CHECK(!running_); - return real_time_used_; - } - - // REQUIRES: timer is not running - double cpu_time_used() { - CHECK(!running_); - return cpu_time_used_; - } - - // REQUIRES: timer is not running - double manual_time_used() { - CHECK(!running_); - return manual_time_used_; - } - - private: - bool running_ = false; // Is the timer running - double start_real_time_ = 0; // If running_ - double start_cpu_time_ = 0; // If running_ - - // Accumulated time so far (does not contain current slice if running_) - double real_time_used_ = 0; - double cpu_time_used_ = 0; - // Manually set iteration time. User sets this with SetIterationTime(seconds). - double manual_time_used_ = 0; -}; - -namespace { - -BenchmarkReporter::Run CreateRunReport( - const benchmark::internal::Benchmark::Instance& b, - const internal::ThreadManager::Result& results, size_t iters, - double seconds) { - // Create report about this benchmark run. - BenchmarkReporter::Run report; - - report.benchmark_name = b.name; - report.error_occurred = results.has_error_; - report.error_message = results.error_message_; - report.report_label = results.report_label_; - // Report the total iterations across all threads. - report.iterations = static_cast(iters) * b.threads; - report.time_unit = b.time_unit; - - if (!report.error_occurred) { - double bytes_per_second = 0; - if (results.bytes_processed > 0 && seconds > 0.0) { - bytes_per_second = (results.bytes_processed / seconds); - } - double items_per_second = 0; - if (results.items_processed > 0 && seconds > 0.0) { - items_per_second = (results.items_processed / seconds); - } - - if (b.use_manual_time) { - report.real_accumulated_time = results.manual_time_used; - } else { - report.real_accumulated_time = results.real_time_used; - } - report.cpu_accumulated_time = results.cpu_time_used; - report.bytes_per_second = bytes_per_second; - report.items_per_second = items_per_second; - report.complexity_n = results.complexity_n; - report.complexity = b.complexity; - report.complexity_lambda = b.complexity_lambda; - report.counters = results.counters; - internal::Finish(&report.counters, seconds, b.threads); - } - return report; -} - -// Execute one thread of benchmark b for the specified number of iterations. -// Adds the stats collected for the thread into *total. -void RunInThread(const benchmark::internal::Benchmark::Instance* b, - size_t iters, int thread_id, - internal::ThreadManager* manager) { - internal::ThreadTimer timer; - State st(iters, b->arg, thread_id, b->threads, &timer, manager); - b->benchmark->Run(st); - CHECK(st.iterations() == st.max_iterations) - << "Benchmark returned before State::KeepRunning() returned false!"; - { - MutexLock l(manager->GetBenchmarkMutex()); - internal::ThreadManager::Result& results = manager->results; - results.cpu_time_used += timer.cpu_time_used(); - results.real_time_used += timer.real_time_used(); - results.manual_time_used += timer.manual_time_used(); - results.bytes_processed += st.bytes_processed(); - results.items_processed += st.items_processed(); - results.complexity_n += st.complexity_length_n(); - internal::Increment(&results.counters, st.counters); - } - manager->NotifyThreadComplete(); -} - -std::vector RunBenchmark( - const benchmark::internal::Benchmark::Instance& b, - std::vector* complexity_reports) { - std::vector reports; // return value - - const bool has_explicit_iteration_count = b.iterations != 0; - size_t iters = has_explicit_iteration_count ? b.iterations : 1; - std::unique_ptr manager; - std::vector pool(b.threads - 1); - const int repeats = - b.repetitions != 0 ? b.repetitions : FLAGS_benchmark_repetitions; - const bool report_aggregates_only = - repeats != 1 && - (b.report_mode == internal::RM_Unspecified - ? FLAGS_benchmark_report_aggregates_only - : b.report_mode == internal::RM_ReportAggregatesOnly); - for (int repetition_num = 0; repetition_num < repeats; repetition_num++) { - for (;;) { - // Try benchmark - VLOG(2) << "Running " << b.name << " for " << iters << "\n"; - - manager.reset(new internal::ThreadManager(b.threads)); - for (std::size_t ti = 0; ti < pool.size(); ++ti) { - pool[ti] = std::thread(&RunInThread, &b, iters, - static_cast(ti + 1), manager.get()); - } - RunInThread(&b, iters, 0, manager.get()); - manager->WaitForAllThreads(); - for (std::thread& thread : pool) thread.join(); - internal::ThreadManager::Result results; - { - MutexLock l(manager->GetBenchmarkMutex()); - results = manager->results; - } - manager.reset(); - // Adjust real/manual time stats since they were reported per thread. - results.real_time_used /= b.threads; - results.manual_time_used /= b.threads; - - VLOG(2) << "Ran in " << results.cpu_time_used << "/" - << results.real_time_used << "\n"; - - // Base decisions off of real time if requested by this benchmark. - double seconds = results.cpu_time_used; - if (b.use_manual_time) { - seconds = results.manual_time_used; - } else if (b.use_real_time) { - seconds = results.real_time_used; - } - - const double min_time = - !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time; - - // Determine if this run should be reported; Either it has - // run for a sufficient amount of time or because an error was reported. - const bool should_report = repetition_num > 0 - || has_explicit_iteration_count // An exact iteration count was requested - || results.has_error_ - || iters >= kMaxIterations - || seconds >= min_time // the elapsed time is large enough - // CPU time is specified but the elapsed real time greatly exceeds the - // minimum time. Note that user provided timers are except from this - // sanity check. - || ((results.real_time_used >= 5 * min_time) && !b.use_manual_time); - - if (should_report) { - BenchmarkReporter::Run report = - CreateRunReport(b, results, iters, seconds); - if (!report.error_occurred && b.complexity != oNone) - complexity_reports->push_back(report); - reports.push_back(report); - break; - } - - // See how much iterations should be increased by - // Note: Avoid division by zero with max(seconds, 1ns). - double multiplier = min_time * 1.4 / std::max(seconds, 1e-9); - // If our last run was at least 10% of FLAGS_benchmark_min_time then we - // use the multiplier directly. Otherwise we use at most 10 times - // expansion. - // NOTE: When the last run was at least 10% of the min time the max - // expansion should be 14x. - bool is_significant = (seconds / min_time) > 0.1; - multiplier = is_significant ? multiplier : std::min(10.0, multiplier); - if (multiplier <= 1.0) multiplier = 2.0; - double next_iters = std::max(multiplier * iters, iters + 1.0); - if (next_iters > kMaxIterations) { - next_iters = kMaxIterations; - } - VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n"; - iters = static_cast(next_iters + 0.5); - } - } - // Calculate additional statistics - auto stat_reports = ComputeStats(reports); - if ((b.complexity != oNone) && b.last_benchmark_instance) { - auto additional_run_stats = ComputeBigO(*complexity_reports); - stat_reports.insert(stat_reports.end(), additional_run_stats.begin(), - additional_run_stats.end()); - complexity_reports->clear(); - } - - if (report_aggregates_only) reports.clear(); - reports.insert(reports.end(), stat_reports.begin(), stat_reports.end()); - return reports; -} - -} // namespace -} // namespace internal - -State::State(size_t max_iters, const std::vector& ranges, int thread_i, - int n_threads, internal::ThreadTimer* timer, - internal::ThreadManager* manager) - : started_(false), - finished_(false), - total_iterations_(0), - range_(ranges), - bytes_processed_(0), - items_processed_(0), - complexity_n_(0), - error_occurred_(false), - counters(), - thread_index(thread_i), - threads(n_threads), - max_iterations(max_iters), - timer_(timer), - manager_(manager) { - CHECK(max_iterations != 0) << "At least one iteration must be run"; - CHECK_LT(thread_index, threads) << "thread_index must be less than threads"; -} - -void State::PauseTiming() { - // Add in time accumulated so far - CHECK(started_ && !finished_ && !error_occurred_); - timer_->StopTimer(); -} - -void State::ResumeTiming() { - CHECK(started_ && !finished_ && !error_occurred_); - timer_->StartTimer(); -} - -void State::SkipWithError(const char* msg) { - CHECK(msg); - error_occurred_ = true; - { - MutexLock l(manager_->GetBenchmarkMutex()); - if (manager_->results.has_error_ == false) { - manager_->results.error_message_ = msg; - manager_->results.has_error_ = true; - } - } - total_iterations_ = max_iterations; - if (timer_->running()) timer_->StopTimer(); -} - -void State::SetIterationTime(double seconds) { - timer_->SetIterationTime(seconds); -} - -void State::SetLabel(const char* label) { - MutexLock l(manager_->GetBenchmarkMutex()); - manager_->results.report_label_ = label; -} - -void State::StartKeepRunning() { - CHECK(!started_ && !finished_); - started_ = true; - manager_->StartStopBarrier(); - if (!error_occurred_) ResumeTiming(); -} - -void State::FinishKeepRunning() { - CHECK(started_ && (!finished_ || error_occurred_)); - if (!error_occurred_) { - PauseTiming(); - } - // Total iterations now is one greater than max iterations. Fix this. - total_iterations_ = max_iterations; - finished_ = true; - manager_->StartStopBarrier(); -} - -namespace internal { -namespace { - -void RunBenchmarks(const std::vector& benchmarks, - BenchmarkReporter* console_reporter, - BenchmarkReporter* file_reporter) { - // Note the file_reporter can be null. - CHECK(console_reporter != nullptr); - - // Determine the width of the name field using a minimum width of 10. - bool has_repetitions = FLAGS_benchmark_repetitions > 1; - size_t name_field_width = 10; - for (const Benchmark::Instance& benchmark : benchmarks) { - name_field_width = - std::max(name_field_width, benchmark.name.size()); - has_repetitions |= benchmark.repetitions > 1; - } - if (has_repetitions) name_field_width += std::strlen("_stddev"); - - // Print header here - BenchmarkReporter::Context context; - context.num_cpus = NumCPUs(); - context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f; - - context.cpu_scaling_enabled = CpuScalingEnabled(); - context.name_field_width = name_field_width; - - // Keep track of runing times of all instances of current benchmark - std::vector complexity_reports; - - // We flush streams after invoking reporter methods that write to them. This - // ensures users get timely updates even when streams are not line-buffered. - auto flushStreams = [](BenchmarkReporter* reporter) { - if (!reporter) return; - std::flush(reporter->GetOutputStream()); - std::flush(reporter->GetErrorStream()); - }; - - if (console_reporter->ReportContext(context) && - (!file_reporter || file_reporter->ReportContext(context))) { - flushStreams(console_reporter); - flushStreams(file_reporter); - for (const auto& benchmark : benchmarks) { - std::vector reports = - RunBenchmark(benchmark, &complexity_reports); - console_reporter->ReportRuns(reports); - if (file_reporter) file_reporter->ReportRuns(reports); - flushStreams(console_reporter); - flushStreams(file_reporter); - } - } - console_reporter->Finalize(); - if (file_reporter) file_reporter->Finalize(); - flushStreams(console_reporter); - flushStreams(file_reporter); -} - -std::unique_ptr CreateReporter( - std::string const& name, ConsoleReporter::OutputOptions output_opts) { - typedef std::unique_ptr PtrType; - if (name == "console") { - return PtrType(new ConsoleReporter(output_opts)); - } else if (name == "json") { - return PtrType(new JSONReporter); - } else if (name == "csv") { - return PtrType(new CSVReporter); - } else { - std::cerr << "Unexpected format: '" << name << "'\n"; - std::exit(1); - } -} - -} // end namespace - -bool IsZero(double n) { - return std::abs(n) < std::numeric_limits::epsilon(); -} - -ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) { - int output_opts = ConsoleReporter::OO_Defaults; - if ((FLAGS_benchmark_color == "auto" && IsColorTerminal()) || - IsTruthyFlagValue(FLAGS_benchmark_color)) { - output_opts |= ConsoleReporter::OO_Color; - } else { - output_opts &= ~ConsoleReporter::OO_Color; - } - if(force_no_color) { - output_opts &= ~ConsoleReporter::OO_Color; - } - if(FLAGS_benchmark_counters_tabular) { - output_opts |= ConsoleReporter::OO_Tabular; - } else { - output_opts &= ~ConsoleReporter::OO_Tabular; - } - return static_cast< ConsoleReporter::OutputOptions >(output_opts); -} - -} // end namespace internal - -size_t RunSpecifiedBenchmarks() { - return RunSpecifiedBenchmarks(nullptr, nullptr); -} - -size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter) { - return RunSpecifiedBenchmarks(console_reporter, nullptr); -} - -size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter, - BenchmarkReporter* file_reporter) { - std::string spec = FLAGS_benchmark_filter; - if (spec.empty() || spec == "all") - spec = "."; // Regexp that matches all benchmarks - - // Setup the reporters - std::ofstream output_file; - std::unique_ptr default_console_reporter; - std::unique_ptr default_file_reporter; - if (!console_reporter) { - default_console_reporter = internal::CreateReporter( - FLAGS_benchmark_format, internal::GetOutputOptions()); - console_reporter = default_console_reporter.get(); - } - auto& Out = console_reporter->GetOutputStream(); - auto& Err = console_reporter->GetErrorStream(); - - std::string const& fname = FLAGS_benchmark_out; - if (fname.empty() && file_reporter) { - Err << "A custom file reporter was provided but " - "--benchmark_out= was not specified." - << std::endl; - std::exit(1); - } - if (!fname.empty()) { - output_file.open(fname); - if (!output_file.is_open()) { - Err << "invalid file name: '" << fname << std::endl; - std::exit(1); - } - if (!file_reporter) { - default_file_reporter = internal::CreateReporter( - FLAGS_benchmark_out_format, ConsoleReporter::OO_None); - file_reporter = default_file_reporter.get(); - } - file_reporter->SetOutputStream(&output_file); - file_reporter->SetErrorStream(&output_file); - } - - std::vector benchmarks; - if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0; - - if (benchmarks.empty()) { - Err << "Failed to match any benchmarks against regex: " << spec << "\n"; - return 0; - } - - if (FLAGS_benchmark_list_tests) { - for (auto const& benchmark : benchmarks) Out << benchmark.name << "\n"; - } else { - internal::RunBenchmarks(benchmarks, console_reporter, file_reporter); - } - - return benchmarks.size(); -} - -namespace internal { - -void PrintUsageAndExit() { - fprintf(stdout, - "benchmark" - " [--benchmark_list_tests={true|false}]\n" - " [--benchmark_filter=]\n" - " [--benchmark_min_time=]\n" - " [--benchmark_repetitions=]\n" - " [--benchmark_report_aggregates_only={true|false}\n" - " [--benchmark_format=]\n" - " [--benchmark_out=]\n" - " [--benchmark_out_format=]\n" - " [--benchmark_color={auto|true|false}]\n" - " [--benchmark_counters_tabular={true|false}]\n" - " [--v=]\n"); - exit(0); -} - -void ParseCommandLineFlags(int* argc, char** argv) { - using namespace benchmark; - for (int i = 1; i < *argc; ++i) { - if (ParseBoolFlag(argv[i], "benchmark_list_tests", - &FLAGS_benchmark_list_tests) || - ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) || - ParseDoubleFlag(argv[i], "benchmark_min_time", - &FLAGS_benchmark_min_time) || - ParseInt32Flag(argv[i], "benchmark_repetitions", - &FLAGS_benchmark_repetitions) || - ParseBoolFlag(argv[i], "benchmark_report_aggregates_only", - &FLAGS_benchmark_report_aggregates_only) || - ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) || - ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) || - ParseStringFlag(argv[i], "benchmark_out_format", - &FLAGS_benchmark_out_format) || - ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) || - // "color_print" is the deprecated name for "benchmark_color". - // TODO: Remove this. - ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) || - ParseBoolFlag(argv[i], "benchmark_counters_tabular", - &FLAGS_benchmark_counters_tabular) || - ParseInt32Flag(argv[i], "v", &FLAGS_v)) { - for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1]; - - --(*argc); - --i; - } else if (IsFlag(argv[i], "help")) { - PrintUsageAndExit(); - } - } - for (auto const* flag : - {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) - if (*flag != "console" && *flag != "json" && *flag != "csv") { - PrintUsageAndExit(); - } - if (FLAGS_benchmark_color.empty()) { - PrintUsageAndExit(); - } -} - -int InitializeStreams() { - static std::ios_base::Init init; - return 0; -} - -} // end namespace internal - -void Initialize(int* argc, char** argv) { - internal::ParseCommandLineFlags(argc, argv); - internal::LogLevel() = FLAGS_v; -} - -bool ReportUnrecognizedArguments(int argc, char** argv) { - for (int i = 1; i < argc; ++i) { - fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0], argv[i]); - } - return argc > 1; -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_api_internal.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_api_internal.h deleted file mode 100644 index 36d23404717..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_api_internal.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef BENCHMARK_API_INTERNAL_H -#define BENCHMARK_API_INTERNAL_H - -#include "benchmark/benchmark.h" - -#include -#include -#include -#include -#include - -namespace benchmark { -namespace internal { - -// Information kept per benchmark we may want to run -struct Benchmark::Instance { - std::string name; - Benchmark* benchmark; - ReportMode report_mode; - std::vector arg; - TimeUnit time_unit; - int range_multiplier; - bool use_real_time; - bool use_manual_time; - BigO complexity; - BigOFunc* complexity_lambda; - UserCounters counters; - bool last_benchmark_instance; - int repetitions; - double min_time; - size_t iterations; - int threads; // Number of concurrent threads to us -}; - -bool FindBenchmarksInternal(const std::string& re, - std::vector* benchmarks, - std::ostream* Err); - -bool IsZero(double n); - -ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); - -} // end namespace internal -} // end namespace benchmark - -#endif // BENCHMARK_API_INTERNAL_H diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_register.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_register.cc deleted file mode 100644 index ed70d820dee..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/benchmark_register.cc +++ /dev/null @@ -1,467 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "benchmark/benchmark.h" -#include "benchmark_api_internal.h" -#include "internal_macros.h" - -#ifndef BENCHMARK_OS_WINDOWS -#include -#include -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "check.h" -#include "commandlineflags.h" -#include "complexity.h" -#include "log.h" -#include "mutex.h" -#include "re.h" -#include "stat.h" -#include "string_util.h" -#include "sysinfo.h" -#include "timers.h" - -namespace benchmark { - -namespace { -// For non-dense Range, intermediate values are powers of kRangeMultiplier. -static const int kRangeMultiplier = 8; -// The size of a benchmark family determines is the number of inputs to repeat -// the benchmark on. If this is "large" then warn the user during configuration. -static const size_t kMaxFamilySize = 100; -} // end namespace - -namespace internal { - -//=============================================================================// -// BenchmarkFamilies -//=============================================================================// - -// Class for managing registered benchmarks. Note that each registered -// benchmark identifies a family of related benchmarks to run. -class BenchmarkFamilies { - public: - static BenchmarkFamilies* GetInstance(); - - // Registers a benchmark family and returns the index assigned to it. - size_t AddBenchmark(std::unique_ptr family); - - // Clear all registered benchmark families. - void ClearBenchmarks(); - - // Extract the list of benchmark instances that match the specified - // regular expression. - bool FindBenchmarks(const std::string& re, - std::vector* benchmarks, - std::ostream* Err); - - private: - BenchmarkFamilies() {} - - std::vector> families_; - Mutex mutex_; -}; - -BenchmarkFamilies* BenchmarkFamilies::GetInstance() { - static BenchmarkFamilies instance; - return &instance; -} - -size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr family) { - MutexLock l(mutex_); - size_t index = families_.size(); - families_.push_back(std::move(family)); - return index; -} - -void BenchmarkFamilies::ClearBenchmarks() { - MutexLock l(mutex_); - families_.clear(); - families_.shrink_to_fit(); -} - -bool BenchmarkFamilies::FindBenchmarks( - const std::string& spec, std::vector* benchmarks, - std::ostream* ErrStream) { - CHECK(ErrStream); - auto& Err = *ErrStream; - // Make regular expression out of command-line flag - std::string error_msg; - Regex re; - if (!re.Init(spec, &error_msg)) { - Err << "Could not compile benchmark re: " << error_msg << std::endl; - return false; - } - - // Special list of thread counts to use when none are specified - const std::vector one_thread = {1}; - - MutexLock l(mutex_); - for (std::unique_ptr& family : families_) { - // Family was deleted or benchmark doesn't match - if (!family) continue; - - if (family->ArgsCnt() == -1) { - family->Args({}); - } - const std::vector* thread_counts = - (family->thread_counts_.empty() - ? &one_thread - : &static_cast&>(family->thread_counts_)); - const size_t family_size = family->args_.size() * thread_counts->size(); - // The benchmark will be run at least 'family_size' different inputs. - // If 'family_size' is very large warn the user. - if (family_size > kMaxFamilySize) { - Err << "The number of inputs is very large. " << family->name_ - << " will be repeated at least " << family_size << " times.\n"; - } - // reserve in the special case the regex ".", since we know the final - // family size. - if (spec == ".") benchmarks->reserve(family_size); - - for (auto const& args : family->args_) { - for (int num_threads : *thread_counts) { - Benchmark::Instance instance; - instance.name = family->name_; - instance.benchmark = family.get(); - instance.report_mode = family->report_mode_; - instance.arg = args; - instance.time_unit = family->time_unit_; - instance.range_multiplier = family->range_multiplier_; - instance.min_time = family->min_time_; - instance.iterations = family->iterations_; - instance.repetitions = family->repetitions_; - instance.use_real_time = family->use_real_time_; - instance.use_manual_time = family->use_manual_time_; - instance.complexity = family->complexity_; - instance.complexity_lambda = family->complexity_lambda_; - instance.threads = num_threads; - - // Add arguments to instance name - size_t arg_i = 0; - for (auto const& arg : args) { - instance.name += "/"; - - if (arg_i < family->arg_names_.size()) { - const auto& arg_name = family->arg_names_[arg_i]; - if (!arg_name.empty()) { - instance.name += - StringPrintF("%s:", family->arg_names_[arg_i].c_str()); - } - } - - instance.name += StringPrintF("%d", arg); - ++arg_i; - } - - if (!IsZero(family->min_time_)) - instance.name += StringPrintF("/min_time:%0.3f", family->min_time_); - if (family->iterations_ != 0) - instance.name += StringPrintF("/iterations:%d", family->iterations_); - if (family->repetitions_ != 0) - instance.name += StringPrintF("/repeats:%d", family->repetitions_); - - if (family->use_manual_time_) { - instance.name += "/manual_time"; - } else if (family->use_real_time_) { - instance.name += "/real_time"; - } - - // Add the number of threads used to the name - if (!family->thread_counts_.empty()) { - instance.name += StringPrintF("/threads:%d", instance.threads); - } - - if (re.Match(instance.name)) { - instance.last_benchmark_instance = (&args == &family->args_.back()); - benchmarks->push_back(std::move(instance)); - } - } - } - } - return true; -} - -Benchmark* RegisterBenchmarkInternal(Benchmark* bench) { - std::unique_ptr bench_ptr(bench); - BenchmarkFamilies* families = BenchmarkFamilies::GetInstance(); - families->AddBenchmark(std::move(bench_ptr)); - return bench; -} - -// FIXME: This function is a hack so that benchmark.cc can access -// `BenchmarkFamilies` -bool FindBenchmarksInternal(const std::string& re, - std::vector* benchmarks, - std::ostream* Err) { - return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err); -} - -//=============================================================================// -// Benchmark -//=============================================================================// - -Benchmark::Benchmark(const char* name) - : name_(name), - report_mode_(RM_Unspecified), - time_unit_(kNanosecond), - range_multiplier_(kRangeMultiplier), - min_time_(0), - iterations_(0), - repetitions_(0), - use_real_time_(false), - use_manual_time_(false), - complexity_(oNone), - complexity_lambda_(nullptr) {} - -Benchmark::~Benchmark() {} - -void Benchmark::AddRange(std::vector* dst, int lo, int hi, int mult) { - CHECK_GE(lo, 0); - CHECK_GE(hi, lo); - CHECK_GE(mult, 2); - - // Add "lo" - dst->push_back(lo); - - static const int kint32max = std::numeric_limits::max(); - - // Now space out the benchmarks in multiples of "mult" - for (int32_t i = 1; i < kint32max / mult; i *= mult) { - if (i >= hi) break; - if (i > lo) { - dst->push_back(i); - } - } - // Add "hi" (if different from "lo") - if (hi != lo) { - dst->push_back(hi); - } -} - -Benchmark* Benchmark::Arg(int x) { - CHECK(ArgsCnt() == -1 || ArgsCnt() == 1); - args_.push_back({x}); - return this; -} - -Benchmark* Benchmark::Unit(TimeUnit unit) { - time_unit_ = unit; - return this; -} - -Benchmark* Benchmark::Range(int start, int limit) { - CHECK(ArgsCnt() == -1 || ArgsCnt() == 1); - std::vector arglist; - AddRange(&arglist, start, limit, range_multiplier_); - - for (int i : arglist) { - args_.push_back({i}); - } - return this; -} - -Benchmark* Benchmark::Ranges(const std::vector>& ranges) { - CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast(ranges.size())); - std::vector> arglists(ranges.size()); - std::size_t total = 1; - for (std::size_t i = 0; i < ranges.size(); i++) { - AddRange(&arglists[i], ranges[i].first, ranges[i].second, - range_multiplier_); - total *= arglists[i].size(); - } - - std::vector ctr(arglists.size(), 0); - - for (std::size_t i = 0; i < total; i++) { - std::vector tmp; - tmp.reserve(arglists.size()); - - for (std::size_t j = 0; j < arglists.size(); j++) { - tmp.push_back(arglists[j].at(ctr[j])); - } - - args_.push_back(std::move(tmp)); - - for (std::size_t j = 0; j < arglists.size(); j++) { - if (ctr[j] + 1 < arglists[j].size()) { - ++ctr[j]; - break; - } - ctr[j] = 0; - } - } - return this; -} - -Benchmark* Benchmark::ArgName(const std::string& name) { - CHECK(ArgsCnt() == -1 || ArgsCnt() == 1); - arg_names_ = {name}; - return this; -} - -Benchmark* Benchmark::ArgNames(const std::vector& names) { - CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast(names.size())); - arg_names_ = names; - return this; -} - -Benchmark* Benchmark::DenseRange(int start, int limit, int step) { - CHECK(ArgsCnt() == -1 || ArgsCnt() == 1); - CHECK_GE(start, 0); - CHECK_LE(start, limit); - for (int arg = start; arg <= limit; arg += step) { - args_.push_back({arg}); - } - return this; -} - -Benchmark* Benchmark::Args(const std::vector& args) { - CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast(args.size())); - args_.push_back(args); - return this; -} - -Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) { - custom_arguments(this); - return this; -} - -Benchmark* Benchmark::RangeMultiplier(int multiplier) { - CHECK(multiplier > 1); - range_multiplier_ = multiplier; - return this; -} - - -Benchmark* Benchmark::MinTime(double t) { - CHECK(t > 0.0); - CHECK(iterations_ == 0); - min_time_ = t; - return this; -} - - -Benchmark* Benchmark::Iterations(size_t n) { - CHECK(n > 0); - CHECK(IsZero(min_time_)); - iterations_ = n; - return this; -} - -Benchmark* Benchmark::Repetitions(int n) { - CHECK(n > 0); - repetitions_ = n; - return this; -} - -Benchmark* Benchmark::ReportAggregatesOnly(bool value) { - report_mode_ = value ? RM_ReportAggregatesOnly : RM_Default; - return this; -} - -Benchmark* Benchmark::UseRealTime() { - CHECK(!use_manual_time_) - << "Cannot set UseRealTime and UseManualTime simultaneously."; - use_real_time_ = true; - return this; -} - -Benchmark* Benchmark::UseManualTime() { - CHECK(!use_real_time_) - << "Cannot set UseRealTime and UseManualTime simultaneously."; - use_manual_time_ = true; - return this; -} - -Benchmark* Benchmark::Complexity(BigO complexity) { - complexity_ = complexity; - return this; -} - -Benchmark* Benchmark::Complexity(BigOFunc* complexity) { - complexity_lambda_ = complexity; - complexity_ = oLambda; - return this; -} - -Benchmark* Benchmark::Threads(int t) { - CHECK_GT(t, 0); - thread_counts_.push_back(t); - return this; -} - -Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) { - CHECK_GT(min_threads, 0); - CHECK_GE(max_threads, min_threads); - - AddRange(&thread_counts_, min_threads, max_threads, 2); - return this; -} - -Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads, - int stride) { - CHECK_GT(min_threads, 0); - CHECK_GE(max_threads, min_threads); - CHECK_GE(stride, 1); - - for (auto i = min_threads; i < max_threads; i += stride) { - thread_counts_.push_back(i); - } - thread_counts_.push_back(max_threads); - return this; -} - -Benchmark* Benchmark::ThreadPerCpu() { - static int num_cpus = NumCPUs(); - thread_counts_.push_back(num_cpus); - return this; -} - -void Benchmark::SetName(const char* name) { name_ = name; } - -int Benchmark::ArgsCnt() const { - if (args_.empty()) { - if (arg_names_.empty()) return -1; - return static_cast(arg_names_.size()); - } - return static_cast(args_.front().size()); -} - -//=============================================================================// -// FunctionBenchmark -//=============================================================================// - -void FunctionBenchmark::Run(State& st) { func_(st); } - -} // end namespace internal - -void ClearRegisteredBenchmarks() { - internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks(); -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/check.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/check.h deleted file mode 100644 index 73bead2fb55..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/check.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef CHECK_H_ -#define CHECK_H_ - -#include -#include -#include - -#include "internal_macros.h" -#include "log.h" - -namespace benchmark { -namespace internal { - -typedef void(AbortHandlerT)(); - -inline AbortHandlerT*& GetAbortHandler() { - static AbortHandlerT* handler = &std::abort; - return handler; -} - -BENCHMARK_NORETURN inline void CallAbortHandler() { - GetAbortHandler()(); - std::abort(); // fallback to enforce noreturn -} - -// CheckHandler is the class constructed by failing CHECK macros. CheckHandler -// will log information about the failures and abort when it is destructed. -class CheckHandler { - public: - CheckHandler(const char* check, const char* file, const char* func, int line) - : log_(GetErrorLogInstance()) { - log_ << file << ":" << line << ": " << func << ": Check `" << check - << "' failed. "; - } - - LogType& GetLog() { return log_; } - - BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) { - log_ << std::endl; - CallAbortHandler(); - } - - CheckHandler& operator=(const CheckHandler&) = delete; - CheckHandler(const CheckHandler&) = delete; - CheckHandler() = delete; - - private: - LogType& log_; -}; - -} // end namespace internal -} // end namespace benchmark - -// The CHECK macro returns a std::ostream object that can have extra information -// written to it. -#ifndef NDEBUG -#define CHECK(b) \ - (b ? ::benchmark::internal::GetNullLogInstance() \ - : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \ - .GetLog()) -#else -#define CHECK(b) ::benchmark::internal::GetNullLogInstance() -#endif - -#define CHECK_EQ(a, b) CHECK((a) == (b)) -#define CHECK_NE(a, b) CHECK((a) != (b)) -#define CHECK_GE(a, b) CHECK((a) >= (b)) -#define CHECK_LE(a, b) CHECK((a) <= (b)) -#define CHECK_GT(a, b) CHECK((a) > (b)) -#define CHECK_LT(a, b) CHECK((a) < (b)) - -#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) < (eps)) -#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps)) -#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps)) -#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps)) -#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) > (eps)) -#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) > (eps)) - -#endif // CHECK_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.cc deleted file mode 100644 index 2dec4a8b28b..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.cc +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "colorprint.h" - -#include -#include -#include -#include -#include -#include - -#include "check.h" -#include "internal_macros.h" - -#ifdef BENCHMARK_OS_WINDOWS -#include -#include -#else -#include -#endif // BENCHMARK_OS_WINDOWS - -namespace benchmark { -namespace { -#ifdef BENCHMARK_OS_WINDOWS -typedef WORD PlatformColorCode; -#else -typedef const char* PlatformColorCode; -#endif - -PlatformColorCode GetPlatformColorCode(LogColor color) { -#ifdef BENCHMARK_OS_WINDOWS - switch (color) { - case COLOR_RED: - return FOREGROUND_RED; - case COLOR_GREEN: - return FOREGROUND_GREEN; - case COLOR_YELLOW: - return FOREGROUND_RED | FOREGROUND_GREEN; - case COLOR_BLUE: - return FOREGROUND_BLUE; - case COLOR_MAGENTA: - return FOREGROUND_BLUE | FOREGROUND_RED; - case COLOR_CYAN: - return FOREGROUND_BLUE | FOREGROUND_GREEN; - case COLOR_WHITE: // fall through to default - default: - return 0; - } -#else - switch (color) { - case COLOR_RED: - return "1"; - case COLOR_GREEN: - return "2"; - case COLOR_YELLOW: - return "3"; - case COLOR_BLUE: - return "4"; - case COLOR_MAGENTA: - return "5"; - case COLOR_CYAN: - return "6"; - case COLOR_WHITE: - return "7"; - default: - return nullptr; - }; -#endif -} - -} // end namespace - -std::string FormatString(const char* msg, va_list args) { - // we might need a second shot at this, so pre-emptivly make a copy - va_list args_cp; - va_copy(args_cp, args); - - std::size_t size = 256; - char local_buff[256]; - auto ret = vsnprintf(local_buff, size, msg, args_cp); - - va_end(args_cp); - - // currently there is no error handling for failure, so this is hack. - CHECK(ret >= 0); - - if (ret == 0) // handle empty expansion - return {}; - else if (static_cast(ret) < size) - return local_buff; - else { - // we did not provide a long enough buffer on our first attempt. - size = (size_t)ret + 1; // + 1 for the null byte - std::unique_ptr buff(new char[size]); - ret = vsnprintf(buff.get(), size, msg, args); - CHECK(ret > 0 && ((size_t)ret) < size); - return buff.get(); - } -} - -std::string FormatString(const char* msg, ...) { - va_list args; - va_start(args, msg); - auto tmp = FormatString(msg, args); - va_end(args); - return tmp; -} - -void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) { - va_list args; - va_start(args, fmt); - ColorPrintf(out, color, fmt, args); - va_end(args); -} - -void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, - va_list args) { -#ifdef BENCHMARK_OS_WINDOWS - ((void)out); // suppress unused warning - - const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); - - // Gets the current text color. - CONSOLE_SCREEN_BUFFER_INFO buffer_info; - GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); - const WORD old_color_attrs = buffer_info.wAttributes; - - // We need to flush the stream buffers into the console before each - // SetConsoleTextAttribute call lest it affect the text that is already - // printed but has not yet reached the console. - fflush(stdout); - SetConsoleTextAttribute(stdout_handle, - GetPlatformColorCode(color) | FOREGROUND_INTENSITY); - vprintf(fmt, args); - - fflush(stdout); - // Restores the text color. - SetConsoleTextAttribute(stdout_handle, old_color_attrs); -#else - const char* color_code = GetPlatformColorCode(color); - if (color_code) out << FormatString("\033[0;3%sm", color_code); - out << FormatString(fmt, args) << "\033[m"; -#endif -} - -bool IsColorTerminal() { -#if BENCHMARK_OS_WINDOWS - // On Windows the TERM variable is usually not set, but the - // console there does support colors. - return 0 != _isatty(_fileno(stdout)); -#else - // On non-Windows platforms, we rely on the TERM variable. This list of - // supported TERM values is copied from Google Test: - // . - const char* const SUPPORTED_TERM_VALUES[] = { - "xterm", "xterm-color", "xterm-256color", - "screen", "screen-256color", "tmux", - "tmux-256color", "rxvt-unicode", "rxvt-unicode-256color", - "linux", "cygwin", - }; - - const char* const term = getenv("TERM"); - - bool term_supports_color = false; - for (const char* candidate : SUPPORTED_TERM_VALUES) { - if (term && 0 == strcmp(term, candidate)) { - term_supports_color = true; - break; - } - } - - return 0 != isatty(fileno(stdout)) && term_supports_color; -#endif // BENCHMARK_OS_WINDOWS -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.h deleted file mode 100644 index 9f6fab9b342..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/colorprint.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef BENCHMARK_COLORPRINT_H_ -#define BENCHMARK_COLORPRINT_H_ - -#include -#include -#include - -namespace benchmark { -enum LogColor { - COLOR_DEFAULT, - COLOR_RED, - COLOR_GREEN, - COLOR_YELLOW, - COLOR_BLUE, - COLOR_MAGENTA, - COLOR_CYAN, - COLOR_WHITE -}; - -std::string FormatString(const char* msg, va_list args); -std::string FormatString(const char* msg, ...); - -void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, - va_list args); -void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...); - -// Returns true if stdout appears to be a terminal that supports colored -// output, false otherwise. -bool IsColorTerminal(); - -} // end namespace benchmark - -#endif // BENCHMARK_COLORPRINT_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.cc deleted file mode 100644 index 2fc92517a32..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.cc +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "commandlineflags.h" - -#include -#include -#include -#include -#include - -namespace benchmark { -// Parses 'str' for a 32-bit signed integer. If successful, writes -// the result to *value and returns true; otherwise leaves *value -// unchanged and returns false. -bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) { - // Parses the environment variable as a decimal integer. - char* end = nullptr; - const long long_value = strtol(str, &end, 10); // NOLINT - - // Has strtol() consumed all characters in the string? - if (*end != '\0') { - // No - an invalid character was encountered. - std::cerr << src_text << " is expected to be a 32-bit integer, " - << "but actually has value \"" << str << "\".\n"; - return false; - } - - // Is the parsed value in the range of an Int32? - const int32_t result = static_cast(long_value); - if (long_value == std::numeric_limits::max() || - long_value == std::numeric_limits::min() || - // The parsed value overflows as a long. (strtol() returns - // LONG_MAX or LONG_MIN when the input overflows.) - result != long_value - // The parsed value overflows as an Int32. - ) { - std::cerr << src_text << " is expected to be a 32-bit integer, " - << "but actually has value \"" << str << "\", " - << "which overflows.\n"; - return false; - } - - *value = result; - return true; -} - -// Parses 'str' for a double. If successful, writes the result to *value and -// returns true; otherwise leaves *value unchanged and returns false. -bool ParseDouble(const std::string& src_text, const char* str, double* value) { - // Parses the environment variable as a decimal integer. - char* end = nullptr; - const double double_value = strtod(str, &end); // NOLINT - - // Has strtol() consumed all characters in the string? - if (*end != '\0') { - // No - an invalid character was encountered. - std::cerr << src_text << " is expected to be a double, " - << "but actually has value \"" << str << "\".\n"; - return false; - } - - *value = double_value; - return true; -} - -// Returns the name of the environment variable corresponding to the -// given flag. For example, FlagToEnvVar("foo") will return -// "BENCHMARK_FOO" in the open-source version. -static std::string FlagToEnvVar(const char* flag) { - const std::string flag_str(flag); - - std::string env_var; - for (size_t i = 0; i != flag_str.length(); ++i) - env_var += static_cast(::toupper(flag_str.c_str()[i])); - - return "BENCHMARK_" + env_var; -} - -// Reads and returns the Boolean environment variable corresponding to -// the given flag; if it's not set, returns default_value. -// -// The value is considered true iff it's not "0". -bool BoolFromEnv(const char* flag, bool default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const string_value = getenv(env_var.c_str()); - return string_value == nullptr ? default_value - : strcmp(string_value, "0") != 0; -} - -// Reads and returns a 32-bit integer stored in the environment -// variable corresponding to the given flag; if it isn't set or -// doesn't represent a valid 32-bit integer, returns default_value. -int32_t Int32FromEnv(const char* flag, int32_t default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const string_value = getenv(env_var.c_str()); - if (string_value == nullptr) { - // The environment variable is not set. - return default_value; - } - - int32_t result = default_value; - if (!ParseInt32(std::string("Environment variable ") + env_var, string_value, - &result)) { - std::cout << "The default value " << default_value << " is used.\n"; - return default_value; - } - - return result; -} - -// Reads and returns the string environment variable corresponding to -// the given flag; if it's not set, returns default_value. -const char* StringFromEnv(const char* flag, const char* default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const value = getenv(env_var.c_str()); - return value == nullptr ? default_value : value; -} - -// Parses a string as a command line flag. The string should have -// the format "--flag=value". When def_optional is true, the "=value" -// part can be omitted. -// -// Returns the value of the flag, or nullptr if the parsing failed. -const char* ParseFlagValue(const char* str, const char* flag, - bool def_optional) { - // str and flag must not be nullptr. - if (str == nullptr || flag == nullptr) return nullptr; - - // The flag must start with "--". - const std::string flag_str = std::string("--") + std::string(flag); - const size_t flag_len = flag_str.length(); - if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr; - - // Skips the flag name. - const char* flag_end = str + flag_len; - - // When def_optional is true, it's OK to not have a "=value" part. - if (def_optional && (flag_end[0] == '\0')) return flag_end; - - // If def_optional is true and there are more characters after the - // flag name, or if def_optional is false, there must be a '=' after - // the flag name. - if (flag_end[0] != '=') return nullptr; - - // Returns the string after "=". - return flag_end + 1; -} - -bool ParseBoolFlag(const char* str, const char* flag, bool* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, true); - - // Aborts if the parsing failed. - if (value_str == nullptr) return false; - - // Converts the string value to a bool. - *value = IsTruthyFlagValue(value_str); - return true; -} - -bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); - - // Aborts if the parsing failed. - if (value_str == nullptr) return false; - - // Sets *value to the value of the flag. - return ParseInt32(std::string("The value of flag --") + flag, value_str, - value); -} - -bool ParseDoubleFlag(const char* str, const char* flag, double* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); - - // Aborts if the parsing failed. - if (value_str == nullptr) return false; - - // Sets *value to the value of the flag. - return ParseDouble(std::string("The value of flag --") + flag, value_str, - value); -} - -bool ParseStringFlag(const char* str, const char* flag, std::string* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); - - // Aborts if the parsing failed. - if (value_str == nullptr) return false; - - *value = value_str; - return true; -} - -bool IsFlag(const char* str, const char* flag) { - return (ParseFlagValue(str, flag, true) != nullptr); -} - -bool IsTruthyFlagValue(const std::string& value) { - if (value.empty()) return true; - char ch = value[0]; - return isalnum(ch) && - !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N'); -} -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.h deleted file mode 100644 index 945c9a9fc4a..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/commandlineflags.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef BENCHMARK_COMMANDLINEFLAGS_H_ -#define BENCHMARK_COMMANDLINEFLAGS_H_ - -#include -#include - -// Macro for referencing flags. -#define FLAG(name) FLAGS_##name - -// Macros for declaring flags. -#define DECLARE_bool(name) extern bool FLAG(name) -#define DECLARE_int32(name) extern int32_t FLAG(name) -#define DECLARE_int64(name) extern int64_t FLAG(name) -#define DECLARE_double(name) extern double FLAG(name) -#define DECLARE_string(name) extern std::string FLAG(name) - -// Macros for defining flags. -#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val) -#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val) -#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val) -#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val) -#define DEFINE_string(name, default_val, doc) \ - std::string FLAG(name) = (default_val) - -namespace benchmark { -// Parses 'str' for a 32-bit signed integer. If successful, writes the result -// to *value and returns true; otherwise leaves *value unchanged and returns -// false. -bool ParseInt32(const std::string& src_text, const char* str, int32_t* value); - -// Parses a bool/Int32/string from the environment variable -// corresponding to the given Google Test flag. -bool BoolFromEnv(const char* flag, bool default_val); -int32_t Int32FromEnv(const char* flag, int32_t default_val); -double DoubleFromEnv(const char* flag, double default_val); -const char* StringFromEnv(const char* flag, const char* default_val); - -// Parses a string for a bool flag, in the form of either -// "--flag=value" or "--flag". -// -// In the former case, the value is taken as true if it passes IsTruthyValue(). -// -// In the latter case, the value is taken as true. -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseBoolFlag(const char* str, const char* flag, bool* value); - -// Parses a string for an Int32 flag, in the form of -// "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseInt32Flag(const char* str, const char* flag, int32_t* value); - -// Parses a string for a Double flag, in the form of -// "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseDoubleFlag(const char* str, const char* flag, double* value); - -// Parses a string for a string flag, in the form of -// "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseStringFlag(const char* str, const char* flag, std::string* value); - -// Returns true if the string matches the flag. -bool IsFlag(const char* str, const char* flag); - -// Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or -// some non-alphanumeric character. As a special case, also returns true if -// value is the empty string. -bool IsTruthyFlagValue(const std::string& value); -} // end namespace benchmark - -#endif // BENCHMARK_COMMANDLINEFLAGS_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.cc deleted file mode 100644 index 33975be55ec..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.cc +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright 2016 Ismael Jimenez Martinez. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Source project : https://github.com/ismaelJimenez/cpp.leastsq -// Adapted to be used with google benchmark - -#include "benchmark/benchmark.h" - -#include -#include -#include "check.h" -#include "complexity.h" -#include "stat.h" - -namespace benchmark { - -// Internal function to calculate the different scalability forms -BigOFunc* FittingCurve(BigO complexity) { - switch (complexity) { - case oN: - return [](int n) -> double { return n; }; - case oNSquared: - return [](int n) -> double { return std::pow(n, 2); }; - case oNCubed: - return [](int n) -> double { return std::pow(n, 3); }; - case oLogN: - return [](int n) { return log2(n); }; - case oNLogN: - return [](int n) { return n * log2(n); }; - case o1: - default: - return [](int) { return 1.0; }; - } -} - -// Function to return an string for the calculated complexity -std::string GetBigOString(BigO complexity) { - switch (complexity) { - case oN: - return "N"; - case oNSquared: - return "N^2"; - case oNCubed: - return "N^3"; - case oLogN: - return "lgN"; - case oNLogN: - return "NlgN"; - case o1: - return "(1)"; - default: - return "f(N)"; - } -} - -// Find the coefficient for the high-order term in the running time, by -// minimizing the sum of squares of relative error, for the fitting curve -// given by the lambda expresion. -// - n : Vector containing the size of the benchmark tests. -// - time : Vector containing the times for the benchmark tests. -// - fitting_curve : lambda expresion (e.g. [](int n) {return n; };). - -// For a deeper explanation on the algorithm logic, look the README file at -// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit - -LeastSq MinimalLeastSq(const std::vector& n, - const std::vector& time, - BigOFunc* fitting_curve) { - double sigma_gn = 0.0; - double sigma_gn_squared = 0.0; - double sigma_time = 0.0; - double sigma_time_gn = 0.0; - - // Calculate least square fitting parameter - for (size_t i = 0; i < n.size(); ++i) { - double gn_i = fitting_curve(n[i]); - sigma_gn += gn_i; - sigma_gn_squared += gn_i * gn_i; - sigma_time += time[i]; - sigma_time_gn += time[i] * gn_i; - } - - LeastSq result; - result.complexity = oLambda; - - // Calculate complexity. - result.coef = sigma_time_gn / sigma_gn_squared; - - // Calculate RMS - double rms = 0.0; - for (size_t i = 0; i < n.size(); ++i) { - double fit = result.coef * fitting_curve(n[i]); - rms += pow((time[i] - fit), 2); - } - - // Normalized RMS by the mean of the observed values - double mean = sigma_time / n.size(); - result.rms = sqrt(rms / n.size()) / mean; - - return result; -} - -// Find the coefficient for the high-order term in the running time, by -// minimizing the sum of squares of relative error. -// - n : Vector containing the size of the benchmark tests. -// - time : Vector containing the times for the benchmark tests. -// - complexity : If different than oAuto, the fitting curve will stick to -// this one. If it is oAuto, it will be calculated the best -// fitting curve. -LeastSq MinimalLeastSq(const std::vector& n, - const std::vector& time, const BigO complexity) { - CHECK_EQ(n.size(), time.size()); - CHECK_GE(n.size(), 2); // Do not compute fitting curve is less than two - // benchmark runs are given - CHECK_NE(complexity, oNone); - - LeastSq best_fit; - - if (complexity == oAuto) { - std::vector fit_curves = {oLogN, oN, oNLogN, oNSquared, oNCubed}; - - // Take o1 as default best fitting curve - best_fit = MinimalLeastSq(n, time, FittingCurve(o1)); - best_fit.complexity = o1; - - // Compute all possible fitting curves and stick to the best one - for (const auto& fit : fit_curves) { - LeastSq current_fit = MinimalLeastSq(n, time, FittingCurve(fit)); - if (current_fit.rms < best_fit.rms) { - best_fit = current_fit; - best_fit.complexity = fit; - } - } - } else { - best_fit = MinimalLeastSq(n, time, FittingCurve(complexity)); - best_fit.complexity = complexity; - } - - return best_fit; -} - -std::vector ComputeStats( - const std::vector& reports) { - typedef BenchmarkReporter::Run Run; - std::vector results; - - auto error_count = - std::count_if(reports.begin(), reports.end(), - [](Run const& run) { return run.error_occurred; }); - - if (reports.size() - error_count < 2) { - // We don't report aggregated data if there was a single run. - return results; - } - // Accumulators. - Stat1_d real_accumulated_time_stat; - Stat1_d cpu_accumulated_time_stat; - Stat1_d bytes_per_second_stat; - Stat1_d items_per_second_stat; - // All repetitions should be run with the same number of iterations so we - // can take this information from the first benchmark. - int64_t const run_iterations = reports.front().iterations; - // create stats for user counters - struct CounterStat { - Counter c; - Stat1_d s; - }; - std::map< std::string, CounterStat > counter_stats; - for(Run const& r : reports) { - for(auto const& cnt : r.counters) { - auto it = counter_stats.find(cnt.first); - if(it == counter_stats.end()) { - counter_stats.insert({cnt.first, {cnt.second, Stat1_d{}}}); - } else { - CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags); - } - } - } - - // Populate the accumulators. - for (Run const& run : reports) { - CHECK_EQ(reports[0].benchmark_name, run.benchmark_name); - CHECK_EQ(run_iterations, run.iterations); - if (run.error_occurred) continue; - real_accumulated_time_stat += - Stat1_d(run.real_accumulated_time / run.iterations); - cpu_accumulated_time_stat += - Stat1_d(run.cpu_accumulated_time / run.iterations); - items_per_second_stat += Stat1_d(run.items_per_second); - bytes_per_second_stat += Stat1_d(run.bytes_per_second); - // user counters - for(auto const& cnt : run.counters) { - auto it = counter_stats.find(cnt.first); - CHECK_NE(it, counter_stats.end()); - it->second.s += Stat1_d(cnt.second); - } - } - - // Get the data from the accumulator to BenchmarkReporter::Run's. - Run mean_data; - mean_data.benchmark_name = reports[0].benchmark_name + "_mean"; - mean_data.iterations = run_iterations; - mean_data.real_accumulated_time = - real_accumulated_time_stat.Mean() * run_iterations; - mean_data.cpu_accumulated_time = - cpu_accumulated_time_stat.Mean() * run_iterations; - mean_data.bytes_per_second = bytes_per_second_stat.Mean(); - mean_data.items_per_second = items_per_second_stat.Mean(); - mean_data.time_unit = reports[0].time_unit; - // user counters - for(auto const& kv : counter_stats) { - auto c = Counter(kv.second.s.Mean(), counter_stats[kv.first].c.flags); - mean_data.counters[kv.first] = c; - } - - // Only add label to mean/stddev if it is same for all runs - mean_data.report_label = reports[0].report_label; - for (std::size_t i = 1; i < reports.size(); i++) { - if (reports[i].report_label != reports[0].report_label) { - mean_data.report_label = ""; - break; - } - } - - Run stddev_data; - stddev_data.benchmark_name = reports[0].benchmark_name + "_stddev"; - stddev_data.report_label = mean_data.report_label; - stddev_data.iterations = 0; - stddev_data.real_accumulated_time = real_accumulated_time_stat.StdDev(); - stddev_data.cpu_accumulated_time = cpu_accumulated_time_stat.StdDev(); - stddev_data.bytes_per_second = bytes_per_second_stat.StdDev(); - stddev_data.items_per_second = items_per_second_stat.StdDev(); - stddev_data.time_unit = reports[0].time_unit; - // user counters - for(auto const& kv : counter_stats) { - auto c = Counter(kv.second.s.StdDev(), counter_stats[kv.first].c.flags); - stddev_data.counters[kv.first] = c; - } - - results.push_back(mean_data); - results.push_back(stddev_data); - return results; -} - -std::vector ComputeBigO( - const std::vector& reports) { - typedef BenchmarkReporter::Run Run; - std::vector results; - - if (reports.size() < 2) return results; - - // Accumulators. - std::vector n; - std::vector real_time; - std::vector cpu_time; - - // Populate the accumulators. - for (const Run& run : reports) { - CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?"; - n.push_back(run.complexity_n); - real_time.push_back(run.real_accumulated_time / run.iterations); - cpu_time.push_back(run.cpu_accumulated_time / run.iterations); - } - - LeastSq result_cpu; - LeastSq result_real; - - if (reports[0].complexity == oLambda) { - result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda); - result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda); - } else { - result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity); - result_real = MinimalLeastSq(n, real_time, result_cpu.complexity); - } - std::string benchmark_name = - reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/')); - - // Get the data from the accumulator to BenchmarkReporter::Run's. - Run big_o; - big_o.benchmark_name = benchmark_name + "_BigO"; - big_o.iterations = 0; - big_o.real_accumulated_time = result_real.coef; - big_o.cpu_accumulated_time = result_cpu.coef; - big_o.report_big_o = true; - big_o.complexity = result_cpu.complexity; - - // All the time results are reported after being multiplied by the - // time unit multiplier. But since RMS is a relative quantity it - // should not be multiplied at all. So, here, we _divide_ it by the - // multiplier so that when it is multiplied later the result is the - // correct one. - double multiplier = GetTimeUnitMultiplier(reports[0].time_unit); - - // Only add label to mean/stddev if it is same for all runs - Run rms; - big_o.report_label = reports[0].report_label; - rms.benchmark_name = benchmark_name + "_RMS"; - rms.report_label = big_o.report_label; - rms.iterations = 0; - rms.real_accumulated_time = result_real.rms / multiplier; - rms.cpu_accumulated_time = result_cpu.rms / multiplier; - rms.report_rms = true; - rms.complexity = result_cpu.complexity; - // don't forget to keep the time unit, or we won't be able to - // recover the correct value. - rms.time_unit = reports[0].time_unit; - - results.push_back(big_o); - results.push_back(rms); - return results; -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.h deleted file mode 100644 index c0ca60e6bec..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/complexity.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2016 Ismael Jimenez Martinez. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Source project : https://github.com/ismaelJimenez/cpp.leastsq -// Adapted to be used with google benchmark - -#ifndef COMPLEXITY_H_ -#define COMPLEXITY_H_ - -#include -#include - -#include "benchmark/benchmark.h" - -namespace benchmark { - -// Return a vector containing the mean and standard devation information for -// the specified list of reports. If 'reports' contains less than two -// non-errored runs an empty vector is returned -std::vector ComputeStats( - const std::vector& reports); - -// Return a vector containing the bigO and RMS information for the specified -// list of reports. If 'reports.size() < 2' an empty vector is returned. -std::vector ComputeBigO( - const std::vector& reports); - -// This data structure will contain the result returned by MinimalLeastSq -// - coef : Estimated coeficient for the high-order term as -// interpolated from data. -// - rms : Normalized Root Mean Squared Error. -// - complexity : Scalability form (e.g. oN, oNLogN). In case a scalability -// form has been provided to MinimalLeastSq this will return -// the same value. In case BigO::oAuto has been selected, this -// parameter will return the best fitting curve detected. - -struct LeastSq { - LeastSq() : coef(0.0), rms(0.0), complexity(oNone) {} - - double coef; - double rms; - BigO complexity; -}; - -// Function to return an string for the calculated complexity -std::string GetBigOString(BigO complexity); - -} // end namespace benchmark -#endif // COMPLEXITY_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/console_reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/console_reporter.cc deleted file mode 100644 index 4bb6f71271c..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/console_reporter.cc +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "benchmark/benchmark.h" -#include "complexity.h" -#include "counter.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "check.h" -#include "colorprint.h" -#include "commandlineflags.h" -#include "internal_macros.h" -#include "string_util.h" -#include "timers.h" - -namespace benchmark { - -bool ConsoleReporter::ReportContext(const Context& context) { - name_field_width_ = context.name_field_width; - printed_header_ = false; - prev_counters_.clear(); - - PrintBasicContext(&GetErrorStream(), context); - -#ifdef BENCHMARK_OS_WINDOWS - if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) { - GetErrorStream() - << "Color printing is only supported for stdout on windows." - " Disabling color printing\n"; - output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color); - } -#endif - - return true; -} - -void ConsoleReporter::PrintHeader(const Run& run) { - std::string str = FormatString("%-*s %13s %13s %10s", static_cast(name_field_width_), - "Benchmark", "Time", "CPU", "Iterations"); - if(!run.counters.empty()) { - if(output_options_ & OO_Tabular) { - for(auto const& c : run.counters) { - str += FormatString(" %10s", c.first.c_str()); - } - } else { - str += " UserCounters..."; - } - } - str += "\n"; - std::string line = std::string(str.length(), '-'); - GetOutputStream() << line << "\n" << str << line << "\n"; -} - -void ConsoleReporter::ReportRuns(const std::vector& reports) { - for (const auto& run : reports) { - // print the header: - // --- if none was printed yet - bool print_header = !printed_header_; - // --- or if the format is tabular and this run - // has different fields from the prev header - print_header |= (output_options_ & OO_Tabular) && - (!internal::SameNames(run.counters, prev_counters_)); - if (print_header) { - printed_header_ = true; - prev_counters_ = run.counters; - PrintHeader(run); - } - // As an alternative to printing the headers like this, we could sort - // the benchmarks by header and then print. But this would require - // waiting for the full results before printing, or printing twice. - PrintRunData(run); - } -} - -static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt, - ...) { - va_list args; - va_start(args, fmt); - out << FormatString(fmt, args); - va_end(args); -} - -void ConsoleReporter::PrintRunData(const Run& result) { - typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...); - auto& Out = GetOutputStream(); - PrinterFn* printer = (output_options_ & OO_Color) ? - (PrinterFn*)ColorPrintf : IgnoreColorPrint; - auto name_color = - (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN; - printer(Out, name_color, "%-*s ", name_field_width_, - result.benchmark_name.c_str()); - - if (result.error_occurred) { - printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'", - result.error_message.c_str()); - printer(Out, COLOR_DEFAULT, "\n"); - return; - } - // Format bytes per second - std::string rate; - if (result.bytes_per_second > 0) { - rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s"); - } - - // Format items per second - std::string items; - if (result.items_per_second > 0) { - items = - StrCat(" ", HumanReadableNumber(result.items_per_second), " items/s"); - } - - const double real_time = result.GetAdjustedRealTime(); - const double cpu_time = result.GetAdjustedCPUTime(); - - if (result.report_big_o) { - std::string big_o = GetBigOString(result.complexity); - printer(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time, big_o.c_str(), - cpu_time, big_o.c_str()); - } else if (result.report_rms) { - printer(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100, - cpu_time * 100); - } else { - const char* timeLabel = GetTimeUnitString(result.time_unit); - printer(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel, - cpu_time, timeLabel); - } - - if (!result.report_big_o && !result.report_rms) { - printer(Out, COLOR_CYAN, "%10lld", result.iterations); - } - - for (auto& c : result.counters) { - auto const& s = HumanReadableNumber(c.second.value); - if (output_options_ & OO_Tabular) { - if (c.second.flags & Counter::kIsRate) { - printer(Out, COLOR_DEFAULT, " %8s/s", s.c_str()); - } else { - printer(Out, COLOR_DEFAULT, " %10s", s.c_str()); - } - } else { - const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : ""; - printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(), - unit); - } - } - - if (!rate.empty()) { - printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str()); - } - - if (!items.empty()) { - printer(Out, COLOR_DEFAULT, " %*s", 18, items.c_str()); - } - - if (!result.report_label.empty()) { - printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str()); - } - - printer(Out, COLOR_DEFAULT, "\n"); -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.cc deleted file mode 100644 index ed1aa044ee7..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "counter.h" - -namespace benchmark { -namespace internal { - -double Finish(Counter const& c, double cpu_time, double num_threads) { - double v = c.value; - if (c.flags & Counter::kIsRate) { - v /= cpu_time; - } - if (c.flags & Counter::kAvgThreads) { - v /= num_threads; - } - return v; -} - -void Finish(UserCounters *l, double cpu_time, double num_threads) { - for (auto &c : *l) { - c.second.value = Finish(c.second, cpu_time, num_threads); - } -} - -void Increment(UserCounters *l, UserCounters const& r) { - // add counters present in both or just in *l - for (auto &c : *l) { - auto it = r.find(c.first); - if (it != r.end()) { - c.second.value = c.second + it->second; - } - } - // add counters present in r, but not in *l - for (auto const &tc : r) { - auto it = l->find(tc.first); - if (it == l->end()) { - (*l)[tc.first] = tc.second; - } - } -} - -bool SameNames(UserCounters const& l, UserCounters const& r) { - if (&l == &r) return true; - if (l.size() != r.size()) { - return false; - } - for (auto const& c : l) { - if (r.find(c.first) == r.end()) { - return false; - } - } - return true; -} - -} // end namespace internal -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.h deleted file mode 100644 index dd6865a31d7..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/counter.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "benchmark/benchmark.h" - -namespace benchmark { - -// these counter-related functions are hidden to reduce API surface. -namespace internal { -void Finish(UserCounters *l, double time, double num_threads); -void Increment(UserCounters *l, UserCounters const& r); -bool SameNames(UserCounters const& l, UserCounters const& r); -} // end namespace internal - -} //end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/csv_reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/csv_reporter.cc deleted file mode 100644 index 35510645b08..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/csv_reporter.cc +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "benchmark/benchmark.h" -#include "complexity.h" - -#include -#include -#include -#include -#include -#include - -#include "string_util.h" -#include "timers.h" -#include "check.h" - -// File format reference: http://edoceo.com/utilitas/csv-file-format. - -namespace benchmark { - -namespace { -std::vector elements = { - "name", "iterations", "real_time", "cpu_time", - "time_unit", "bytes_per_second", "items_per_second", "label", - "error_occurred", "error_message"}; -} // namespace - -bool CSVReporter::ReportContext(const Context& context) { - PrintBasicContext(&GetErrorStream(), context); - return true; -} - -void CSVReporter::ReportRuns(const std::vector & reports) { - std::ostream& Out = GetOutputStream(); - - if (!printed_header_) { - // save the names of all the user counters - for (const auto& run : reports) { - for (const auto& cnt : run.counters) { - user_counter_names_.insert(cnt.first); - } - } - - // print the header - for (auto B = elements.begin(); B != elements.end();) { - Out << *B++; - if (B != elements.end()) Out << ","; - } - for (auto B = user_counter_names_.begin(); B != user_counter_names_.end();) { - Out << ",\"" << *B++ << "\""; - } - Out << "\n"; - - printed_header_ = true; - } else { - // check that all the current counters are saved in the name set - for (const auto& run : reports) { - for (const auto& cnt : run.counters) { - CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end()) - << "All counters must be present in each run. " - << "Counter named \"" << cnt.first - << "\" was not in a run after being added to the header"; - } - } - } - - // print results for each run - for (const auto& run : reports) { - PrintRunData(run); - } - -} - -void CSVReporter::PrintRunData(const Run & run) { - std::ostream& Out = GetOutputStream(); - - // Field with embedded double-quote characters must be doubled and the field - // delimited with double-quotes. - std::string name = run.benchmark_name; - ReplaceAll(&name, "\"", "\"\""); - Out << '"' << name << "\","; - if (run.error_occurred) { - Out << std::string(elements.size() - 3, ','); - Out << "true,"; - std::string msg = run.error_message; - ReplaceAll(&msg, "\"", "\"\""); - Out << '"' << msg << "\"\n"; - return; - } - - // Do not print iteration on bigO and RMS report - if (!run.report_big_o && !run.report_rms) { - Out << run.iterations; - } - Out << ","; - - Out << run.GetAdjustedRealTime() << ","; - Out << run.GetAdjustedCPUTime() << ","; - - // Do not print timeLabel on bigO and RMS report - if (run.report_big_o) { - Out << GetBigOString(run.complexity); - } else if (!run.report_rms) { - Out << GetTimeUnitString(run.time_unit); - } - Out << ","; - - if (run.bytes_per_second > 0.0) { - Out << run.bytes_per_second; - } - Out << ","; - if (run.items_per_second > 0.0) { - Out << run.items_per_second; - } - Out << ","; - if (!run.report_label.empty()) { - // Field with embedded double-quote characters must be doubled and the field - // delimited with double-quotes. - std::string label = run.report_label; - ReplaceAll(&label, "\"", "\"\""); - Out << "\"" << label << "\""; - } - Out << ",,"; // for error_occurred and error_message - - // Print user counters - for (const auto &ucn : user_counter_names_) { - auto it = run.counters.find(ucn); - if(it == run.counters.end()) { - Out << ","; - } else { - Out << "," << it->second; - } - } - Out << '\n'; -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/cycleclock.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/cycleclock.h deleted file mode 100644 index 4251fe4c32a..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/cycleclock.h +++ /dev/null @@ -1,172 +0,0 @@ -// ---------------------------------------------------------------------- -// CycleClock -// A CycleClock tells you the current time in Cycles. The "time" -// is actually time since power-on. This is like time() but doesn't -// involve a system call and is much more precise. -// -// NOTE: Not all cpu/platform/kernel combinations guarantee that this -// clock increments at a constant rate or is synchronized across all logical -// cpus in a system. -// -// If you need the above guarantees, please consider using a different -// API. There are efforts to provide an interface which provides a millisecond -// granularity and implemented as a memory read. A memory read is generally -// cheaper than the CycleClock for many architectures. -// -// Also, in some out of order CPU implementations, the CycleClock is not -// serializing. So if you're trying to count at cycles granularity, your -// data might be inaccurate due to out of order instruction execution. -// ---------------------------------------------------------------------- - -#ifndef BENCHMARK_CYCLECLOCK_H_ -#define BENCHMARK_CYCLECLOCK_H_ - -#include - -#include "benchmark/benchmark.h" -#include "internal_macros.h" - -#if defined(BENCHMARK_OS_MACOSX) -#include -#endif -// For MSVC, we want to use '_asm rdtsc' when possible (since it works -// with even ancient MSVC compilers), and when not possible the -// __rdtsc intrinsic, declared in . Unfortunately, in some -// environments, and have conflicting -// declarations of some other intrinsics, breaking compilation. -// Therefore, we simply declare __rdtsc ourselves. See also -// http://connect.microsoft.com/VisualStudio/feedback/details/262047 -#if defined(COMPILER_MSVC) && !defined(_M_IX86) -extern "C" uint64_t __rdtsc(); -#pragma intrinsic(__rdtsc) -#endif - -#ifndef BENCHMARK_OS_WINDOWS -#include -#include -#endif - -#ifdef BENCHMARK_OS_EMSCRIPTEN -#include -#endif - -namespace benchmark { -// NOTE: only i386 and x86_64 have been well tested. -// PPC, sparc, alpha, and ia64 are based on -// http://peter.kuscsik.com/wordpress/?p=14 -// with modifications by m3b. See also -// https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h -namespace cycleclock { -// This should return the number of cycles since power-on. Thread-safe. -inline BENCHMARK_ALWAYS_INLINE int64_t Now() { -#if defined(BENCHMARK_OS_MACOSX) - // this goes at the top because we need ALL Macs, regardless of - // architecture, to return the number of "mach time units" that - // have passed since startup. See sysinfo.cc where - // InitializeSystemInfo() sets the supposed cpu clock frequency of - // macs to the number of mach time units per second, not actual - // CPU clock frequency (which can change in the face of CPU - // frequency scaling). Also note that when the Mac sleeps, this - // counter pauses; it does not continue counting, nor does it - // reset to zero. - return mach_absolute_time(); -#elif defined(BENCHMARK_OS_EMSCRIPTEN) - // this goes above x86-specific code because old versions of Emscripten - // define __x86_64__, although they have nothing to do with it. - return static_cast(emscripten_get_now() * 1e+6); -#elif defined(__i386__) - int64_t ret; - __asm__ volatile("rdtsc" : "=A"(ret)); - return ret; -#elif defined(__x86_64__) || defined(__amd64__) - uint64_t low, high; - __asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); - return (high << 32) | low; -#elif defined(__powerpc__) || defined(__ppc__) - // This returns a time-base, which is not always precisely a cycle-count. - int64_t tbl, tbu0, tbu1; - asm("mftbu %0" : "=r"(tbu0)); - asm("mftb %0" : "=r"(tbl)); - asm("mftbu %0" : "=r"(tbu1)); - tbl &= -static_cast(tbu0 == tbu1); - // high 32 bits in tbu1; low 32 bits in tbl (tbu0 is garbage) - return (tbu1 << 32) | tbl; -#elif defined(__sparc__) - int64_t tick; - asm(".byte 0x83, 0x41, 0x00, 0x00"); - asm("mov %%g1, %0" : "=r"(tick)); - return tick; -#elif defined(__ia64__) - int64_t itc; - asm("mov %0 = ar.itc" : "=r"(itc)); - return itc; -#elif defined(COMPILER_MSVC) && defined(_M_IX86) - // Older MSVC compilers (like 7.x) don't seem to support the - // __rdtsc intrinsic properly, so I prefer to use _asm instead - // when I know it will work. Otherwise, I'll use __rdtsc and hope - // the code is being compiled with a non-ancient compiler. - _asm rdtsc -#elif defined(COMPILER_MSVC) - return __rdtsc(); -#elif defined(BENCHMARK_OS_NACL) - // Native Client validator on x86/x86-64 allows RDTSC instructions, - // and this case is handled above. Native Client validator on ARM - // rejects MRC instructions (used in the ARM-specific sequence below), - // so we handle it here. Portable Native Client compiles to - // architecture-agnostic bytecode, which doesn't provide any - // cycle counter access mnemonics. - - // Native Client does not provide any API to access cycle counter. - // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday - // because is provides nanosecond resolution (which is noticable at - // least for PNaCl modules running on x86 Mac & Linux). - // Initialize to always return 0 if clock_gettime fails. - struct timespec ts = { 0, 0 }; - clock_gettime(CLOCK_MONOTONIC, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#elif defined(__aarch64__) - // System timer of ARMv8 runs at a different frequency than the CPU's. - // The frequency is fixed, typically in the range 1-50MHz. It can be - // read at CNTFRQ special register. We assume the OS has set up - // the virtual timer properly. - int64_t virtual_timer_value; - asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value)); - return virtual_timer_value; -#elif defined(__ARM_ARCH) - // V6 is the earliest arch that has a standard cyclecount - // Native Client validator doesn't allow MRC instructions. -#if (__ARM_ARCH >= 6) - uint32_t pmccntr; - uint32_t pmuseren; - uint32_t pmcntenset; - // Read the user mode perf monitor counter access permissions. - asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); - if (pmuseren & 1) { // Allows reading perfmon counters for user mode code. - asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); - if (pmcntenset & 0x80000000ul) { // Is it counting? - asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); - // The counter is set up to count every 64th cycle - return static_cast(pmccntr) * 64; // Should optimize to << 6 - } - } -#endif - struct timeval tv; - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; -#elif defined(__mips__) - // mips apparently only allows rdtsc for superusers, so we fall - // back to gettimeofday. It's possible clock_gettime would be better. - struct timeval tv; - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; -#else -// The soft failover to a generic implementation is automatic only for ARM. -// For other platforms the developer is expected to make an attempt to create -// a fast implementation and use generic version if nothing better is available. -#error You need to define CycleTimer for your OS and CPU -#endif -} -} // end namespace cycleclock -} // end namespace benchmark - -#endif // BENCHMARK_CYCLECLOCK_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/internal_macros.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/internal_macros.h deleted file mode 100644 index 942887457f1..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/internal_macros.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef BENCHMARK_INTERNAL_MACROS_H_ -#define BENCHMARK_INTERNAL_MACROS_H_ - -#include "benchmark/benchmark.h" - -#ifndef __has_feature -#define __has_feature(x) 0 -#endif - -#if defined(__clang__) -#define COMPILER_CLANG -#elif defined(_MSC_VER) -#define COMPILER_MSVC -#elif defined(__GNUC__) -#define COMPILER_GCC -#endif - -#if __has_feature(cxx_attributes) -#define BENCHMARK_NORETURN [[noreturn]] -#elif defined(__GNUC__) -#define BENCHMARK_NORETURN __attribute__((noreturn)) -#elif defined(COMPILER_MSVC) -#define BENCHMARK_NORETURN __declspec(noreturn) -#else -#define BENCHMARK_NORETURN -#endif - -#if defined(__CYGWIN__) -#define BENCHMARK_OS_CYGWIN 1 -#elif defined(_WIN32) -#define BENCHMARK_OS_WINDOWS 1 -#elif defined(__APPLE__) -#include "TargetConditionals.h" - #if defined(TARGET_OS_MAC) - #define BENCHMARK_OS_MACOSX 1 - #if defined(TARGET_OS_IPHONE) - #define BENCHMARK_OS_IOS 1 - #endif - #endif -#elif defined(__FreeBSD__) -#define BENCHMARK_OS_FREEBSD 1 -#elif defined(__linux__) -#define BENCHMARK_OS_LINUX 1 -#elif defined(__native_client__) -#define BENCHMARK_OS_NACL 1 -#elif defined(EMSCRIPTEN) -#define BENCHMARK_OS_EMSCRIPTEN 1 -#elif defined(__rtems__) -#define BENCHMARK_OS_RTEMS 1 -#endif - -#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \ - && !defined(__EXCEPTIONS) -#define BENCHMARK_HAS_NO_EXCEPTIONS -#endif - -#endif // BENCHMARK_INTERNAL_MACROS_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/json_reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/json_reporter.cc deleted file mode 100644 index edf6ecc836c..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/json_reporter.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "benchmark/benchmark.h" -#include "complexity.h" - -#include -#include -#include -#include -#include -#include - -#include "string_util.h" -#include "timers.h" - -namespace benchmark { - -namespace { - -std::string FormatKV(std::string const& key, std::string const& value) { - return StringPrintF("\"%s\": \"%s\"", key.c_str(), value.c_str()); -} - -std::string FormatKV(std::string const& key, const char* value) { - return StringPrintF("\"%s\": \"%s\"", key.c_str(), value); -} - -std::string FormatKV(std::string const& key, bool value) { - return StringPrintF("\"%s\": %s", key.c_str(), value ? "true" : "false"); -} - -std::string FormatKV(std::string const& key, int64_t value) { - std::stringstream ss; - ss << '"' << key << "\": " << value; - return ss.str(); -} - -std::string FormatKV(std::string const& key, double value) { - return StringPrintF("\"%s\": %.2f", key.c_str(), value); -} - -int64_t RoundDouble(double v) { return static_cast(v + 0.5); } - -} // end namespace - -bool JSONReporter::ReportContext(const Context& context) { - std::ostream& out = GetOutputStream(); - - out << "{\n"; - std::string inner_indent(2, ' '); - - // Open context block and print context information. - out << inner_indent << "\"context\": {\n"; - std::string indent(4, ' '); - - std::string walltime_value = LocalDateTimeString(); - out << indent << FormatKV("date", walltime_value) << ",\n"; - - out << indent << FormatKV("num_cpus", static_cast(context.num_cpus)) - << ",\n"; - out << indent << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu)) - << ",\n"; - out << indent << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled) - << ",\n"; - -#if defined(NDEBUG) - const char build_type[] = "release"; -#else - const char build_type[] = "debug"; -#endif - out << indent << FormatKV("library_build_type", build_type) << "\n"; - // Close context block and open the list of benchmarks. - out << inner_indent << "},\n"; - out << inner_indent << "\"benchmarks\": [\n"; - return true; -} - -void JSONReporter::ReportRuns(std::vector const& reports) { - if (reports.empty()) { - return; - } - std::string indent(4, ' '); - std::ostream& out = GetOutputStream(); - if (!first_report_) { - out << ",\n"; - } - first_report_ = false; - - for (auto it = reports.begin(); it != reports.end(); ++it) { - out << indent << "{\n"; - PrintRunData(*it); - out << indent << '}'; - auto it_cp = it; - if (++it_cp != reports.end()) { - out << ",\n"; - } - } -} - -void JSONReporter::Finalize() { - // Close the list of benchmarks and the top level object. - GetOutputStream() << "\n ]\n}\n"; -} - -void JSONReporter::PrintRunData(Run const& run) { - std::string indent(6, ' '); - std::ostream& out = GetOutputStream(); - out << indent << FormatKV("name", run.benchmark_name) << ",\n"; - if (run.error_occurred) { - out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n"; - out << indent << FormatKV("error_message", run.error_message) << ",\n"; - } - if (!run.report_big_o && !run.report_rms) { - out << indent << FormatKV("iterations", run.iterations) << ",\n"; - out << indent - << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime())) - << ",\n"; - out << indent - << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime())); - out << ",\n" - << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit)); - } else if (run.report_big_o) { - out << indent - << FormatKV("cpu_coefficient", RoundDouble(run.GetAdjustedCPUTime())) - << ",\n"; - out << indent - << FormatKV("real_coefficient", RoundDouble(run.GetAdjustedRealTime())) - << ",\n"; - out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n"; - out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit)); - } else if (run.report_rms) { - out << indent - << FormatKV("rms", run.GetAdjustedCPUTime()); - } - if (run.bytes_per_second > 0.0) { - out << ",\n" - << indent - << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second)); - } - if (run.items_per_second > 0.0) { - out << ",\n" - << indent - << FormatKV("items_per_second", RoundDouble(run.items_per_second)); - } - for(auto &c : run.counters) { - out << ",\n" - << indent - << FormatKV(c.first, RoundDouble(c.second)); - } - if (!run.report_label.empty()) { - out << ",\n" << indent << FormatKV("label", run.report_label); - } - out << '\n'; -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/log.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/log.h deleted file mode 100644 index d06e1031db1..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/log.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef BENCHMARK_LOG_H_ -#define BENCHMARK_LOG_H_ - -#include -#include - -#include "benchmark/benchmark.h" - -namespace benchmark { -namespace internal { - -typedef std::basic_ostream&(EndLType)(std::basic_ostream&); - -class LogType { - friend LogType& GetNullLogInstance(); - friend LogType& GetErrorLogInstance(); - - // FIXME: Add locking to output. - template - friend LogType& operator<<(LogType&, Tp const&); - friend LogType& operator<<(LogType&, EndLType*); - - private: - LogType(std::ostream* out) : out_(out) {} - std::ostream* out_; - BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType); -}; - -template -LogType& operator<<(LogType& log, Tp const& value) { - if (log.out_) { - *log.out_ << value; - } - return log; -} - -inline LogType& operator<<(LogType& log, EndLType* m) { - if (log.out_) { - *log.out_ << m; - } - return log; -} - -inline int& LogLevel() { - static int log_level = 0; - return log_level; -} - -inline LogType& GetNullLogInstance() { - static LogType log(nullptr); - return log; -} - -inline LogType& GetErrorLogInstance() { - static LogType log(&std::clog); - return log; -} - -inline LogType& GetLogInstanceForLevel(int level) { - if (level <= LogLevel()) { - return GetErrorLogInstance(); - } - return GetNullLogInstance(); -} - -} // end namespace internal -} // end namespace benchmark - -#define VLOG(x) \ - (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \ - " ") - -#endif diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/mutex.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/mutex.h deleted file mode 100644 index 5f461d05a0c..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/mutex.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef BENCHMARK_MUTEX_H_ -#define BENCHMARK_MUTEX_H_ - -#include -#include - -#include "check.h" - -// Enable thread safety attributes only with clang. -// The attributes can be safely erased when compiling with other compilers. -#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES) -#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x)) -#else -#define THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op -#endif - -#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x)) - -#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable) - -#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x)) - -#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x)) - -#define ACQUIRED_BEFORE(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__)) - -#define ACQUIRED_AFTER(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__)) - -#define REQUIRES(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__)) - -#define REQUIRES_SHARED(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__)) - -#define ACQUIRE(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__)) - -#define ACQUIRE_SHARED(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__)) - -#define RELEASE(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__)) - -#define RELEASE_SHARED(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__)) - -#define TRY_ACQUIRE(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__)) - -#define TRY_ACQUIRE_SHARED(...) \ - THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__)) - -#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__)) - -#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x)) - -#define ASSERT_SHARED_CAPABILITY(x) \ - THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x)) - -#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x)) - -#define NO_THREAD_SAFETY_ANALYSIS \ - THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) - -namespace benchmark { - -typedef std::condition_variable Condition; - -// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that -// we can annotate them with thread safety attributes and use the -// -Wthread-safety warning with clang. The standard library types cannot be -// used directly because they do not provided the required annotations. -class CAPABILITY("mutex") Mutex { - public: - Mutex() {} - - void lock() ACQUIRE() { mut_.lock(); } - void unlock() RELEASE() { mut_.unlock(); } - std::mutex& native_handle() { return mut_; } - - private: - std::mutex mut_; -}; - -class SCOPED_CAPABILITY MutexLock { - typedef std::unique_lock MutexLockImp; - - public: - MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle()) {} - ~MutexLock() RELEASE() {} - MutexLockImp& native_handle() { return ml_; } - - private: - MutexLockImp ml_; -}; - -class Barrier { - public: - Barrier(int num_threads) : running_threads_(num_threads) {} - - // Called by each thread - bool wait() EXCLUDES(lock_) { - bool last_thread = false; - { - MutexLock ml(lock_); - last_thread = createBarrier(ml); - } - if (last_thread) phase_condition_.notify_all(); - return last_thread; - } - - void removeThread() EXCLUDES(lock_) { - MutexLock ml(lock_); - --running_threads_; - if (entered_ != 0) phase_condition_.notify_all(); - } - - private: - Mutex lock_; - Condition phase_condition_; - int running_threads_; - - // State for barrier management - int phase_number_ = 0; - int entered_ = 0; // Number of threads that have entered this barrier - - // Enter the barrier and wait until all other threads have also - // entered the barrier. Returns iff this is the last thread to - // enter the barrier. - bool createBarrier(MutexLock& ml) REQUIRES(lock_) { - CHECK_LT(entered_, running_threads_); - entered_++; - if (entered_ < running_threads_) { - // Wait for all threads to enter - int phase_number_cp = phase_number_; - auto cb = [this, phase_number_cp]() { - return this->phase_number_ > phase_number_cp || - entered_ == running_threads_; // A thread has aborted in error - }; - phase_condition_.wait(ml.native_handle(), cb); - if (phase_number_ > phase_number_cp) return false; - // else (running_threads_ == entered_) and we are the last thread. - } - // Last thread has reached the barrier - phase_number_++; - entered_ = 0; - return true; - } -}; - -} // end namespace benchmark - -#endif // BENCHMARK_MUTEX_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/re.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/re.h deleted file mode 100644 index 01e9736505e..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/re.h +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef BENCHMARK_RE_H_ -#define BENCHMARK_RE_H_ - -#include "internal_macros.h" - -// Prefer C regex libraries when compiling w/o exceptions so that we can -// correctly report errors. -#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && defined(HAVE_STD_REGEX) && \ - (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX)) -#undef HAVE_STD_REGEX -#endif - -#if defined(HAVE_STD_REGEX) -#include -#elif defined(HAVE_GNU_POSIX_REGEX) -#include -#elif defined(HAVE_POSIX_REGEX) -#include -#else -#error No regular expression backend was found! -#endif -#include - -#include "check.h" - -namespace benchmark { - -// A wrapper around the POSIX regular expression API that provides automatic -// cleanup -class Regex { - public: - Regex() : init_(false) {} - - ~Regex(); - - // Compile a regular expression matcher from spec. Returns true on success. - // - // On failure (and if error is not nullptr), error is populated with a human - // readable error message if an error occurs. - bool Init(const std::string& spec, std::string* error); - - // Returns whether str matches the compiled regular expression. - bool Match(const std::string& str); - - private: - bool init_; -// Underlying regular expression object -#if defined(HAVE_STD_REGEX) - std::regex re_; -#elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX) - regex_t re_; -#else -#error No regular expression backend implementation available -#endif -}; - -#if defined(HAVE_STD_REGEX) - -inline bool Regex::Init(const std::string& spec, std::string* error) { -#ifdef BENCHMARK_HAS_NO_EXCEPTIONS - ((void)error); // suppress unused warning -#else - try { -#endif - re_ = std::regex(spec, std::regex_constants::extended); - init_ = true; -#ifndef BENCHMARK_HAS_NO_EXCEPTIONS - } catch (const std::regex_error& e) { - if (error) { - *error = e.what(); - } - } -#endif - return init_; -} - -inline Regex::~Regex() {} - -inline bool Regex::Match(const std::string& str) { - if (!init_) { - return false; - } - return std::regex_search(str, re_); -} - -#else -inline bool Regex::Init(const std::string& spec, std::string* error) { - int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB); - if (ec != 0) { - if (error) { - size_t needed = regerror(ec, &re_, nullptr, 0); - char* errbuf = new char[needed]; - regerror(ec, &re_, errbuf, needed); - - // regerror returns the number of bytes necessary to null terminate - // the string, so we move that when assigning to error. - CHECK_NE(needed, 0); - error->assign(errbuf, needed - 1); - - delete[] errbuf; - } - - return false; - } - - init_ = true; - return true; -} - -inline Regex::~Regex() { - if (init_) { - regfree(&re_); - } -} - -inline bool Regex::Match(const std::string& str) { - if (!init_) { - return false; - } - return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0; -} -#endif - -} // end namespace benchmark - -#endif // BENCHMARK_RE_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/reporter.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/reporter.cc deleted file mode 100644 index aacd453179d..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/reporter.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "benchmark/benchmark.h" -#include "timers.h" - -#include - -#include -#include -#include - -#include "check.h" -#include "stat.h" - -namespace benchmark { - -BenchmarkReporter::BenchmarkReporter() - : output_stream_(&std::cout), error_stream_(&std::cerr) {} - -BenchmarkReporter::~BenchmarkReporter() {} - -void BenchmarkReporter::PrintBasicContext(std::ostream *out, - Context const &context) { - CHECK(out) << "cannot be null"; - auto &Out = *out; - - Out << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu - << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n"; - - Out << LocalDateTimeString() << "\n"; - - if (context.cpu_scaling_enabled) { - Out << "***WARNING*** CPU scaling is enabled, the benchmark " - "real time measurements may be noisy and will incur extra " - "overhead.\n"; - } - -#ifndef NDEBUG - Out << "***WARNING*** Library was built as DEBUG. Timings may be " - "affected.\n"; -#endif -} - -double BenchmarkReporter::Run::GetAdjustedRealTime() const { - double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit); - if (iterations != 0) new_time /= static_cast(iterations); - return new_time; -} - -double BenchmarkReporter::Run::GetAdjustedCPUTime() const { - double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit); - if (iterations != 0) new_time /= static_cast(iterations); - return new_time; -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.cc deleted file mode 100644 index 54aa04a4224..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "sleep.h" - -#include -#include -#include - -#include "internal_macros.h" - -#ifdef BENCHMARK_OS_WINDOWS -#include -#endif - -namespace benchmark { -#ifdef BENCHMARK_OS_WINDOWS -// Window's Sleep takes milliseconds argument. -void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); } -void SleepForSeconds(double seconds) { - SleepForMilliseconds(static_cast(kNumMillisPerSecond * seconds)); -} -#else // BENCHMARK_OS_WINDOWS -void SleepForMicroseconds(int microseconds) { - struct timespec sleep_time; - sleep_time.tv_sec = microseconds / kNumMicrosPerSecond; - sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro; - while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR) - ; // Ignore signals and wait for the full interval to elapse. -} - -void SleepForMilliseconds(int milliseconds) { - SleepForMicroseconds(milliseconds * kNumMicrosPerMilli); -} - -void SleepForSeconds(double seconds) { - SleepForMicroseconds(static_cast(seconds * kNumMicrosPerSecond)); -} -#endif // BENCHMARK_OS_WINDOWS -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.h deleted file mode 100644 index f98551afe28..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sleep.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef BENCHMARK_SLEEP_H_ -#define BENCHMARK_SLEEP_H_ - -namespace benchmark { -const int kNumMillisPerSecond = 1000; -const int kNumMicrosPerMilli = 1000; -const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000; -const int kNumNanosPerMicro = 1000; -const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond; - -void SleepForMilliseconds(int milliseconds); -void SleepForSeconds(double seconds); -} // end namespace benchmark - -#endif // BENCHMARK_SLEEP_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/stat.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/stat.h deleted file mode 100644 index d356875b632..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/stat.h +++ /dev/null @@ -1,310 +0,0 @@ -#ifndef BENCHMARK_STAT_H_ -#define BENCHMARK_STAT_H_ - -#include -#include -#include -#include - -namespace benchmark { - -template -class Stat1; - -template -class Stat1MinMax; - -typedef Stat1 Stat1_f; -typedef Stat1 Stat1_d; -typedef Stat1MinMax Stat1MinMax_f; -typedef Stat1MinMax Stat1MinMax_d; - -template -class Vector2; -template -class Vector3; -template -class Vector4; - -template -class Stat1 { - public: - typedef Stat1 Self; - - Stat1() { Clear(); } - // Create a sample of value dat and weight 1 - explicit Stat1(const VType &dat) { - sum_ = dat; - sum_squares_ = Sqr(dat); - numsamples_ = 1; - } - // Create statistics for all the samples between begin (included) - // and end(excluded) - explicit Stat1(const VType *begin, const VType *end) { - Clear(); - for (const VType *item = begin; item < end; ++item) { - (*this) += Stat1(*item); - } - } - // Create a sample of value dat and weight w - Stat1(const VType &dat, const NumType &w) { - sum_ = w * dat; - sum_squares_ = w * Sqr(dat); - numsamples_ = w; - } - // Copy operator - Stat1(const Self &stat) { - sum_ = stat.sum_; - sum_squares_ = stat.sum_squares_; - numsamples_ = stat.numsamples_; - } - - void Clear() { - numsamples_ = NumType(); - sum_squares_ = sum_ = VType(); - } - - Self &operator=(const Self &stat) { - sum_ = stat.sum_; - sum_squares_ = stat.sum_squares_; - numsamples_ = stat.numsamples_; - return (*this); - } - // Merge statistics from two sample sets. - Self &operator+=(const Self &stat) { - sum_ += stat.sum_; - sum_squares_ += stat.sum_squares_; - numsamples_ += stat.numsamples_; - return (*this); - } - // The operation opposite to += - Self &operator-=(const Self &stat) { - sum_ -= stat.sum_; - sum_squares_ -= stat.sum_squares_; - numsamples_ -= stat.numsamples_; - return (*this); - } - // Multiply the weight of the set of samples by a factor k - Self &operator*=(const VType &k) { - sum_ *= k; - sum_squares_ *= k; - numsamples_ *= k; - return (*this); - } - - // Merge statistics from two sample sets. - Self operator+(const Self &stat) const { return Self(*this) += stat; } - - // The operation opposite to + - Self operator-(const Self &stat) const { return Self(*this) -= stat; } - - // Multiply the weight of the set of samples by a factor k - Self operator*(const VType &k) const { return Self(*this) *= k; } - - // Return the total weight of this sample set - NumType numSamples() const { return numsamples_; } - - // Return the sum of this sample set - VType Sum() const { return sum_; } - - // Return the mean of this sample set - VType Mean() const { - if (numsamples_ == 0) return VType(); - return sum_ * (1.0 / numsamples_); - } - - // Return the mean of this sample set and compute the standard deviation at - // the same time. - VType Mean(VType *stddev) const { - if (numsamples_ == 0) return VType(); - VType mean = sum_ * (1.0 / numsamples_); - if (stddev) { - // Sample standard deviation is undefined for n = 1 - if (numsamples_ == 1) { - *stddev = VType(); - } else { - VType avg_squares = sum_squares_ * (1.0 / numsamples_); - *stddev = Sqrt(numsamples_ / (numsamples_ - 1.0) * (avg_squares - Sqr(mean))); - } - } - return mean; - } - - // Return the standard deviation of the sample set - VType StdDev() const { - VType stddev = VType(); - Mean(&stddev); - return stddev; - } - - private: - static_assert(std::is_integral::value && - !std::is_same::value, - "NumType must be an integral type that is not bool."); - // Let i be the index of the samples provided (using +=) - // and weight[i],value[i] be the data of sample #i - // then the variables have the following meaning: - NumType numsamples_; // sum of weight[i]; - VType sum_; // sum of weight[i]*value[i]; - VType sum_squares_; // sum of weight[i]*value[i]^2; - - // Template function used to square a number. - // For a vector we square all components - template - static inline SType Sqr(const SType &dat) { - return dat * dat; - } - - template - static inline Vector2 Sqr(const Vector2 &dat) { - return dat.MulComponents(dat); - } - - template - static inline Vector3 Sqr(const Vector3 &dat) { - return dat.MulComponents(dat); - } - - template - static inline Vector4 Sqr(const Vector4 &dat) { - return dat.MulComponents(dat); - } - - // Template function used to take the square root of a number. - // For a vector we square all components - template - static inline SType Sqrt(const SType &dat) { - // Avoid NaN due to imprecision in the calculations - if (dat < 0) return 0; - return sqrt(dat); - } - - template - static inline Vector2 Sqrt(const Vector2 &dat) { - // Avoid NaN due to imprecision in the calculations - return Max(dat, Vector2()).Sqrt(); - } - - template - static inline Vector3 Sqrt(const Vector3 &dat) { - // Avoid NaN due to imprecision in the calculations - return Max(dat, Vector3()).Sqrt(); - } - - template - static inline Vector4 Sqrt(const Vector4 &dat) { - // Avoid NaN due to imprecision in the calculations - return Max(dat, Vector4()).Sqrt(); - } -}; - -// Useful printing function -template -std::ostream &operator<<(std::ostream &out, const Stat1 &s) { - out << "{ avg = " << s.Mean() << " std = " << s.StdDev() - << " nsamples = " << s.NumSamples() << "}"; - return out; -} - -// Stat1MinMax: same as Stat1, but it also -// keeps the Min and Max values; the "-" -// operator is disabled because it cannot be implemented -// efficiently -template -class Stat1MinMax : public Stat1 { - public: - typedef Stat1MinMax Self; - - Stat1MinMax() { Clear(); } - // Create a sample of value dat and weight 1 - explicit Stat1MinMax(const VType &dat) : Stat1(dat) { - max_ = dat; - min_ = dat; - } - // Create statistics for all the samples between begin (included) - // and end(excluded) - explicit Stat1MinMax(const VType *begin, const VType *end) { - Clear(); - for (const VType *item = begin; item < end; ++item) { - (*this) += Stat1MinMax(*item); - } - } - // Create a sample of value dat and weight w - Stat1MinMax(const VType &dat, const NumType &w) - : Stat1(dat, w) { - max_ = dat; - min_ = dat; - } - // Copy operator - Stat1MinMax(const Self &stat) : Stat1(stat) { - max_ = stat.max_; - min_ = stat.min_; - } - - void Clear() { - Stat1::Clear(); - if (std::numeric_limits::has_infinity) { - min_ = std::numeric_limits::infinity(); - max_ = -std::numeric_limits::infinity(); - } else { - min_ = std::numeric_limits::max(); - max_ = std::numeric_limits::min(); - } - } - - Self &operator=(const Self &stat) { - this->Stat1::operator=(stat); - max_ = stat.max_; - min_ = stat.min_; - return (*this); - } - // Merge statistics from two sample sets. - Self &operator+=(const Self &stat) { - this->Stat1::operator+=(stat); - if (stat.max_ > max_) max_ = stat.max_; - if (stat.min_ < min_) min_ = stat.min_; - return (*this); - } - // Multiply the weight of the set of samples by a factor k - Self &operator*=(const VType &stat) { - this->Stat1::operator*=(stat); - return (*this); - } - // Merge statistics from two sample sets. - Self operator+(const Self &stat) const { return Self(*this) += stat; } - // Multiply the weight of the set of samples by a factor k - Self operator*(const VType &k) const { return Self(*this) *= k; } - - // Return the maximal value in this sample set - VType Max() const { return max_; } - // Return the minimal value in this sample set - VType Min() const { return min_; } - - private: - // The - operation makes no sense with Min/Max - // unless we keep the full list of values (but we don't) - // make it private, and let it undefined so nobody can call it - Self &operator-=(const Self &stat); // senseless. let it undefined. - - // The operation opposite to - - Self operator-(const Self &stat) const; // senseless. let it undefined. - - // Let i be the index of the samples provided (using +=) - // and weight[i],value[i] be the data of sample #i - // then the variables have the following meaning: - VType max_; // max of value[i] - VType min_; // min of value[i] -}; - -// Useful printing function -template -std::ostream &operator<<(std::ostream &out, - const Stat1MinMax &s) { - out << "{ avg = " << s.Mean() << " std = " << s.StdDev() - << " nsamples = " << s.NumSamples() << " min = " << s.Min() - << " max = " << s.Max() << "}"; - return out; -} -} // end namespace benchmark - -#endif // BENCHMARK_STAT_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.cc deleted file mode 100644 index cd4e7cfde57..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.cc +++ /dev/null @@ -1,172 +0,0 @@ -#include "string_util.h" - -#include -#include -#include -#include -#include -#include - -#include "arraysize.h" - -namespace benchmark { -namespace { - -// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta. -const char kBigSIUnits[] = "kMGTPEZY"; -// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi. -const char kBigIECUnits[] = "KMGTPEZY"; -// milli, micro, nano, pico, femto, atto, zepto, yocto. -const char kSmallSIUnits[] = "munpfazy"; - -// We require that all three arrays have the same size. -static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits), - "SI and IEC unit arrays must be the same size"); -static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits), - "Small SI and Big SI unit arrays must be the same size"); - -static const int64_t kUnitsSize = arraysize(kBigSIUnits); - -} // end anonymous namespace - -void ToExponentAndMantissa(double val, double thresh, int precision, - double one_k, std::string* mantissa, - int64_t* exponent) { - std::stringstream mantissa_stream; - - if (val < 0) { - mantissa_stream << "-"; - val = -val; - } - - // Adjust threshold so that it never excludes things which can't be rendered - // in 'precision' digits. - const double adjusted_threshold = - std::max(thresh, 1.0 / std::pow(10.0, precision)); - const double big_threshold = adjusted_threshold * one_k; - const double small_threshold = adjusted_threshold; - // Values in ]simple_threshold,small_threshold[ will be printed as-is - const double simple_threshold = 0.01; - - if (val > big_threshold) { - // Positive powers - double scaled = val; - for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) { - scaled /= one_k; - if (scaled <= big_threshold) { - mantissa_stream << scaled; - *exponent = i + 1; - *mantissa = mantissa_stream.str(); - return; - } - } - mantissa_stream << val; - *exponent = 0; - } else if (val < small_threshold) { - // Negative powers - if (val < simple_threshold) { - double scaled = val; - for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) { - scaled *= one_k; - if (scaled >= small_threshold) { - mantissa_stream << scaled; - *exponent = -static_cast(i + 1); - *mantissa = mantissa_stream.str(); - return; - } - } - } - mantissa_stream << val; - *exponent = 0; - } else { - mantissa_stream << val; - *exponent = 0; - } - *mantissa = mantissa_stream.str(); -} - -std::string ExponentToPrefix(int64_t exponent, bool iec) { - if (exponent == 0) return ""; - - const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1); - if (index >= kUnitsSize) return ""; - - const char* array = - (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits); - if (iec) - return array[index] + std::string("i"); - else - return std::string(1, array[index]); -} - -std::string ToBinaryStringFullySpecified(double value, double threshold, - int precision) { - std::string mantissa; - int64_t exponent; - ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa, - &exponent); - return mantissa + ExponentToPrefix(exponent, false); -} - -void AppendHumanReadable(int n, std::string* str) { - std::stringstream ss; - // Round down to the nearest SI prefix. - ss << ToBinaryStringFullySpecified(n, 1.0, 0); - *str += ss.str(); -} - -std::string HumanReadableNumber(double n) { - // 1.1 means that figures up to 1.1k should be shown with the next unit down; - // this softens edge effects. - // 1 means that we should show one decimal place of precision. - return ToBinaryStringFullySpecified(n, 1.1, 1); -} - -std::string StringPrintFImp(const char* msg, va_list args) { - // we might need a second shot at this, so pre-emptivly make a copy - va_list args_cp; - va_copy(args_cp, args); - - // TODO(ericwf): use std::array for first attempt to avoid one memory - // allocation guess what the size might be - std::array local_buff; - std::size_t size = local_buff.size(); - // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation - // in the android-ndk - auto ret = vsnprintf(local_buff.data(), size, msg, args_cp); - - va_end(args_cp); - - // handle empty expansion - if (ret == 0) return std::string{}; - if (static_cast(ret) < size) - return std::string(local_buff.data()); - - // we did not provide a long enough buffer on our first attempt. - // add 1 to size to account for null-byte in size cast to prevent overflow - size = static_cast(ret) + 1; - auto buff_ptr = std::unique_ptr(new char[size]); - // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation - // in the android-ndk - ret = vsnprintf(buff_ptr.get(), size, msg, args); - return std::string(buff_ptr.get()); -} - -std::string StringPrintF(const char* format, ...) { - va_list args; - va_start(args, format); - std::string tmp = StringPrintFImp(format, args); - va_end(args); - return tmp; -} - -void ReplaceAll(std::string* str, const std::string& from, - const std::string& to) { - std::size_t start = 0; - while ((start = str->find(from, start)) != std::string::npos) { - str->replace(start, from.length(), to); - start += to.length(); - } -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.h deleted file mode 100644 index 0b190b91a16..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/string_util.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef BENCHMARK_STRING_UTIL_H_ -#define BENCHMARK_STRING_UTIL_H_ - -#include -#include -#include -#include "internal_macros.h" - -namespace benchmark { - -void AppendHumanReadable(int n, std::string* str); - -std::string HumanReadableNumber(double n); - -std::string StringPrintF(const char* format, ...); - -inline std::ostream& StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT { - return out; -} - -template -inline std::ostream& StringCatImp(std::ostream& out, First&& f, - Rest&&... rest) { - out << std::forward(f); - return StringCatImp(out, std::forward(rest)...); -} - -template -inline std::string StrCat(Args&&... args) { - std::ostringstream ss; - StringCatImp(ss, std::forward(args)...); - return ss.str(); -} - -void ReplaceAll(std::string* str, const std::string& from, - const std::string& to); - -} // end namespace benchmark - -#endif // BENCHMARK_STRING_UTIL_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.cc deleted file mode 100644 index 7feb79e65f2..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.cc +++ /dev/null @@ -1,355 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "sysinfo.h" -#include "internal_macros.h" - -#ifdef BENCHMARK_OS_WINDOWS -#include -#include -#include -#else -#include -#include -#include -#include // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD -#include -#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX -#include -#endif -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arraysize.h" -#include "check.h" -#include "cycleclock.h" -#include "internal_macros.h" -#include "log.h" -#include "sleep.h" -#include "string_util.h" - -namespace benchmark { -namespace { -std::once_flag cpuinfo_init; -double cpuinfo_cycles_per_second = 1.0; -int cpuinfo_num_cpus = 1; // Conservative guess - -#if !defined BENCHMARK_OS_MACOSX -const int64_t estimate_time_ms = 1000; - -// Helper function estimates cycles/sec by observing cycles elapsed during -// sleep(). Using small sleep time decreases accuracy significantly. -int64_t EstimateCyclesPerSecond() { - const int64_t start_ticks = cycleclock::Now(); - SleepForMilliseconds(estimate_time_ms); - return cycleclock::Now() - start_ticks; -} -#endif - -#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN -// Helper function for reading an int from a file. Returns true if successful -// and the memory location pointed to by value is set to the value read. -bool ReadIntFromFile(const char* file, long* value) { - bool ret = false; - int fd = open(file, O_RDONLY); - if (fd != -1) { - char line[1024]; - char* err; - memset(line, '\0', sizeof(line)); - ssize_t read_err = read(fd, line, sizeof(line) - 1); - ((void)read_err); // prevent unused warning - CHECK(read_err >= 0); - const long temp_value = strtol(line, &err, 10); - if (line[0] != '\0' && (*err == '\n' || *err == '\0')) { - *value = temp_value; - ret = true; - } - close(fd); - } - return ret; -} -#endif - -#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN -static std::string convertToLowerCase(std::string s) { - for (auto& ch : s) - ch = std::tolower(ch); - return s; -} -static bool startsWithKey(std::string Value, std::string Key, - bool IgnoreCase = true) { - if (IgnoreCase) { - Key = convertToLowerCase(std::move(Key)); - Value = convertToLowerCase(std::move(Value)); - } - return Value.compare(0, Key.size(), Key) == 0; -} -#endif - -void InitializeSystemInfo() { -#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN - char line[1024]; - char* err; - long freq; - - bool saw_mhz = false; - - // If the kernel is exporting the tsc frequency use that. There are issues - // where cpuinfo_max_freq cannot be relied on because the BIOS may be - // exporintg an invalid p-state (on x86) or p-states may be used to put the - // processor in a new mode (turbo mode). Essentially, those frequencies - // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as - // well. - if (!saw_mhz && - ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) { - // The value is in kHz (as the file name suggests). For example, on a - // 2GHz warpstation, the file contains the value "2000000". - cpuinfo_cycles_per_second = freq * 1000.0; - saw_mhz = true; - } - - // If CPU scaling is in effect, we want to use the *maximum* frequency, - // not whatever CPU speed some random processor happens to be using now. - if (!saw_mhz && - ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", - &freq)) { - // The value is in kHz. For example, on a 2GHz warpstation, the file - // contains the value "2000000". - cpuinfo_cycles_per_second = freq * 1000.0; - saw_mhz = true; - } - - // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq. - const char* pname = "/proc/cpuinfo"; - int fd = open(pname, O_RDONLY); - if (fd == -1) { - perror(pname); - if (!saw_mhz) { - cpuinfo_cycles_per_second = - static_cast(EstimateCyclesPerSecond()); - } - return; - } - - double bogo_clock = 1.0; - bool saw_bogo = false; - long max_cpu_id = 0; - int num_cpus = 0; - line[0] = line[1] = '\0'; - size_t chars_read = 0; - do { // we'll exit when the last read didn't read anything - // Move the next line to the beginning of the buffer - const size_t oldlinelen = strlen(line); - if (sizeof(line) == oldlinelen + 1) // oldlinelen took up entire line - line[0] = '\0'; - else // still other lines left to save - memmove(line, line + oldlinelen + 1, sizeof(line) - (oldlinelen + 1)); - // Terminate the new line, reading more if we can't find the newline - char* newline = strchr(line, '\n'); - if (newline == nullptr) { - const size_t linelen = strlen(line); - const size_t bytes_to_read = sizeof(line) - 1 - linelen; - CHECK(bytes_to_read > 0); // because the memmove recovered >=1 bytes - chars_read = read(fd, line + linelen, bytes_to_read); - line[linelen + chars_read] = '\0'; - newline = strchr(line, '\n'); - } - if (newline != nullptr) *newline = '\0'; - - // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only - // accept postive values. Some environments (virtual machines) report zero, - // which would cause infinite looping in WallTime_Init. - if (!saw_mhz && startsWithKey(line, "cpu MHz")) { - const char* freqstr = strchr(line, ':'); - if (freqstr) { - cpuinfo_cycles_per_second = strtod(freqstr + 1, &err) * 1000000.0; - if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0) - saw_mhz = true; - } - } else if (startsWithKey(line, "bogomips")) { - const char* freqstr = strchr(line, ':'); - if (freqstr) { - bogo_clock = strtod(freqstr + 1, &err) * 1000000.0; - if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0) - saw_bogo = true; - } - } else if (startsWithKey(line, "processor", /*IgnoreCase*/false)) { - // The above comparison is case-sensitive because ARM kernels often - // include a "Processor" line that tells you about the CPU, distinct - // from the usual "processor" lines that give you CPU ids. No current - // Linux architecture is using "Processor" for CPU ids. - num_cpus++; // count up every time we see an "processor :" entry - const char* id_str = strchr(line, ':'); - if (id_str) { - const long cpu_id = strtol(id_str + 1, &err, 10); - if (id_str[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id) - max_cpu_id = cpu_id; - } - } - } while (chars_read > 0); - close(fd); - - if (!saw_mhz) { - if (saw_bogo) { - // If we didn't find anything better, we'll use bogomips, but - // we're not happy about it. - cpuinfo_cycles_per_second = bogo_clock; - } else { - // If we don't even have bogomips, we'll use the slow estimation. - cpuinfo_cycles_per_second = - static_cast(EstimateCyclesPerSecond()); - } - } - if (num_cpus == 0) { - fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n"); - } else { - if ((max_cpu_id + 1) != num_cpus) { - fprintf(stderr, - "CPU ID assignments in /proc/cpuinfo seem messed up." - " This is usually caused by a bad BIOS.\n"); - } - cpuinfo_num_cpus = num_cpus; - } - -#elif defined BENCHMARK_OS_FREEBSD -// For this sysctl to work, the machine must be configured without -// SMP, APIC, or APM support. hz should be 64-bit in freebsd 7.0 -// and later. Before that, it's a 32-bit quantity (and gives the -// wrong answer on machines faster than 2^32 Hz). See -// http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html -// But also compare FreeBSD 7.0: -// http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223 -// 231 error = sysctl_handle_quad(oidp, &freq, 0, req); -// To FreeBSD 6.3 (it's the same in 6-STABLE): -// http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131 -// 139 error = sysctl_handle_int(oidp, &freq, sizeof(freq), req); -#if __FreeBSD__ >= 7 - uint64_t hz = 0; -#else - unsigned int hz = 0; -#endif - size_t sz = sizeof(hz); - const char* sysctl_path = "machdep.tsc_freq"; - if (sysctlbyname(sysctl_path, &hz, &sz, nullptr, 0) != 0) { - fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n", - sysctl_path, strerror(errno)); - cpuinfo_cycles_per_second = static_cast(EstimateCyclesPerSecond()); - } else { - cpuinfo_cycles_per_second = hz; - } -// TODO: also figure out cpuinfo_num_cpus - -#elif defined BENCHMARK_OS_WINDOWS - // In NT, read MHz from the registry. If we fail to do so or we're in win9x - // then make a crude estimate. - DWORD data, data_size = sizeof(data); - if (IsWindowsXPOrGreater() && - SUCCEEDED( - SHGetValueA(HKEY_LOCAL_MACHINE, - "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", - "~MHz", nullptr, &data, &data_size))) - cpuinfo_cycles_per_second = - static_cast((int64_t)data * (int64_t)(1000 * 1000)); // was mhz - else - cpuinfo_cycles_per_second = static_cast(EstimateCyclesPerSecond()); - - SYSTEM_INFO sysinfo; - // Use memset as opposed to = {} to avoid GCC missing initializer false - // positives. - std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO)); - GetSystemInfo(&sysinfo); - cpuinfo_num_cpus = sysinfo.dwNumberOfProcessors; // number of logical - // processors in the current - // group - -#elif defined BENCHMARK_OS_MACOSX - int32_t num_cpus = 0; - size_t size = sizeof(num_cpus); - if (::sysctlbyname("hw.ncpu", &num_cpus, &size, nullptr, 0) == 0 && - (size == sizeof(num_cpus))) { - cpuinfo_num_cpus = num_cpus; - } else { - fprintf(stderr, "%s\n", strerror(errno)); - std::exit(EXIT_FAILURE); - } - int64_t cpu_freq = 0; - size = sizeof(cpu_freq); - if (::sysctlbyname("hw.cpufrequency", &cpu_freq, &size, nullptr, 0) == 0 && - (size == sizeof(cpu_freq))) { - cpuinfo_cycles_per_second = cpu_freq; - } else { - #if defined BENCHMARK_OS_IOS - fprintf(stderr, "CPU frequency cannot be detected. \n"); - cpuinfo_cycles_per_second = 0; - #else - fprintf(stderr, "%s\n", strerror(errno)); - std::exit(EXIT_FAILURE); - #endif - } -#else - // Generic cycles per second counter - cpuinfo_cycles_per_second = static_cast(EstimateCyclesPerSecond()); -#endif -} - -} // end namespace - -double CyclesPerSecond(void) { - std::call_once(cpuinfo_init, InitializeSystemInfo); - return cpuinfo_cycles_per_second; -} - -int NumCPUs(void) { - std::call_once(cpuinfo_init, InitializeSystemInfo); - return cpuinfo_num_cpus; -} - -// The ""'s catch people who don't pass in a literal for "str" -#define strliterallen(str) (sizeof("" str "") - 1) - -// Must use a string literal for prefix. -#define memprefix(str, len, prefix) \ - ((((len) >= strliterallen(prefix)) && \ - std::memcmp(str, prefix, strliterallen(prefix)) == 0) \ - ? str + strliterallen(prefix) \ - : nullptr) - -bool CpuScalingEnabled() { -#ifndef BENCHMARK_OS_WINDOWS - // On Linux, the CPUfreq subsystem exposes CPU information as files on the - // local file system. If reading the exported files fails, then we may not be - // running on Linux, so we silently ignore all the read errors. - for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) { - std::string governor_file = - StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor"); - FILE* file = fopen(governor_file.c_str(), "r"); - if (!file) break; - char buff[16]; - size_t bytes_read = fread(buff, 1, sizeof(buff), file); - fclose(file); - if (memprefix(buff, bytes_read, "performance") == nullptr) return true; - } -#endif - return false; -} - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.h deleted file mode 100644 index c5d9916d2dd..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/sysinfo.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef BENCHMARK_SYSINFO_H_ -#define BENCHMARK_SYSINFO_H_ - -namespace benchmark { -int NumCPUs(); -double CyclesPerSecond(); -bool CpuScalingEnabled(); -} // end namespace benchmark - -#endif // BENCHMARK_SYSINFO_H_ diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.cc deleted file mode 100644 index 817272d00bc..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.cc +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "timers.h" -#include "internal_macros.h" - -#ifdef BENCHMARK_OS_WINDOWS -#include -#include -#include -#else -#include -#include -#include -#include // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD -#include -#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX -#include -#endif -#if defined(BENCHMARK_OS_MACOSX) -#include -#include -#include -#endif -#endif - -#ifdef BENCHMARK_OS_EMSCRIPTEN -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "check.h" -#include "log.h" -#include "sleep.h" -#include "string_util.h" - -namespace benchmark { - -// Suppress unused warnings on helper functions. -#if defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wunused-function" -#endif - -namespace { -#if defined(BENCHMARK_OS_WINDOWS) -double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) { - ULARGE_INTEGER kernel; - ULARGE_INTEGER user; - kernel.HighPart = kernel_time.dwHighDateTime; - kernel.LowPart = kernel_time.dwLowDateTime; - user.HighPart = user_time.dwHighDateTime; - user.LowPart = user_time.dwLowDateTime; - return (static_cast(kernel.QuadPart) + - static_cast(user.QuadPart)) * - 1e-7; -} -#else -double MakeTime(struct rusage const& ru) { - return (static_cast(ru.ru_utime.tv_sec) + - static_cast(ru.ru_utime.tv_usec) * 1e-6 + - static_cast(ru.ru_stime.tv_sec) + - static_cast(ru.ru_stime.tv_usec) * 1e-6); -} -#endif -#if defined(BENCHMARK_OS_MACOSX) -double MakeTime(thread_basic_info_data_t const& info) { - return (static_cast(info.user_time.seconds) + - static_cast(info.user_time.microseconds) * 1e-6 + - static_cast(info.system_time.seconds) + - static_cast(info.system_time.microseconds) * 1e-6); -} -#endif -#if defined(CLOCK_PROCESS_CPUTIME_ID) || defined(CLOCK_THREAD_CPUTIME_ID) -double MakeTime(struct timespec const& ts) { - return ts.tv_sec + (static_cast(ts.tv_nsec) * 1e-9); -} -#endif - -BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) { - std::cerr << "ERROR: " << msg << std::endl; - std::exit(EXIT_FAILURE); -} - -} // end namespace - -double ProcessCPUUsage() { -#if defined(BENCHMARK_OS_WINDOWS) - HANDLE proc = GetCurrentProcess(); - FILETIME creation_time; - FILETIME exit_time; - FILETIME kernel_time; - FILETIME user_time; - if (GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, - &user_time)) - return MakeTime(kernel_time, user_time); - DiagnoseAndExit("GetProccessTimes() failed"); -#elif defined(BENCHMARK_OS_EMSCRIPTEN) - // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten. - // Use Emscripten-specific API. Reported CPU time would be exactly the - // same as total time, but this is ok because there aren't long-latency - // syncronous system calls in Emscripten. - return emscripten_get_now() * 1e-3; -#elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX) - // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See - // https://github.com/google/benchmark/pull/292 - struct timespec spec; - if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0) - return MakeTime(spec); - DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed"); -#else - struct rusage ru; - if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru); - DiagnoseAndExit("getrusage(RUSAGE_SELF, ...) failed"); -#endif -} - -double ThreadCPUUsage() { -#if defined(BENCHMARK_OS_WINDOWS) - HANDLE this_thread = GetCurrentThread(); - FILETIME creation_time; - FILETIME exit_time; - FILETIME kernel_time; - FILETIME user_time; - GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time, - &user_time); - return MakeTime(kernel_time, user_time); -#elif defined(BENCHMARK_OS_MACOSX) - // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See - // https://github.com/google/benchmark/pull/292 - mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT; - thread_basic_info_data_t info; - mach_port_t thread = pthread_mach_thread_np(pthread_self()); - if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) == - KERN_SUCCESS) { - return MakeTime(info); - } - DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info"); -#elif defined(BENCHMARK_OS_EMSCRIPTEN) - // Emscripten doesn't support traditional threads - return ProcessCPUUsage(); -#elif defined(BENCHMARK_OS_RTEMS) - // RTEMS doesn't support CLOCK_THREAD_CPUTIME_ID. See - // https://github.com/RTEMS/rtems/blob/master/cpukit/posix/src/clockgettime.c - return ProcessCPUUsage(); -#elif defined(CLOCK_THREAD_CPUTIME_ID) - struct timespec ts; - if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts); - DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed"); -#else -#error Per-thread timing is not available on your system. -#endif -} - -namespace { - -std::string DateTimeString(bool local) { - typedef std::chrono::system_clock Clock; - std::time_t now = Clock::to_time_t(Clock::now()); - const std::size_t kStorageSize = 128; - char storage[kStorageSize]; - std::size_t written; - - if (local) { -#if defined(BENCHMARK_OS_WINDOWS) - written = - std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now)); -#else - std::tm timeinfo; - std::memset(&timeinfo, 0, sizeof(std::tm)); - ::localtime_r(&now, &timeinfo); - written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo); -#endif - } else { -#if defined(BENCHMARK_OS_WINDOWS) - written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now)); -#else - std::tm timeinfo; - std::memset(&timeinfo, 0, sizeof(std::tm)); - ::gmtime_r(&now, &timeinfo); - written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo); -#endif - } - CHECK(written < kStorageSize); - ((void)written); // prevent unused variable in optimized mode. - return std::string(storage); -} - -} // end namespace - -std::string LocalDateTimeString() { return DateTimeString(true); } - -} // end namespace benchmark diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.h deleted file mode 100644 index 65606ccd93d..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/src/timers.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef BENCHMARK_TIMERS_H -#define BENCHMARK_TIMERS_H - -#include -#include - -namespace benchmark { - -// Return the CPU usage of the current process -double ProcessCPUUsage(); - -// Return the CPU usage of the children of the current process -double ChildrenCPUUsage(); - -// Return the CPU usage of the current thread -double ThreadCPUUsage(); - -#if defined(HAVE_STEADY_CLOCK) -template -struct ChooseSteadyClock { - typedef std::chrono::high_resolution_clock type; -}; - -template <> -struct ChooseSteadyClock { - typedef std::chrono::steady_clock type; -}; -#endif - -struct ChooseClockType { -#if defined(HAVE_STEADY_CLOCK) - typedef ChooseSteadyClock<>::type type; -#else - typedef std::chrono::high_resolution_clock type; -#endif -}; - -inline double ChronoClockNow() { - typedef ChooseClockType::type ClockType; - using FpSeconds = std::chrono::duration; - return FpSeconds(ClockType::now().time_since_epoch()).count(); -} - -std::string LocalDateTimeString(); - -} // end namespace benchmark - -#endif // BENCHMARK_TIMERS_H diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/CMakeLists.txt b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/CMakeLists.txt deleted file mode 100644 index b55612b4655..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/CMakeLists.txt +++ /dev/null @@ -1,170 +0,0 @@ -# Enable the tests - -find_package(Threads REQUIRED) -include(CheckCXXCompilerFlag) - -# NOTE: Some tests use `` to perform the test. Therefore we must -# strip -DNDEBUG from the default CMake flags in DEBUG mode. -string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) -if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" ) - add_definitions( -UNDEBUG ) - add_definitions(-DTEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) - # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines. - foreach (flags_var_to_scrub - CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_CXX_FLAGS_MINSIZEREL - CMAKE_C_FLAGS_RELEASE - CMAKE_C_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS_MINSIZEREL) - string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " " - "${flags_var_to_scrub}" "${${flags_var_to_scrub}}") - endforeach() -endif() - -# NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise -# they will break the configuration check. -if (DEFINED BENCHMARK_CXX_LINKER_FLAGS) - list(APPEND CMAKE_EXE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}) -endif() - -add_library(output_test_helper STATIC output_test_helper.cc output_test.h) - -macro(compile_benchmark_test name) - add_executable(${name} "${name}.cc") - target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT}) -endmacro(compile_benchmark_test) - - -macro(compile_output_test name) - add_executable(${name} "${name}.cc" output_test.h) - target_link_libraries(${name} output_test_helper benchmark - ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) -endmacro(compile_output_test) - - -# Demonstration executable -compile_benchmark_test(benchmark_test) -add_test(benchmark benchmark_test --benchmark_min_time=0.01) - -compile_benchmark_test(filter_test) -macro(add_filter_test name filter expect) - add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect}) - add_test(${name}_list_only filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect}) -endmacro(add_filter_test) - -add_filter_test(filter_simple "Foo" 3) -add_filter_test(filter_suffix "BM_.*" 4) -add_filter_test(filter_regex_all ".*" 5) -add_filter_test(filter_regex_blank "" 5) -add_filter_test(filter_regex_none "monkey" 0) -add_filter_test(filter_regex_wildcard ".*Foo.*" 3) -add_filter_test(filter_regex_begin "^BM_.*" 4) -add_filter_test(filter_regex_begin2 "^N" 1) -add_filter_test(filter_regex_end ".*Ba$" 1) - -compile_benchmark_test(options_test) -add_test(options_benchmarks options_test --benchmark_min_time=0.01) - -compile_benchmark_test(basic_test) -add_test(basic_benchmark basic_test --benchmark_min_time=0.01) - -compile_benchmark_test(diagnostics_test) -add_test(diagnostics_test diagnostics_test --benchmark_min_time=0.01) - -compile_benchmark_test(skip_with_error_test) -add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01) - -compile_benchmark_test(donotoptimize_test) -# Some of the issues with DoNotOptimize only occur when optimization is enabled -check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG) -if (BENCHMARK_HAS_O3_FLAG) - set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3") -endif() -add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01) - -compile_benchmark_test(fixture_test) -add_test(fixture_test fixture_test --benchmark_min_time=0.01) - -compile_benchmark_test(register_benchmark_test) -add_test(register_benchmark_test register_benchmark_test --benchmark_min_time=0.01) - -compile_benchmark_test(map_test) -add_test(map_test map_test --benchmark_min_time=0.01) - -compile_benchmark_test(multiple_ranges_test) -add_test(multiple_ranges_test multiple_ranges_test --benchmark_min_time=0.01) - -compile_output_test(reporter_output_test) -add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01) - -compile_output_test(user_counters_test) -add_test(user_counters_test user_counters_test --benchmark_min_time=0.01) - -compile_output_test(user_counters_tabular_test) -add_test(user_counters_tabular_test user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01) - -check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG) -if (BENCHMARK_HAS_CXX03_FLAG) - set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}") - string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}") - - compile_benchmark_test(cxx03_test) - set_target_properties(cxx03_test - PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}") - add_test(cxx03 cxx03_test --benchmark_min_time=0.01) -endif() - -# Attempt to work around flaky test failures when running on Appveyor servers. -if (DEFINED ENV{APPVEYOR}) - set(COMPLEXITY_MIN_TIME "0.5") -else() - set(COMPLEXITY_MIN_TIME "0.01") -endif() -compile_output_test(complexity_test) -add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME}) - -# Add the coverage command(s) -if(CMAKE_BUILD_TYPE) - string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER) -endif() -if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage") - find_program(GCOV gcov) - find_program(LCOV lcov) - find_program(GENHTML genhtml) - find_program(CTEST ctest) - if (GCOV AND LCOV AND GENHTML AND CTEST AND HAVE_CXX_FLAG_COVERAGE) - add_custom_command( - OUTPUT ${CMAKE_BINARY_DIR}/lcov/index.html - COMMAND ${LCOV} -q -z -d . - COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o before.lcov -i - COMMAND ${CTEST} --force-new-ctest-process - COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o after.lcov - COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov - COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov - COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark - DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test complexity_test - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - COMMENT "Running LCOV" - ) - add_custom_target(coverage - DEPENDS ${CMAKE_BINARY_DIR}/lcov/index.html - COMMENT "LCOV report at lcov/index.html" - ) - message(STATUS "Coverage command added") - else() - if (HAVE_CXX_FLAG_COVERAGE) - set(CXX_FLAG_COVERAGE_MESSAGE supported) - else() - set(CXX_FLAG_COVERAGE_MESSAGE unavailable) - endif() - message(WARNING - "Coverage not available:\n" - " gcov: ${GCOV}\n" - " lcov: ${LCOV}\n" - " genhtml: ${GENHTML}\n" - " ctest: ${CTEST}\n" - " --coverage flag: ${CXX_FLAG_COVERAGE_MESSAGE}") - endif() -endif() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/basic_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/basic_test.cc deleted file mode 100644 index bc1f96d9315..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/basic_test.cc +++ /dev/null @@ -1,99 +0,0 @@ - -#include "benchmark/benchmark.h" - -#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192) - -void BM_empty(benchmark::State& state) { - while (state.KeepRunning()) { - benchmark::DoNotOptimize(state.iterations()); - } -} -BENCHMARK(BM_empty); -BENCHMARK(BM_empty)->ThreadPerCpu(); - -void BM_spin_empty(benchmark::State& state) { - while (state.KeepRunning()) { - for (int x = 0; x < state.range(0); ++x) { - benchmark::DoNotOptimize(x); - } - } -} -BASIC_BENCHMARK_TEST(BM_spin_empty); -BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu(); - -void BM_spin_pause_before(benchmark::State& state) { - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } - while (state.KeepRunning()) { - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } - } -} -BASIC_BENCHMARK_TEST(BM_spin_pause_before); -BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu(); - -void BM_spin_pause_during(benchmark::State& state) { - while (state.KeepRunning()) { - state.PauseTiming(); - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } - state.ResumeTiming(); - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } - } -} -BASIC_BENCHMARK_TEST(BM_spin_pause_during); -BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu(); - -void BM_pause_during(benchmark::State& state) { - while (state.KeepRunning()) { - state.PauseTiming(); - state.ResumeTiming(); - } -} -BENCHMARK(BM_pause_during); -BENCHMARK(BM_pause_during)->ThreadPerCpu(); -BENCHMARK(BM_pause_during)->UseRealTime(); -BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu(); - -void BM_spin_pause_after(benchmark::State& state) { - while (state.KeepRunning()) { - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } - } - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } -} -BASIC_BENCHMARK_TEST(BM_spin_pause_after); -BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu(); - -void BM_spin_pause_before_and_after(benchmark::State& state) { - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } - while (state.KeepRunning()) { - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } - } - for (int i = 0; i < state.range(0); ++i) { - benchmark::DoNotOptimize(i); - } -} -BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after); -BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu(); - -void BM_empty_stop_start(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_empty_stop_start); -BENCHMARK(BM_empty_stop_start)->ThreadPerCpu(); - -BENCHMARK_MAIN() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/benchmark_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/benchmark_test.cc deleted file mode 100644 index 7a16466e208..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/benchmark_test.cc +++ /dev/null @@ -1,240 +0,0 @@ -#include "benchmark/benchmark.h" - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__GNUC__) -#define BENCHMARK_NOINLINE __attribute__((noinline)) -#else -#define BENCHMARK_NOINLINE -#endif - -namespace { - -int BENCHMARK_NOINLINE Factorial(uint32_t n) { - return (n == 1) ? 1 : n * Factorial(n - 1); -} - -double CalculatePi(int depth) { - double pi = 0.0; - for (int i = 0; i < depth; ++i) { - double numerator = static_cast(((i % 2) * 2) - 1); - double denominator = static_cast((2 * i) - 1); - pi += numerator / denominator; - } - return (pi - 1.0) * 4; -} - -std::set ConstructRandomSet(int size) { - std::set s; - for (int i = 0; i < size; ++i) s.insert(i); - return s; -} - -std::mutex test_vector_mu; -std::vector* test_vector = nullptr; - -} // end namespace - -static void BM_Factorial(benchmark::State& state) { - int fac_42 = 0; - while (state.KeepRunning()) fac_42 = Factorial(8); - // Prevent compiler optimizations - std::stringstream ss; - ss << fac_42; - state.SetLabel(ss.str()); -} -BENCHMARK(BM_Factorial); -BENCHMARK(BM_Factorial)->UseRealTime(); - -static void BM_CalculatePiRange(benchmark::State& state) { - double pi = 0.0; - while (state.KeepRunning()) pi = CalculatePi(state.range(0)); - std::stringstream ss; - ss << pi; - state.SetLabel(ss.str()); -} -BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024); - -static void BM_CalculatePi(benchmark::State& state) { - static const int depth = 1024; - while (state.KeepRunning()) { - benchmark::DoNotOptimize(CalculatePi(depth)); - } -} -BENCHMARK(BM_CalculatePi)->Threads(8); -BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32); -BENCHMARK(BM_CalculatePi)->ThreadPerCpu(); - -static void BM_SetInsert(benchmark::State& state) { - while (state.KeepRunning()) { - state.PauseTiming(); - std::set data = ConstructRandomSet(state.range(0)); - state.ResumeTiming(); - for (int j = 0; j < state.range(1); ++j) data.insert(rand()); - } - state.SetItemsProcessed(state.iterations() * state.range(1)); - state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int)); -} -BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {1, 10}}); - -template -static void BM_Sequential(benchmark::State& state) { - ValueType v = 42; - while (state.KeepRunning()) { - Container c; - for (int i = state.range(0); --i;) c.push_back(v); - } - const size_t items_processed = state.iterations() * state.range(0); - state.SetItemsProcessed(items_processed); - state.SetBytesProcessed(items_processed * sizeof(v)); -} -BENCHMARK_TEMPLATE2(BM_Sequential, std::vector, int) - ->Range(1 << 0, 1 << 10); -BENCHMARK_TEMPLATE(BM_Sequential, std::list)->Range(1 << 0, 1 << 10); -// Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond. -#if __cplusplus >= 201103L -BENCHMARK_TEMPLATE(BM_Sequential, std::vector, int)->Arg(512); -#endif - -static void BM_StringCompare(benchmark::State& state) { - std::string s1(state.range(0), '-'); - std::string s2(state.range(0), '-'); - while (state.KeepRunning()) benchmark::DoNotOptimize(s1.compare(s2)); -} -BENCHMARK(BM_StringCompare)->Range(1, 1 << 20); - -static void BM_SetupTeardown(benchmark::State& state) { - if (state.thread_index == 0) { - // No need to lock test_vector_mu here as this is running single-threaded. - test_vector = new std::vector(); - } - int i = 0; - while (state.KeepRunning()) { - std::lock_guard l(test_vector_mu); - if (i % 2 == 0) - test_vector->push_back(i); - else - test_vector->pop_back(); - ++i; - } - if (state.thread_index == 0) { - delete test_vector; - } -} -BENCHMARK(BM_SetupTeardown)->ThreadPerCpu(); - -static void BM_LongTest(benchmark::State& state) { - double tracker = 0.0; - while (state.KeepRunning()) { - for (int i = 0; i < state.range(0); ++i) - benchmark::DoNotOptimize(tracker += i); - } -} -BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28); - -static void BM_ParallelMemset(benchmark::State& state) { - int size = state.range(0) / static_cast(sizeof(int)); - int thread_size = size / state.threads; - int from = thread_size * state.thread_index; - int to = from + thread_size; - - if (state.thread_index == 0) { - test_vector = new std::vector(size); - } - - while (state.KeepRunning()) { - for (int i = from; i < to; i++) { - // No need to lock test_vector_mu as ranges - // do not overlap between threads. - benchmark::DoNotOptimize(test_vector->at(i) = 1); - } - } - - if (state.thread_index == 0) { - delete test_vector; - } -} -BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4); - -static void BM_ManualTiming(benchmark::State& state) { - size_t slept_for = 0; - int microseconds = state.range(0); - std::chrono::duration sleep_duration{ - static_cast(microseconds)}; - - while (state.KeepRunning()) { - auto start = std::chrono::high_resolution_clock::now(); - // Simulate some useful workload with a sleep - std::this_thread::sleep_for( - std::chrono::duration_cast(sleep_duration)); - auto end = std::chrono::high_resolution_clock::now(); - - auto elapsed = - std::chrono::duration_cast>(end - start); - - state.SetIterationTime(elapsed.count()); - slept_for += microseconds; - } - state.SetItemsProcessed(slept_for); -} -BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime(); -BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime(); - -#if __cplusplus >= 201103L - -template -void BM_with_args(benchmark::State& state, Args&&...) { - while (state.KeepRunning()) { - } -} -BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44); -BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"), - std::pair(42, 3.8)); - -void BM_non_template_args(benchmark::State& state, int, double) { - while(state.KeepRunning()) {} -} -BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0); - -#endif // __cplusplus >= 201103L - -static void BM_DenseThreadRanges(benchmark::State& st) { - switch (st.range(0)) { - case 1: - assert(st.threads == 1 || st.threads == 2 || st.threads == 3); - break; - case 2: - assert(st.threads == 1 || st.threads == 3 || st.threads == 4); - break; - case 3: - assert(st.threads == 5 || st.threads == 8 || st.threads == 11 || - st.threads == 14); - break; - default: - assert(false && "Invalid test case number"); - } - while (st.KeepRunning()) { - } -} -BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3); -BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2); -BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3); - -BENCHMARK_MAIN() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/complexity_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/complexity_test.cc deleted file mode 100644 index 62d1154df0e..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/complexity_test.cc +++ /dev/null @@ -1,167 +0,0 @@ -#undef NDEBUG -#include -#include -#include -#include -#include -#include "benchmark/benchmark.h" -#include "output_test.h" - -namespace { - -#define ADD_COMPLEXITY_CASES(...) \ - int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__) - -int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name, - std::string big_o) { - SetSubstitutions({{"%bigo_name", big_o_test_name}, - {"%rms_name", rms_test_name}, - {"%bigo_str", "[ ]* %float " + big_o}, - {"%bigo", big_o}, - {"%rms", "[ ]*[0-9]+ %"}}); - AddCases( - TC_ConsoleOut, - {{"^%bigo_name %bigo_str %bigo_str[ ]*$"}, - {"^%bigo_name", MR_Not}, // Assert we we didn't only matched a name. - {"^%rms_name %rms %rms[ ]*$", MR_Next}}); - AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"}, - {"\"cpu_coefficient\": [0-9]+,$", MR_Next}, - {"\"real_coefficient\": [0-9]{1,5},$", MR_Next}, - {"\"big_o\": \"%bigo\",$", MR_Next}, - {"\"time_unit\": \"ns\"$", MR_Next}, - {"}", MR_Next}, - {"\"name\": \"%rms_name\",$"}, - {"\"rms\": %float$", MR_Next}, - {"}", MR_Next}}); - AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"}, - {"^\"%bigo_name\"", MR_Not}, - {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}}); - return 0; -} - -} // end namespace - -// ========================================================================= // -// --------------------------- Testing BigO O(1) --------------------------- // -// ========================================================================= // - -void BM_Complexity_O1(benchmark::State& state) { - while (state.KeepRunning()) { - for (int i = 0; i < 1024; ++i) { - benchmark::DoNotOptimize(&i); - } - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1); -BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(); -BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int) { - return 1.0; -}); - -const char *big_o_1_test_name = "BM_Complexity_O1_BigO"; -const char *rms_o_1_test_name = "BM_Complexity_O1_RMS"; -const char *enum_big_o_1 = "\\([0-9]+\\)"; -// FIXME: Tolerate both '(1)' and 'lgN' as output when the complexity is auto -// deduced. -// See https://github.com/google/benchmark/issues/272 -const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)"; -const char *lambda_big_o_1 = "f\\(N\\)"; - -// Add enum tests -ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, enum_big_o_1); - -// Add auto enum tests -ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, auto_big_o_1); - -// Add lambda tests -ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1); - -// ========================================================================= // -// --------------------------- Testing BigO O(N) --------------------------- // -// ========================================================================= // - -std::vector ConstructRandomVector(int size) { - std::vector v; - v.reserve(size); - for (int i = 0; i < size; ++i) { - v.push_back(std::rand() % size); - } - return v; -} - -void BM_Complexity_O_N(benchmark::State& state) { - auto v = ConstructRandomVector(state.range(0)); - const int item_not_in_vector = - state.range(0) * 2; // Test worst case scenario (item not in vector) - while (state.KeepRunning()) { - benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector)); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_Complexity_O_N) - ->RangeMultiplier(2) - ->Range(1 << 10, 1 << 16) - ->Complexity(benchmark::oN); -BENCHMARK(BM_Complexity_O_N) - ->RangeMultiplier(2) - ->Range(1 << 10, 1 << 16) - ->Complexity([](int n) -> double { return n; }); -BENCHMARK(BM_Complexity_O_N) - ->RangeMultiplier(2) - ->Range(1 << 10, 1 << 16) - ->Complexity(); - -const char *big_o_n_test_name = "BM_Complexity_O_N_BigO"; -const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS"; -const char *enum_auto_big_o_n = "N"; -const char *lambda_big_o_n = "f\\(N\\)"; - -// Add enum tests -ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n); - -// Add lambda tests -ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n); - -// ========================================================================= // -// ------------------------- Testing BigO O(N*lgN) ------------------------- // -// ========================================================================= // - -static void BM_Complexity_O_N_log_N(benchmark::State& state) { - auto v = ConstructRandomVector(state.range(0)); - while (state.KeepRunning()) { - std::sort(v.begin(), v.end()); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_Complexity_O_N_log_N) - ->RangeMultiplier(2) - ->Range(1 << 10, 1 << 16) - ->Complexity(benchmark::oNLogN); -BENCHMARK(BM_Complexity_O_N_log_N) - ->RangeMultiplier(2) - ->Range(1 << 10, 1 << 16) - ->Complexity([](int n) { return n * log2(n); }); -BENCHMARK(BM_Complexity_O_N_log_N) - ->RangeMultiplier(2) - ->Range(1 << 10, 1 << 16) - ->Complexity(); - -const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO"; -const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS"; -const char *enum_auto_big_o_n_lg_n = "NlgN"; -const char *lambda_big_o_n_lg_n = "f\\(N\\)"; - -// Add enum tests -ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, - enum_auto_big_o_n_lg_n); - -// Add lambda tests -ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, - lambda_big_o_n_lg_n); - -// ========================================================================= // -// --------------------------- TEST CASES END ------------------------------ // -// ========================================================================= // - -int main(int argc, char *argv[]) { RunOutputTests(argc, argv); } diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/cxx03_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/cxx03_test.cc deleted file mode 100644 index a79d964e17b..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/cxx03_test.cc +++ /dev/null @@ -1,48 +0,0 @@ -#undef NDEBUG -#include -#include - -#include "benchmark/benchmark.h" - -#if __cplusplus >= 201103L -#error C++11 or greater detected. Should be C++03. -#endif - -void BM_empty(benchmark::State& state) { - while (state.KeepRunning()) { - volatile std::size_t x = state.iterations(); - ((void)x); - } -} -BENCHMARK(BM_empty); - -// The new C++11 interface for args/ranges requires initializer list support. -// Therefore we provide the old interface to support C++03. -void BM_old_arg_range_interface(benchmark::State& state) { - assert((state.range(0) == 1 && state.range(1) == 2) || - (state.range(0) == 5 && state.range(1) == 6)); - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_old_arg_range_interface)->ArgPair(1, 2)->RangePair(5, 5, 6, 6); - -template -void BM_template2(benchmark::State& state) { - BM_empty(state); -} -BENCHMARK_TEMPLATE2(BM_template2, int, long); - -template -void BM_template1(benchmark::State& state) { - BM_empty(state); -} -BENCHMARK_TEMPLATE(BM_template1, long); -BENCHMARK_TEMPLATE1(BM_template1, int); - -void BM_counters(benchmark::State& state) { - BM_empty(state); - state.counters["Foo"] = 2; -} -BENCHMARK(BM_counters); - -BENCHMARK_MAIN() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/diagnostics_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/diagnostics_test.cc deleted file mode 100644 index 7aac8069e59..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/diagnostics_test.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Testing: -// State::PauseTiming() -// State::ResumeTiming() -// Test that CHECK's within these function diagnose when they are called -// outside of the KeepRunning() loop. -// -// NOTE: Users should NOT include or use src/check.h. This is only done in -// order to test library internals. - -#include -#include - -#include "../src/check.h" -#include "benchmark/benchmark.h" - -#if defined(__GNUC__) && !defined(__EXCEPTIONS) -#define TEST_HAS_NO_EXCEPTIONS -#endif - -void TestHandler() { -#ifndef TEST_HAS_NO_EXCEPTIONS - throw std::logic_error(""); -#else - std::abort(); -#endif -} - -void try_invalid_pause_resume(benchmark::State& state) { -#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS) - try { - state.PauseTiming(); - std::abort(); - } catch (std::logic_error const&) { - } - try { - state.ResumeTiming(); - std::abort(); - } catch (std::logic_error const&) { - } -#else - (void)state; // avoid unused warning -#endif -} - -void BM_diagnostic_test(benchmark::State& state) { - static bool called_once = false; - - if (called_once == false) try_invalid_pause_resume(state); - - while (state.KeepRunning()) { - benchmark::DoNotOptimize(state.iterations()); - } - - if (called_once == false) try_invalid_pause_resume(state); - - called_once = true; -} -BENCHMARK(BM_diagnostic_test); - -int main(int argc, char* argv[]) { - benchmark::internal::GetAbortHandler() = &TestHandler; - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); -} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/donotoptimize_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/donotoptimize_test.cc deleted file mode 100644 index a705654a269..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/donotoptimize_test.cc +++ /dev/null @@ -1,52 +0,0 @@ -#include "benchmark/benchmark.h" - -#include - -namespace { -#if defined(__GNUC__) -std::uint64_t double_up(const std::uint64_t x) __attribute__((const)); -#endif -std::uint64_t double_up(const std::uint64_t x) { return x * 2; } -} - -// Using DoNotOptimize on types like BitRef seem to cause a lot of problems -// with the inline assembly on both GCC and Clang. -struct BitRef { - int index; - unsigned char &byte; - -public: - static BitRef Make() { - static unsigned char arr[2] = {}; - BitRef b(1, arr[0]); - return b; - } -private: - BitRef(int i, unsigned char& b) : index(i), byte(b) {} -}; - -int main(int, char*[]) { - // this test verifies compilation of DoNotOptimize() for some types - - char buffer8[8]; - benchmark::DoNotOptimize(buffer8); - - char buffer20[20]; - benchmark::DoNotOptimize(buffer20); - - char buffer1024[1024]; - benchmark::DoNotOptimize(buffer1024); - benchmark::DoNotOptimize(&buffer1024[0]); - - int x = 123; - benchmark::DoNotOptimize(x); - benchmark::DoNotOptimize(&x); - benchmark::DoNotOptimize(x += 42); - - benchmark::DoNotOptimize(double_up(x)); - - // These tests are to e - benchmark::DoNotOptimize(BitRef::Make()); - BitRef lval = BitRef::Make(); - benchmark::DoNotOptimize(lval); -} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/filter_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/filter_test.cc deleted file mode 100644 index 3a205295f09..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/filter_test.cc +++ /dev/null @@ -1,104 +0,0 @@ -#include "benchmark/benchmark.h" - -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace { - -class TestReporter : public benchmark::ConsoleReporter { - public: - virtual bool ReportContext(const Context& context) { - return ConsoleReporter::ReportContext(context); - }; - - virtual void ReportRuns(const std::vector& report) { - ++count_; - ConsoleReporter::ReportRuns(report); - }; - - TestReporter() : count_(0) {} - - virtual ~TestReporter() {} - - size_t GetCount() const { return count_; } - - private: - mutable size_t count_; -}; - -} // end namespace - -static void NoPrefix(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(NoPrefix); - -static void BM_Foo(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_Foo); - -static void BM_Bar(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_Bar); - -static void BM_FooBar(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_FooBar); - -static void BM_FooBa(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_FooBa); - -int main(int argc, char **argv) { - bool list_only = false; - for (int i = 0; i < argc; ++i) - list_only |= std::string(argv[i]).find("--benchmark_list_tests") != - std::string::npos; - - benchmark::Initialize(&argc, argv); - - TestReporter test_reporter; - const size_t returned_count = - benchmark::RunSpecifiedBenchmarks(&test_reporter); - - if (argc == 2) { - // Make sure we ran all of the tests - std::stringstream ss(argv[1]); - size_t expected_return; - ss >> expected_return; - - if (returned_count != expected_return) { - std::cerr << "ERROR: Expected " << expected_return - << " tests to match the filter but returned_count = " - << returned_count << std::endl; - return -1; - } - - const size_t expected_reports = list_only ? 0 : expected_return; - const size_t reports_count = test_reporter.GetCount(); - if (reports_count != expected_reports) { - std::cerr << "ERROR: Expected " << expected_reports - << " tests to be run but reported_count = " << reports_count - << std::endl; - return -1; - } - } - - return 0; -} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/fixture_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/fixture_test.cc deleted file mode 100644 index bbc2f957902..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/fixture_test.cc +++ /dev/null @@ -1,49 +0,0 @@ - -#include "benchmark/benchmark.h" - -#include -#include - -class MyFixture : public ::benchmark::Fixture { - public: - void SetUp(const ::benchmark::State& state) { - if (state.thread_index == 0) { - assert(data.get() == nullptr); - data.reset(new int(42)); - } - } - - void TearDown(const ::benchmark::State& state) { - if (state.thread_index == 0) { - assert(data.get() != nullptr); - data.reset(); - } - } - - ~MyFixture() { assert(data == nullptr); } - - std::unique_ptr data; -}; - -BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) { - assert(data.get() != nullptr); - assert(*data == 42); - while (st.KeepRunning()) { - } -} - -BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) { - if (st.thread_index == 0) { - assert(data.get() != nullptr); - assert(*data == 42); - } - while (st.KeepRunning()) { - assert(data.get() != nullptr); - assert(*data == 42); - } - st.SetItemsProcessed(st.range(0)); -} -BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42); -BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu(); - -BENCHMARK_MAIN() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/map_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/map_test.cc deleted file mode 100644 index 83457c9981c..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/map_test.cc +++ /dev/null @@ -1,56 +0,0 @@ -#include "benchmark/benchmark.h" - -#include -#include - -namespace { - -std::map ConstructRandomMap(int size) { - std::map m; - for (int i = 0; i < size; ++i) { - m.insert(std::make_pair(rand() % size, rand() % size)); - } - return m; -} - -} // namespace - -// Basic version. -static void BM_MapLookup(benchmark::State& state) { - const int size = state.range(0); - while (state.KeepRunning()) { - state.PauseTiming(); - std::map m = ConstructRandomMap(size); - state.ResumeTiming(); - for (int i = 0; i < size; ++i) { - benchmark::DoNotOptimize(m.find(rand() % size)); - } - } - state.SetItemsProcessed(state.iterations() * size); -} -BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12); - -// Using fixtures. -class MapFixture : public ::benchmark::Fixture { - public: - void SetUp(const ::benchmark::State& st) { - m = ConstructRandomMap(st.range(0)); - } - - void TearDown(const ::benchmark::State&) { m.clear(); } - - std::map m; -}; - -BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) { - const int size = state.range(0); - while (state.KeepRunning()) { - for (int i = 0; i < size; ++i) { - benchmark::DoNotOptimize(m.find(rand() % size)); - } - } - state.SetItemsProcessed(state.iterations() * size); -} -BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1 << 3, 1 << 12); - -BENCHMARK_MAIN() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/multiple_ranges_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/multiple_ranges_test.cc deleted file mode 100644 index 8e67b3b2a99..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/multiple_ranges_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include "benchmark/benchmark.h" - -#include -#include - -class MultipleRangesFixture : public ::benchmark::Fixture { - public: - MultipleRangesFixture() - : expectedValues({{1, 3, 5}, - {1, 3, 8}, - {1, 3, 15}, - {2, 3, 5}, - {2, 3, 8}, - {2, 3, 15}, - {1, 4, 5}, - {1, 4, 8}, - {1, 4, 15}, - {2, 4, 5}, - {2, 4, 8}, - {2, 4, 15}, - {1, 7, 5}, - {1, 7, 8}, - {1, 7, 15}, - {2, 7, 5}, - {2, 7, 8}, - {2, 7, 15}, - {7, 6, 3}}) {} - - void SetUp(const ::benchmark::State& state) { - std::vector ranges = {state.range(0), state.range(1), state.range(2)}; - - assert(expectedValues.find(ranges) != expectedValues.end()); - - actualValues.insert(ranges); - } - - virtual ~MultipleRangesFixture() { - assert(actualValues.size() == expectedValues.size()); - } - - std::set> expectedValues; - std::set> actualValues; -}; - -BENCHMARK_DEFINE_F(MultipleRangesFixture, Empty)(benchmark::State& state) { - while (state.KeepRunning()) { - int product = state.range(0) * state.range(1) * state.range(2); - for (int x = 0; x < product; x++) { - benchmark::DoNotOptimize(x); - } - } -} - -BENCHMARK_REGISTER_F(MultipleRangesFixture, Empty) - ->RangeMultiplier(2) - ->Ranges({{1, 2}, {3, 7}, {5, 15}}) - ->Args({7, 6, 3}); - -void BM_CheckDefaultArgument(benchmark::State& state) { - // Test that the 'range()' without an argument is the same as 'range(0)'. - assert(state.range() == state.range(0)); - assert(state.range() != state.range(1)); - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_CheckDefaultArgument)->Ranges({{1, 5}, {6, 10}}); - -static void BM_MultipleRanges(benchmark::State& st) { - while (st.KeepRunning()) { - } -} -BENCHMARK(BM_MultipleRanges)->Ranges({{5, 5}, {6, 6}}); - -BENCHMARK_MAIN() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/options_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/options_test.cc deleted file mode 100644 index 8eac068b977..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/options_test.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include "benchmark/benchmark.h" -#include -#include - -#if defined(NDEBUG) -#undef NDEBUG -#endif -#include - -void BM_basic(benchmark::State& state) { - while (state.KeepRunning()) { - } -} - -void BM_basic_slow(benchmark::State& state) { - std::chrono::milliseconds sleep_duration(state.range(0)); - while (state.KeepRunning()) { - std::this_thread::sleep_for( - std::chrono::duration_cast(sleep_duration)); - } -} - -BENCHMARK(BM_basic); -BENCHMARK(BM_basic)->Arg(42); -BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond); -BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond); -BENCHMARK(BM_basic)->Range(1, 8); -BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8); -BENCHMARK(BM_basic)->DenseRange(10, 15); -BENCHMARK(BM_basic)->Args({42, 42}); -BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}}); -BENCHMARK(BM_basic)->MinTime(0.7); -BENCHMARK(BM_basic)->UseRealTime(); -BENCHMARK(BM_basic)->ThreadRange(2, 4); -BENCHMARK(BM_basic)->ThreadPerCpu(); -BENCHMARK(BM_basic)->Repetitions(3); - -void CustomArgs(benchmark::internal::Benchmark* b) { - for (int i = 0; i < 10; ++i) { - b->Arg(i); - } -} - -BENCHMARK(BM_basic)->Apply(CustomArgs); - -void BM_explicit_iteration_count(benchmark::State& st) { - // Test that benchmarks specified with an explicit iteration count are - // only run once. - static bool invoked_before = false; - assert(!invoked_before); - invoked_before = true; - - // Test that the requested iteration count is respected. - assert(st.max_iterations == 42); - size_t actual_iterations = 0; - while (st.KeepRunning()) - ++actual_iterations; - assert(st.iterations() == st.max_iterations); - assert(st.iterations() == 42); - -} -BENCHMARK(BM_explicit_iteration_count)->Iterations(42); - -BENCHMARK_MAIN() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test.h b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test.h deleted file mode 100644 index 897a13866ba..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test.h +++ /dev/null @@ -1,201 +0,0 @@ -#ifndef TEST_OUTPUT_TEST_H -#define TEST_OUTPUT_TEST_H - -#undef NDEBUG -#include -#include -#include -#include -#include -#include -#include - -#include "../src/re.h" -#include "benchmark/benchmark.h" - -#define CONCAT2(x, y) x##y -#define CONCAT(x, y) CONCAT2(x, y) - -#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = ::AddCases(__VA_ARGS__) - -#define SET_SUBSTITUTIONS(...) \ - int CONCAT(dummy, __LINE__) = ::SetSubstitutions(__VA_ARGS__) - -enum MatchRules { - MR_Default, // Skip non-matching lines until a match is found. - MR_Next, // Match must occur on the next line. - MR_Not // No line between the current position and the next match matches - // the regex -}; - -struct TestCase { - TestCase(std::string re, int rule = MR_Default); - - std::string regex_str; - int match_rule; - std::string substituted_regex; - std::shared_ptr regex; -}; - -enum TestCaseID { - TC_ConsoleOut, - TC_ConsoleErr, - TC_JSONOut, - TC_JSONErr, - TC_CSVOut, - TC_CSVErr, - - TC_NumID // PRIVATE -}; - -// Add a list of test cases to be run against the output specified by -// 'ID' -int AddCases(TestCaseID ID, std::initializer_list il); - -// Add or set a list of substitutions to be performed on constructed regex's -// See 'output_test_helper.cc' for a list of default substitutions. -int SetSubstitutions( - std::initializer_list> il); - -// Run all output tests. -void RunOutputTests(int argc, char* argv[]); - -// ========================================================================= // -// ------------------------- Results checking ------------------------------ // -// ========================================================================= // - -// Call this macro to register a benchmark for checking its results. This -// should be all that's needed. It subscribes a function to check the (CSV) -// results of a benchmark. This is done only after verifying that the output -// strings are really as expected. -// bm_name_pattern: a name or a regex pattern which will be matched against -// all the benchmark names. Matching benchmarks -// will be the subject of a call to checker_function -// checker_function: should be of type ResultsCheckFn (see below) -#define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \ - size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function) - -struct Results; -typedef std::function< void(Results const&) > ResultsCheckFn; - -size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn); - -// Class holding the results of a benchmark. -// It is passed in calls to checker functions. -struct Results { - - // the benchmark name - std::string name; - // the benchmark fields - std::map< std::string, std::string > values; - - Results(const std::string& n) : name(n) {} - - int NumThreads() const; - - typedef enum { kCpuTime, kRealTime } BenchmarkTime; - - // get cpu_time or real_time in seconds - double GetTime(BenchmarkTime which) const; - - // get the real_time duration of the benchmark in seconds. - // it is better to use fuzzy float checks for this, as the float - // ASCII formatting is lossy. - double DurationRealTime() const { - return GetAs< double >("iterations") * GetTime(kRealTime); - } - // get the cpu_time duration of the benchmark in seconds - double DurationCPUTime() const { - return GetAs< double >("iterations") * GetTime(kCpuTime); - } - - // get the string for a result by name, or nullptr if the name - // is not found - const std::string* Get(const char* entry_name) const { - auto it = values.find(entry_name); - if(it == values.end()) return nullptr; - return &it->second; - } - - // get a result by name, parsed as a specific type. - // NOTE: for counters, use GetCounterAs instead. - template - T GetAs(const char* entry_name) const; - - // counters are written as doubles, so they have to be read first - // as a double, and only then converted to the asked type. - template - T GetCounterAs(const char* entry_name) const { - double dval = GetAs< double >(entry_name); - T tval = static_cast< T >(dval); - return tval; - } -}; - -template -T Results::GetAs(const char* entry_name) const { - auto *sv = Get(entry_name); - CHECK(sv != nullptr && !sv->empty()); - std::stringstream ss; - ss << *sv; - T out; - ss >> out; - CHECK(!ss.fail()); - return out; -} - -//---------------------------------- -// Macros to help in result checking. Do not use them with arguments causing -// side-effects. - -#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \ - CONCAT(CHECK_, relationship) \ - (entry.getfn< var_type >(var_name), (value)) << "\n" \ - << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n" \ - << __FILE__ << ":" << __LINE__ << ": " \ - << "expected (" << #var_type << ")" << (var_name) \ - << "=" << (entry).getfn< var_type >(var_name) \ - << " to be " #relationship " to " << (value) << "\n" - -// check with tolerance. eps_factor is the tolerance window, which is -// interpreted relative to value (eg, 0.1 means 10% of value). -#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \ - CONCAT(CHECK_FLOAT_, relationship) \ - (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \ - << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n" \ - << __FILE__ << ":" << __LINE__ << ": " \ - << "expected (" << #var_type << ")" << (var_name) \ - << "=" << (entry).getfn< var_type >(var_name) \ - << " to be " #relationship " to " << (value) << "\n" \ - << __FILE__ << ":" << __LINE__ << ": " \ - << "with tolerance of " << (eps_factor) * (value) \ - << " (" << (eps_factor)*100. << "%), " \ - << "but delta was " << ((entry).getfn< var_type >(var_name) - (value)) \ - << " (" << (((entry).getfn< var_type >(var_name) - (value)) \ - / \ - ((value) > 1.e-5 || value < -1.e-5 ? value : 1.e-5)*100.) \ - << "%)" - -#define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \ - _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value) - -#define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \ - _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value) - -#define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \ - _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor) - -#define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \ - _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor) - -// ========================================================================= // -// --------------------------- Misc Utilities ------------------------------ // -// ========================================================================= // - -namespace { - -const char* const dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"; - -} // end namespace - -#endif // TEST_OUTPUT_TEST_H diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test_helper.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test_helper.cc deleted file mode 100644 index 24746f6d27f..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/output_test_helper.cc +++ /dev/null @@ -1,423 +0,0 @@ -#include -#include -#include -#include -#include - -#include "../src/check.h" // NOTE: check.h is for internal use only! -#include "../src/re.h" // NOTE: re.h is for internal use only -#include "output_test.h" -#include "../src/benchmark_api_internal.h" - -// ========================================================================= // -// ------------------------------ Internals -------------------------------- // -// ========================================================================= // -namespace internal { -namespace { - -using TestCaseList = std::vector; - -// Use a vector because the order elements are added matters during iteration. -// std::map/unordered_map don't guarantee that. -// For example: -// SetSubstitutions({{"%HelloWorld", "Hello"}, {"%Hello", "Hi"}}); -// Substitute("%HelloWorld") // Always expands to Hello. -using SubMap = std::vector>; - -TestCaseList& GetTestCaseList(TestCaseID ID) { - // Uses function-local statics to ensure initialization occurs - // before first use. - static TestCaseList lists[TC_NumID]; - return lists[ID]; -} - -SubMap& GetSubstitutions() { - // Don't use 'dec_re' from header because it may not yet be initialized. - static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"; - static SubMap map = { - {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"}, - // human-readable float - {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"}, - {"%int", "[ ]*[0-9]+"}, - {" %s ", "[ ]+"}, - {"%time", "[ ]*[0-9]{1,5} ns"}, - {"%console_report", "[ ]*[0-9]{1,5} ns [ ]*[0-9]{1,5} ns [ ]*[0-9]+"}, - {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"}, - {"%csv_header", - "name,iterations,real_time,cpu_time,time_unit,bytes_per_second," - "items_per_second,label,error_occurred,error_message"}, - {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"}, - {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"}, - {"%csv_bytes_report", - "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"}, - {"%csv_items_report", - "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,," + safe_dec_re + ",,,"}, - {"%csv_bytes_items_report", - "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + - "," + safe_dec_re + ",,,"}, - {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"}, - {"%csv_label_report_end", ",,"}}; - return map; -} - -std::string PerformSubstitutions(std::string source) { - SubMap const& subs = GetSubstitutions(); - using SizeT = std::string::size_type; - for (auto const& KV : subs) { - SizeT pos; - SizeT next_start = 0; - while ((pos = source.find(KV.first, next_start)) != std::string::npos) { - next_start = pos + KV.second.size(); - source.replace(pos, KV.first.size(), KV.second); - } - } - return source; -} - -void CheckCase(std::stringstream& remaining_output, TestCase const& TC, - TestCaseList const& not_checks) { - std::string first_line; - bool on_first = true; - std::string line; - while (remaining_output.eof() == false) { - CHECK(remaining_output.good()); - std::getline(remaining_output, line); - if (on_first) { - first_line = line; - on_first = false; - } - for (const auto& NC : not_checks) { - CHECK(!NC.regex->Match(line)) - << "Unexpected match for line \"" << line << "\" for MR_Not regex \"" - << NC.regex_str << "\"" - << "\n actual regex string \"" << TC.substituted_regex << "\"" - << "\n started matching near: " << first_line; - } - if (TC.regex->Match(line)) return; - CHECK(TC.match_rule != MR_Next) - << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str - << "\"" - << "\n actual regex string \"" << TC.substituted_regex << "\"" - << "\n started matching near: " << first_line; - } - CHECK(remaining_output.eof() == false) - << "End of output reached before match for regex \"" << TC.regex_str - << "\" was found" - << "\n actual regex string \"" << TC.substituted_regex << "\"" - << "\n started matching near: " << first_line; -} - -void CheckCases(TestCaseList const& checks, std::stringstream& output) { - std::vector not_checks; - for (size_t i = 0; i < checks.size(); ++i) { - const auto& TC = checks[i]; - if (TC.match_rule == MR_Not) { - not_checks.push_back(TC); - continue; - } - CheckCase(output, TC, not_checks); - not_checks.clear(); - } -} - -class TestReporter : public benchmark::BenchmarkReporter { - public: - TestReporter(std::vector reps) - : reporters_(reps) {} - - virtual bool ReportContext(const Context& context) { - bool last_ret = false; - bool first = true; - for (auto rep : reporters_) { - bool new_ret = rep->ReportContext(context); - CHECK(first || new_ret == last_ret) - << "Reports return different values for ReportContext"; - first = false; - last_ret = new_ret; - } - (void)first; - return last_ret; - } - - void ReportRuns(const std::vector& report) { - for (auto rep : reporters_) rep->ReportRuns(report); - } - void Finalize() { - for (auto rep : reporters_) rep->Finalize(); - } - - private: - std::vector reporters_; -}; -} - -} // end namespace internal - -// ========================================================================= // -// -------------------------- Results checking ----------------------------- // -// ========================================================================= // - -namespace internal { - -// Utility class to manage subscribers for checking benchmark results. -// It works by parsing the CSV output to read the results. -class ResultsChecker { - public: - - struct PatternAndFn : public TestCase { // reusing TestCase for its regexes - PatternAndFn(const std::string& rx, ResultsCheckFn fn_) - : TestCase(rx), fn(fn_) {} - ResultsCheckFn fn; - }; - - std::vector< PatternAndFn > check_patterns; - std::vector< Results > results; - std::vector< std::string > field_names; - - void Add(const std::string& entry_pattern, ResultsCheckFn fn); - - void CheckResults(std::stringstream& output); - - private: - - void SetHeader_(const std::string& csv_header); - void SetValues_(const std::string& entry_csv_line); - - std::vector< std::string > SplitCsv_(const std::string& line); - -}; - -// store the static ResultsChecker in a function to prevent initialization -// order problems -ResultsChecker& GetResultsChecker() { - static ResultsChecker rc; - return rc; -} - -// add a results checker for a benchmark -void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) { - check_patterns.emplace_back(entry_pattern, fn); -} - -// check the results of all subscribed benchmarks -void ResultsChecker::CheckResults(std::stringstream& output) { - // first reset the stream to the start - { - auto start = std::ios::streampos(0); - // clear before calling tellg() - output.clear(); - // seek to zero only when needed - if(output.tellg() > start) output.seekg(start); - // and just in case - output.clear(); - } - // now go over every line and publish it to the ResultsChecker - std::string line; - bool on_first = true; - while (output.eof() == false) { - CHECK(output.good()); - std::getline(output, line); - if (on_first) { - SetHeader_(line); // this is important - on_first = false; - continue; - } - SetValues_(line); - } - // finally we can call the subscribed check functions - for(const auto& p : check_patterns) { - VLOG(2) << "--------------------------------\n"; - VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n"; - for(const auto& r : results) { - if(!p.regex->Match(r.name)) { - VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n"; - continue; - } else { - VLOG(2) << p.regex_str << " is matched by " << r.name << "\n"; - } - VLOG(1) << "Checking results of " << r.name << ": ... \n"; - p.fn(r); - VLOG(1) << "Checking results of " << r.name << ": OK.\n"; - } - } -} - -// prepare for the names in this header -void ResultsChecker::SetHeader_(const std::string& csv_header) { - field_names = SplitCsv_(csv_header); -} - -// set the values for a benchmark -void ResultsChecker::SetValues_(const std::string& entry_csv_line) { - if(entry_csv_line.empty()) return; // some lines are empty - CHECK(!field_names.empty()); - auto vals = SplitCsv_(entry_csv_line); - CHECK_EQ(vals.size(), field_names.size()); - results.emplace_back(vals[0]); // vals[0] is the benchmark name - auto &entry = results.back(); - for (size_t i = 1, e = vals.size(); i < e; ++i) { - entry.values[field_names[i]] = vals[i]; - } -} - -// a quick'n'dirty csv splitter (eliminating quotes) -std::vector< std::string > ResultsChecker::SplitCsv_(const std::string& line) { - std::vector< std::string > out; - if(line.empty()) return out; - if(!field_names.empty()) out.reserve(field_names.size()); - size_t prev = 0, pos = line.find_first_of(','), curr = pos; - while(pos != line.npos) { - CHECK(curr > 0); - if(line[prev] == '"') ++prev; - if(line[curr-1] == '"') --curr; - out.push_back(line.substr(prev, curr-prev)); - prev = pos + 1; - pos = line.find_first_of(',', pos + 1); - curr = pos; - } - curr = line.size(); - if(line[prev] == '"') ++prev; - if(line[curr-1] == '"') --curr; - out.push_back(line.substr(prev, curr-prev)); - return out; -} - -} // end namespace internal - -size_t AddChecker(const char* bm_name, ResultsCheckFn fn) -{ - auto &rc = internal::GetResultsChecker(); - rc.Add(bm_name, fn); - return rc.results.size(); -} - -int Results::NumThreads() const { - auto pos = name.find("/threads:"); - if(pos == name.npos) return 1; - auto end = name.find('/', pos + 9); - std::stringstream ss; - ss << name.substr(pos + 9, end); - int num = 1; - ss >> num; - CHECK(!ss.fail()); - return num; -} - -double Results::GetTime(BenchmarkTime which) const { - CHECK(which == kCpuTime || which == kRealTime); - const char *which_str = which == kCpuTime ? "cpu_time" : "real_time"; - double val = GetAs< double >(which_str); - auto unit = Get("time_unit"); - CHECK(unit); - if(*unit == "ns") { - return val * 1.e-9; - } else if(*unit == "us") { - return val * 1.e-6; - } else if(*unit == "ms") { - return val * 1.e-3; - } else if(*unit == "s") { - return val; - } else { - CHECK(1 == 0) << "unknown time unit: " << *unit; - return 0; - } -} - -// ========================================================================= // -// -------------------------- Public API Definitions------------------------ // -// ========================================================================= // - -TestCase::TestCase(std::string re, int rule) - : regex_str(std::move(re)), - match_rule(rule), - substituted_regex(internal::PerformSubstitutions(regex_str)), - regex(std::make_shared()) { - std::string err_str; - regex->Init(substituted_regex,& err_str); - CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex - << "\"" - << "\n originally \"" << regex_str << "\"" - << "\n got error: " << err_str; -} - -int AddCases(TestCaseID ID, std::initializer_list il) { - auto& L = internal::GetTestCaseList(ID); - L.insert(L.end(), il); - return 0; -} - -int SetSubstitutions( - std::initializer_list> il) { - auto& subs = internal::GetSubstitutions(); - for (auto KV : il) { - bool exists = false; - KV.second = internal::PerformSubstitutions(KV.second); - for (auto& EKV : subs) { - if (EKV.first == KV.first) { - EKV.second = std::move(KV.second); - exists = true; - break; - } - } - if (!exists) subs.push_back(std::move(KV)); - } - return 0; -} - -void RunOutputTests(int argc, char* argv[]) { - using internal::GetTestCaseList; - benchmark::Initialize(&argc, argv); - auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/true); - benchmark::ConsoleReporter CR(options); - benchmark::JSONReporter JR; - benchmark::CSVReporter CSVR; - struct ReporterTest { - const char* name; - std::vector& output_cases; - std::vector& error_cases; - benchmark::BenchmarkReporter& reporter; - std::stringstream out_stream; - std::stringstream err_stream; - - ReporterTest(const char* n, std::vector& out_tc, - std::vector& err_tc, - benchmark::BenchmarkReporter& br) - : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) { - reporter.SetOutputStream(&out_stream); - reporter.SetErrorStream(&err_stream); - } - } TestCases[] = { - {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut), - GetTestCaseList(TC_ConsoleErr), CR}, - {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr), - JR}, - {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr), - CSVR}, - }; - - // Create the test reporter and run the benchmarks. - std::cout << "Running benchmarks...\n"; - internal::TestReporter test_rep({&CR, &JR, &CSVR}); - benchmark::RunSpecifiedBenchmarks(&test_rep); - - for (auto& rep_test : TestCases) { - std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n"; - std::string banner(msg.size() - 1, '-'); - std::cout << banner << msg << banner << "\n"; - - std::cerr << rep_test.err_stream.str(); - std::cout << rep_test.out_stream.str(); - - internal::CheckCases(rep_test.error_cases, rep_test.err_stream); - internal::CheckCases(rep_test.output_cases, rep_test.out_stream); - - std::cout << "\n"; - } - - // now that we know the output is as expected, we can dispatch - // the checks to subscribees. - auto &csv = TestCases[2]; - // would use == but gcc spits a warning - CHECK(std::strcmp(csv.name, "CSVReporter") == 0); - internal::GetResultsChecker().CheckResults(csv.out_stream); -} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/register_benchmark_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/register_benchmark_test.cc deleted file mode 100644 index 2769b7a6b63..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/register_benchmark_test.cc +++ /dev/null @@ -1,182 +0,0 @@ - -#undef NDEBUG -#include -#include - -#include "../src/check.h" // NOTE: check.h is for internal use only! -#include "benchmark/benchmark.h" - -namespace { - -class TestReporter : public benchmark::ConsoleReporter { - public: - virtual void ReportRuns(const std::vector& report) { - all_runs_.insert(all_runs_.end(), begin(report), end(report)); - ConsoleReporter::ReportRuns(report); - } - - std::vector all_runs_; -}; - -struct TestCase { - std::string name; - const char* label; - // Note: not explicit as we rely on it being converted through ADD_CASES. - TestCase(const char* xname) : TestCase(xname, nullptr) {} - TestCase(const char* xname, const char* xlabel) - : name(xname), label(xlabel) {} - - typedef benchmark::BenchmarkReporter::Run Run; - - void CheckRun(Run const& run) const { - CHECK(name == run.benchmark_name) << "expected " << name << " got " - << run.benchmark_name; - if (label) { - CHECK(run.report_label == label) << "expected " << label << " got " - << run.report_label; - } else { - CHECK(run.report_label == ""); - } - } -}; - -std::vector ExpectedResults; - -int AddCases(std::initializer_list const& v) { - for (auto N : v) { - ExpectedResults.push_back(N); - } - return 0; -} - -#define CONCAT(x, y) CONCAT2(x, y) -#define CONCAT2(x, y) x##y -#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases({__VA_ARGS__}) - -} // end namespace - -typedef benchmark::internal::Benchmark* ReturnVal; - -//----------------------------------------------------------------------------// -// Test RegisterBenchmark with no additional arguments -//----------------------------------------------------------------------------// -void BM_function(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_function); -ReturnVal dummy = benchmark::RegisterBenchmark( - "BM_function_manual_registration", BM_function); -ADD_CASES({"BM_function"}, {"BM_function_manual_registration"}); - -//----------------------------------------------------------------------------// -// Test RegisterBenchmark with additional arguments -// Note: GCC <= 4.8 do not support this form of RegisterBenchmark because they -// reject the variadic pack expansion of lambda captures. -//----------------------------------------------------------------------------// -#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK - -void BM_extra_args(benchmark::State& st, const char* label) { - while (st.KeepRunning()) { - } - st.SetLabel(label); -} -int RegisterFromFunction() { - std::pair cases[] = { - {"test1", "One"}, {"test2", "Two"}, {"test3", "Three"}}; - for (auto const& c : cases) - benchmark::RegisterBenchmark(c.first, &BM_extra_args, c.second); - return 0; -} -int dummy2 = RegisterFromFunction(); -ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"}); - -#endif // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK - -//----------------------------------------------------------------------------// -// Test RegisterBenchmark with different callable types -//----------------------------------------------------------------------------// - -struct CustomFixture { - void operator()(benchmark::State& st) { - while (st.KeepRunning()) { - } - } -}; - -void TestRegistrationAtRuntime() { -#ifdef BENCHMARK_HAS_CXX11 - { - CustomFixture fx; - benchmark::RegisterBenchmark("custom_fixture", fx); - AddCases({"custom_fixture"}); - } -#endif -#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK - { - const char* x = "42"; - auto capturing_lam = [=](benchmark::State& st) { - while (st.KeepRunning()) { - } - st.SetLabel(x); - }; - benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam); - AddCases({{"lambda_benchmark", x}}); - } -#endif -} - -// Test that all benchmarks, registered at either during static init or runtime, -// are run and the results are passed to the reported. -void RunTestOne() { - TestRegistrationAtRuntime(); - - TestReporter test_reporter; - benchmark::RunSpecifiedBenchmarks(&test_reporter); - - typedef benchmark::BenchmarkReporter::Run Run; - auto EB = ExpectedResults.begin(); - - for (Run const& run : test_reporter.all_runs_) { - assert(EB != ExpectedResults.end()); - EB->CheckRun(run); - ++EB; - } - assert(EB == ExpectedResults.end()); -} - -// Test that ClearRegisteredBenchmarks() clears all previously registered -// benchmarks. -// Also test that new benchmarks can be registered and ran afterwards. -void RunTestTwo() { - assert(ExpectedResults.size() != 0 && - "must have at least one registered benchmark"); - ExpectedResults.clear(); - benchmark::ClearRegisteredBenchmarks(); - - TestReporter test_reporter; - size_t num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter); - assert(num_ran == 0); - assert(test_reporter.all_runs_.begin() == test_reporter.all_runs_.end()); - - TestRegistrationAtRuntime(); - num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter); - assert(num_ran == ExpectedResults.size()); - - typedef benchmark::BenchmarkReporter::Run Run; - auto EB = ExpectedResults.begin(); - - for (Run const& run : test_reporter.all_runs_) { - assert(EB != ExpectedResults.end()); - EB->CheckRun(run); - ++EB; - } - assert(EB == ExpectedResults.end()); -} - -int main(int argc, char* argv[]) { - benchmark::Initialize(&argc, argv); - - RunTestOne(); - RunTestTwo(); -} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/reporter_output_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/reporter_output_test.cc deleted file mode 100644 index 4a481433485..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/reporter_output_test.cc +++ /dev/null @@ -1,256 +0,0 @@ - -#undef NDEBUG -#include - -#include "benchmark/benchmark.h" -#include "output_test.h" - -// ========================================================================= // -// ---------------------- Testing Prologue Output -------------------------- // -// ========================================================================= // - -ADD_CASES(TC_ConsoleOut, - {{"^[-]+$", MR_Next}, - {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next}, - {"^[-]+$", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"%csv_header"}}); - -// ========================================================================= // -// ------------------------ Testing Basic Output --------------------------- // -// ========================================================================= // - -void BM_basic(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_basic); - -ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\"$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}}); - -// ========================================================================= // -// ------------------------ Testing Bytes per Second Output ---------------- // -// ========================================================================= // - -void BM_bytes_per_second(benchmark::State& state) { - while (state.KeepRunning()) { - } - state.SetBytesProcessed(1); -} -BENCHMARK(BM_bytes_per_second); - -ADD_CASES(TC_ConsoleOut, - {{"^BM_bytes_per_second %console_report +%floatB/s$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"bytes_per_second\": %int$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}}); - -// ========================================================================= // -// ------------------------ Testing Items per Second Output ---------------- // -// ========================================================================= // - -void BM_items_per_second(benchmark::State& state) { - while (state.KeepRunning()) { - } - state.SetItemsProcessed(1); -} -BENCHMARK(BM_items_per_second); - -ADD_CASES(TC_ConsoleOut, - {{"^BM_items_per_second %console_report +%float items/s$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"items_per_second\": %int$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_items_per_second\",%csv_items_report$"}}); - -// ========================================================================= // -// ------------------------ Testing Label Output --------------------------- // -// ========================================================================= // - -void BM_label(benchmark::State& state) { - while (state.KeepRunning()) { - } - state.SetLabel("some label"); -} -BENCHMARK(BM_label); - -ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"label\": \"some label\"$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_label\",%csv_label_report_begin\"some " - "label\"%csv_label_report_end$"}}); - -// ========================================================================= // -// ------------------------ Testing Error Output --------------------------- // -// ========================================================================= // - -void BM_error(benchmark::State& state) { - state.SkipWithError("message"); - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_error); -ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"}, - {"\"error_occurred\": true,$", MR_Next}, - {"\"error_message\": \"message\",$", MR_Next}}); - -ADD_CASES(TC_CSVOut, {{"^\"BM_error\",,,,,,,,true,\"message\"$"}}); - -// ========================================================================= // -// ------------------------ Testing No Arg Name Output ----------------------- -// // -// ========================================================================= // - -void BM_no_arg_name(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_no_arg_name)->Arg(3); -ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}}); - -// ========================================================================= // -// ------------------------ Testing Arg Name Output ----------------------- // -// ========================================================================= // - -void BM_arg_name(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3); -ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}}); - -// ========================================================================= // -// ------------------------ Testing Arg Names Output ----------------------- // -// ========================================================================= // - -void BM_arg_names(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"}); -ADD_CASES(TC_ConsoleOut, - {{"^BM_arg_names/first:2/5/third:4 %console_report$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}}); - -// ========================================================================= // -// ----------------------- Testing Complexity Output ----------------------- // -// ========================================================================= // - -void BM_Complexity_O1(benchmark::State& state) { - while (state.KeepRunning()) { - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1); -SET_SUBSTITUTIONS({{"%bigOStr", "[ ]* %float \\([0-9]+\\)"}, - {"%RMS", "[ ]*[0-9]+ %"}}); -ADD_CASES(TC_ConsoleOut, {{"^BM_Complexity_O1_BigO %bigOStr %bigOStr[ ]*$"}, - {"^BM_Complexity_O1_RMS %RMS %RMS[ ]*$"}}); - -// ========================================================================= // -// ----------------------- Testing Aggregate Output ------------------------ // -// ========================================================================= // - -// Test that non-aggregate data is printed by default -void BM_Repeat(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_Repeat)->Repetitions(3); -ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:3 %console_report$"}, - {"^BM_Repeat/repeats:3 %console_report$"}, - {"^BM_Repeat/repeats:3 %console_report$"}, - {"^BM_Repeat/repeats:3_mean %console_report$"}, - {"^BM_Repeat/repeats:3_stddev %console_report$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"}, - {"\"name\": \"BM_Repeat/repeats:3\",$"}, - {"\"name\": \"BM_Repeat/repeats:3\",$"}, - {"\"name\": \"BM_Repeat/repeats:3_mean\",$"}, - {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"}, - {"^\"BM_Repeat/repeats:3\",%csv_report$"}, - {"^\"BM_Repeat/repeats:3\",%csv_report$"}, - {"^\"BM_Repeat/repeats:3_mean\",%csv_report$"}, - {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}}); - -// Test that a non-repeated test still prints non-aggregate results even when -// only-aggregate reports have been requested -void BM_RepeatOnce(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly(); -ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}}); - -// Test that non-aggregate data is not reported -void BM_SummaryRepeat(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly(); -ADD_CASES(TC_ConsoleOut, - {{".*BM_SummaryRepeat/repeats:3 ", MR_Not}, - {"^BM_SummaryRepeat/repeats:3_mean %console_report$"}, - {"^BM_SummaryRepeat/repeats:3_stddev %console_report$"}}); -ADD_CASES(TC_JSONOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not}, - {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"}, - {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"}}); -ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not}, - {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"}, - {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}}); - -void BM_RepeatTimeUnit(benchmark::State& state) { - while (state.KeepRunning()) { - } -} -BENCHMARK(BM_RepeatTimeUnit) - ->Repetitions(3) - ->ReportAggregatesOnly() - ->Unit(benchmark::kMicrosecond); -ADD_CASES(TC_ConsoleOut, - {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not}, - {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_report$"}, - {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_report$"}}); -ADD_CASES(TC_JSONOut, {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not}, - {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"}, - {"\"time_unit\": \"us\",?$"}, - {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"}, - {"\"time_unit\": \"us\",?$"}}); -ADD_CASES(TC_CSVOut, - {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not}, - {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"}, - {"^\"BM_RepeatTimeUnit/repeats:3_stddev\",%csv_us_report$"}}); - -// ========================================================================= // -// --------------------------- TEST CASES END ------------------------------ // -// ========================================================================= // - -int main(int argc, char* argv[]) { RunOutputTests(argc, argv); } diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/skip_with_error_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/skip_with_error_test.cc deleted file mode 100644 index b74d33c5899..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/skip_with_error_test.cc +++ /dev/null @@ -1,150 +0,0 @@ - -#undef NDEBUG -#include -#include - -#include "../src/check.h" // NOTE: check.h is for internal use only! -#include "benchmark/benchmark.h" - -namespace { - -class TestReporter : public benchmark::ConsoleReporter { - public: - virtual bool ReportContext(const Context& context) { - return ConsoleReporter::ReportContext(context); - }; - - virtual void ReportRuns(const std::vector& report) { - all_runs_.insert(all_runs_.end(), begin(report), end(report)); - ConsoleReporter::ReportRuns(report); - } - - TestReporter() {} - virtual ~TestReporter() {} - - mutable std::vector all_runs_; -}; - -struct TestCase { - std::string name; - bool error_occurred; - std::string error_message; - - typedef benchmark::BenchmarkReporter::Run Run; - - void CheckRun(Run const& run) const { - CHECK(name == run.benchmark_name) << "expected " << name << " got " - << run.benchmark_name; - CHECK(error_occurred == run.error_occurred); - CHECK(error_message == run.error_message); - if (error_occurred) { - // CHECK(run.iterations == 0); - } else { - CHECK(run.iterations != 0); - } - } -}; - -std::vector ExpectedResults; - -int AddCases(const char* base_name, std::initializer_list const& v) { - for (auto TC : v) { - TC.name = base_name + TC.name; - ExpectedResults.push_back(std::move(TC)); - } - return 0; -} - -#define CONCAT(x, y) CONCAT2(x, y) -#define CONCAT2(x, y) x##y -#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__) - -} // end namespace - -void BM_error_before_running(benchmark::State& state) { - state.SkipWithError("error message"); - while (state.KeepRunning()) { - assert(false); - } -} -BENCHMARK(BM_error_before_running); -ADD_CASES("BM_error_before_running", {{"", true, "error message"}}); - -void BM_error_during_running(benchmark::State& state) { - int first_iter = true; - while (state.KeepRunning()) { - if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) { - assert(first_iter); - first_iter = false; - state.SkipWithError("error message"); - } else { - state.PauseTiming(); - state.ResumeTiming(); - } - } -} -BENCHMARK(BM_error_during_running)->Arg(1)->Arg(2)->ThreadRange(1, 8); -ADD_CASES("BM_error_during_running", {{"/1/threads:1", true, "error message"}, - {"/1/threads:2", true, "error message"}, - {"/1/threads:4", true, "error message"}, - {"/1/threads:8", true, "error message"}, - {"/2/threads:1", false, ""}, - {"/2/threads:2", false, ""}, - {"/2/threads:4", false, ""}, - {"/2/threads:8", false, ""}}); - -void BM_error_after_running(benchmark::State& state) { - while (state.KeepRunning()) { - benchmark::DoNotOptimize(state.iterations()); - } - if (state.thread_index <= (state.threads / 2)) - state.SkipWithError("error message"); -} -BENCHMARK(BM_error_after_running)->ThreadRange(1, 8); -ADD_CASES("BM_error_after_running", {{"/threads:1", true, "error message"}, - {"/threads:2", true, "error message"}, - {"/threads:4", true, "error message"}, - {"/threads:8", true, "error message"}}); - -void BM_error_while_paused(benchmark::State& state) { - bool first_iter = true; - while (state.KeepRunning()) { - if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) { - assert(first_iter); - first_iter = false; - state.PauseTiming(); - state.SkipWithError("error message"); - } else { - state.PauseTiming(); - state.ResumeTiming(); - } - } -} -BENCHMARK(BM_error_while_paused)->Arg(1)->Arg(2)->ThreadRange(1, 8); -ADD_CASES("BM_error_while_paused", {{"/1/threads:1", true, "error message"}, - {"/1/threads:2", true, "error message"}, - {"/1/threads:4", true, "error message"}, - {"/1/threads:8", true, "error message"}, - {"/2/threads:1", false, ""}, - {"/2/threads:2", false, ""}, - {"/2/threads:4", false, ""}, - {"/2/threads:8", false, ""}}); - -int main(int argc, char* argv[]) { - benchmark::Initialize(&argc, argv); - - TestReporter test_reporter; - benchmark::RunSpecifiedBenchmarks(&test_reporter); - - typedef benchmark::BenchmarkReporter::Run Run; - auto EB = ExpectedResults.begin(); - - for (Run const& run : test_reporter.all_runs_) { - assert(EB != ExpectedResults.end()); - EB->CheckRun(run); - ++EB; - } - assert(EB == ExpectedResults.end()); - - return 0; -} diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_tabular_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_tabular_test.cc deleted file mode 100644 index 5fc5b4d9b88..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_tabular_test.cc +++ /dev/null @@ -1,250 +0,0 @@ - -#undef NDEBUG - -#include "benchmark/benchmark.h" -#include "output_test.h" - -// @todo: this checks the full output at once; the rule for -// CounterSet1 was failing because it was not matching "^[-]+$". -// @todo: check that the counters are vertically aligned. -ADD_CASES(TC_ConsoleOut, { -// keeping these lines long improves readability, so: -// clang-format off - {"^[-]+$", MR_Next}, - {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next}, - {"^[-]+$", MR_Next}, - {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next}, - {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next}, - {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next}, - {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next}, - {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next}, - {"^[-]+$", MR_Next}, - {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Baz %s Foo$", MR_Next}, - {"^[-]+$", MR_Next}, - {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^[-]+$", MR_Next}, - {"^Benchmark %s Time %s CPU %s Iterations %s Bat %s Baz %s Foo$", MR_Next}, - {"^[-]+$", MR_Next}, - {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next}, - {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"}, -// clang-format on -}); -ADD_CASES(TC_CSVOut, {{"%csv_header," - "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}}); - -// ========================================================================= // -// ------------------------- Tabular Counters Output ----------------------- // -// ========================================================================= // - -void BM_Counters_Tabular(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters.insert({ - {"Foo", { 1, bm::Counter::kAvgThreads}}, - {"Bar", { 2, bm::Counter::kAvgThreads}}, - {"Baz", { 4, bm::Counter::kAvgThreads}}, - {"Bat", { 8, bm::Counter::kAvgThreads}}, - {"Frob", {16, bm::Counter::kAvgThreads}}, - {"Lob", {32, bm::Counter::kAvgThreads}}, - }); -} -BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"Bar\": %float,$", MR_Next}, - {"\"Bat\": %float,$", MR_Next}, - {"\"Baz\": %float,$", MR_Next}, - {"\"Foo\": %float,$", MR_Next}, - {"\"Frob\": %float,$", MR_Next}, - {"\"Lob\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report," - "%float,%float,%float,%float,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckTabular(Results const& e) { - CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 1); - CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 2); - CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 4); - CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 8); - CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16); - CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32); -} -CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular); - -// ========================================================================= // -// -------------------- Tabular+Rate Counters Output ----------------------- // -// ========================================================================= // - -void BM_CounterRates_Tabular(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters.insert({ - {"Foo", { 1, bm::Counter::kAvgThreadsRate}}, - {"Bar", { 2, bm::Counter::kAvgThreadsRate}}, - {"Baz", { 4, bm::Counter::kAvgThreadsRate}}, - {"Bat", { 8, bm::Counter::kAvgThreadsRate}}, - {"Frob", {16, bm::Counter::kAvgThreadsRate}}, - {"Lob", {32, bm::Counter::kAvgThreadsRate}}, - }); -} -BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"Bar\": %float,$", MR_Next}, - {"\"Bat\": %float,$", MR_Next}, - {"\"Baz\": %float,$", MR_Next}, - {"\"Foo\": %float,$", MR_Next}, - {"\"Frob\": %float,$", MR_Next}, - {"\"Lob\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report," - "%float,%float,%float,%float,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckTabularRate(Results const& e) { - double t = e.DurationCPUTime(); - CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1./t, 0.001); - CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2./t, 0.001); - CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4./t, 0.001); - CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8./t, 0.001); - CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16./t, 0.001); - CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32./t, 0.001); -} -CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int", - &CheckTabularRate); - -// ========================================================================= // -// ------------------------- Tabular Counters Output ----------------------- // -// ========================================================================= // - -// set only some of the counters -void BM_CounterSet0_Tabular(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters.insert({ - {"Foo", {10, bm::Counter::kAvgThreads}}, - {"Bar", {20, bm::Counter::kAvgThreads}}, - {"Baz", {40, bm::Counter::kAvgThreads}}, - }); -} -BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"Bar\": %float,$", MR_Next}, - {"\"Baz\": %float,$", MR_Next}, - {"\"Foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report," - "%float,,%float,%float,,"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckSet0(Results const& e) { - CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10); - CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 20); - CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40); -} -CHECK_BENCHMARK_RESULTS("BM_CounterSet0_Tabular", &CheckSet0); - -// again. -void BM_CounterSet1_Tabular(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters.insert({ - {"Foo", {15, bm::Counter::kAvgThreads}}, - {"Bar", {25, bm::Counter::kAvgThreads}}, - {"Baz", {45, bm::Counter::kAvgThreads}}, - }); -} -BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"Bar\": %float,$", MR_Next}, - {"\"Baz\": %float,$", MR_Next}, - {"\"Foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report," - "%float,,%float,%float,,"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckSet1(Results const& e) { - CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 15); - CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 25); - CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 45); -} -CHECK_BENCHMARK_RESULTS("BM_CounterSet1_Tabular/threads:%int", &CheckSet1); - -// ========================================================================= // -// ------------------------- Tabular Counters Output ----------------------- // -// ========================================================================= // - -// set only some of the counters, different set now. -void BM_CounterSet2_Tabular(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters.insert({ - {"Foo", {10, bm::Counter::kAvgThreads}}, - {"Bat", {30, bm::Counter::kAvgThreads}}, - {"Baz", {40, bm::Counter::kAvgThreads}}, - }); -} -BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"Bat\": %float,$", MR_Next}, - {"\"Baz\": %float,$", MR_Next}, - {"\"Foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report," - ",%float,%float,%float,,"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckSet2(Results const& e) { - CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10); - CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 30); - CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40); -} -CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2); - -// ========================================================================= // -// --------------------------- TEST CASES END ------------------------------ // -// ========================================================================= // - -int main(int argc, char* argv[]) { RunOutputTests(argc, argv); } diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_test.cc b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_test.cc deleted file mode 100644 index 66df48b31f8..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/test/user_counters_test.cc +++ /dev/null @@ -1,217 +0,0 @@ - -#undef NDEBUG - -#include "benchmark/benchmark.h" -#include "output_test.h" - -// ========================================================================= // -// ---------------------- Testing Prologue Output -------------------------- // -// ========================================================================= // - -ADD_CASES(TC_ConsoleOut, - {{"^[-]+$", MR_Next}, - {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next}, - {"^[-]+$", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}}); - -// ========================================================================= // -// ------------------------- Simple Counters Output ------------------------ // -// ========================================================================= // - -void BM_Counters_Simple(benchmark::State& state) { - while (state.KeepRunning()) { - } - state.counters["foo"] = 1; - state.counters["bar"] = 2 * (double)state.iterations(); -} -BENCHMARK(BM_Counters_Simple); -ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"bar\": %float,$", MR_Next}, - {"\"foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckSimple(Results const& e) { - double its = e.GetAs< double >("iterations"); - CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1); - // check that the value of bar is within 0.1% of the expected value - CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2.*its, 0.001); -} -CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple); - -// ========================================================================= // -// --------------------- Counters+Items+Bytes/s Output --------------------- // -// ========================================================================= // - -namespace { int num_calls1 = 0; } -void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) { - while (state.KeepRunning()) { - } - state.counters["foo"] = 1; - state.counters["bar"] = ++num_calls1; - state.SetBytesProcessed(364); - state.SetItemsProcessed(150); -} -BENCHMARK(BM_Counters_WithBytesAndItemsPSec); -ADD_CASES(TC_ConsoleOut, - {{"^BM_Counters_WithBytesAndItemsPSec %console_report " - "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"bytes_per_second\": %int,$", MR_Next}, - {"\"items_per_second\": %int,$", MR_Next}, - {"\"bar\": %float,$", MR_Next}, - {"\"foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\"," - "%csv_bytes_items_report,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckBytesAndItemsPSec(Results const& e) { - double t = e.DurationCPUTime(); // this (and not real time) is the time used - CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1); - CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1); - // check that the values are within 0.1% of the expected values - CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364./t, 0.001); - CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150./t, 0.001); -} -CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec", - &CheckBytesAndItemsPSec); - -// ========================================================================= // -// ------------------------- Rate Counters Output -------------------------- // -// ========================================================================= // - -void BM_Counters_Rate(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate}; - state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate}; -} -BENCHMARK(BM_Counters_Rate); -ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"bar\": %float,$", MR_Next}, - {"\"foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckRate(Results const& e) { - double t = e.DurationCPUTime(); // this (and not real time) is the time used - // check that the values are within 0.1% of the expected values - CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./t, 0.001); - CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./t, 0.001); -} -CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate); - -// ========================================================================= // -// ------------------------- Thread Counters Output ------------------------ // -// ========================================================================= // - -void BM_Counters_Threads(benchmark::State& state) { - while (state.KeepRunning()) { - } - state.counters["foo"] = 1; - state.counters["bar"] = 2; -} -BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8); -ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"bar\": %float,$", MR_Next}, - {"\"foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckThreads(Results const& e) { - CHECK_COUNTER_VALUE(e, int, "foo", EQ, e.NumThreads()); - CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads()); -} -CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads); - -// ========================================================================= // -// ---------------------- ThreadAvg Counters Output ------------------------ // -// ========================================================================= // - -void BM_Counters_AvgThreads(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads}; - state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads}; -} -BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8); -ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"bar\": %float,$", MR_Next}, - {"\"foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckAvgThreads(Results const& e) { - CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1); - CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2); -} -CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int", - &CheckAvgThreads); - -// ========================================================================= // -// ---------------------- ThreadAvg Counters Output ------------------------ // -// ========================================================================= // - -void BM_Counters_AvgThreadsRate(benchmark::State& state) { - while (state.KeepRunning()) { - } - namespace bm = benchmark; - state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate}; - state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate}; -} -BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8); -ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int %console_report bar=%hrfloat/s foo=%hrfloat/s$"}}); -ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"}, - {"\"iterations\": %int,$", MR_Next}, - {"\"real_time\": %int,$", MR_Next}, - {"\"cpu_time\": %int,$", MR_Next}, - {"\"time_unit\": \"ns\",$", MR_Next}, - {"\"bar\": %float,$", MR_Next}, - {"\"foo\": %float$", MR_Next}, - {"}", MR_Next}}); -ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/threads:%int\",%csv_report,%float,%float$"}}); -// VS2013 does not allow this function to be passed as a lambda argument -// to CHECK_BENCHMARK_RESULTS() -void CheckAvgThreadsRate(Results const& e) { - CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./e.DurationCPUTime(), 0.001); - CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./e.DurationCPUTime(), 0.001); -} -CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int", - &CheckAvgThreadsRate); - -// ========================================================================= // -// --------------------------- TEST CASES END ------------------------------ // -// ========================================================================= // - -int main(int argc, char* argv[]) { RunOutputTests(argc, argv); } diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/compare_bench.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/compare_bench.py deleted file mode 100755 index d54baaa0e8f..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/compare_bench.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -""" -compare_bench.py - Compare two benchmarks or their results and report the - difference. -""" -import argparse -from argparse import ArgumentParser -import sys -import gbench -from gbench import util, report -from gbench.util import * - -def check_inputs(in1, in2, flags): - """ - Perform checking on the user provided inputs and diagnose any abnormalities - """ - in1_kind, in1_err = classify_input_file(in1) - in2_kind, in2_err = classify_input_file(in2) - output_file = find_benchmark_flag('--benchmark_out=', flags) - output_type = find_benchmark_flag('--benchmark_out_format=', flags) - if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file: - print(("WARNING: '--benchmark_out=%s' will be passed to both " - "benchmarks causing it to be overwritten") % output_file) - if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0: - print("WARNING: passing --benchmark flags has no effect since both " - "inputs are JSON") - if output_type is not None and output_type != 'json': - print(("ERROR: passing '--benchmark_out_format=%s' to 'compare_bench.py`" - " is not supported.") % output_type) - sys.exit(1) - - -def main(): - parser = ArgumentParser( - description='compare the results of two benchmarks') - parser.add_argument( - 'test1', metavar='test1', type=str, nargs=1, - help='A benchmark executable or JSON output file') - parser.add_argument( - 'test2', metavar='test2', type=str, nargs=1, - help='A benchmark executable or JSON output file') - # FIXME this is a dummy argument which will never actually match - # any --benchmark flags but it helps generate a better usage message - parser.add_argument( - 'benchmark_options', metavar='benchmark_option', nargs='*', - help='Arguments to pass when running benchmark executables' - ) - args, unknown_args = parser.parse_known_args() - # Parse the command line flags - test1 = args.test1[0] - test2 = args.test2[0] - if args.benchmark_options: - print("Unrecognized positional argument arguments: '%s'" - % args.benchmark_options) - exit(1) - benchmark_options = unknown_args - check_inputs(test1, test2, benchmark_options) - # Run the benchmarks and report the results - json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options) - json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options) - output_lines = gbench.report.generate_difference_report(json1, json2) - print('Comparing %s to %s' % (test1, test2)) - for ln in output_lines: - print(ln) - - -if __name__ == '__main__': - main() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run1.json b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run1.json deleted file mode 100644 index 37faed46d13..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run1.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "context": { - "date": "2016-08-02 17:44:46", - "num_cpus": 4, - "mhz_per_cpu": 4228, - "cpu_scaling_enabled": false, - "library_build_type": "release" - }, - "benchmarks": [ - { - "name": "BM_SameTimes", - "iterations": 1000, - "real_time": 10, - "cpu_time": 10, - "time_unit": "ns" - }, - { - "name": "BM_2xFaster", - "iterations": 1000, - "real_time": 50, - "cpu_time": 50, - "time_unit": "ns" - }, - { - "name": "BM_2xSlower", - "iterations": 1000, - "real_time": 50, - "cpu_time": 50, - "time_unit": "ns" - }, - { - "name": "BM_10PercentFaster", - "iterations": 1000, - "real_time": 100, - "cpu_time": 100, - "time_unit": "ns" - }, - { - "name": "BM_10PercentSlower", - "iterations": 1000, - "real_time": 100, - "cpu_time": 100, - "time_unit": "ns" - }, - { - "name": "BM_100xSlower", - "iterations": 1000, - "real_time": 100, - "cpu_time": 100, - "time_unit": "ns" - }, - { - "name": "BM_100xFaster", - "iterations": 1000, - "real_time": 10000, - "cpu_time": 10000, - "time_unit": "ns" - } - ] -} \ No newline at end of file diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run2.json b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run2.json deleted file mode 100644 index aed5151d392..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/Inputs/test1_run2.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "context": { - "date": "2016-08-02 17:44:46", - "num_cpus": 4, - "mhz_per_cpu": 4228, - "cpu_scaling_enabled": false, - "library_build_type": "release" - }, - "benchmarks": [ - { - "name": "BM_SameTimes", - "iterations": 1000, - "real_time": 10, - "cpu_time": 10, - "time_unit": "ns" - }, - { - "name": "BM_2xFaster", - "iterations": 1000, - "real_time": 25, - "cpu_time": 25, - "time_unit": "ns" - }, - { - "name": "BM_2xSlower", - "iterations": 20833333, - "real_time": 100, - "cpu_time": 100, - "time_unit": "ns" - }, - { - "name": "BM_10PercentFaster", - "iterations": 1000, - "real_time": 90, - "cpu_time": 90, - "time_unit": "ns" - }, - { - "name": "BM_10PercentSlower", - "iterations": 1000, - "real_time": 110, - "cpu_time": 110, - "time_unit": "ns" - }, - { - "name": "BM_100xSlower", - "iterations": 1000, - "real_time": 10000, - "cpu_time": 10000, - "time_unit": "ns" - }, - { - "name": "BM_100xFaster", - "iterations": 1000, - "real_time": 100, - "cpu_time": 100, - "time_unit": "ns" - } - ] -} \ No newline at end of file diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/__init__.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/__init__.py deleted file mode 100644 index fce1a1acfbb..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Google Benchmark tooling""" - -__author__ = 'Eric Fiselier' -__email__ = 'eric@efcs.ca' -__versioninfo__ = (0, 5, 0) -__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev' - -__all__ = [] diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/report.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/report.py deleted file mode 100644 index 015d33d9e49..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/report.py +++ /dev/null @@ -1,146 +0,0 @@ -"""report.py - Utilities for reporting statistics about benchmark results -""" -import os - -class BenchmarkColor(object): - def __init__(self, name, code): - self.name = name - self.code = code - - def __repr__(self): - return '%s%r' % (self.__class__.__name__, - (self.name, self.code)) - - def __format__(self, format): - return self.code - -# Benchmark Colors Enumeration -BC_NONE = BenchmarkColor('NONE', '') -BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m') -BC_CYAN = BenchmarkColor('CYAN', '\033[96m') -BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m') -BC_HEADER = BenchmarkColor('HEADER', '\033[92m') -BC_WARNING = BenchmarkColor('WARNING', '\033[93m') -BC_WHITE = BenchmarkColor('WHITE', '\033[97m') -BC_FAIL = BenchmarkColor('FAIL', '\033[91m') -BC_ENDC = BenchmarkColor('ENDC', '\033[0m') -BC_BOLD = BenchmarkColor('BOLD', '\033[1m') -BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m') - -def color_format(use_color, fmt_str, *args, **kwargs): - """ - Return the result of 'fmt_str.format(*args, **kwargs)' after transforming - 'args' and 'kwargs' according to the value of 'use_color'. If 'use_color' - is False then all color codes in 'args' and 'kwargs' are replaced with - the empty string. - """ - assert use_color is True or use_color is False - if not use_color: - args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE - for arg in args] - kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE - for key, arg in kwargs.items()} - return fmt_str.format(*args, **kwargs) - - -def find_longest_name(benchmark_list): - """ - Return the length of the longest benchmark name in a given list of - benchmark JSON objects - """ - longest_name = 1 - for bc in benchmark_list: - if len(bc['name']) > longest_name: - longest_name = len(bc['name']) - return longest_name - - -def calculate_change(old_val, new_val): - """ - Return a float representing the decimal change between old_val and new_val. - """ - if old_val == 0 and new_val == 0: - return 0.0 - if old_val == 0: - return float(new_val - old_val) / (float(old_val + new_val) / 2) - return float(new_val - old_val) / abs(old_val) - - -def generate_difference_report(json1, json2, use_color=True): - """ - Calculate and report the difference between each test of two benchmarks - runs specified as 'json1' and 'json2'. - """ - first_col_width = find_longest_name(json1['benchmarks']) + 5 - def find_test(name): - for b in json2['benchmarks']: - if b['name'] == name: - return b - return None - first_line = "{:<{}s} Time CPU Old New".format( - 'Benchmark', first_col_width) - output_strs = [first_line, '-' * len(first_line)] - - gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn) - for bn in gen: - other_bench = find_test(bn['name']) - if not other_bench: - continue - - def get_color(res): - if res > 0.05: - return BC_FAIL - elif res > -0.07: - return BC_WHITE - else: - return BC_CYAN - fmt_str = "{}{:<{}s}{endc}{}{:+9.2f}{endc}{}{:+14.2f}{endc}{:14d}{:14d}" - tres = calculate_change(bn['real_time'], other_bench['real_time']) - cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time']) - output_strs += [color_format(use_color, fmt_str, - BC_HEADER, bn['name'], first_col_width, - get_color(tres), tres, get_color(cpures), cpures, - bn['cpu_time'], other_bench['cpu_time'], - endc=BC_ENDC)] - return output_strs - -############################################################################### -# Unit tests - -import unittest - -class TestReportDifference(unittest.TestCase): - def load_results(self): - import json - testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs') - testOutput1 = os.path.join(testInputs, 'test1_run1.json') - testOutput2 = os.path.join(testInputs, 'test1_run2.json') - with open(testOutput1, 'r') as f: - json1 = json.load(f) - with open(testOutput2, 'r') as f: - json2 = json.load(f) - return json1, json2 - - def test_basic(self): - expect_lines = [ - ['BM_SameTimes', '+0.00', '+0.00', '10', '10'], - ['BM_2xFaster', '-0.50', '-0.50', '50', '25'], - ['BM_2xSlower', '+1.00', '+1.00', '50', '100'], - ['BM_10PercentFaster', '-0.10', '-0.10', '100', '90'], - ['BM_10PercentSlower', '+0.10', '+0.10', '100', '110'], - ['BM_100xSlower', '+99.00', '+99.00', '100', '10000'], - ['BM_100xFaster', '-0.99', '-0.99', '10000', '100'], - ] - json1, json2 = self.load_results() - output_lines_with_header = generate_difference_report(json1, json2, use_color=False) - output_lines = output_lines_with_header[2:] - print("\n".join(output_lines_with_header)) - self.assertEqual(len(output_lines), len(expect_lines)) - for i in xrange(0, len(output_lines)): - parts = [x for x in output_lines[i].split(' ') if x] - self.assertEqual(len(parts), 5) - self.assertEqual(parts, expect_lines[i]) - - -if __name__ == '__main__': - unittest.main() diff --git a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/util.py b/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/util.py deleted file mode 100644 index 07c23772754..00000000000 --- a/extension/llm/custom_ops/spinquant/FFHT/external/benchmark/tools/gbench/util.py +++ /dev/null @@ -1,159 +0,0 @@ -"""util.py - General utilities for running, loading, and processing benchmarks -""" -import json -import os -import tempfile -import subprocess -import sys - -# Input file type enumeration -IT_Invalid = 0 -IT_JSON = 1 -IT_Executable = 2 - -_num_magic_bytes = 2 if sys.platform.startswith('win') else 4 -def is_executable_file(filename): - """ - Return 'True' if 'filename' names a valid file which is likely - an executable. A file is considered an executable if it starts with the - magic bytes for a EXE, Mach O, or ELF file. - """ - if not os.path.isfile(filename): - return False - with open(filename, mode='rb') as f: - magic_bytes = f.read(_num_magic_bytes) - if sys.platform == 'darwin': - return magic_bytes in [ - b'\xfe\xed\xfa\xce', # MH_MAGIC - b'\xce\xfa\xed\xfe', # MH_CIGAM - b'\xfe\xed\xfa\xcf', # MH_MAGIC_64 - b'\xcf\xfa\xed\xfe', # MH_CIGAM_64 - b'\xca\xfe\xba\xbe', # FAT_MAGIC - b'\xbe\xba\xfe\xca' # FAT_CIGAM - ] - elif sys.platform.startswith('win'): - return magic_bytes == b'MZ' - else: - return magic_bytes == b'\x7FELF' - - -def is_json_file(filename): - """ - Returns 'True' if 'filename' names a valid JSON output file. - 'False' otherwise. - """ - try: - with open(filename, 'r') as f: - json.load(f) - return True - except: - pass - return False - - -def classify_input_file(filename): - """ - Return a tuple (type, msg) where 'type' specifies the classified type - of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable - string represeting the error. - """ - ftype = IT_Invalid - err_msg = None - if not os.path.exists(filename): - err_msg = "'%s' does not exist" % filename - elif not os.path.isfile(filename): - err_msg = "'%s' does not name a file" % filename - elif is_executable_file(filename): - ftype = IT_Executable - elif is_json_file(filename): - ftype = IT_JSON - else: - err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename - return ftype, err_msg - - -def check_input_file(filename): - """ - Classify the file named by 'filename' and return the classification. - If the file is classified as 'IT_Invalid' print an error message and exit - the program. - """ - ftype, msg = classify_input_file(filename) - if ftype == IT_Invalid: - print("Invalid input file: %s" % msg) - sys.exit(1) - return ftype - -def find_benchmark_flag(prefix, benchmark_flags): - """ - Search the specified list of flags for a flag matching `` and - if it is found return the arg it specifies. If specified more than once the - last value is returned. If the flag is not found None is returned. - """ - assert prefix.startswith('--') and prefix.endswith('=') - result = None - for f in benchmark_flags: - if f.startswith(prefix): - result = f[len(prefix):] - return result - -def remove_benchmark_flags(prefix, benchmark_flags): - """ - Return a new list containing the specified benchmark_flags except those - with the specified prefix. - """ - assert prefix.startswith('--') and prefix.endswith('=') - return [f for f in benchmark_flags if not f.startswith(prefix)] - -def load_benchmark_results(fname): - """ - Read benchmark output from a file and return the JSON object. - REQUIRES: 'fname' names a file containing JSON benchmark output. - """ - with open(fname, 'r') as f: - return json.load(f) - - -def run_benchmark(exe_name, benchmark_flags): - """ - Run a benchmark specified by 'exe_name' with the specified - 'benchmark_flags'. The benchmark is run directly as a subprocess to preserve - real time console output. - RETURNS: A JSON object representing the benchmark output - """ - output_name = find_benchmark_flag('--benchmark_out=', - benchmark_flags) - is_temp_output = False - if output_name is None: - is_temp_output = True - thandle, output_name = tempfile.mkstemp() - os.close(thandle) - benchmark_flags = list(benchmark_flags) + \ - ['--benchmark_out=%s' % output_name] - - cmd = [exe_name] + benchmark_flags - print("RUNNING: %s" % ' '.join(cmd)) - exitCode = subprocess.call(cmd) - if exitCode != 0: - print('TEST FAILED...') - sys.exit(exitCode) - json_res = load_benchmark_results(output_name) - if is_temp_output: - os.unlink(output_name) - return json_res - - -def run_or_load_benchmark(filename, benchmark_flags): - """ - Get the results for a specified benchmark. If 'filename' specifies - an executable benchmark then the results are generated by running the - benchmark. Otherwise 'filename' must name a valid JSON output file, - which is loaded and the result returned. - """ - ftype = check_input_file(filename) - if ftype == IT_JSON: - return load_benchmark_results(filename) - elif ftype == IT_Executable: - return run_benchmark(filename, benchmark_flags) - else: - assert False # This branch is unreachable \ No newline at end of file diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h index 1084dcc3dee..3f00fe5cda2 100644 --- a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h +++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h @@ -1,3 +1,11 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + // (c) Meta Platforms, Inc. and affiliates. #pragma once @@ -25,9 +33,7 @@ T fast_sqrt_of_power_of_2(int log2_n) { } template -void normalize_after_fht( - T* out, - int log2_vec_size) { +void normalize_after_fht(T* out, int log2_vec_size) { const T inv_sqrt = T(1) / fast_sqrt_of_power_of_2(log2_vec_size); const int vec_size = 1 << log2_vec_size; for (int ii = 0; ii < vec_size; ++ii) { @@ -35,7 +41,6 @@ void normalize_after_fht( } } - // Normalization step: divide by sqrt(1 << log2_vec_size). Similar // to fast_sqrt above, if N is even, then the maximum-precision way // to do this is right-shift by log2_vec_size / 2. If N is odd, we @@ -46,7 +51,11 @@ void normalize_after_fht( // function to tend to increase the magnitude of the elements of // vec, which would resulting in clipping and therefore accuracy // loss, especially compounded over 30+ transformer layers. -void quantized_normalize_after_fht(const int32_t* tmp, int16_t* out, int log2_vec_size, int vec_size) { +void quantized_normalize_after_fht( + const int32_t* tmp, + int16_t* out, + int log2_vec_size, + int vec_size) { const int log2_sqrt_vec_size = log2_vec_size / 2; constexpr int32_t qmin = -(1 << 15) + 1; constexpr int32_t qmax = -qmin; @@ -55,8 +64,9 @@ void quantized_normalize_after_fht(const int32_t* tmp, int16_t* out, int log2_ve static const int32_t inv_sqrt_2_numerator = 408; static const int32_t inv_sqrt_2_denominator = 577; for (int ii = 0; ii < vec_size; ++ii) { - const auto val_over_sqrt_vec_size = (tmp[ii] * inv_sqrt_2_numerator / inv_sqrt_2_denominator) - >> log2_sqrt_vec_size; + const auto val_over_sqrt_vec_size = + (tmp[ii] * inv_sqrt_2_numerator / inv_sqrt_2_denominator) >> + log2_sqrt_vec_size; out[ii] = std::clamp(val_over_sqrt_vec_size, qmin, qmax); } } else { @@ -90,9 +100,7 @@ void fast_hadamard_transform_unnormalized_simple_impl( } template -void fast_hadamard_transform_simple_impl( - T* vec, - int log2_vec_size) { +void fast_hadamard_transform_simple_impl(T* vec, int log2_vec_size) { fast_hadamard_transform_unnormalized_simple_impl(vec, log2_vec_size); normalize_after_fht(vec, log2_vec_size); } @@ -104,7 +112,7 @@ void fast_hadamard_transform_simple_impl( // of vec, which must be of length (1 << log2_vec_size). template void fast_hadamard_transform(T* vec, int log2_vec_size) { - internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size); + internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size); } // Compute a quantized fast Walsh-Hadamard transform of vec, which @@ -116,8 +124,11 @@ void fast_hadamard_transform(T* vec, int log2_vec_size) { // following trivial identities: // // scale * a + scale * b = scale * (a + b) (addition doesn't need the scale) -// alpha * (scale * a) = scale * (alpha * a) (multiplication doesn't need the scale) -void fast_hadamard_transform_symmetric_quantized_s16(int16_t* vec, int log2_vec_size) { +// alpha * (scale * a) = scale * (alpha * a) (multiplication doesn't need the +// scale) +void fast_hadamard_transform_symmetric_quantized_s16( + int16_t* vec, + int log2_vec_size) { if (log2_vec_size == 0) { return; } @@ -136,9 +147,11 @@ void fast_hadamard_transform_symmetric_quantized_s16(int16_t* vec, int log2_vec_ // implementation. // NOTE: if we need this to be fast on CPU, we can use FFHT to // generate fht_uint32 similar to fht_float. - internal::fast_hadamard_transform_unnormalized_simple_impl(tmp.get(), log2_vec_size); + internal::fast_hadamard_transform_unnormalized_simple_impl( + tmp.get(), log2_vec_size); - internal::quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size); + internal::quantized_normalize_after_fht( + tmp.get(), vec, log2_vec_size, vec_size); } // Like fast_hadamard_transform, but vec must be of length 28 * (1 << @@ -161,7 +174,9 @@ void fast_hadamard_transform_28N(T* vec, int log2_vec_size) { // We don't need the quantization scale; see the function-level // comment on fast_hadamard_transform_symmetric_quantized_s16 for // details. -void fast_hadamard_transform_symmetric_quantized_s16_28N(int16_t* vec, int log2_vec_size) { +void fast_hadamard_transform_symmetric_quantized_s16_28N( + int16_t* vec, + int log2_vec_size) { if (log2_vec_size == 0) { return; } @@ -171,14 +186,16 @@ void fast_hadamard_transform_symmetric_quantized_s16_28N(int16_t* vec, int log2_ std::copy(vec, vec + vec_size * 28, tmp.get()); for (int ii = 0; ii < 28; ++ii) { - internal::fast_hadamard_transform_unnormalized_simple_impl(&tmp[ii * vec_size], log2_vec_size); + internal::fast_hadamard_transform_unnormalized_simple_impl( + &tmp[ii * vec_size], log2_vec_size); } for (int ii = 0; ii < vec_size; ++ii) { hadamard_mult_28_strided(&tmp[ii], vec_size); } - internal::quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size * 28); + internal::quantized_normalize_after_fht( + tmp.get(), vec, log2_vec_size, vec_size * 28); } } // namespace executorch diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h index edc62b9667a..ca5a8d61e73 100644 --- a/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h +++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h @@ -1,5 +1,4 @@ - -// This file is auto-generated. See "special_hadamard_code_gen.py" +// @generated by special_hadamard_code_gen.py strided_cpu #pragma once diff --git a/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py index 1dc57166c6d..a8b9feb0785 100644 --- a/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py +++ b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py @@ -32,8 +32,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math -import re from pathlib import Path import numpy as np @@ -176,12 +174,12 @@ had_strings = [had_12, had_20_will, had_28_will, had_40_tpal] header = """ -// This file is auto-generated. See "special_hadamard_code_gen.py"\n #pragma once """ + TEMPLATE = """ __device__ __forceinline__ void hadamard_mult_thread_{N}(float x[{N}]) {{ float out[{N}]; @@ -220,8 +218,13 @@ def string_to_array(string): # Convert strings of + and - to bool arrays - string = string.strip().replace('+', '1').replace('-', '-1').split() - return np.stack([np.fromstring(" ".join(string[i]), dtype=np.int32, sep=' ') for i in range(len(string))]) + string = string.strip().replace("+", "1").replace("-", "-1").split() + return np.stack( + [ + np.fromstring(" ".join(string[i]), dtype=np.int32, sep=" ") + for i in range(len(string)) + ] + ) def strided_load_code_gen(N): @@ -233,28 +236,44 @@ def array_code_gen(arr, template): assert arr.shape[0] == arr.shape[1] out = [] for i in range(N): - out.append(f"out[{i}] = " + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)]) + ";") - return template.format(N=str(N), code='\n '.join(out), strided_load_code = strided_load_code_gen(N)) - - -def main(template = TEMPLATE): - output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h" - output_dir.write_text(header + ''.join(array_code_gen(string_to_array(s), template) for s in had_strings)) + out.append( + f"out[{i}] = " + + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)]) + + ";" + ) + return template.format( + N=str(N), code="\n ".join(out), strided_load_code=strided_load_code_gen(N) + ) OPTION_TO_TEMPLATE = { - 'cuda': TEMPLATE, - 'cpu': CPU_TEMPLATE, - 'strided_cpu': STRIDED_CPU_TEMPLATE, + "cuda": TEMPLATE, + "cpu": CPU_TEMPLATE, + "strided_cpu": STRIDED_CPU_TEMPLATE, } -if __name__ == '__main__': +def main(option="cuda"): + try: + template = OPTION_TO_TEMPLATE[option] + except KeyError: + raise Exception( + f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}" + ) + output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h" + generated_line = f"// @{'generated'} by special_hadamard_code_gen.py {option}\n" + + output_dir.write_text( + generated_line + + header + + "".join(array_code_gen(string_to_array(s), template) for s in had_strings) + ) + + +if __name__ == "__main__": import sys - template = TEMPLATE + + option = "cuda" if len(sys.argv) > 1: option = sys.argv[1] - if option not in OPTION_TO_TEMPLATE: - raise Exception(f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}") - template = OPTION_TO_TEMPLATE[option] - main(template) + main(option) diff --git a/extension/llm/custom_ops/spinquant/targets.bzl b/extension/llm/custom_ops/spinquant/targets.bzl index 8cf7827f9e2..42fa472548b 100644 --- a/extension/llm/custom_ops/spinquant/targets.bzl +++ b/extension/llm/custom_ops/spinquant/targets.bzl @@ -8,8 +8,9 @@ def define_common_targets(): """ runtime.cxx_library( name = "fast_hadamard_transform", - headers = [ + exported_headers = [ "fast_hadamard_transform.h", "fast_hadamard_transform_special.h", ], + visibility = ["@EXECUTORCH_CLIENTS"], ) diff --git a/extension/llm/custom_ops/spinquant/FFHT/LICENSE.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/LICENSE.md rename to extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md diff --git a/extension/llm/custom_ops/spinquant/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/Makefile rename to extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile diff --git a/extension/llm/custom_ops/spinquant/FFHT/README.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/README.md rename to extension/llm/custom_ops/spinquant/third-party/FFHT/README.md diff --git a/extension/llm/custom_ops/spinquant/FFHT/example.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/example.py rename to extension/llm/custom_ops/spinquant/third-party/FFHT/example.py diff --git a/extension/llm/custom_ops/spinquant/FFHT/fast_copy.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/fast_copy.c rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c diff --git a/extension/llm/custom_ops/spinquant/FFHT/fast_copy.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/fast_copy.h rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/fht.c rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/fht.h rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht_avx.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/fht_avx.c rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht_impl.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/fht_impl.h rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h diff --git a/extension/llm/custom_ops/spinquant/FFHT/fht_sse.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/fht_sse.c rename to extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c diff --git a/extension/llm/custom_ops/spinquant/FFHT/gen.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/gen.py rename to extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py diff --git a/extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_avx.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_avx.txt rename to extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt diff --git a/extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_sse.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/hall_of_fame_sse.txt rename to extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt diff --git a/extension/llm/custom_ops/spinquant/FFHT/measurements/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile similarity index 61% rename from extension/llm/custom_ops/spinquant/FFHT/measurements/Makefile rename to extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile index c0b6daff716..807d5fe626b 100644 --- a/extension/llm/custom_ops/spinquant/FFHT/measurements/Makefile +++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile @@ -1,5 +1,5 @@ CXX=g++ -CXX_FLAGS=-O3 -Wall -march=native -std=c++11 -I../external/benchmark/include -L../external/benchmark/src -lbenchmark -lpthread +CXX_FLAGS=-O3 -Wall -march=native -std=c++11 `pkg-config benchmark --cflags --libs` -lpthread .PHONY: run_float run_double clean diff --git a/extension/llm/custom_ops/spinquant/FFHT/measurements/run_double.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/measurements/run_double.cpp rename to extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp diff --git a/extension/llm/custom_ops/spinquant/FFHT/measurements/run_float.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/measurements/run_float.cpp rename to extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp diff --git a/extension/llm/custom_ops/spinquant/FFHT/test_double.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/test_double.c rename to extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c diff --git a/extension/llm/custom_ops/spinquant/FFHT/test_float.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c similarity index 100% rename from extension/llm/custom_ops/spinquant/FFHT/test_float.c rename to extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index bc64ae869fc..4237ae7b3a7 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -69,6 +69,7 @@ def __init__( example_inputs, args: Optional[Any] = None, enable_dynamic_shape: bool = False, + generate_full_logits: bool = False, calibration_tasks: Optional[List[str]] = None, calibration_limit: Optional[int] = None, calibration_seq_length: Optional[int] = None, @@ -86,6 +87,7 @@ def __init__( self.dtype = dtype self.example_inputs = example_inputs self.use_kv_cache = use_kv_cache + self.generate_full_logits = generate_full_logits self.enable_dynamic_shape = enable_dynamic_shape self.verbose = verbose self.metadata = metadata @@ -229,7 +231,12 @@ def calibrate_template( ) pos += 1 if pos >= len(token_list): - token_list.append(torch.argmax(logits[:], dim=-1).item()) + if self.generate_full_logits: + token_list.append( + torch.argmax(logits[:, -1], dim=-1).item() + ) + else: + token_list.append(torch.argmax(logits[:], dim=-1).item()) calibrate_template( module=prepared_module, @@ -243,6 +250,7 @@ def calibrate_template( tokenizer=tokenizer, max_seq_length=calibration_seq_length, use_kv_cache=self.use_kv_cache, + generate_full_logits=self.generate_full_logits, enable_dynamic_shape=self.enable_dynamic_shape, ) eval_results = evaluate_model( diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index e75d5bef3fb..eca78bc9346 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -56,11 +56,11 @@ def get_mps_partitioner(use_kv_cache: bool = False): def get_coreml_partitioner( - use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None + enable_state: bool = False, + embedding_quantize: Optional[str] = None, + pt2e_quantize: Optional[str] = None, + coreml_quantize: Optional[str] = None, ): - assert ( - use_kv_cache is True - ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment" try: import coremltools as ct from executorch.backends.apple.coreml.compiler import ( # pyre-ignore @@ -75,22 +75,34 @@ def get_coreml_partitioner( ) minimum_deployment_target = ct.target.iOS15 - # In Core ML, quantization in introduced in iOS 16 - if pt2e_quantize is not None: + # In Core ML, stateful execution is introduced in iOS 18 + if enable_state: + minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) + # In Core ML, quantization is introduced in iOS 16 + if embedding_quantize is not None or pt2e_quantize is not None: minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS16) # In Core ML, 8-bit activation quantization is introduced in iOS 17 - if pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"): + if ( + embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 8 + ) or pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"): minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17) # In Core ML, 4-bit weight compression is introduced in iOS 18 - if pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"): + if ( + (embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4) + or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w") + or coreml_quantize == "b4w" + ): minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) - # In Core ML, stateful execution is introduced in iOS 18 - # TODO (https://github.com/pytorch/executorch/issues/4209) - # For now, since mutable buffer is kept in executorch runtime, - # state is out of place and can be handled by older iOS. - # Once mutable buffer can be handed over to delegate, i.e. state becomes in-place, we will have - # if use_kv_cache: - # minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) + + op_linear_quantizer_config = None + if coreml_quantize == "b4w": + op_linear_quantizer_config = { + "mode": "linear_symmetric", + "dtype": "int4", + "granularity": "per_block", + "block_size": 32, + "weight_threshold": 512, + } compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16] minimum_deployment_target=minimum_deployment_target, @@ -98,9 +110,11 @@ def get_coreml_partitioner( # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU` compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()], model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16] + op_linear_quantizer_config=op_linear_quantizer_config, ) return CoreMLPartitioner( # pyre-fixme[16] compile_specs=compile_specs, + take_over_mutable_buffer=enable_state, ) @@ -108,6 +122,7 @@ def get_qnn_partitioner( use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None, num_sharding: int = 0, + soc_model: str = "SM8650", # default to SM8650 ): assert ( use_kv_cache is True @@ -130,17 +145,17 @@ def get_qnn_partitioner( ) except ImportError: raise ImportError( - "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html" + "Please install the Qualcomm backend following https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html" ) use_fp16 = True - skip_node_op_set = {"llama.fallback.default"} + skip_node_op_set = {"llama.fallback.default", "aten.embedding.default"} if pt2e_quantize is not None: use_fp16 = False return QnnPartitioner( # pyre-fixme[16] generate_qnn_executorch_compiler_spec( # pyre-fixme[16] - soc_model=QcomChipset.SM8650, # default to SM8650 # pyre-fixme[16] + soc_model=getattr(QcomChipset, soc_model), # pyre-fixme[16] # pyre-fixme[16] backend_options=generate_htp_compiler_spec( use_fp16=use_fp16, diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 7fc53358c50..45d9932724e 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -180,8 +180,9 @@ def get_qnn_quantizer( # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w # TODO: enable it after the issue is fixed logging.warning( - "Disable per channel quantization for linear due to the error with QNN HTP 16a16w." + "Disable per channel quantization for linear and conv due to the error with QNN HTP 16a16w." ) + qnn_quantizer.set_per_channel_conv_quant(enable=False) qnn_quantizer.set_per_channel_linear_quant(enable=False) qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS) qnn_quantizer.set_bit16_op_quant_config( @@ -208,6 +209,12 @@ def get_qnn_quantizer( quantization_mode is None ), "Currently qnn backend only supports QnnQuantizer via pt2e flow" qnn_quantizer.add_custom_quant_annotations(custom_annotations) + qnn_quantizer.add_discard_ops( + [ + torch.ops.aten.embedding.default, + ] + ) + return qnn_quantizer, quant_dtype diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 70ecafee810..6798f648a0c 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -59,7 +59,8 @@ class MultimodalRunner { const std::string& prompt, int32_t seq_len = 1024, std::function token_callback = {}, - std::function stats_callback = {}) = 0; + std::function stats_callback = {}, + bool echo = true) = 0; /** * Prefill an LLaVA Module with the given images input. @@ -95,6 +96,7 @@ class MultimodalRunner { * @param start_pos The starting position in KV cache of the input in the LLM. * @param token_callback What to do after a token is generated. * @param stats_callback What to do with Stats. + * @param echo Whether to echo the input prompt or not. * @return The error code. */ virtual runtime::Error generate_from_pos( @@ -103,7 +105,8 @@ class MultimodalRunner { int64_t start_pos = 0, std::function token_callback = {}, std::function - stats_callback = {}) = 0; + stats_callback = {}, + bool echo = true) = 0; inline void stop() { text_token_generator_->stop(); diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index 01887e75600..1726750ece5 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -70,11 +70,8 @@ class TextTokenGenerator { } // initialize tensor wrappers - auto tokens_managed = from_blob( - token_data.data(), - token_shape, - exec_aten::ScalarType::Long, - exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + auto tokens_managed = + from_blob(token_data.data(), token_shape, exec_aten::ScalarType::Long); auto start_pos_managed = from_blob(&pos, {1}, exec_aten::ScalarType::Long); diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp index 75cead25a72..7db4784dc93 100644 --- a/extension/module/test/module_test.cpp +++ b/extension/module/test/module_test.cpp @@ -15,9 +15,8 @@ #include -using namespace ::testing; - -namespace torch::executor { +using namespace ::executorch::extension; +using namespace ::executorch::runtime; class ModuleTest : public ::testing::Test { protected: @@ -102,13 +101,13 @@ TEST_F(ModuleTest, TestMethodMeta) { const auto input_meta = meta->input_tensor_meta(0); EXPECT_TRUE(input_meta.ok()); - EXPECT_EQ(input_meta->scalar_type(), ScalarType::Float); + EXPECT_EQ(input_meta->scalar_type(), exec_aten::ScalarType::Float); EXPECT_EQ(input_meta->sizes().size(), 1); EXPECT_EQ(input_meta->sizes()[0], 1); const auto output_meta = meta->output_tensor_meta(0); EXPECT_TRUE(output_meta.ok()); - EXPECT_EQ(output_meta->scalar_type(), ScalarType::Float); + EXPECT_EQ(output_meta->scalar_type(), exec_aten::ScalarType::Float); EXPECT_EQ(output_meta->sizes().size(), 1); EXPECT_EQ(output_meta->sizes()[0], 1); } @@ -125,11 +124,11 @@ TEST_F(ModuleTest, TestExecute) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); EXPECT_TRUE(result.ok()); @@ -149,11 +148,11 @@ TEST_F(ModuleTest, TestExecutePreload) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -169,11 +168,11 @@ TEST_F(ModuleTest, TestExecutePreload_method) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -192,11 +191,11 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -225,10 +224,11 @@ TEST_F(ModuleTest, TestGet) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.get("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.get( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->toTensor().const_data_ptr(); @@ -240,10 +240,11 @@ TEST_F(ModuleTest, TestForward) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module->forward({Tensor(&tensor), Tensor(&tensor)}); + const auto result = + module->forward({exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -251,9 +252,10 @@ TEST_F(ModuleTest, TestForward) { EXPECT_NEAR(data[0], 2, 1e-5); std::array input2{2, 3}; - TensorImpl tensor2( - ScalarType::Float, sizes.size(), sizes.data(), input2.data()); - const auto result2 = module->forward({Tensor(&tensor2), Tensor(&tensor2)}); + exec_aten::TensorImpl tensor2( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input2.data()); + const auto result2 = module->forward( + {exec_aten::Tensor(&tensor2), exec_aten::Tensor(&tensor2)}); EXPECT_TRUE(result2.ok()); const auto data2 = result->at(0).toTensor().const_data_ptr(); @@ -298,10 +300,9 @@ TEST_F(ModuleTest, TestProgramSharingBetweenModules) { } TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) { - auto loader = util::FileDataLoader::from(model_path_.c_str()); + auto loader = FileDataLoader::from(model_path_.c_str()); EXPECT_TRUE(loader.ok()); - auto data_loader = - std::make_unique(std::move(loader.get())); + auto data_loader = std::make_unique(std::move(loader.get())); auto module1 = std::make_unique(std::move(data_loader)); @@ -311,24 +312,24 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result1 = - module1->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result1 = module1->execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result1.ok()); auto module2 = std::make_unique(module1->program()); - auto result2 = - module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result2 = module2->execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result2.ok()); module1 = std::make_unique("/path/to/nonexistent/file.pte"); EXPECT_FALSE(module1->is_loaded()); - auto result3 = - module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result3 = module2->execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result3.ok()); } @@ -336,10 +337,10 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) { std::shared_ptr shared_program; { - auto loader = util::FileDataLoader::from(model_path_.c_str()); + auto loader = FileDataLoader::from(model_path_.c_str()); EXPECT_TRUE(loader.ok()); auto data_loader = - std::make_unique(std::move(loader.get())); + std::make_unique(std::move(loader.get())); auto* data_loader_ptr = data_loader.get(); Module module(std::move(data_loader)); @@ -362,10 +363,11 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result = module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); auto data = result->at(0).toTensor().const_data_ptr(); @@ -391,10 +393,14 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { const std::array& input) { Module module(program); std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data()); - - const auto result = module.forward({Tensor(&tensor), Tensor(&tensor)}); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, + sizes.size(), + sizes.data(), + (void*)input.data()); + + const auto result = module.forward( + {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -413,5 +419,3 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { t4.join(); t5.join(); } - -} // namespace torch::executor diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index c605c48c582..57bc44d1394 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -71,6 +71,7 @@ void et_pal_emit_log_message( } namespace py = pybind11; +using executorch::bundled_program::verify_method_outputs; using ::executorch::extension::BufferDataLoader; using ::executorch::extension::MallocMemoryAllocator; using ::executorch::extension::MmapDataLoader; @@ -79,7 +80,7 @@ using ::executorch::runtime::DataLoader; using ::executorch::runtime::Error; using ::executorch::runtime::EValue; using ::executorch::runtime::EventTracerDebugLogLevel; -using ::executorch::runtime::get_kernels; +using ::executorch::runtime::get_registered_kernels; using ::executorch::runtime::HierarchicalAllocator; using ::executorch::runtime::Kernel; using ::executorch::runtime::MemoryAllocator; @@ -92,8 +93,6 @@ using ::executorch::runtime::Span; using ::executorch::runtime::Tag; using torch::executor::etdump_result; using torch::executor::ETDumpGen; -using torch::executor::bundled_program::LoadBundledInput; -using torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput; #ifndef USE_ATEN_LIB using ::executorch::extension::alias_attensor_to_etensor; @@ -655,11 +654,11 @@ struct PyModule final { const std::string method_name, size_t testset_idx) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); - Error status = LoadBundledInput( + Error status = executorch::bundled_program::load_bundled_input( module_->get_method(method_name), bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, - "LoadBundledInput failed with status %" PRIu32, + "load_bundled_input failed with status 0x%" PRIx32, static_cast(status)); } @@ -671,13 +670,14 @@ struct PyModule final { double atol = 1e-8) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); auto& method = module_->get_method(method_name); - Error status = LoadBundledInput(method, bundled_program_ptr, testset_idx); + Error status = executorch::bundled_program::load_bundled_input( + method, bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, - "LoadBundledInput failed with status %" PRIu32, + "load_bundled_input failed with status 0x%" PRIx32, static_cast(status)); py::list outputs = plan_execute(method_name); - status = VerifyResultWithBundledExpectedOutput( + status = executorch::bundled_program::verify_method_outputs( method, bundled_program_ptr, testset_idx, rtol, atol); THROW_IF_ERROR( status, @@ -774,7 +774,7 @@ void create_profile_block(const std::string& name) { } py::list get_operator_names() { - ArrayRef kernels = get_kernels(); + Span kernels = get_registered_kernels(); py::list res; for (const Kernel& k : kernels) { if (k.name_ != nullptr) { diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index 4998b5cf15b..8493d093fa1 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -15,6 +15,7 @@ def define_common_targets(): srcs = [ "tensor_impl_ptr.cpp", "tensor_ptr.cpp", + "tensor_ptr_maker.cpp", ], exported_headers = [ "tensor.h", diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h index 3ccede79b1d..5f34f929b96 100644 --- a/extension/tensor/tensor_impl_ptr.h +++ b/extension/tensor/tensor_impl_ptr.h @@ -66,7 +66,7 @@ TensorImplPtr make_tensor_impl_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, std::function deleter = nullptr); /** @@ -93,10 +93,10 @@ TensorImplPtr make_tensor_impl_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { constexpr exec_aten::ScalarType scalar_type = runtime::CppTypeToScalarType::value; - auto raw_data_ptr = data.data(); + const auto raw_data_ptr = data.data(); auto data_ptr = std::make_shared>(std::move(data)); return make_tensor_impl_ptr( scalar_type, @@ -108,6 +108,40 @@ TensorImplPtr make_tensor_impl_ptr( [data_ptr = std::move(data_ptr)](void*) {}); } +/** + * Creates a TensorImplPtr that manages a newly created TensorImpl with the + * specified properties. + * + * This template overload is specialized for cases where the tensor data is + * provided as a vector. The scalar type is automatically deduced from the + * vector's data type. The deleter ensures that the data vector is properly + * managed and its lifetime is tied to the TensorImpl. + * + * @tparam T The C++ type of the tensor elements, deduced from the vector. + * @param data A vector containing the tensor's data. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorImplPtr that manages the newly created TensorImpl. + */ +template +TensorImplPtr make_tensor_impl_ptr( + std::vector data, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + constexpr exec_aten::ScalarType scalar_type = + runtime::CppTypeToScalarType::value; + std::vector sizes{exec_aten::SizesType(data.size())}; + const auto raw_data_ptr = data.data(); + auto data_ptr = std::make_shared>(std::move(data)); + return make_tensor_impl_ptr( + scalar_type, + std::move(sizes), + raw_data_ptr, + {0}, + {1}, + dynamism, + [data_ptr = std::move(data_ptr)](void*) {}); +} + /** * Creates a TensorImplPtr that manages a newly created TensorImpl with the * specified properties. @@ -131,7 +165,7 @@ TensorImplPtr make_tensor_impl_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC); + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); } // namespace extension } // namespace executorch diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index 18568876607..f477199a3e1 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -125,7 +125,7 @@ inline TensorPtr make_tensor_ptr( std::vector dim_order = {}, std::vector strides = {}, const exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, std::function deleter = nullptr) { return make_tensor_ptr(make_tensor_impl_ptr( type, @@ -142,8 +142,7 @@ inline TensorPtr make_tensor_ptr( * * This template overload is specialized for cases where the tensor data is * provided as a vector. The scalar type is automatically deduced from the - * vector's data type. The deleter ensures that the data vector is properly - * managed and its lifetime is tied to the TensorImpl. + * vector's data type. * * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param sizes A vector specifying the size of each dimension. @@ -160,7 +159,7 @@ TensorPtr make_tensor_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return make_tensor_ptr(make_tensor_impl_ptr( std::move(sizes), std::move(data), @@ -169,6 +168,47 @@ TensorPtr make_tensor_ptr( dynamism)); } +/** + * Creates a TensorPtr that manages a Tensor with the specified properties. + * + * This template overload is specialized for cases where the tensor data is + * provided as a vector. The scalar type is automatically deduced from the + * vector's data type. + * + * @tparam T The C++ type of the tensor elements, deduced from the vector. + * @param data A vector containing the tensor's data. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorPtr that manages the newly created TensorImpl. + */ +template +TensorPtr make_tensor_ptr( + std::vector data, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return make_tensor_ptr(make_tensor_impl_ptr(std::move(data), dynamism)); +} + +/** + * Creates a TensorPtr that manages a Tensor with the specified properties. + * + * This template overload allows creating a Tensor from an initializer list + * of data. The scalar type is automatically deduced from the type of the + * initializer list's elements. + * + * @tparam T The C++ type of the tensor elements, deduced from the initializer + * list. + * @param data An initializer list containing the tensor's data. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorPtr that manages the newly created TensorImpl. + */ +template +TensorPtr make_tensor_ptr( + std::initializer_list data, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return make_tensor_ptr(std::vector(data), dynamism); +} + /** * Creates a TensorPtr that manages a Tensor with the specified properties. * @@ -191,7 +231,7 @@ inline TensorPtr make_tensor_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return make_tensor_ptr(make_tensor_impl_ptr( scalar_type, std::move(sizes), diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp new file mode 100644 index 00000000000..1a09fea4cac --- /dev/null +++ b/extension/tensor/tensor_ptr_maker.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace executorch { +namespace extension { +namespace { + +template < + typename INT_T, + typename std::enable_if< + std::is_integral::value && !std::is_same::value, + bool>::type = true> +bool extract_scalar(exec_aten::Scalar scalar, INT_T* out_val) { + if (!scalar.isIntegral(/*includeBool=*/false)) { + return false; + } + int64_t val = scalar.to(); + if (val < std::numeric_limits::lowest() || + val > std::numeric_limits::max()) { + return false; + } + *out_val = static_cast(val); + return true; +} + +template < + typename FLOAT_T, + typename std::enable_if::value, bool>:: + type = true> +bool extract_scalar(exec_aten::Scalar scalar, FLOAT_T* out_val) { + double val; + if (scalar.isFloatingPoint()) { + val = scalar.to(); + if (std::isfinite(val) && + (val < std::numeric_limits::lowest() || + val > std::numeric_limits::max())) { + return false; + } + } else if (scalar.isIntegral(/*includeBool=*/false)) { + val = static_cast(scalar.to()); + } else { + return false; + } + *out_val = static_cast(val); + return true; +} + +template < + typename BOOL_T, + typename std::enable_if::value, bool>::type = + true> +bool extract_scalar(exec_aten::Scalar scalar, BOOL_T* out_val) { + if (scalar.isIntegral(false)) { + *out_val = static_cast(scalar.to()); + return true; + } + if (scalar.isBoolean()) { + *out_val = scalar.to(); + return true; + } + return false; +} + +#define ET_EXTRACT_SCALAR(scalar, out_val) \ + ET_CHECK_MSG( \ + extract_scalar(scalar, &out_val), \ + #scalar " could not be extracted: wrong type or out of range"); + +template +TensorPtr random_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism, + Distribution&& distribution) { + auto tensor = + empty_strided(std::move(sizes), std::move(strides), type, dynamism); + std::default_random_engine gen{std::random_device{}()}; + + ET_SWITCH_REALB_TYPES(type, nullptr, "random_strided", CTYPE, [&] { + std::generate_n(tensor->mutable_data_ptr(), tensor->numel(), [&]() { + return static_cast(distribution(gen)); + }); + }); + return tensor; +} + +} // namespace + +TensorPtr empty_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + std::vector data( + exec_aten::compute_numel(sizes.data(), sizes.size()) * + exec_aten::elementSize(type)); + return make_tensor_ptr( + type, + std::move(sizes), + std::move(data), + {}, + std::move(strides), + dynamism); +} + +TensorPtr full_strided( + std::vector sizes, + std::vector strides, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + auto tensor = + empty_strided(std::move(sizes), std::move(strides), type, dynamism); + ET_SWITCH_REALB_TYPES(type, nullptr, "full_strided", CTYPE, [&] { + CTYPE value; + ET_EXTRACT_SCALAR(fill_value, value); + std::fill( + tensor->mutable_data_ptr(), + tensor->mutable_data_ptr() + tensor->numel(), + value); + }); + return tensor; +} + +TensorPtr rand_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + return random_strided( + std::move(sizes), + std::move(strides), + type, + dynamism, + std::uniform_real_distribution(0.0f, 1.0f)); +} + +TensorPtr randn_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + return random_strided( + std::move(sizes), + std::move(strides), + type, + dynamism, + std::normal_distribution(0.0f, 1.0f)); +} + +TensorPtr randint_strided( + int64_t low, + int64_t high, + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + return random_strided( + std::move(sizes), + std::move(strides), + type, + dynamism, + std::uniform_int_distribution(low, high - 1)); +} + +} // namespace extension +} // namespace executorch diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h index a08f04c2101..4e65480b7fd 100644 --- a/extension/tensor/tensor_ptr_maker.h +++ b/extension/tensor/tensor_ptr_maker.h @@ -15,7 +15,7 @@ namespace extension { /** * A helper class for creating TensorPtr instances from raw data and tensor - * properties. Note the the TensorPtr created by this class will not own the + * properties. Note that the TensorPtr created by this class will not own the * data, so it must outlive the TensorPtr. * * TensorPtrMaker provides a fluent interface for specifying various properties @@ -31,6 +31,7 @@ class TensorPtrMaker final { // But it is movable. TensorPtrMaker(TensorPtrMaker&&) = default; TensorPtrMaker& operator=(TensorPtrMaker&&) = default; + /** * Sets the scalar type of the tensor elements. * @@ -138,7 +139,7 @@ class TensorPtrMaker final { void* data_ = nullptr; exec_aten::ScalarType type_ = exec_aten::ScalarType::Float; exec_aten::TensorShapeDynamism dynamism_ = - exec_aten::TensorShapeDynamism::STATIC; + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND; }; /** @@ -182,7 +183,7 @@ inline TensorPtr from_blob( std::vector sizes, exec_aten::ScalarType type = exec_aten::ScalarType::Float, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .dynamism(dynamism) .make_tensor_ptr(); @@ -210,7 +211,7 @@ inline TensorPtr from_blob( std::vector strides, exec_aten::ScalarType type = exec_aten::ScalarType::Float, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .strides(std::move(strides)) .dynamism(dynamism) @@ -239,7 +240,7 @@ inline TensorPtr from_blob( exec_aten::ScalarType type, std::function&& deleter, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .deleter(std::move(deleter)) .dynamism(dynamism) @@ -270,7 +271,7 @@ inline TensorPtr from_blob( exec_aten::ScalarType type, std::function&& deleter, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .strides(std::move(strides)) .deleter(std::move(deleter)) @@ -278,5 +279,408 @@ inline TensorPtr from_blob( .make_tensor_ptr(); } +/** + * Creates a TensorPtr with the specified sizes, strides, and properties. + * + * This function allocates memory for the tensor elements but does not + * initialize them with any specific values. The tensor is created with the + * specified strides. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr empty_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates an empty TensorPtr with the same size and properties as the given + * tensor. + * + * This function allocates memory for the tensor elements but does not + * initialize them with any specific values. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr empty_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return empty_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates an empty TensorPtr with the specified sizes and properties. + * + * This function allocates memory for the tensor elements but does not + * initialize them with any specific values. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr empty( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return empty_strided(std::move(sizes), {}, type, dynamism); +} + +/** + * Creates a TensorPtr filled with the specified value. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param fill_value The value to fill the tensor with. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr full_strided( + std::vector sizes, + std::vector strides, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with the specified value, with the same size and + * properties as another tensor. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param fill_value The value to fill the tensor with. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr full_like( + const TensorPtr& other, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return full_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + fill_value, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with the specified value. + * + * @param sizes A vector specifying the size of each dimension. + * @param fill_value The value to fill the tensor with. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr full( + std::vector sizes, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full_strided(std::move(sizes), {}, fill_value, type, dynamism); +} + +/** + * Creates a TensorPtr that holds a scalar value. + * + * @param value The scalar value to create the tensor with. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created scalar Tensor. + */ +inline TensorPtr scalar_tensor( + exec_aten::Scalar value, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full({}, value, type, dynamism); +} + +/** + * Creates a TensorPtr filled with ones, with the same size and properties as + * another tensor. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the `other` tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr ones_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full_like(other, 1, type, dynamism); +} + +/** + * Creates a TensorPtr filled with ones. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr ones( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full(std::move(sizes), 1, type, dynamism); +} + +/** + * Creates a TensorPtr filled with zeros, with the same size and properties as + * another tensor. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the `other` tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr zeros_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full_like(other, 0, type, dynamism); +} + +/** + * Creates a TensorPtr filled with zeros. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr zeros( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full(std::move(sizes), 0, type, dynamism); +} + +/** + * Creates a TensorPtr filled with random values between 0 and 1. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + **/ +TensorPtr rand_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with random values between 0 and 1. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr rand_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return rand_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with random values between 0 and 1. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr rand( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return rand_strided(std::move(sizes), {}, type, dynamism); +} + +/** + * Creates a TensorPtr filled with random values from a normal distribution. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr randn_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with random values from a normal distribution. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randn_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return randn_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with random values from a normal distribution. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randn( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return randn_strided(std::move(sizes), {}, type, dynamism); +} + +/** + * Creates a TensorPtr filled with random integer values in the given range. + * + * @param low The lower bound (inclusive) of the random values. + * @param high The upper bound (exclusive) of the random values. + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr randint_strided( + int64_t low, + int64_t high, + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Int, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with random integer values in the given range. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param low The lower bound (inclusive) of the random values. + * @param high The upper bound (exclusive) of the random values. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randint_like( + const TensorPtr& other, + int64_t low, + int64_t high, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return randint_strided( + low, + high, + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with random integer values in the given range. + * + * @param low The lower bound (inclusive) of the random values. + * @param high The upper bound (exclusive) of the random values. + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randint( + int64_t low, + int64_t high, + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Int, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return randint_strided(low, high, std::move(sizes), {}, type, dynamism); +} + } // namespace extension } // namespace executorch diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp index 45d79f240af..f7fd062c462 100644 --- a/extension/tensor/test/tensor_impl_ptr_test.cpp +++ b/extension/tensor/test/tensor_impl_ptr_test.cpp @@ -23,6 +23,29 @@ class TensorImplPtrTest : public ::testing::Test { } }; +TEST_F(TensorImplPtrTest, ScalarTensorCreation) { + float scalar_data = 3.14f; + auto tensor_impl = + make_tensor_impl_ptr(exec_aten::ScalarType::Float, {}, &scalar_data); + + EXPECT_EQ(tensor_impl->numel(), 1); + EXPECT_EQ(tensor_impl->dim(), 0); + EXPECT_EQ(tensor_impl->sizes().size(), 0); + EXPECT_EQ(tensor_impl->strides().size(), 0); + EXPECT_EQ((float*)tensor_impl->data(), &scalar_data); + EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f); +} + +TEST_F(TensorImplPtrTest, ScalarTensorOwningData) { + auto tensor_impl = make_tensor_impl_ptr({}, {3.14f}); + + EXPECT_EQ(tensor_impl->numel(), 1); + EXPECT_EQ(tensor_impl->dim(), 0); + EXPECT_EQ(tensor_impl->sizes().size(), 0); + EXPECT_EQ(tensor_impl->strides().size(), 0); + EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f); +} + TEST_F(TensorImplPtrTest, TensorImplCreation) { float data[20] = {2}; auto tensor_impl = make_tensor_impl_ptr( @@ -34,8 +57,8 @@ TEST_F(TensorImplPtrTest, TensorImplCreation) { EXPECT_EQ(tensor_impl->strides()[0], 5); EXPECT_EQ(tensor_impl->strides()[1], 1); EXPECT_EQ(tensor_impl->data(), data); - EXPECT_EQ(tensor_impl->mutable_data(), data); - EXPECT_EQ(((float*)tensor_impl->mutable_data())[0], 2); + EXPECT_EQ(tensor_impl->data(), data); + EXPECT_EQ(((float*)tensor_impl->data())[0], 2); } TEST_F(TensorImplPtrTest, TensorImplSharedOwnership) { @@ -145,7 +168,7 @@ TEST_F(TensorImplPtrTest, TensorImplDataDeleterReleasesCapturedSharedPtr) { data_ptr.get(), {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [data_ptr, &deleter_called](void*) mutable { deleter_called = true; }); EXPECT_EQ(data_ptr.use_count(), 2); @@ -172,7 +195,7 @@ TEST_F(TensorImplPtrTest, TensorImplOwningData) { } TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) { - auto tensor_impl = make_tensor_impl_ptr({0, 5}, {}); + auto tensor_impl = make_tensor_impl_ptr({0, 5}, std::vector()); EXPECT_EQ(tensor_impl->dim(), 2); EXPECT_EQ(tensor_impl->size(0), 0); @@ -182,6 +205,74 @@ TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) { EXPECT_EQ(tensor_impl->data(), nullptr); } +TEST_F(TensorImplPtrTest, TensorImplDataOnlyDoubleType) { + std::vector data = {1.0, 2.0, 3.0, 4.0}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0); + EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0); +} + +TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt32Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((int32_t*)tensor_impl->data())[0], 10); + EXPECT_EQ(((int32_t*)tensor_impl->data())[3], 40); +} + +TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt64Type) { + std::vector data = {100, 200, 300, 400}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((int64_t*)tensor_impl->data())[0], 100); + EXPECT_EQ(((int64_t*)tensor_impl->data())[3], 400); +} + +TEST_F(TensorImplPtrTest, TensorImplDataOnlyUint8Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((uint8_t*)tensor_impl->data())[0], 10); + EXPECT_EQ(((uint8_t*)tensor_impl->data())[3], 40); +} + +TEST_F(TensorImplPtrTest, TensorImplAmbiguityWithMixedVectors) { + std::vector sizes = {2, 2}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto tensor_impl = make_tensor_impl_ptr(std::move(sizes), std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 2); + EXPECT_EQ(tensor_impl->strides()[0], 2); + EXPECT_EQ(tensor_impl->strides()[1], 1); + EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f); + EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f); + + auto tensor_impl2 = make_tensor_impl_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f}); + + EXPECT_EQ(tensor_impl2->dim(), 2); + EXPECT_EQ(tensor_impl2->size(0), 2); + EXPECT_EQ(tensor_impl2->size(1), 2); + EXPECT_EQ(tensor_impl2->strides()[0], 2); + EXPECT_EQ(tensor_impl2->strides()[1], 1); + EXPECT_EQ(((float*)tensor_impl2->data())[0], 1.0f); + EXPECT_EQ(((float*)tensor_impl2->data())[3], 4.0f); +} + TEST_F(TensorImplPtrTest, SharedDataManagement) { auto data = std::make_shared>(100, 1.0f); auto tensor_impl1 = make_tensor_impl_ptr( @@ -212,7 +303,7 @@ TEST_F(TensorImplPtrTest, CustomDeleterWithSharedData) { data->data(), {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [data, &deleter_called](void*) mutable { deleter_called = true; data.reset(); diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp index d1b4179a260..41f3fa21439 100644 --- a/extension/tensor/test/tensor_ptr_maker_test.cpp +++ b/extension/tensor/test/tensor_ptr_maker_test.cpp @@ -178,3 +178,262 @@ TEST_F(TensorPtrMakerTest, TensorDeleterReleasesCapturedSharedPtr) { EXPECT_TRUE(deleter_called); EXPECT_EQ(data_ptr.use_count(), 1); } + +TEST_F(TensorPtrMakerTest, CreateEmpty) { + auto tensor = empty({4, 5}); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + + auto tensor2 = empty({4, 5}, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + + auto tensor3 = empty({4, 5}, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + + auto tensor4 = empty({4, 5}, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); +} + +TEST_F(TensorPtrMakerTest, CreateFull) { + auto tensor = full({4, 5}, 7); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 7); + + auto tensor2 = full({4, 5}, 3, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 3); + + auto tensor3 = full({4, 5}, 9, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->const_data_ptr()[0], 9); + + auto tensor4 = full({4, 5}, 11, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->const_data_ptr()[0], 11); +} + +TEST_F(TensorPtrMakerTest, CreateScalar) { + auto tensor = scalar_tensor(3.14f); + + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 3.14f); + + auto tensor2 = scalar_tensor(5, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor2->dim(), 0); + EXPECT_EQ(tensor2->numel(), 1); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 5); + + auto tensor3 = scalar_tensor(7.0, exec_aten::ScalarType::Double); + + EXPECT_EQ(tensor3->dim(), 0); + EXPECT_EQ(tensor3->numel(), 1); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor3->const_data_ptr()[0], 7.0); +} + +TEST_F(TensorPtrMakerTest, CreateOnes) { + auto tensor = ones({4, 5}); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 1); + + auto tensor2 = ones({4, 5}, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 1); + + auto tensor3 = ones({4, 5}, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->const_data_ptr()[0], 1); + + auto tensor4 = ones({4, 5}, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->const_data_ptr()[0], 1); +} + +TEST_F(TensorPtrMakerTest, CreateZeros) { + auto tensor = zeros({4, 5}); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 0); + + auto tensor2 = zeros({4, 5}, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 0); + + auto tensor3 = zeros({4, 5}, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->const_data_ptr()[0], 0); + + auto tensor4 = zeros({4, 5}, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->const_data_ptr()[0], 0); +} + +TEST_F(TensorPtrMakerTest, CreateRandTensor) { + auto tensor = rand({4, 5}); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 0.0f); + EXPECT_LT(val, 1.0f); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandTensorWithIntType) { + auto tensor = rand({4, 5}, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_EQ(val, 0); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandTensorWithDoubleType) { + auto tensor = rand({4, 5}, exec_aten::ScalarType::Double); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 0.0); + EXPECT_LT(val, 1.0); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandnTensor) { + auto tensor = randn({4, 5}); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + + auto sum = 0.0f; + for (auto i = 0; i < tensor->numel(); ++i) { + sum += tensor->const_data_ptr()[i]; + } + const auto average = sum / tensor->numel(); + EXPECT_NEAR(average, 0.0f, 0.5f); +} + +TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) { + auto tensor = randn({4, 5}, exec_aten::ScalarType::Double); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double); + + auto sum = 0.0; + for (auto i = 0; i < tensor->numel(); ++i) { + sum += tensor->const_data_ptr()[i]; + } + const auto average = sum / tensor->numel(); + EXPECT_NEAR(average, 0.0, 0.5); +} + +TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithIntType) { + auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 10); + EXPECT_LT(val, 20); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithLongType) { + auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Long); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 10); + EXPECT_LT(val, 20); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandnTensorWithIntType) { + auto tensor = rand({4, 5}, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_EQ(val, 0); + } +} diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp index 1542824fb73..653e2ef98d7 100644 --- a/extension/tensor/test/tensor_ptr_test.cpp +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -22,6 +22,28 @@ class TensorPtrTest : public ::testing::Test { } }; +TEST_F(TensorPtrTest, ScalarTensorCreation) { + float scalar_data = 3.14f; + auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {}, &scalar_data); + + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->sizes().size(), 0); + EXPECT_EQ(tensor->strides().size(), 0); + EXPECT_EQ(tensor->const_data_ptr(), &scalar_data); + EXPECT_EQ(tensor->const_data_ptr()[0], 3.14f); +} + +TEST_F(TensorPtrTest, ScalarTensorOwningData) { + auto tensor = make_tensor_ptr({}, {3.14f}); + + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->sizes().size(), 0); + EXPECT_EQ(tensor->strides().size(), 0); + EXPECT_EQ(tensor->const_data_ptr()[0], 3.14f); +} + TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) { float data[20] = {2}; auto tensor = make_tensor_ptr( @@ -98,7 +120,7 @@ TEST_F(TensorPtrTest, TensorWithCustomDataDeleter) { data, {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [&deleter_called](void* ptr) { deleter_called = true; delete[] static_cast(ptr); @@ -118,7 +140,7 @@ TEST_F(TensorPtrTest, TensorManagesMovedVector) { data_ptr, {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [moved_data = std::move(data), &deleter_called](void*) mutable { deleter_called = true; }); @@ -140,7 +162,7 @@ TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) { data_ptr.get(), {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [data_ptr, &deleter_called](void*) mutable { deleter_called = true; }); EXPECT_EQ(data_ptr.use_count(), 2); @@ -167,7 +189,7 @@ TEST_F(TensorPtrTest, TensorOwningData) { } TEST_F(TensorPtrTest, TensorOwningEmptyData) { - auto tensor = make_tensor_ptr({0, 5}, {}); + auto tensor = make_tensor_ptr({0, 5}, std::vector()); EXPECT_EQ(tensor->dim(), 2); EXPECT_EQ(tensor->size(0), 0); @@ -175,6 +197,90 @@ TEST_F(TensorPtrTest, TensorOwningEmptyData) { EXPECT_EQ(tensor->strides()[0], 5); EXPECT_EQ(tensor->strides()[1], 1); EXPECT_EQ(tensor->data_ptr(), nullptr); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); +} + +TEST_F(TensorPtrTest, TensorImplDataOnly) { + auto tensor = make_tensor_ptr({1.0f, 2.0f, 3.0f, 4.0f}); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 1.0); + EXPECT_EQ(tensor->const_data_ptr()[3], 4.0); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); +} + +TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) { + std::vector data = {1.0, 2.0, 3.0, 4.0}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 1.0); + EXPECT_EQ(tensor->const_data_ptr()[3], 4.0); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double); +} + +TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 10); + EXPECT_EQ(tensor->const_data_ptr()[3], 40); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); +} + +TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) { + std::vector data = {100, 200, 300, 400}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 100); + EXPECT_EQ(tensor->const_data_ptr()[3], 400); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long); +} + +TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 10); + EXPECT_EQ(tensor->const_data_ptr()[3], 40); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Byte); +} + +TEST_F(TensorPtrTest, TensorImplAmbiguityWithMixedVectors) { + std::vector sizes = {2, 2}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto tensor = make_tensor_ptr(std::move(sizes), std::move(data)); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 2); + EXPECT_EQ(tensor->strides()[0], 2); + EXPECT_EQ(tensor->strides()[1], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 1.0f); + EXPECT_EQ(tensor->const_data_ptr()[3], 4.0f); + + auto tensor2 = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f}); + + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 2); + EXPECT_EQ(tensor2->size(1), 2); + EXPECT_EQ(tensor2->strides()[0], 2); + EXPECT_EQ(tensor2->strides()[1], 1); + EXPECT_EQ(tensor2->const_data_ptr()[0], 1.0f); + EXPECT_EQ(tensor2->const_data_ptr()[3], 4.0f); } TEST_F(TensorPtrTest, TensorSharingImplModifiesSharedDataVector) { diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 1350fc090b0..e63863fc048 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -215,6 +215,8 @@ - op: linalg_vector_norm.out +- op: linear.out + - op: log.out - op: log10.out diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp index 35b208d30fc..99003f8f0ea 100644 --- a/kernels/optimized/blas/CPUBlas.cpp +++ b/kernels/optimized/blas/CPUBlas.cpp @@ -173,5 +173,28 @@ void gemm( } // clang-format on +// clang-format off +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const BFloat16 alpha, + const BFloat16 *a, int64_t lda, + const BFloat16 *b, int64_t ldb, + const BFloat16 beta, + BFloat16 *c, int64_t ldc) { + normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); + + using acc_type = utils::compute_dtype; + gemm_impl( + transa, transb, + m, n, k, + static_cast(alpha), + a, lda, + b, ldb, + static_cast(beta), + c, ldc); +} +// clang-format on + } // namespace cpublas } // namespace executorch diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h index dd4a24cbce0..71e50601238 100644 --- a/kernels/optimized/blas/CPUBlas.h +++ b/kernels/optimized/blas/CPUBlas.h @@ -17,6 +17,7 @@ namespace executorch { namespace cpublas { +using BFloat16 = torch::executor::BFloat16; using Half = torch::executor::Half; enum class TransposeType { @@ -104,6 +105,15 @@ void gemm( const Half *b, int64_t ldb, const Half beta, Half *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const BFloat16 alpha, + const BFloat16 *a, int64_t lda, + const BFloat16 *b, int64_t ldb, + const BFloat16 beta, + BFloat16 *c, int64_t ldc); // clang-format on // clang-format off diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp new file mode 100644 index 00000000000..56634d326f2 --- /dev/null +++ b/kernels/optimized/cpu/op_linear.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& opt_linear_out( + RuntimeContext& ctx, + const Tensor& in, + const Tensor& mat2, + const optional& bias, + Tensor& out) { + ET_KERNEL_CHECK_MSG( + ctx, + !bias.has_value(), + InvalidArgument, + out, + "bias not supported yet in linear"); + ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out); + + size_t output_ndim = 0; + std::array output_sizes; + get_linear_out_target_size(in, mat2, output_sizes.data(), &output_ndim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok, + InvalidArgument, + out); + + // gemm on some platforms doesn't tolerate empty input. + if (out.numel() == 0) { + return out; + } + + int flattened_input_dim = 1; + for (int ii = 0; ii < in.dim() - 1; ++ii) { + flattened_input_dim *= in.sizes()[ii]; + } + ET_SWITCH_REAL_TYPES_AND2( + Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { + size_t n = flattened_input_dim; + size_t k = in.sizes()[in.dim() - 1]; + size_t m = mat2.size(0); + + executorch::cpublas::gemm( + executorch::cpublas::TransposeType::Transpose, + executorch::cpublas::TransposeType::NoTranspose, + m, + n, + k, + static_cast(1), + mat2.const_data_ptr(), + k, + in.const_data_ptr(), + k, + static_cast(0), + out.mutable_data_ptr(), + m); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/optimized/cpu/op_mm.cpp b/kernels/optimized/cpu/op_mm.cpp new file mode 100644 index 00000000000..9131356aeb6 --- /dev/null +++ b/kernels/optimized/cpu/op_mm.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& opt_mm_out( + RuntimeContext& ctx, + const Tensor& in, + const Tensor& mat2, + Tensor& out) { + ET_KERNEL_CHECK(ctx, check_mm_args(in, mat2, out), InvalidArgument, out); + + size_t output_ndim = 0; + std::array output_sizes; + get_mm_out_target_size(in, mat2, output_sizes.data(), &output_ndim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok, + InvalidArgument, + out); + + if (out.numel() == 0) { + return out; + } + ET_SWITCH_REAL_TYPES_AND2( + Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { + size_t n = in.size(0); + size_t k = in.size(1); + size_t m = mat2.size(1); + + // gemm expects column-major inputs and produces column-major + // output. So, we take advantage of the identity (A @ B).t() + // = B.t() @ A.t() here; row-major B is B.t() from gemm's + // column-major perspective, etc. + executorch::cpublas::gemm( + executorch::cpublas::TransposeType::NoTranspose, + executorch::cpublas::TransposeType::NoTranspose, + m, + n, + k, + static_cast(1), + mat2.const_data_ptr(), + m, + in.const_data_ptr(), + k, + static_cast(0), + out.mutable_data_ptr(), + m); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index e7bb2d36bf4..488d2af7fa1 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -40,6 +40,13 @@ _OPTIMIZED_ATEN_OPS = ( "//executorch/kernels/portable/cpu:scalar_utils", ], ), + op_target( + name = "op_linear", + deps = [ + "//executorch/kernels/optimized:libblas", + "//executorch/kernels/portable/cpu/util:matmul_ops_util", + ], + ), op_target( name = "op_log_softmax", deps = select({ @@ -52,6 +59,13 @@ _OPTIMIZED_ATEN_OPS = ( ], }), ), + op_target( + name = "op_mm", + deps = [ + "//executorch/kernels/optimized:libblas", + "//executorch/kernels/portable/cpu/util:matmul_ops_util", + ], + ), op_target( name = "op_mul", deps = [ diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml index f79d652b91d..797744f3bd4 100644 --- a/kernels/optimized/optimized-oss.yaml +++ b/kernels/optimized/optimized-oss.yaml @@ -45,6 +45,11 @@ - arg_meta: null kernel_name: torch::executor::opt_le_tensor_out +- op: linear.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_linear_out + - op: mul.out kernels: - arg_meta: null diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml index 0d445deb3e8..2421673f8a7 100644 --- a/kernels/optimized/optimized.yaml +++ b/kernels/optimized/optimized.yaml @@ -52,6 +52,16 @@ - arg_meta: null kernel_name: torch::executor::opt_le_tensor_out +- op: linear.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_linear_out + +- op: mm.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_mm_out + - op: mul.out kernels: - arg_meta: null diff --git a/kernels/optimized/test/libblas_test.cpp b/kernels/optimized/test/libblas_test.cpp index 8f30a357e1a..24aeaba776a 100644 --- a/kernels/optimized/test/libblas_test.cpp +++ b/kernels/optimized/test/libblas_test.cpp @@ -9,6 +9,7 @@ #include #include +#include #include @@ -17,7 +18,8 @@ _(); \ _(); \ _(); \ - _(); + _(); \ + _(); namespace { diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 8fc4f9d4593..34e7e085687 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -123,7 +123,11 @@ Tensor& mul_scalar_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); - ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, + out); ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); diff --git a/kernels/portable/cpu/op_reflection_pad1d.cpp b/kernels/portable/cpu/op_reflection_pad1d.cpp index 66a2333619f..53fbbc9c56a 100644 --- a/kernels/portable/cpu/op_reflection_pad1d.cpp +++ b/kernels/portable/cpu/op_reflection_pad1d.cpp @@ -28,6 +28,11 @@ Tensor& reflection_pad1d_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_padding_out_target_size(1, in, padding, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_reflection_pad2d.cpp b/kernels/portable/cpu/op_reflection_pad2d.cpp index a16d92ff1ce..8de0baba43b 100644 --- a/kernels/portable/cpu/op_reflection_pad2d.cpp +++ b/kernels/portable/cpu/op_reflection_pad2d.cpp @@ -28,6 +28,11 @@ Tensor& reflection_pad2d_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_padding_out_target_size(2, in, padding, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_reflection_pad3d.cpp b/kernels/portable/cpu/op_reflection_pad3d.cpp index 9629b9e4c4e..4ba78733046 100644 --- a/kernels/portable/cpu/op_reflection_pad3d.cpp +++ b/kernels/portable/cpu/op_reflection_pad3d.cpp @@ -28,6 +28,11 @@ Tensor& reflection_pad3d_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_padding_out_target_size(3, in, padding, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_relu.cpp b/kernels/portable/cpu/op_relu.cpp index b9136cb3392..e59aec3ae64 100644 --- a/kernels/portable/cpu/op_relu.cpp +++ b/kernels/portable/cpu/op_relu.cpp @@ -35,6 +35,9 @@ Tensor& relu_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "relu.out", CTYPE, [&]() { apply_unary_map_fn( [](const CTYPE val_in) { diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp index 7c858c1c08a..3a641829773 100644 --- a/kernels/portable/cpu/op_remainder.cpp +++ b/kernels/portable/cpu/op_remainder.cpp @@ -80,6 +80,9 @@ Tensor& remainder_Tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); @@ -124,6 +127,9 @@ Tensor& remainder_Scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); diff --git a/kernels/portable/cpu/op_repeat.cpp b/kernels/portable/cpu/op_repeat.cpp index 644ebc98420..3b5596b2163 100644 --- a/kernels/portable/cpu/op_repeat.cpp +++ b/kernels/portable/cpu/op_repeat.cpp @@ -62,6 +62,11 @@ Tensor& repeat_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + // Resize for dynamic shape ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/op_roll.cpp b/kernels/portable/cpu/op_roll.cpp index 4eff081eec4..09c7667c812 100644 --- a/kernels/portable/cpu/op_roll.cpp +++ b/kernels/portable/cpu/op_roll.cpp @@ -60,6 +60,9 @@ Tensor& roll_out( ET_KERNEL_CHECK( ctx, check_roll_args(in, shifts, dims, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + if (in.numel() == 0) { return out; } diff --git a/kernels/portable/cpu/op_round.cpp b/kernels/portable/cpu/op_round.cpp index 0b28ba41887..33af6508be2 100644 --- a/kernels/portable/cpu/op_round.cpp +++ b/kernels/portable/cpu/op_round.cpp @@ -45,6 +45,9 @@ Tensor& round_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + auto in_scalar_type = in.scalar_type(); ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "round.out", CTYPE, [&] { diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp index 6a5ef598ef4..442221d6693 100644 --- a/kernels/portable/cpu/op_rsub.cpp +++ b/kernels/portable/cpu/op_rsub.cpp @@ -31,6 +31,9 @@ Tensor& rsub_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp index e10d87f9193..b4cf0d84f04 100644 --- a/kernels/portable/cpu/op_scatter_add.cpp +++ b/kernels/portable/cpu/op_scatter_add.cpp @@ -65,6 +65,15 @@ Tensor& scatter_add_out( InvalidArgument, out); + ET_KERNEL_CHECK( + context, + tensors_have_same_dim_order(self, src, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + context, tensor_is_default_dim_order(index), InvalidArgument, out); + if (dim < 0) { dim += nonzero_dim(self); } diff --git a/kernels/portable/cpu/op_select_scatter.cpp b/kernels/portable/cpu/op_select_scatter.cpp index 71e7d9dfefd..db3ef8b1d29 100644 --- a/kernels/portable/cpu/op_select_scatter.cpp +++ b/kernels/portable/cpu/op_select_scatter.cpp @@ -33,6 +33,9 @@ Tensor& select_scatter_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, src, out), InvalidArgument, out); + // Account for negative indices if (dim < 0) { dim += in.dim(); diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index b696c29518b..919d42a721a 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -24,6 +24,9 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out); ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // Resize for dynamic shape ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/op_sign.cpp b/kernels/portable/cpu/op_sign.cpp index 6dc6f3d015e..1c18788404d 100644 --- a/kernels/portable/cpu/op_sign.cpp +++ b/kernels/portable/cpu/op_sign.cpp @@ -30,6 +30,9 @@ Tensor& sign_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); diff --git a/kernels/portable/cpu/op_slice_copy.cpp b/kernels/portable/cpu/op_slice_copy.cpp index 41a76567906..2b5c48737d6 100644 --- a/kernels/portable/cpu/op_slice_copy.cpp +++ b/kernels/portable/cpu/op_slice_copy.cpp @@ -33,6 +33,9 @@ Tensor& slice_copy_Tensor_out( dim += in.dim(); } + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // If user do not set value to end_val, set end to in.size(dim) (largest // value available) int64_t end = end_val.has_value() ? end_val.value() : in.size(dim); diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp index 47374716b4e..97f75553c1d 100644 --- a/kernels/portable/cpu/op_slice_scatter.cpp +++ b/kernels/portable/cpu/op_slice_scatter.cpp @@ -40,6 +40,9 @@ Tensor& slice_scatter_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(input, out), InvalidArgument, out); + if (input.numel() == 0) { return out; } diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp index 9f1565ff161..544887bed62 100644 --- a/kernels/portable/cpu/op_softmax.cpp +++ b/kernels/portable/cpu/op_softmax.cpp @@ -36,6 +36,9 @@ Tensor& softmax_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // Adjust for negative dim dim = dim < 0 ? dim + nonzero_dim(in) : dim; diff --git a/kernels/portable/cpu/op_split_copy.cpp b/kernels/portable/cpu/op_split_copy.cpp index a604e76b51c..1829b356ff2 100644 --- a/kernels/portable/cpu/op_split_copy.cpp +++ b/kernels/portable/cpu/op_split_copy.cpp @@ -46,6 +46,11 @@ void split_copy_Tensor_out( check_split_copy_args(input, split_size, dim, out), InvalidArgument, ); + for (size_t i = 0; i < out.size(); ++i) { + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, ); + } + const size_t leading_dims = getLeadingDims(input, dim); const size_t trailing_dims = getTrailingDims(input, dim); const size_t step = input.size(dim) * trailing_dims; diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp index 7d1b485e7a4..623394e8013 100644 --- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp +++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp @@ -38,6 +38,11 @@ void split_with_sizes_copy_out( check_split_with_sizes_copy_args(in, split_sizes, dim, out), InvalidArgument, ); + for (size_t i = 0; i < out.size(); ++i) { + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out[i]), InvalidArgument, ); + } + // If out is empty, then nothing needs to be done after checking the args. // Valid args implies that in.size(dim) == 0 and split_sizes is also empty. if (out.size() == 0) { diff --git a/kernels/portable/cpu/op_squeeze_copy.cpp b/kernels/portable/cpu/op_squeeze_copy.cpp index 5be91ff827d..11489e31729 100644 --- a/kernels/portable/cpu/op_squeeze_copy.cpp +++ b/kernels/portable/cpu/op_squeeze_copy.cpp @@ -29,6 +29,11 @@ Tensor& squeeze_copy_dim_out( ET_KERNEL_CHECK( ctx, check_squeeze_copy_dim_args(in, dim, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + if (dim < 0) { dim += nonzero_dim(in); } @@ -62,6 +67,11 @@ Tensor& squeeze_copy_dims_out( ET_KERNEL_CHECK( ctx, check_squeeze_copy_dims_args(in, dims, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_squeeze_copy_dims_out_target_size( diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp index f241120ae2f..6859f2a8746 100644 --- a/kernels/portable/cpu/op_stack.cpp +++ b/kernels/portable/cpu/op_stack.cpp @@ -31,6 +31,16 @@ Tensor& stack_out( ET_KERNEL_CHECK( ctx, check_stack_args(tensors, dim, out), InvalidArgument, out); + for (size_t i = 0; i < tensors.size(); ++i) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(tensors[i], out), + InvalidArgument, + out); + } + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp index 04254653a43..b97b7b490f3 100644 --- a/kernels/portable/cpu/op_sub.cpp +++ b/kernels/portable/cpu/op_sub.cpp @@ -78,6 +78,9 @@ Tensor& sub_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); @@ -131,6 +134,9 @@ Tensor& sub_scalar_out( ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType alpha_type = utils::get_scalar_dtype(alpha); diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp index dfa897206a9..c9a4260344e 100644 --- a/kernels/portable/cpu/op_sum.cpp +++ b/kernels/portable/cpu/op_sum.cpp @@ -38,6 +38,11 @@ Tensor& sum_dim_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND( Bool, in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] { ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_t_copy.cpp b/kernels/portable/cpu/op_t_copy.cpp index c6a2ad5fdb5..46807a42f22 100644 --- a/kernels/portable/cpu/op_t_copy.cpp +++ b/kernels/portable/cpu/op_t_copy.cpp @@ -47,6 +47,11 @@ Tensor& t_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { return out; } + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_transpose_out_target_size(in, 1, 0, expected_out_size, &expected_out_dim); diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp index c0c04e65e93..46bd0bf987e 100644 --- a/kernels/portable/cpu/op_to_copy.cpp +++ b/kernels/portable/cpu/op_to_copy.cpp @@ -46,6 +46,11 @@ Tensor& to_copy_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + ET_SWITCH_REALHBBF16_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES( out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] { diff --git a/kernels/portable/cpu/op_transpose_copy.cpp b/kernels/portable/cpu/op_transpose_copy.cpp index 79c04646a73..d2456b8592e 100644 --- a/kernels/portable/cpu/op_transpose_copy.cpp +++ b/kernels/portable/cpu/op_transpose_copy.cpp @@ -57,6 +57,9 @@ Tensor& transpose_copy_int_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_ALL_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] { transpose_tensors(in, dim0, dim1, out); }); diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp index cdf87bea4ba..46a91e8c627 100644 --- a/kernels/portable/cpu/op_tril.cpp +++ b/kernels/portable/cpu/op_tril.cpp @@ -145,6 +145,11 @@ Tensor& tril_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + if (self.numel() == 0) { return out; } diff --git a/kernels/portable/cpu/op_unbind_copy.cpp b/kernels/portable/cpu/op_unbind_copy.cpp index da5a73d624c..cea4ccce345 100644 --- a/kernels/portable/cpu/op_unbind_copy.cpp +++ b/kernels/portable/cpu/op_unbind_copy.cpp @@ -36,6 +36,13 @@ void unbind_copy_int_out( ET_KERNEL_CHECK( ctx, check_unbind_copy_args(input, dim, out), InvalidArgument, ); + for (int i = 0; i < out.size(); ++i) { + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, ); + } + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(input), InvalidArgument, ); + if (input.numel() == 0) { return; } diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp index f6d25a04983..1c0a5c79990 100644 --- a/kernels/portable/cpu/op_unsqueeze_copy.cpp +++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp @@ -38,6 +38,11 @@ Tensor& unsqueeze_copy_out( ET_KERNEL_CHECK(ctx, self.dim() + 1 == out.dim(), InvalidArgument, out); ET_KERNEL_CHECK(ctx, dim <= self.dim(), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + for (size_t i = 0; i < out.dim(); ++i) { if (i < dim) { expected_output_size[i] = self.size(i); diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp index 52019e381c0..fa49269196e 100644 --- a/kernels/portable/cpu/op_var.cpp +++ b/kernels/portable/cpu/op_var.cpp @@ -74,6 +74,11 @@ Tensor& var_out( ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out); ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok, diff --git a/kernels/portable/cpu/op_view_copy.cpp b/kernels/portable/cpu/op_view_copy.cpp index f7174caac1e..ba72396b44f 100644 --- a/kernels/portable/cpu/op_view_copy.cpp +++ b/kernels/portable/cpu/op_view_copy.cpp @@ -44,6 +44,11 @@ Tensor& view_copy_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, check_view_copy_args(self, size_int64_t, out), InvalidArgument, out); diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index 6ff4cb85fb3..90f6e3df92b 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -35,6 +35,9 @@ Tensor& where_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out); + constexpr auto name = "where.self_out"; ET_CHECK_MSG( diff --git a/kernels/portable/cpu/util/matmul_ops_util.cpp b/kernels/portable/cpu/util/matmul_ops_util.cpp index d7e49d64958..3d4f2e5e9ba 100644 --- a/kernels/portable/cpu/util/matmul_ops_util.cpp +++ b/kernels/portable/cpu/util/matmul_ops_util.cpp @@ -71,6 +71,19 @@ bool check_mm_args(const Tensor& in, const Tensor& mat2, Tensor& out) { return true; } +bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(in.dim() == out.dim()); + ET_LOG_AND_RETURN_IF_FALSE(in.dim() >= 2); + ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(mat2, 2)); + + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, mat2, out)); + + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_size_at_dims(in, in.dim() - 1, mat2, 1)); + + return true; +} + void get_mm_out_target_size( const Tensor& mat1, const Tensor& mat2, @@ -81,5 +94,17 @@ void get_mm_out_target_size( out_sizes[1] = mat2.size(1); } +void get_linear_out_target_size( + const Tensor& mat1, + const Tensor& mat2, + Tensor::SizesType* out_sizes, + size_t* out_ndim) { + *out_ndim = mat1.dim(); + for (int ii = 0; ii < mat1.dim() - 1; ++ii) { + out_sizes[ii] = mat1.sizes()[ii]; + } + out_sizes[mat1.dim() - 1] = mat2.size(0); +} + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/matmul_ops_util.h b/kernels/portable/cpu/util/matmul_ops_util.h index 91e27ff2cc9..d2991868e95 100644 --- a/kernels/portable/cpu/util/matmul_ops_util.h +++ b/kernels/portable/cpu/util/matmul_ops_util.h @@ -37,5 +37,13 @@ void get_mm_out_target_size( Tensor::SizesType* out_sizes, size_t* out_ndim); +bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out); + +void get_linear_out_target_size( + const Tensor& mat1, + const Tensor& mat2, + Tensor::SizesType* out_sizes, + size_t* out_ndim); + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/select_copy_util.cpp b/kernels/portable/cpu/util/select_copy_util.cpp index cf56b3e4ca2..2564317b043 100644 --- a/kernels/portable/cpu/util/select_copy_util.cpp +++ b/kernels/portable/cpu/util/select_copy_util.cpp @@ -38,6 +38,10 @@ Error select_copy_util( return Error::InvalidArgument; } + if (!tensors_have_same_dim_order(in, out)) { + return Error::InvalidArgument; + } + // If the input is a empty tensor, no other operation could be done. We just // return the output. if (in.numel() == 0) { diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp new file mode 100644 index 00000000000..96875cc6f77 --- /dev/null +++ b/kernels/test/op_linear_test.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace ::testing; +using exec_aten::ArrayRef; +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpLinearOutTest : public OperatorTest { + protected: + Tensor& op_linear_out(const Tensor& self, const Tensor& mat2, Tensor& out) { + return torch::executor::aten::linear_outf(context_, self, mat2, {}, out); + } + + template + void test_dtype() { + TensorFactory tf; + + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + if (DTYPE == ScalarType::Half) { + GTEST_SKIP() + << "skip Half because torch::executor::aten::mm_out does not support Half"; + return; + } + } + + // matmul gives 4 * 2 * 3 = 24 + Tensor x = tf.full({3, 4}, 2); + Tensor y = tf.full({5, 4}, 3); + + // Output shape should be (3, 5) + Tensor out = tf.zeros({3, 5}); + + op_linear_out(x, y, out); + + Tensor expected = tf.full({3, 5}, 24); + + EXPECT_TENSOR_EQ(out, expected); + } +}; + +TEST_F(OpLinearOutTest, OutputDim) { + TensorFactory tf; + + // 3 tensors with compatible dimensions: (3, 5), (3, 4) and (4, 5). + Tensor x = tf.ones({3, 4}); + Tensor y = tf.ones({5, 4}); + Tensor out = tf.zeros({3, 5}); + + Tensor ret = op_linear_out(x, y, out); + + // Should always return the provided out Tensor. + EXPECT_TENSOR_EQ(ret, out); + + // Expected tensor, filled with 4. + Tensor expected = tf.full({3, 5}, 4); + + EXPECT_TENSOR_EQ(out, expected); +} + +/// A generic smoke test that works for any dtype that supports ones() and +/// zeros(). +TEST_F(OpLinearOutTest, AllDtypesSupported) { +#define TEST_ENTRY(ctype, dtype) test_dtype(); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); +#undef TEST_ENTRY + // TODO: Also add tests for half, complex, quantized, and other types. Easiest + // way to do that would be to make TensorFactory support zeros() and ones() + // for those types. +} + +TEST_F(OpLinearOutTest, EmptyInputWithEmptyOutTensorPasses) { + TensorFactory tf; + + // Empty input matrices + Tensor x = tf.make({0, 3}, {}); + Tensor y = tf.make({0, 3}, {}); + + // Output matrix is also empty + Tensor out = tf.make({0, 0}, {}); + + Tensor expected = tf.make({0, 0}, {}); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected); +} + +TEST_F(OpLinearOutTest, InfinityTensorPasses) { + TensorFactory tff; + + Tensor x = tff.full({3, 4}, std::numeric_limits::infinity()); + Tensor y = tff.full({5, 4}, 3); + + // Output shape should be (3, 5) + Tensor out = tff.zeros({3, 5}); + + Tensor expected = tff.full({3, 5}, std::numeric_limits::infinity()); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected); +} + +TEST_F(OpLinearOutTest, MismatchedDimensionsDies) { + TensorFactory tf; + + Tensor x = tf.full({2, 2}, 3); + + Tensor wrong_y = tf.full({1, 3}, 1); + Tensor right_y = tf.full({2, 2}, 1); + + // Make an empty out tensor and demonstrate that it's empty. + Tensor out = tf.full({2, 2}, 0); + + Tensor expected = tf.full({2, 2}, 6); + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, out)); + + EXPECT_TENSOR_EQ(op_linear_out(x, right_y, out), expected); +} + +TEST_F(OpLinearOutTest, MismatchedDimensionSizeDies) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "ATen kernel can handle mismatched dimension size"; + } + TensorFactory tf; + Tensor x = tf.full({2, 2}, 3); + + // wrong_y has incompatible dim + Tensor wrong_y = tf.full({2, 2, 2}, 1); + Tensor right_y = tf.full({2, 2}, 1); + + // wrong_out has incompatible dim + Tensor right_out = tf.ones({2, 2}); + Tensor wrong_out = tf.ones({2, 2, 3}); + + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, right_y, wrong_out)); + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, right_out)); +} + +TEST_F(OpLinearOutTest, WrongOutShapeDies) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "ATen kernel can handle wrong out shape"; + } + TensorFactory tf; + Tensor x = tf.ones({10, 3}); + + Tensor y = tf.ones({4, 3}); + + // wrong_out has incompatible shape + Tensor right_out = tf.ones({10, 4}); + Tensor wrong_out = tf.ones({7, 5}); + + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, y, wrong_out)); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, right_out), tf.full({10, 4}, 3)); +} + +TEST_F(OpLinearOutTest, DynamicShapeUpperBoundSameAsExpected) { + TensorFactory tf; + + Tensor x = tf.make( + {3, 2}, + {0.17412060499191284, + 0.34793388843536377, + 0.8187907934188843, + 0.9979893565177917, + 0.7049332857131958, + 0.4255824089050293}); + Tensor y = tf.make( + {4, 2}, + {0.8071839213371277, + 0.31638312339782715, + 0.13667285442352295, + 0.3691965937614441, + 0.9002121090888977, + 0.09420186281204224, + 0.9070476293563843, + 0.9310881495475769}); + Tensor expected_result = tf.make( + {3, 4}, + {0.2506277561187744, + 0.15225356817245483, + 0.18952149152755737, + 0.48189279437065125, + 0.976661741733551, + 0.480360746383667, + 0.8310978412628174, + 1.6718982458114624, + 0.703657865524292, + 0.2534688115119934, + 0.6746801733970642, + 1.0356627702713013}); + + Tensor out = + tf.zeros({3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); + Tensor ret = op_linear_out(x, y, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +TEST_F(OpLinearOutTest, DynamicShapeUpperBoundLargerThanExpected) { + TensorFactory tf; + + Tensor x = tf.make( + {3, 2}, + {0.17412060499191284, + 0.34793388843536377, + 0.8187907934188843, + 0.9979893565177917, + 0.7049332857131958, + 0.4255824089050293}); + Tensor y = tf.make( + {4, 2}, + {0.8071839213371277, + 0.31638312339782715, + 0.13667285442352295, + 0.3691965937614441, + 0.9002121090888977, + 0.09420186281204224, + 0.9070476293563843, + 0.9310881495475769}); + Tensor expected_result = tf.make( + {3, 4}, + {0.2506277561187744, + 0.15225356817245483, + 0.18952149152755737, + 0.48189279437065125, + 0.976661741733551, + 0.480360746383667, + 0.8310978412628174, + 1.6718982458114624, + 0.703657865524292, + 0.2534688115119934, + 0.6746801733970642, + 1.0356627702713013}); + + Tensor out = + tf.zeros({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); + Tensor ret = op_linear_out(x, y, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +TEST_F(OpLinearOutTest, DynamicShapeUnbound) { + GTEST_SKIP() << "Dynamic shape not supported"; + TensorFactory tf; + + Tensor x = tf.make( + {3, 2}, + {0.17412060499191284, + 0.34793388843536377, + 0.8187907934188843, + 0.9979893565177917, + 0.7049332857131958, + 0.4255824089050293}); + Tensor y = tf.make( + {4, 2}, + {0.8071839213371277, + 0.31638312339782715, + 0.13667285442352295, + 0.3691965937614441, + 0.9002121090888977, + 0.09420186281204224, + 0.9070476293563843, + 0.9310881495475769}); + Tensor expected_result = tf.make( + {3, 4}, + {0.2506277561187744, + 0.15225356817245483, + 0.18952149152755737, + 0.48189279437065125, + 0.976661741733551, + 0.480360746383667, + 0.8310978412628174, + 1.6718982458114624, + 0.703657865524292, + 0.2534688115119934, + 0.6746801733970642, + 1.0356627702713013}); + + Tensor out = + tf.zeros({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND); + Tensor ret = op_linear_out(x, y, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +// TODO: support and test bias diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index 84a7e8dedc4..f8205ea601e 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -586,3 +586,29 @@ TEST_F(OpMulScalarOutTest, OptimizedSanityCheck) { // Check that it matches the expected output. EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4})); } + +TEST_F(OpMulScalarOutTest, HalfSanityCheck) { + TensorFactory tf; + + const std::vector sizes = {2, 2}; + + Tensor out = tf.zeros(sizes); + + op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4})); +} + +TEST_F(OpMulScalarOutTest, BFloat16SanityCheck) { + TensorFactory tf; + + const std::vector sizes = {2, 2}; + + Tensor out = tf.zeros(sizes); + + op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4})); +} diff --git a/kernels/test/op_slice_scatter_test.cpp b/kernels/test/op_slice_scatter_test.cpp index 1d5c8a43b10..1d5e972ef2e 100644 --- a/kernels/test/op_slice_scatter_test.cpp +++ b/kernels/test/op_slice_scatter_test.cpp @@ -863,3 +863,24 @@ TEST_F(OpSliceScatterTensorOutTest, DynamicShapeTest) { EXPECT_TENSOR_EQ(ret_default_end, out); EXPECT_TENSOR_EQ(ret_default_end, expected); } + +TEST_F(OpSliceScatterTensorOutTest, LargeEndValue) { + TensorFactory tf; + + Tensor input = tf.zeros({1, 1, 2, 5, 3, 3}); + Tensor src = tf.ones({1, 1, 2, 5, 3, 3}); + + Tensor out = tf.zeros({1, 1, 2, 5, 3, 3}); + Tensor expected = tf.ones({1, 1, 2, 5, 3, 3}); + + Tensor ret = op_slice_scatter_out( + input, + src, + /*dim=*/1, + /*start=*/0, + /*end=*/9223372036854775807, + /*step=*/1, + out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expected); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 7ae17c5237a..f8ea484435a 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -226,6 +226,7 @@ def define_common_targets(): _common_op_test("op_le_test", ["aten", "portable", "optimized"]) _common_op_test("op_leaky_relu_test", ["aten", "portable"]) _common_op_test("op_lift_fresh_copy_test", ["aten", "portable"]) + _common_op_test("op_linear_test", ["aten", "optimized"]) _common_op_test("op_log_softmax_test", ["aten", "portable", "optimized"]) _common_op_test("op_log_test", ["aten", "portable"]) _common_op_test("op_log10_test", ["aten", "portable"]) @@ -244,7 +245,7 @@ def define_common_targets(): _common_op_test("op_mean_test", ["aten", "portable"]) _common_op_test("op_min_test", ["aten", "portable"]) _common_op_test("op_minimum_test", ["aten", "portable"]) - _common_op_test("op_mm_test", ["aten", "portable"]) + _common_op_test("op_mm_test", ["aten", "portable", "optimized"]) _common_op_test("op_mul_test", ["aten", "portable", "optimized"]) _common_op_test("op_narrow_copy_test", ["aten", "portable"]) _common_op_test("op_native_batch_norm_test", ["aten", "portable"]) diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index 4d8712c1590..7c576f889fb 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -73,6 +73,10 @@ struct is_reduced_floating_point bool, std::is_same::value || std::is_same::value> {}; + +template +constexpr bool is_reduced_floating_point_v = + is_reduced_floating_point::value; #endif /// Maps ScalarTypes to C++ types. diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h index 5aded68270b..8987d82804b 100644 --- a/runtime/core/portable_type/half.h +++ b/runtime/core/portable_type/half.h @@ -62,7 +62,7 @@ struct alignas(2) Half { namespace internal { inline float fp32_from_bits(uint32_t w) { - static_assert(sizeof(float) == sizeof(uint32_t), ""); + static_assert(sizeof(float) == sizeof(uint32_t)); union { uint32_t as_bits; float as_value; @@ -71,7 +71,7 @@ inline float fp32_from_bits(uint32_t w) { } inline uint32_t fp32_to_bits(float f) { - static_assert(sizeof(float) == sizeof(uint32_t), ""); + static_assert(sizeof(float) == sizeof(uint32_t)); union { float as_value; uint32_t as_bits; diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h index 4036539ccc5..47a9f335eb5 100644 --- a/runtime/core/portable_type/string_view.h +++ b/runtime/core/portable_type/string_view.h @@ -79,13 +79,10 @@ class basic_string_view final { } constexpr const_reference at(size_type pos) const { - return (pos >= size_) - ? (ET_ASSERT_MESSAGE_EMIT( - " (%s): " - "string_view::operator[] or string_view::at() out of range", - pos >= size_), - torch::executor::runtime_abort()) - : at_(pos); + ET_CHECK_MSG( + pos >= size_, + "string_view::operator[] or string_view::at() out of range"); + return at_(pos); } constexpr const_reference front() const { @@ -140,13 +137,9 @@ class basic_string_view final { constexpr basic_string_view substr(size_type pos = 0, size_type count = npos) const { - return (pos > size_) - ? (ET_ASSERT_MESSAGE_EMIT( - " (%s): " - "basic_string_view::substr parameter out of bounds.", - pos > size_), - torch::executor::runtime_abort()) - : substr_(pos, count); + ET_CHECK_MSG( + pos > size_, "basic_string_view::substr parameter out of bounds."); + return substr_(pos, count); } constexpr int compare(basic_string_view rhs) const noexcept { diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index d39ba875531..a6ed7e354a9 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,8 @@ namespace executorch { namespace runtime { +using internal::PlatformMemoryAllocator; + /** * Runtime state for a backend delegate. */ @@ -527,19 +530,20 @@ Error Method::resolve_operator( i, static_cast(err)); meta[count].dim_order_ = - ArrayRef(dim_order_ptr, size); + Span(dim_order_ptr, size); count++; } } - // search kernel - if (hasOpsFn(operator_name, ArrayRef(meta, count))) { - kernels[kernel_index] = - getOpsFn(operator_name, ArrayRef(meta, count)); - return Error::Ok; - } else { + + // Find a kernel with the matching name and tensor meta. + Result op_function = + get_op_function_from_registry(operator_name, {meta, count}); + if (!op_function.ok()) { ET_LOG(Error, "Missing operator: [%d] %s", op_index, operator_name); - return Error::OperatorMissing; + return op_function.error(); } + kernels[kernel_index] = op_function.get(); + return Error::Ok; } Result Method::load( @@ -547,7 +551,16 @@ Result Method::load( const Program* program, MemoryManager* memory_manager, EventTracer* event_tracer) { - Method method(program, memory_manager, event_tracer); + MemoryAllocator* temp_allocator = memory_manager->temp_allocator(); + if (temp_allocator == nullptr) { + PlatformMemoryAllocator* platform_allocator = + ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR( + memory_manager->method_allocator(), PlatformMemoryAllocator); + new (platform_allocator) PlatformMemoryAllocator(); + temp_allocator = platform_allocator; + } + Method method(program, memory_manager, event_tracer, temp_allocator); + Error err = method.init(s_plan); if (err != Error::Ok) { return err; @@ -1038,16 +1051,14 @@ Error Method::execute_instruction() { auto instruction = instructions->Get(step_state_.instr_idx); size_t next_instr_idx = step_state_.instr_idx + 1; Error err = Error::Ok; + switch (instruction->instr_args_type()) { case executorch_flatbuffer::InstructionArguments::KernelCall: { EXECUTORCH_SCOPE_PROF("OPERATOR_CALL"); internal::EventTracerProfileScope event_tracer_scope = internal::EventTracerProfileScope(event_tracer_, "OPERATOR_CALL"); // TODO(T147221312): Also expose tensor resizer via the context. - // The temp_allocator passed can be null, but calling allocate_temp will - // fail - KernelRuntimeContext context( - event_tracer_, memory_manager_->temp_allocator()); + KernelRuntimeContext context(event_tracer_, temp_allocator_); auto args = chain.argument_lists_[step_state_.instr_idx]; chain.kernels_[step_state_.instr_idx](context, args.data()); // We reset the temp_allocator after the switch statement @@ -1095,7 +1106,7 @@ Error Method::execute_instruction() { step_state_.instr_idx); BackendExecutionContext backend_execution_context( /*event_tracer*/ event_tracer_, - /*temp_allocator*/ memory_manager_->temp_allocator()); + /*temp_allocator*/ temp_allocator_); err = delegates_[delegate_idx].Execute( backend_execution_context, chain.argument_lists_[step_state_.instr_idx].data()); @@ -1167,8 +1178,8 @@ Error Method::execute_instruction() { err = Error::InvalidProgram; } // Reset the temp allocator for every instruction. - if (memory_manager_->temp_allocator() != nullptr) { - memory_manager_->temp_allocator()->reset(); + if (temp_allocator_ != nullptr) { + temp_allocator_->reset(); } if (err == Error::Ok) { step_state_.instr_idx = next_instr_idx; diff --git a/runtime/executor/method.h b/runtime/executor/method.h index 7d96096accf..0a35d6b9282 100644 --- a/runtime/executor/method.h +++ b/runtime/executor/method.h @@ -53,6 +53,7 @@ class Method final { : step_state_(rhs.step_state_), program_(rhs.program_), memory_manager_(rhs.memory_manager_), + temp_allocator_(rhs.temp_allocator_), serialization_plan_(rhs.serialization_plan_), event_tracer_(rhs.event_tracer_), n_value_(rhs.n_value_), @@ -273,10 +274,12 @@ class Method final { Method( const Program* program, MemoryManager* memory_manager, - EventTracer* event_tracer) + EventTracer* event_tracer, + MemoryAllocator* temp_allocator) : step_state_(), program_(program), memory_manager_(memory_manager), + temp_allocator_(temp_allocator), serialization_plan_(nullptr), event_tracer_(event_tracer), n_value_(0), @@ -319,6 +322,7 @@ class Method final { StepState step_state_; const Program* program_; MemoryManager* memory_manager_; + MemoryAllocator* temp_allocator_; executorch_flatbuffer::ExecutionPlan* serialization_plan_; EventTracer* event_tracer_; diff --git a/runtime/executor/platform_memory_allocator.h b/runtime/executor/platform_memory_allocator.h new file mode 100644 index 00000000000..09195a460ac --- /dev/null +++ b/runtime/executor/platform_memory_allocator.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace executorch { +namespace runtime { +namespace internal { + +/** + * PlatformMemoryAllocator is a memory allocator that uses a linked list to + * manage allocated nodes. It overrides the allocate method of MemoryAllocator + * using the PAL fallback allocator method `et_pal_allocate`. + */ +class PlatformMemoryAllocator final : public MemoryAllocator { + private: + // We allocate a little more than requested and use that memory as a node in + // a linked list, pushing the allocated buffers onto a list that's iterated + // and freed when the KernelRuntimeContext is destroyed. + struct AllocationNode { + void* data; + AllocationNode* next; + }; + + AllocationNode* head_ = nullptr; + + public: + PlatformMemoryAllocator() : MemoryAllocator(0, nullptr) {} + + void* allocate(size_t size, size_t alignment = kDefaultAlignment) override { + if (!isPowerOf2(alignment)) { + ET_LOG(Error, "Alignment %zu is not a power of 2", alignment); + return nullptr; + } + + // Allocate enough memory for the node, the data and the alignment bump. + size_t alloc_size = sizeof(AllocationNode) + size + alignment; + void* node_memory = et_pal_allocate(alloc_size); + + // If allocation failed, log message and return nullptr. + if (node_memory == nullptr) { + ET_LOG(Error, "Failed to allocate %zu bytes", alloc_size); + return nullptr; + } + + // Compute data pointer. + uint8_t* data_ptr = + reinterpret_cast(node_memory) + sizeof(AllocationNode); + + // Align the data pointer. + void* aligned_data_ptr = alignPointer(data_ptr, alignment); + + // Assert that the alignment didn't overflow the allocated memory. + ET_DCHECK_MSG( + reinterpret_cast(aligned_data_ptr) + size <= + reinterpret_cast(node_memory) + alloc_size, + "aligned_data_ptr %p + size %zu > node_memory %p + alloc_size %zu", + aligned_data_ptr, + size, + node_memory, + alloc_size); + + // Construct the node. + AllocationNode* new_node = reinterpret_cast(node_memory); + new_node->data = aligned_data_ptr; + new_node->next = head_; + head_ = new_node; + + // Return the aligned data pointer. + return head_->data; + } + + void reset() override { + AllocationNode* current = head_; + while (current != nullptr) { + AllocationNode* next = current->next; + et_pal_free(current); + current = next; + } + head_ = nullptr; + } + + ~PlatformMemoryAllocator() override { + reset(); + } + + private: + // Disable copy and move. + PlatformMemoryAllocator(const PlatformMemoryAllocator&) = delete; + PlatformMemoryAllocator& operator=(const PlatformMemoryAllocator&) = delete; + PlatformMemoryAllocator(PlatformMemoryAllocator&&) noexcept = delete; + PlatformMemoryAllocator& operator=(PlatformMemoryAllocator&&) noexcept = + delete; +}; + +} // namespace internal +} // namespace runtime +} // namespace executorch diff --git a/runtime/executor/program.h b/runtime/executor/program.h index a599cc958e0..f7469eb2192 100644 --- a/runtime/executor/program.h +++ b/runtime/executor/program.h @@ -123,7 +123,8 @@ class Program final { * * @param[in] method_name The name of the method to load. * @param[in] memory_manager The allocators to use during initialization and - * execution of the loaded method. + * execution of the loaded method. If `memory_manager.temp_allocator()` is + * null, the runtime will allocate temp memory using `et_pal_allocate()`. * @param[in] event_tracer The event tracer to use for this method run. * * @returns The loaded method on success, or an error on failure. diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl index 46f997a80ad..cc91255d7b5 100644 --- a/runtime/executor/targets.bzl +++ b/runtime/executor/targets.bzl @@ -65,6 +65,9 @@ def define_common_targets(): "tensor_parser_exec_aten.cpp", "tensor_parser{}.cpp".format(aten_suffix if aten_mode else "_portable"), ], + headers = [ + "platform_memory_allocator.h", + ], exported_headers = [ "method.h", "method_meta.h", diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp index da0d53374f1..15b3982297c 100644 --- a/runtime/executor/test/executor_test.cpp +++ b/runtime/executor/test/executor_test.cpp @@ -24,11 +24,13 @@ using exec_aten::SizesType; using exec_aten::Tensor; using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::getOpsFn; -using executorch::runtime::hasOpsFn; +using executorch::runtime::get_op_function_from_registry; using executorch::runtime::Kernel; using executorch::runtime::KernelRuntimeContext; -using executorch::runtime::register_kernels; +using executorch::runtime::OpFunction; +using executorch::runtime::register_kernel; +using executorch::runtime::registry_has_op_function; +using executorch::runtime::Result; using executorch::runtime::testing::TensorFactory; namespace pytree = ::executorch::extension::pytree; @@ -87,9 +89,9 @@ TEST_F(ExecutorTest, TensorHalf) { TEST_F(ExecutorTest, RegistryLookupAndCall) { const char* op_name = "aten::add.out"; - ASSERT_TRUE(hasOpsFn(op_name)); - auto func = getOpsFn(op_name); - ASSERT_TRUE(func); + Result func = get_op_function_from_registry(op_name); + ASSERT_EQ(func.error(), Error::Ok); + ASSERT_NE(*func, nullptr); TensorFactory tf; constexpr size_t num_evalues = 4; @@ -108,7 +110,7 @@ TEST_F(ExecutorTest, RegistryLookupAndCall) { kernel_args[4] = &evalues[3]; KernelRuntimeContext context{}; - func(context, kernel_args); + (*func)(context, kernel_args); auto c_ptr = evalues[3].toTensor().const_data_ptr(); ASSERT_EQ(c_ptr[3], 12); } @@ -166,15 +168,15 @@ TEST_F(ExecutorTest, EValueToScalar) { void test_op(KernelRuntimeContext& /*unused*/, EValue** /*unused*/) {} TEST_F(ExecutorTest, OpRegistration) { - auto s1 = register_kernels({Kernel("test", test_op)}); - auto s2 = register_kernels({Kernel("test_2", test_op)}); + auto s1 = register_kernel(Kernel("test", test_op)); + auto s2 = register_kernel(Kernel("test_2", test_op)); ASSERT_EQ(Error::Ok, s1); ASSERT_EQ(Error::Ok, s2); ET_EXPECT_DEATH( - []() { (void)register_kernels({Kernel("test", test_op)}); }(), ""); + []() { (void)register_kernel(Kernel("test", test_op)); }(), ""); - ASSERT_TRUE(hasOpsFn("test")); - ASSERT_TRUE(hasOpsFn("test_2")); + ASSERT_TRUE(registry_has_op_function("test")); + ASSERT_TRUE(registry_has_op_function("test_2")); } TEST_F(ExecutorTest, OpRegistrationWithContext) { @@ -184,25 +186,27 @@ TEST_F(ExecutorTest, OpRegistrationWithContext) { (void)context; *(values[0]) = Scalar(100); }); - auto s1 = register_kernels({op}); + auto s1 = register_kernel(op); ASSERT_EQ(Error::Ok, s1); - ASSERT_TRUE(hasOpsFn("test_op_with_context")); - auto func = getOpsFn("test_op_with_context"); + Result func = + get_op_function_from_registry("test_op_with_context"); + ASSERT_EQ(func.error(), Error::Ok); + EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); } TEST_F(ExecutorTest, AddMulAlreadyRegistered) { - ASSERT_TRUE(hasOpsFn("aten::add.out")); - ASSERT_TRUE(hasOpsFn("aten::mul.out")); + ASSERT_TRUE(registry_has_op_function("aten::add.out")); + ASSERT_TRUE(registry_has_op_function("aten::mul.out")); } TEST(PyTreeEValue, List) { diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp index 3e7da810933..4f1ac0240b9 100644 --- a/runtime/executor/test/kernel_integration_test.cpp +++ b/runtime/executor/test/kernel_integration_test.cpp @@ -34,6 +34,7 @@ using executorch::runtime::FreeableBuffer; using executorch::runtime::Kernel; using executorch::runtime::KernelKey; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::MemoryAllocator; using executorch::runtime::Method; using executorch::runtime::Program; using executorch::runtime::Result; @@ -59,10 +60,26 @@ struct KernelControl { // returning. Error fail_value = Error::Ok; + // If true, the kernel should allocate temporary memory. + bool allocate_temp_memory = false; + + // If true, the kernel should simulate allocating temporary memory. + bool simulate_temp_memory_allocation = false; + + // The size of the temporary memory to allocate. + int temp_memory_size = 0; + + // The total size of all allocations. + int total_allocated_size = 0; + void reset() { call_count = 0; call_context_fail = false; fail_value = Error::Ok; + allocate_temp_memory = false; + simulate_temp_memory_allocation = false; + temp_memory_size = 0; + total_allocated_size = 0; } /** @@ -94,7 +111,7 @@ struct KernelControl { executorch::runtime::KernelKey("v1/6;0,1|6;0,1|6;0,1|6;0,1"); Kernel kernel = executorch::runtime::Kernel( "aten::add.out", key, KernelControl::kernel_hook); - Error err = executorch::runtime::register_kernels({kernel}); + Error err = executorch::runtime::register_kernel(kernel); EXPECT_EQ(err, Error::Ok); registered_ = true; @@ -117,6 +134,33 @@ struct KernelControl { if (control->call_context_fail) { context.fail(control->fail_value); } + + // Allocate temporary memory. + if (control->allocate_temp_memory) { + Result temp_mem_res = + context.allocate_temp(control->temp_memory_size); + if (temp_mem_res.ok()) { + control->total_allocated_size += control->temp_memory_size; + // We actually use the memory, to test default memory allocation was + // successful. + uint8_t* array = (uint8_t*)(temp_mem_res.get()); + for (int i = 0; i < control->temp_memory_size; i++) { + array[i] = i % 256; + } + } + } + + // Simulate allocating temporary memory. We use this, for testing that when + // a temp allocator is provided, the kernel will use it, instead of + // allocating memory with the default platform memory allocator. + // The provided TempMemoryAllocator class in this file, simulates allocating + // memory instead of actually allocating anything. + if (control->simulate_temp_memory_allocation) { + Result temp_mem_res = + context.allocate_temp(control->temp_memory_size); + control->total_allocated_size += control->temp_memory_size; + EXPECT_EQ(temp_mem_res.error(), Error::Ok); + } } static bool registered_; @@ -126,6 +170,44 @@ struct KernelControl { bool KernelControl::registered_ = false; KernelControl KernelControl::singleton_; +/** + * MemoryAllocator that keeps track of the number/sizes of its allocations, + * to test the case where the user provides a temp allocator. + */ +class TempMemoryAllocator final : public MemoryAllocator { + public: + TempMemoryAllocator() : MemoryAllocator(0, nullptr) {} + + // The number of times allocate() has been called. + int number_of_allocations = 0; + + // The number of times reset() has been called. + int number_of_resets = 0; + + // The amount of memory currently allocated (should go to 0 when reset is + // called). + int currently_allocated_size = 0; + + // The total size of all allocations. + int total_allocated_size = 0; + + void* allocate(size_t size, ET_UNUSED size_t alignment = kDefaultAlignment) + override { + number_of_allocations += 1; + currently_allocated_size += size; + total_allocated_size += size; + // This is a simulation, we don't actually allocate memory. But we need to + // return a non-null pointer, so we return a bad, non-zero address that will + // crash if anyone tries to dereference it. + return (void*)1; + } + + void reset() override { + number_of_resets += 1; + currently_allocated_size = 0; + } +}; + class KernelIntegrationTest : public ::testing::Test { protected: void SetUp() override { @@ -152,7 +234,9 @@ class KernelIntegrationTest : public ::testing::Test { // Load the forward method. mmm_ = std::make_unique( - kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); + kDefaultNonConstMemBytes, + kDefaultRuntimeMemBytes, + temp_allocator_.get()); Result method = program_->load_method("forward", &mmm_->get()); ASSERT_EQ(method.error(), Error::Ok); method_ = std::make_unique(std::move(method.get())); @@ -185,6 +269,19 @@ class KernelIntegrationTest : public ::testing::Test { // The KernelControl associated with method_. KernelControl* control_; + + // The temp memory allocator provided by the user. By default, none is + // provided. + std::unique_ptr temp_allocator_ = nullptr; +}; + +class KernelTempMemoryAllocatorIntegrationTest : public KernelIntegrationTest { + protected: + void SetUp() override { + // Create a temp allocator for the test before calling the parent SetUp. + temp_allocator_ = std::make_unique(); + KernelIntegrationTest::SetUp(); + } }; TEST_F(KernelIntegrationTest, KernelHookIsCalled) { @@ -222,3 +319,63 @@ TEST_F(KernelIntegrationTest, FailurePropagates) { EXPECT_EQ(err, Error::Ok); EXPECT_EQ(control_->call_count, 3); } + +TEST_F(KernelIntegrationTest, DefaultPlatformMemoryAllocator) { + // Tell the kernel to allocate memory. Since no temp allocator is provided, + // this will allocate memory using the default platform memory allocator. + control_->allocate_temp_memory = true; + + control_->temp_memory_size = 4; + // This is not a simulation. This actually allocates memory, using the + // default platform memory allocator. + Error err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 1); + EXPECT_EQ(control_->total_allocated_size, 4); + + control_->temp_memory_size = 8; + // This is not a simulation. This actually allocates memory, using the + // default platform memory allocator. + err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 2); + EXPECT_EQ(control_->total_allocated_size, 12); +} + +TEST_F(KernelTempMemoryAllocatorIntegrationTest, UsingTempMemoryAllocator) { + // In this test we provide a temp allocator to the method, and tell the kernel + // to allocate memory using it. We want to make sure that the kernel uses the + // temp allocator, and that the temp allocator is reset after the execution. + // Since we are testing that the kernel uses the temp allocator, and not the + // temp allocator itself, we don't need to test the actual allocation of + // memory. Therefore, we set simulate_temp_memory_allocation to true, so that + // the kernel will not actually allocate memory, but will instead simulate + // allocating memory. + // The provided TempMemoryAllocator, simulates allocating memory by increasing + // total_allocated_size and currently_allocated_size by the requested size. + // We simulate resetting the allocator by setting currently_allocated_size + // back to 0. + control_->simulate_temp_memory_allocation = true; + + control_->temp_memory_size = 4; + Error err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 1); + EXPECT_EQ(control_->total_allocated_size, 4); + EXPECT_EQ(temp_allocator_->number_of_allocations, 1); + EXPECT_EQ(temp_allocator_->total_allocated_size, 4); + // The temp allocator should have been reset after the execution. + EXPECT_EQ(temp_allocator_->number_of_resets, 1); + EXPECT_EQ(temp_allocator_->currently_allocated_size, 0); + + control_->temp_memory_size = 8; + err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 2); + EXPECT_EQ(control_->total_allocated_size, 12); + EXPECT_EQ(temp_allocator_->number_of_allocations, 2); + EXPECT_EQ(temp_allocator_->total_allocated_size, 12); + // The temp allocator should have been reset after the execution. + EXPECT_EQ(temp_allocator_->number_of_resets, 2); + EXPECT_EQ(temp_allocator_->currently_allocated_size, 0); +} diff --git a/runtime/executor/test/kernel_resolution_test.cpp b/runtime/executor/test/kernel_resolution_test.cpp index 7ce16a8e9f3..aae0ff9b7ea 100644 --- a/runtime/executor/test/kernel_resolution_test.cpp +++ b/runtime/executor/test/kernel_resolution_test.cpp @@ -34,7 +34,7 @@ using executorch::runtime::KernelKey; using executorch::runtime::KernelRuntimeContext; using executorch::runtime::Method; using executorch::runtime::Program; -using executorch::runtime::register_kernels; +using executorch::runtime::register_kernel; using executorch::runtime::Result; using executorch::runtime::TensorMeta; using executorch::runtime::testing::ManagedMemoryManager; @@ -77,7 +77,7 @@ TEST_F(KernelResolutionTest, InitExecutionPlanSuccess) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernel(kernel_1); EXPECT_EQ(s1, executorch::runtime::Error::Ok); ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); @@ -109,7 +109,7 @@ TEST_F(KernelResolutionTest, ResolveKernelKeySuccess) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernel(kernel_1); EXPECT_EQ(s1, executorch::runtime::Error::Ok); ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); diff --git a/runtime/executor/test/managed_memory_manager.h b/runtime/executor/test/managed_memory_manager.h index 667aa35ca24..a01091527b0 100644 --- a/runtime/executor/test/managed_memory_manager.h +++ b/runtime/executor/test/managed_memory_manager.h @@ -27,7 +27,8 @@ class ManagedMemoryManager { public: ManagedMemoryManager( size_t planned_memory_bytes, - size_t method_allocator_bytes) + size_t method_allocator_bytes, + MemoryAllocator* temp_allocator = nullptr) : planned_memory_buffer_(new uint8_t[planned_memory_bytes]), planned_memory_span_( planned_memory_buffer_.get(), @@ -35,7 +36,7 @@ class ManagedMemoryManager { planned_memory_({&planned_memory_span_, 1}), method_allocator_pool_(new uint8_t[method_allocator_bytes]), method_allocator_(method_allocator_bytes, method_allocator_pool_.get()), - memory_manager_(&method_allocator_, &planned_memory_) {} + memory_manager_(&method_allocator_, &planned_memory_, temp_allocator) {} MemoryManager& get() { return memory_manager_; diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp index a8fd50d7b91..78aa0a51732 100644 --- a/runtime/kernel/operator_registry.cpp +++ b/runtime/kernel/operator_registry.cpp @@ -8,53 +8,63 @@ #include -#include -#include #include #include +#include +#include namespace executorch { namespace runtime { -OperatorRegistry& getOperatorRegistry(); -OperatorRegistry& getOperatorRegistry() { - static OperatorRegistry operator_registry; - return operator_registry; -} - -Error register_kernels(const ArrayRef& kernels) { - Error success = getOperatorRegistry().register_kernels(kernels); - if (success == Error::InvalidArgument || success == Error::Internal) { - ET_CHECK_MSG( - false, - "Kernel registration failed with error %" PRIu32 - ", see error log for details.", - static_cast(success)); - } - return success; -} - -Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { - // Operator registration happens in static initialization time when PAL init - // may or may not happen already. Here we are assuming et_pal_init() doesn't - // have any side effect even if falled multiple times. +namespace { + +// Maximum number of operators and their associated kernels that can be +// registered. +#ifdef MAX_KERNEL_NUM +constexpr uint32_t kMaxRegisteredKernels = MAX_KERNEL_NUM; +#else +constexpr uint32_t kMaxOperators = 250; +constexpr uint32_t kMaxKernelsPerOp = 8; +constexpr uint32_t kMaxRegisteredKernels = kMaxOperators * kMaxKernelsPerOp; +#endif + +// Data that backs the kernel table. Since Kernel has a custom default +// constructor (implicitly, because it contains KernelKey, which has a custom +// ctor), some toolchains don't like having a global array of them: it would +// require constructing them at init time. Since we don't care about the values +// until we add each entry to the table, allocate static zeroed memory instead +// and point the table at it. +// @lint-ignore CLANGTIDY facebook-hte-CArray +alignas(sizeof(Kernel)) uint8_t + registered_kernels_data[kMaxRegisteredKernels * sizeof(Kernel)]; + +/// Global table of registered kernels. +Kernel* registered_kernels = reinterpret_cast(registered_kernels_data); + +/// The number of kernels registered in the table. +size_t num_registered_kernels = 0; + +// Registers the kernels, but may return an error. +Error register_kernels_internal(const Span kernels) { + // Operator registration happens in static initialization time before or after + // PAL init, so call it here. It is safe to call multiple times. ::et_pal_init(); - if (kernels.size() + this->num_kernels_ > kMaxNumOfKernels) { + if (kernels.size() + num_registered_kernels > kMaxRegisteredKernels) { ET_LOG( Error, - "The total number of kernels to be registered is larger than the limit %" PRIu32 - ". %" PRIu32 - " kernels are already registered and we're trying to register another %" PRIu32 - " kernels.", - kMaxNumOfKernels, - (uint32_t)this->num_kernels_, + "The total number of kernels to be registered is larger than the limit " + "%" PRIu32 ". %" PRIu32 + " kernels are already registered and we're trying to register another " + "%" PRIu32 " kernels.", + kMaxRegisteredKernels, + (uint32_t)num_registered_kernels, (uint32_t)kernels.size()); ET_LOG(Error, "======== Kernels already in the registry: ========"); - for (size_t i = 0; i < this->num_kernels_; i++) { - ET_LOG(Error, "%s", this->kernels_[i].name_); - ET_LOG_KERNEL_KEY(this->kernels_[i].kernel_key_); + for (size_t i = 0; i < num_registered_kernels; i++) { + ET_LOG(Error, "%s", registered_kernels[i].name_); + ET_LOG_KERNEL_KEY(registered_kernels[i].kernel_key_); } ET_LOG(Error, "======== Kernels being registered: ========"); for (size_t i = 0; i < kernels.size(); i++) { @@ -67,9 +77,9 @@ Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { const char* lib_name = et_pal_get_shared_library_name(kernels.data()); for (const auto& kernel : kernels) { - // linear search. This is fine if the number of kernels are small. - for (int32_t i = 0; i < this->num_kernels_; i++) { - Kernel k = this->kernels_[i]; + // Linear search. This is fine if the number of kernels is small. + for (int32_t i = 0; i < num_registered_kernels; i++) { + Kernel k = registered_kernels[i]; if (strcmp(kernel.name_, k.name_) == 0 && kernel.kernel_key_ == k.kernel_key_) { ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name); @@ -77,7 +87,7 @@ Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { return Error::InvalidArgument; } } - this->kernels_[this->num_kernels_++] = kernel; + registered_kernels[num_registered_kernels++] = kernel; } ET_LOG( Debug, @@ -87,11 +97,23 @@ Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { return Error::Ok; } -bool hasOpsFn(const char* name, ArrayRef kernel_key) { - return getOperatorRegistry().hasOpsFn(name, kernel_key); +} // namespace + +// Registers the kernels, but panics if an error occurs. Always returns Ok. +Error register_kernels(const Span kernels) { + Error success = register_kernels_internal(kernels); + if (success == Error::InvalidArgument || success == Error::Internal) { + ET_CHECK_MSG( + false, + "Kernel registration failed with error %" PRIu32 + ", see error log for details.", + static_cast(success)); + } + return success; } -static int copy_char_as_number_to_buf(char num, char* buf) { +namespace { +int copy_char_as_number_to_buf(char num, char* buf) { if ((char)num < 10) { *buf = '0' + (char)num; buf += 1; @@ -104,10 +126,10 @@ static int copy_char_as_number_to_buf(char num, char* buf) { return 2; } } +} // namespace -void make_kernel_key_string(ArrayRef key, char* buf); - -void make_kernel_key_string(ArrayRef key, char* buf) { +namespace internal { +void make_kernel_key_string(Span key, char* buf) { if (key.empty()) { // If no tensor is present in an op, kernel key does not apply return; @@ -130,61 +152,43 @@ void make_kernel_key_string(ArrayRef key, char* buf) { buf += 1; } } +} // namespace internal -bool OperatorRegistry::hasOpsFn( +bool registry_has_op_function( const char* name, - ArrayRef meta_list) { - char buf[KernelKey::MAX_SIZE] = {0}; - make_kernel_key_string(meta_list, buf); - KernelKey kernel_key = KernelKey(buf); - - for (size_t idx = 0; idx < this->num_kernels_; idx++) { - if (strcmp(this->kernels_[idx].name_, name) == 0) { - if (this->kernels_[idx].kernel_key_.is_fallback() || - this->kernels_[idx].kernel_key_ == kernel_key) { - return true; - } - } - } - - return false; + Span meta_list) { + return get_op_function_from_registry(name, meta_list).ok(); } -const OpFunction& getOpsFn(const char* name, ArrayRef kernel_key) { - return getOperatorRegistry().getOpsFn(name, kernel_key); -} - -const OpFunction& OperatorRegistry::getOpsFn( +Result get_op_function_from_registry( const char* name, - ArrayRef meta_list) { + Span meta_list) { + // @lint-ignore CLANGTIDY facebook-hte-CArray char buf[KernelKey::MAX_SIZE] = {0}; - make_kernel_key_string(meta_list, buf); + internal::make_kernel_key_string(meta_list, buf); KernelKey kernel_key = KernelKey(buf); int32_t fallback_idx = -1; - for (size_t idx = 0; idx < this->num_kernels_; idx++) { - if (strcmp(this->kernels_[idx].name_, name) == 0) { - if (this->kernels_[idx].kernel_key_ == kernel_key) { - return this->kernels_[idx].op_; + for (size_t idx = 0; idx < num_registered_kernels; idx++) { + if (strcmp(registered_kernels[idx].name_, name) == 0) { + if (registered_kernels[idx].kernel_key_ == kernel_key) { + return registered_kernels[idx].op_; } - if (this->kernels_[idx].kernel_key_.is_fallback()) { + if (registered_kernels[idx].kernel_key_.is_fallback()) { fallback_idx = idx; } } } if (fallback_idx != -1) { - return this->kernels_[fallback_idx].op_; + return registered_kernels[fallback_idx].op_; } - ET_CHECK_MSG(false, "kernel '%s' not found.", name); + ET_LOG(Error, "kernel '%s' not found.", name); ET_LOG_TENSOR_META(meta_list); + return Error::OperatorMissing; } -ArrayRef get_kernels() { - return getOperatorRegistry().get_kernels(); -} - -ArrayRef OperatorRegistry::get_kernels() { - return ArrayRef(this->kernels_, this->num_kernels_); +Span get_registered_kernels() { + return {registered_kernels, num_registered_kernels}; } } // namespace runtime diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h index f1be83306f8..4b71f436d41 100644 --- a/runtime/kernel/operator_registry.h +++ b/runtime/kernel/operator_registry.h @@ -14,8 +14,11 @@ #include #include #include +#include +#include #include #include + // Debug switch for operator registry #if defined(ET_OP_REGISTRY_DEBUG) #include @@ -48,12 +51,10 @@ using OpFunction = void (*)(KernelRuntimeContext&, EValue**); */ struct TensorMeta { exec_aten::ScalarType dtype_; - ArrayRef dim_order_; + Span dim_order_; TensorMeta() = default; - TensorMeta( - exec_aten::ScalarType dtype, - ArrayRef order) + TensorMeta(exec_aten::ScalarType dtype, Span order) : dtype_(dtype), dim_order_(order) {} bool operator==(const TensorMeta& other) const { @@ -190,73 +191,49 @@ struct Kernel { Kernel() {} }; -// Maximum number of operators and their associated kernels that can be -// registered. -constexpr uint32_t kOperatorTableMaxSize = 250; -constexpr uint32_t kMaxNumOfKernelPerOp = 8; -#ifdef MAX_KERNEL_NUM -constexpr uint32_t kMaxNumOfKernels = MAX_KERNEL_NUM; -#else -constexpr uint32_t kMaxNumOfKernels = - kOperatorTableMaxSize * kMaxNumOfKernelPerOp; -#endif +namespace internal { +void make_kernel_key_string(Span key, char* buf); +} // namespace internal + /** - * See OperatorRegistry::hasOpsFn() + * Checks whether an operator exists with a given name and TensorMeta list. When + * TensorMeta is empty, it means this op does not have specialized kernels, so + * it checks whether it has any fallback kernels. */ -bool hasOpsFn(const char* name, ArrayRef meta_list = {}); +bool registry_has_op_function( + const char* name, + Span meta_list = {}); /** - * See OperatorRegistry::getOpsFn() + * Returns the operator with a given name and TensorMeta list, if present. */ -const OpFunction& getOpsFn( +::executorch::runtime::Result get_op_function_from_registry( const char* name, - ArrayRef meta_list = {}); + Span meta_list = {}); /** - * See OperatorRegistry::get_kernels() + * Returns all registered kernels. */ -ArrayRef get_kernels(); +Span get_registered_kernels(); /** - * See OperatorRegistry::register_kernels(). Notice that the returned Error - * object should be handled internally and the reason for keep returning is to - * satisfy the requirement to run this in static initialization time. + * Registers the provided kernels. + * + * @param[in] kernels Kernel objects to register. + * @retval Error::Ok always. Panics on error. This function needs to return a + * non-void type to run at static initialization time. */ -ET_NODISCARD Error register_kernels(const ArrayRef&); - -struct OperatorRegistry { - public: - OperatorRegistry() : num_kernels_(0) {} - - /** - * Registers the Kernels object (i.e. string name and function reference - * pair). The kernels will be merged into Operators based on the op name. - * - * @param[in] kernels Kernel object - * @retval Error code representing whether registration was successful. - */ - ET_NODISCARD Error register_kernels(const ArrayRef&); - - /** - * Checks whether an operator with a given name and TensorMeta list. - * When TensorMeta is empty, it means this op does not have specialized - * kernels, so it checks whether it has any fallback kernels. - */ - bool hasOpsFn(const char* name, ArrayRef meta_list); +ET_NODISCARD Error register_kernels(const Span); - /** - * Get the operator with a given name and TensorMeta list - */ - const OpFunction& getOpsFn(const char* name, ArrayRef meta_list); - - /** - * Return all registered operators. - */ - ArrayRef get_kernels(); - - private: - Kernel kernels_[kMaxNumOfKernels]; - uint32_t num_kernels_; +/** + * Registers a single kernel. + * + * @param[in] kernel Kernel object to register. + * @retval Error::Ok always. Panics on error. This function needs to return a + * non-void type to run at static initialization time. + */ +ET_NODISCARD inline Error register_kernel(const Kernel& kernel) { + return register_kernels({&kernel, 1}); }; } // namespace runtime @@ -266,16 +243,32 @@ namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::get_kernels; -using ::executorch::runtime::getOpsFn; -using ::executorch::runtime::hasOpsFn; using ::executorch::runtime::Kernel; using ::executorch::runtime::KernelKey; using ::executorch::runtime::KernelRuntimeContext; -using ::executorch::runtime::OperatorRegistry; using ::executorch::runtime::OpFunction; -using ::executorch::runtime::register_kernels; using ::executorch::runtime::TensorMeta; using RuntimeContext = ::executorch::runtime::KernelRuntimeContext; + +inline ::executorch::runtime::Error register_kernels(ArrayRef kernels) { + return ::executorch::runtime::register_kernels( + {kernels.data(), kernels.size()}); +} +inline OpFunction getOpsFn( + const char* name, + ArrayRef meta_list = {}) { + auto result = ::executorch::runtime::get_op_function_from_registry( + name, {meta_list.data(), meta_list.size()}); + ET_CHECK(result.ok()); // get_op_function_from_registry() logs details. + return *result; +} +inline bool hasOpsFn(const char* name, ArrayRef meta_list = {}) { + return ::executorch::runtime::registry_has_op_function( + name, {meta_list.data(), meta_list.size()}); +} +inline ArrayRef get_kernels() { + Span kernels = ::executorch::runtime::get_registered_kernels(); + return ArrayRef(kernels.data(), kernels.size()); +} } // namespace executor } // namespace torch diff --git a/runtime/kernel/test/kernel_double_registration_test.cpp b/runtime/kernel/test/kernel_double_registration_test.cpp index bef3b46f46b..1739dffd31b 100644 --- a/runtime/kernel/test/kernel_double_registration_test.cpp +++ b/runtime/kernel/test/kernel_double_registration_test.cpp @@ -20,6 +20,7 @@ using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::Kernel; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::register_kernels; class KernelDoubleRegistrationTest : public ::testing::Test { public: @@ -33,10 +34,9 @@ TEST_F(KernelDoubleRegistrationTest, Basic) { "aten::add.out", "v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); Error err = Error::InvalidArgument; ET_EXPECT_DEATH( - { auto res = register_kernels(kernels_array); }, + { (void)register_kernels({kernels}); }, std::to_string(static_cast(err))); } diff --git a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp index 16520358c75..6f6fe4b9e1b 100644 --- a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp +++ b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp @@ -19,9 +19,10 @@ using namespace ::testing; using executorch::runtime::ArrayRef; using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::hasOpsFn; using executorch::runtime::Kernel; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::register_kernels; +using executorch::runtime::registry_has_op_function; class OperatorRegistryMaxKernelNumTest : public ::testing::Test { public: @@ -33,11 +34,10 @@ class OperatorRegistryMaxKernelNumTest : public ::testing::Test { // Register one kernel when max_kernel_num=1; success TEST_F(OperatorRegistryMaxKernelNumTest, RegisterOneOp) { Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); - auto s1 = register_kernels(kernels_array); + auto s1 = register_kernels({kernels}); EXPECT_EQ(s1, Error::Ok); - EXPECT_FALSE(hasOpsFn("fpp")); - EXPECT_TRUE(hasOpsFn("foo")); + EXPECT_FALSE(registry_has_op_function("fpp")); + EXPECT_TRUE(registry_has_op_function("foo")); } // Register two kernels when max_kernel_num=1; fail @@ -45,8 +45,7 @@ TEST_F(OperatorRegistryMaxKernelNumTest, RegisterTwoOpsFail) { Kernel kernels[] = { Kernel("foo1", [](KernelRuntimeContext&, EValue**) {}), Kernel("foo2", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); ET_EXPECT_DEATH( - { (void)register_kernels(kernels_array); }, + { (void)register_kernels({kernels}); }, "The total number of kernels to be registered is larger than the limit 1"); } diff --git a/runtime/kernel/test/operator_registry_test.cpp b/runtime/kernel/test/operator_registry_test.cpp index 60cd5723cd0..57439a2bd0f 100644 --- a/runtime/kernel/test/operator_registry_test.cpp +++ b/runtime/kernel/test/operator_registry_test.cpp @@ -10,6 +10,8 @@ #include #include +#include +#include #include #include #include @@ -20,15 +22,17 @@ using namespace ::testing; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; -using executorch::runtime::ArrayRef; using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::hasOpsFn; +using executorch::runtime::get_op_function_from_registry; using executorch::runtime::Kernel; using executorch::runtime::KernelKey; using executorch::runtime::KernelRuntimeContext; using executorch::runtime::OpFunction; using executorch::runtime::register_kernels; +using executorch::runtime::registry_has_op_function; +using executorch::runtime::Result; +using executorch::runtime::Span; using executorch::runtime::TensorMeta; using executorch::runtime::testing::make_kernel_key; @@ -41,18 +45,18 @@ class OperatorRegistryTest : public ::testing::Test { TEST_F(OperatorRegistryTest, Basic) { Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); - auto s1 = register_kernels(kernels_array); - EXPECT_FALSE(hasOpsFn("fpp")); - EXPECT_TRUE(hasOpsFn("foo")); + Span kernels_span(kernels); + (void)register_kernels(kernels_span); + EXPECT_FALSE(registry_has_op_function("fpp")); + EXPECT_TRUE(registry_has_op_function("foo")); } TEST_F(OperatorRegistryTest, RegisterOpsMoreThanOnceDie) { Kernel kernels[] = { Kernel("foo", [](KernelRuntimeContext&, EValue**) {}), Kernel("foo", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); - ET_EXPECT_DEATH({ auto res = register_kernels(kernels_array); }, ""); + Span kernels_span = Span(kernels); + ET_EXPECT_DEATH({ (void)register_kernels(kernels_span); }, ""); } constexpr int BUF_SIZE = KernelKey::MAX_SIZE; @@ -91,24 +95,31 @@ TEST_F(OperatorRegistryTest, RegisterKernels) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key = ArrayRef(meta, 1); - EXPECT_TRUE(hasOpsFn("test::boo", user_kernel_key)); + Span user_kernel_key(meta); + // no fallback kernel is registered - EXPECT_FALSE(hasOpsFn("test::boo", {})); - OpFunction func = getOpsFn("test::boo", user_kernel_key); + EXPECT_FALSE(registry_has_op_function("test::boo", {})); + Result fallback_func = + get_op_function_from_registry("test::boo", {}); + EXPECT_NE(fallback_func.error(), Error::Ok); + + EXPECT_TRUE(registry_has_op_function("test::boo", user_kernel_key)); + Result func = + get_op_function_from_registry("test::boo", user_kernel_key); + EXPECT_EQ(func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); @@ -136,18 +147,18 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) { auto s1 = register_kernels(kernels); // has both kernels Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key_1 = ArrayRef(meta, 1); + Span user_kernel_key_1(meta); TensorMeta meta_2[] = {TensorMeta(ScalarType::Float, dim_order_type)}; - ArrayRef user_kernel_key_2 = ArrayRef(meta_2, 1); - - EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_1)); - EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_2)); + Span user_kernel_key_2(meta_2); // no fallback kernel is registered - EXPECT_FALSE(hasOpsFn("test::bar", {})); + EXPECT_FALSE(registry_has_op_function("test::bar", {})); + Result fallback_func = + get_op_function_from_registry("test::bar", {}); + EXPECT_NE(fallback_func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); @@ -156,16 +167,22 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) { KernelRuntimeContext context{}; // test kernel_1 - OpFunction func_1 = getOpsFn("test::bar", user_kernel_key_1); - func_1(context, evalues); + EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_1)); + Result func_1 = + get_op_function_from_registry("test::bar", user_kernel_key_1); + EXPECT_EQ(func_1.error(), Error::Ok); + (*func_1)(context, evalues); auto val_1 = values[0].toScalar().to(); ASSERT_EQ(val_1, 100); // test kernel_2 + EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_2)); + Result func_2 = + get_op_function_from_registry("test::bar", user_kernel_key_2); + EXPECT_EQ(func_2.error(), Error::Ok); values[0] = Scalar(0); - OpFunction func_2 = getOpsFn("test::bar", user_kernel_key_2); - func_2(context, evalues); + (*func_2)(context, evalues); auto val_2 = values[0].toScalar().to(); ASSERT_EQ(val_2, 50); @@ -202,27 +219,26 @@ TEST_F(OperatorRegistryTest, ExecutorChecksKernel) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key_1 = ArrayRef(meta, 1); - EXPECT_TRUE(hasOpsFn("test::qux", user_kernel_key_1)); + Span user_kernel_key_1(meta); + EXPECT_TRUE(registry_has_op_function("test::qux", user_kernel_key_1)); Tensor::DimOrderType dims_channel_first[] = {0, 3, 1, 2}; auto dim_order_type_channel_first = - ArrayRef(dims_channel_first, 4); + Span(dims_channel_first, 4); TensorMeta meta_channel_first[] = { TensorMeta(ScalarType::Long, dim_order_type_channel_first)}; - ArrayRef user_kernel_key_2 = - ArrayRef(meta_channel_first, 1); - EXPECT_FALSE(hasOpsFn("test::qux", user_kernel_key_2)); + Span user_kernel_key_2(meta_channel_first); + EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_2)); TensorMeta meta_float[] = {TensorMeta(ScalarType::Float, dim_order_type)}; - ArrayRef user_kernel_key_3 = ArrayRef(meta_float, 1); - EXPECT_FALSE(hasOpsFn("test::qux", ArrayRef(user_kernel_key_3))); + Span user_kernel_key_3(meta_float); + EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_3)); } TEST_F(OperatorRegistryTest, ExecutorUsesKernel) { @@ -235,23 +251,25 @@ TEST_F(OperatorRegistryTest, ExecutorUsesKernel) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key_1 = ArrayRef(meta, 1); - EXPECT_TRUE(hasOpsFn("test::quux", ArrayRef(meta))); + Span user_kernel_key_1(meta); - OpFunction func = getOpsFn("test::quux", ArrayRef(meta)); + EXPECT_TRUE(registry_has_op_function("test::quux", user_kernel_key_1)); + Result func = + get_op_function_from_registry("test::quux", user_kernel_key_1); + EXPECT_EQ(func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); @@ -265,20 +283,21 @@ TEST_F(OperatorRegistryTest, ExecutorUsesFallbackKernel) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); - EXPECT_TRUE(hasOpsFn("test::corge")); - EXPECT_TRUE(hasOpsFn("test::corge", ArrayRef())); + EXPECT_TRUE(registry_has_op_function("test::corge")); + EXPECT_TRUE(registry_has_op_function("test::corge", {})); - OpFunction func = getOpsFn("test::corge", ArrayRef()); + Result func = get_op_function_from_registry("test::corge", {}); + EXPECT_EQ(func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); diff --git a/runtime/kernel/test/test_kernel_manual_registration.cpp b/runtime/kernel/test/test_kernel_manual_registration.cpp index c150b61ad73..de8853c7813 100644 --- a/runtime/kernel/test/test_kernel_manual_registration.cpp +++ b/runtime/kernel/test/test_kernel_manual_registration.cpp @@ -15,7 +15,7 @@ using namespace ::testing; using executorch::runtime::Error; -using executorch::runtime::hasOpsFn; +using executorch::runtime::registry_has_op_function; class KernelManualRegistrationTest : public ::testing::Test { public: @@ -26,15 +26,15 @@ class KernelManualRegistrationTest : public ::testing::Test { TEST_F(KernelManualRegistrationTest, ManualRegister) { // Before registering, we can't find the add operator. - EXPECT_FALSE(hasOpsFn("aten::add.out")); + EXPECT_FALSE(registry_has_op_function("aten::add.out")); // Call the generated registration function. Error result = torch::executor::register_all_kernels(); EXPECT_EQ(result, Error::Ok); // We can now find the registered add operator. - EXPECT_TRUE(hasOpsFn("aten::add.out")); + EXPECT_TRUE(registry_has_op_function("aten::add.out")); // We can't find a random other operator. - EXPECT_FALSE(hasOpsFn("fpp")); + EXPECT_FALSE(registry_has_op_function("fpp")); } diff --git a/runtime/kernel/test/test_util.h b/runtime/kernel/test/test_util.h index 23993fd39d6..0c6c651af32 100644 --- a/runtime/kernel/test/test_util.h +++ b/runtime/kernel/test/test_util.h @@ -16,9 +16,6 @@ namespace executorch { namespace runtime { -// Defined in //executorch/runtime/kernel/operator_registry.cpp. -void make_kernel_key_string(ArrayRef key, char* buf); - namespace testing { inline void make_kernel_key( @@ -28,12 +25,11 @@ inline void make_kernel_key( char* buf) { std::vector meta; for (auto& t : tensors) { - ArrayRef dim_order( - t.second.data(), t.second.size()); + Span dim_order(t.second.data(), t.second.size()); meta.emplace_back(t.first, dim_order); } - auto meatadata = ArrayRef(meta.data(), meta.size()); - make_kernel_key_string(meatadata, buf); + Span metadata(meta.data(), meta.size()); + internal::make_kernel_key_string(metadata, buf); } } // namespace testing diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h index c7f603756c8..9a8e18c0f1e 100644 --- a/runtime/platform/compiler.h +++ b/runtime/platform/compiler.h @@ -13,17 +13,32 @@ #pragma once -// Compiler support checks. +/* + * Compiler support checks. Follows the logic used by pytorch/c10/util/C++17.h + * but may support older versions. + */ + +// https://gcc.gnu.org/projects/cxx-status.html#cxx17 +#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \ + __GNUC__ < 7 +#error \ + "You're trying to build ExecuTorch with a too old version of GCC. We need GCC 7 or later." +#endif + +// https://clang.llvm.org/cxx_status.html#cxx17 +#if defined(__clang__) && __clang_major__ < 5 +#error \ + "You're trying to build ExecuTorch with a too old version of Clang. We need Clang 5 or later." +#endif -#if !defined(__cplusplus) -#error ExecuTorch must be compiled using a C++ compiler. +#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \ + (!defined(_MSC_VER) && __cplusplus < 201703L) +#error "You need C++17 to compile ExecuTorch" #endif -#if __cplusplus < 201103L && (!defined(_MSC_VER) || _MSC_VER < 1600) && \ - (!defined(__GNUC__) || \ - (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40400)) -#error ExecuTorch must use a compiler supporting at least the C++11 standard. -#error __cplusplus _MSC_VER __GNUC__ __GNUC_MINOR__ __GNUC_PATCHLEVEL__ +#if defined(_WIN32) && (defined(min) || defined(max)) +#error \ + "Macro clash with min and max -- define NOMINMAX when compiling your program on Windows" #endif /* diff --git a/runtime/platform/default/minimal.cpp b/runtime/platform/default/minimal.cpp index e1db2083f4a..8236f993188 100644 --- a/runtime/platform/default/minimal.cpp +++ b/runtime/platform/default/minimal.cpp @@ -47,3 +47,9 @@ void et_pal_emit_log_message( ET_UNUSED size_t line, ET_UNUSED const char* message, ET_UNUSED size_t length) {} + +void* et_pal_allocate(ET_UNUSED size_t size) { + return nullptr; +} + +void et_pal_free(ET_UNUSED void* ptr) {} diff --git a/runtime/platform/default/posix.cpp b/runtime/platform/default/posix.cpp index cfc8cafc491..aba504f53e0 100644 --- a/runtime/platform/default/posix.cpp +++ b/runtime/platform/default/posix.cpp @@ -170,3 +170,26 @@ void et_pal_emit_log_message( message); fflush(ET_LOG_OUTPUT_FILE); } + +/** + * NOTE: Core runtime code must not call this directly. It may only be called by + * a MemoryAllocator wrapper. + * + * Allocates size bytes of memory via malloc. + * + * @param[in] size Number of bytes to allocate. + * @returns the allocated memory, or nullptr on failure. Must be freed using + * et_pal_free(). + */ +void* et_pal_allocate(size_t size) { + return malloc(size); +} + +/** + * Frees memory allocated by et_pal_allocate(). + * + * @param[in] ptr Pointer to memory to free. May be nullptr. + */ +void et_pal_free(void* ptr) { + free(ptr); +} diff --git a/runtime/platform/platform.h b/runtime/platform/platform.h index e29dad8e9a8..03cdef8eb2f 100644 --- a/runtime/platform/platform.h +++ b/runtime/platform/platform.h @@ -115,4 +115,23 @@ void et_pal_emit_log_message( const char* message, size_t length) ET_INTERNAL_PLATFORM_WEAKNESS; +/** + * NOTE: Core runtime code must not call this directly. It may only be called by + * a MemoryAllocator wrapper. + * + * Allocates size bytes of memory. + * + * @param[in] size Number of bytes to allocate. + * @returns the allocated memory, or nullptr on failure. Must be freed using + * et_pal_free(). + */ +void* et_pal_allocate(size_t size) ET_INTERNAL_PLATFORM_WEAKNESS; + +/** + * Frees memory allocated by et_pal_allocate(). + * + * @param[in] ptr Pointer to memory to free. May be nullptr. + */ +void et_pal_free(void* ptr) ET_INTERNAL_PLATFORM_WEAKNESS; + } // extern "C" diff --git a/runtime/platform/test/executor_pal_override_test.cpp b/runtime/platform/test/executor_pal_override_test.cpp index bb9ea2ce589..9bc500e652e 100644 --- a/runtime/platform/test/executor_pal_override_test.cpp +++ b/runtime/platform/test/executor_pal_override_test.cpp @@ -53,12 +53,29 @@ class PalSpy : public PlatformIntercept { last_log_message_args.length = length; } + void* allocate(size_t size) override { + ++allocate_call_count; + last_allocated_size = size; + last_allocated_ptr = (void*)0x1234; + return nullptr; + } + + void free(void* ptr) override { + ++free_call_count; + last_freed_ptr = ptr; + } + virtual ~PalSpy() = default; size_t init_call_count = 0; size_t current_ticks_call_count = 0; size_t emit_log_message_call_count = 0; et_tick_ratio_t tick_ns_multiplier = {1, 1}; + size_t allocate_call_count = 0; + size_t free_call_count = 0; + size_t last_allocated_size = 0; + void* last_allocated_ptr = nullptr; + void* last_freed_ptr = nullptr; /// The args that were passed to the most recent call to emit_log_message(). struct { @@ -158,4 +175,33 @@ TEST(ExecutorPalOverrideTest, TickToNsMultiplier) { EXPECT_EQ(et_pal_ticks_to_ns_multiplier().denominator, 1); } +TEST(ExecutorPalOverrideTest, AllocateSmokeTest) { + PalSpy spy; + InterceptWith iw(spy); + + // Validate that et_pal_allocate is overridden. + EXPECT_EQ(spy.allocate_call_count, 0); + EXPECT_EQ(spy.last_allocated_ptr, nullptr); + et_pal_allocate(4); + EXPECT_EQ(spy.allocate_call_count, 1); + EXPECT_EQ(spy.last_allocated_size, 4); + EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234); +} + +TEST(ExecutorPalOverrideTest, FreeSmokeTest) { + PalSpy spy; + InterceptWith iw(spy); + + et_pal_allocate(4); + EXPECT_EQ(spy.last_allocated_size, 4); + EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234); + + // Validate that et_pal_free is overridden. + EXPECT_EQ(spy.free_call_count, 0); + EXPECT_EQ(spy.last_freed_ptr, nullptr); + et_pal_free(spy.last_allocated_ptr); + EXPECT_EQ(spy.free_call_count, 1); + EXPECT_EQ(spy.last_freed_ptr, (void*)0x1234); +} + #endif diff --git a/runtime/platform/test/stub_platform.cpp b/runtime/platform/test/stub_platform.cpp index f7ad2f9ee63..8cee404e4e1 100644 --- a/runtime/platform/test/stub_platform.cpp +++ b/runtime/platform/test/stub_platform.cpp @@ -75,6 +75,16 @@ void et_pal_emit_log_message( timestamp, level, filename, function, line, message, length); } +void* et_pal_allocate(size_t size) { + ASSERT_INTERCEPT_INSTALLED(); + return platform_intercept->allocate(size); +} + +void et_pal_free(void* ptr) { + ASSERT_INTERCEPT_INSTALLED(); + platform_intercept->free(ptr); +} + } // extern "C" #include diff --git a/runtime/platform/test/stub_platform.h b/runtime/platform/test/stub_platform.h index af3756f3136..de5599b53b0 100644 --- a/runtime/platform/test/stub_platform.h +++ b/runtime/platform/test/stub_platform.h @@ -45,6 +45,12 @@ class PlatformIntercept { ET_UNUSED const char* message, ET_UNUSED size_t length) {} + virtual void* allocate(ET_UNUSED size_t size) { + return nullptr; + } + + virtual void free(ET_UNUSED void* ptr) {} + virtual ~PlatformIntercept() = default; }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5dbe47c8671..b651bd2dd93 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -19,8 +19,7 @@ cmake_minimum_required(VERSION 3.19) project(size_test) -# Use C++11 for size test. -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) diff --git a/test/build_size_test.sh b/test/build_size_test.sh index 540b78e9f05..428e351cf08 100644 --- a/test/build_size_test.sh +++ b/test/build_size_test.sh @@ -11,29 +11,12 @@ set -e # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" -# Set compile flags for Clang and GCC. -# -Wno-gnu allows us to use gnu statement-expressions. -# -Werror -Wc++17* ensure we do not use features from C++17. -CXX_FLAGS="-Wno-gnu" -compiler=$(cc --version) -if [[ $compiler == *"clang"* ]]; then - CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-extensions -Wc++14-extensions" -elif [[ $compiler == *"cc"* ]]; then - CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-compat -Wc++14-compat" -else - echo "Unknown compiler: $compiler" - exit 1 -fi -echo "Using compiler $compiler with flags $CXX_FLAGS" - cmake_install_executorch_lib() { echo "Installing libexecutorch.a" rm -rf cmake-out retry cmake -DBUCK2="$BUCK2" \ - -DCMAKE_CXX_STANDARD=11 \ -DCMAKE_CXX_STANDARD_REQUIRED=ON \ - -DCMAKE_CXX_FLAGS="$CXX_FLAGS" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \