diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh index 1a8e3219be0..86d53c996bc 100755 --- a/.ci/scripts/test_backend.sh +++ b/.ci/scripts/test_backend.sh @@ -85,7 +85,10 @@ else fi CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true +GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts" +export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}" + EXIT_CODE=0 -${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$? +${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$? # Generate markdown summary. ${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml index ec426af8892..73303934d2c 100644 --- a/.github/workflows/_test_backend.yml +++ b/.github/workflows/_test_backend.yml @@ -59,6 +59,61 @@ jobs: source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}" + package-golden-artifacts: + if: ${{ inputs.run-linux }} + needs: test-backend-linux + runs-on: linux.2xlarge + steps: + - name: Download model test artifacts + uses: actions/download-artifact@v4 + with: + pattern: test-report-*-models + path: downloaded/ + + - name: Package golden artifacts + run: | + set -eux + TIMESTAMP=$(date -u +%y%m%d%H) + mkdir -p golden_combined + + # Collect golden artifacts preserving flow directory structure. + # Raw files live under downloaded/*/golden-artifacts/{flow}/. + for flow_dir in downloaded/*/golden-artifacts/*/; do + [ -d "$flow_dir" ] || continue + flow_name=$(basename "$flow_dir") + if ls "$flow_dir"/*.pte 1>/dev/null 2>&1; then + mkdir -p "golden_combined/${flow_name}" + cp "$flow_dir"/*.pte "$flow_dir"/*_input*.bin "$flow_dir"/*_expected_output*.bin \ + "golden_combined/${flow_name}/" 2>/dev/null || true + fi + done + + if find golden_combined -name '*.pte' | grep -q .; then + (cd golden_combined && zip -r "../golden_artifacts_${TIMESTAMP}.zip" .) + echo "Created golden_artifacts_${TIMESTAMP}.zip" + find golden_combined -type f | head -20 + else + echo "No golden artifacts found." + fi + + - name: Upload combined golden artifacts + uses: actions/upload-artifact@v4 + with: + name: golden-artifacts-${{ inputs.backend }} + path: golden_artifacts_*.zip + if-no-files-found: ignore + + - name: Upload golden artifacts to S3 + uses: seemethere/upload-artifact-s3@v5 + if: ${{ hashFiles('golden_artifacts_*.zip') != '' }} + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/test-backend-artifacts/golden-artifacts-${{ inputs.backend }} + retention-days: 90 + if-no-files-found: ignore + path: golden_artifacts_*.zip + test-backend-macos: if: ${{ inputs.run-macos }} strategy: diff --git a/.github/workflows/test-backend-xnnpack.yml b/.github/workflows/test-backend-xnnpack.yml index 086c9625a38..d51e0c4a4e9 100644 --- a/.github/workflows/test-backend-xnnpack.yml +++ b/.github/workflows/test-backend-xnnpack.yml @@ -12,6 +12,9 @@ on: paths: - .github/workflows/test-backend-xnnpack.yml - .github/workflows/_test_backend.yml + - .ci/scripts/test_backend.sh + - backends/test/harness/** + - backends/test/suite/** workflow_dispatch: concurrency: diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py index 52c661dd748..95465cfe4a5 100644 --- a/backends/test/harness/tester.py +++ b/backends/test/harness/tester.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging +import os import random from collections import Counter, OrderedDict from typing import Any, Callable, Dict, List, Optional, Tuple @@ -317,11 +319,14 @@ def run_method_and_compare_outputs( rtol=1e-03, qtol=0, statistics_callback: Callable[[ErrorStatistics], None] | None = None, + artifact_dir: Optional[str] = None, + artifact_name: Optional[str] = None, ): number_of_runs = 1 if inputs is not None else num_runs reference_stage = self.stages[StageType.EXPORT] stage = stage or self.cur + artifacts_saved = False for _ in range(number_of_runs): inputs_to_run = inputs if inputs else next(self.generate_random_inputs()) @@ -346,8 +351,54 @@ def run_method_and_compare_outputs( statistics_callback, ) + if artifact_dir and artifact_name and not artifacts_saved: + try: + self._dump_golden_artifacts( + artifact_dir, + artifact_name, + inputs_to_run, + reference_output, + ) + except Exception: + logging.getLogger(__name__).warning( + f"Failed to dump golden artifacts for {artifact_name}", + exc_info=True, + ) + artifacts_saved = True + return self + @staticmethod + def _dump_golden_artifacts( + artifact_dir: str, + artifact_name: str, + inputs: Tuple[torch.Tensor], + reference_output, + ): + logger = logging.getLogger(__name__) + os.makedirs(artifact_dir, exist_ok=True) + + for i, inp in enumerate(inputs): + if isinstance(inp, torch.Tensor): + suffix = "" if len(inputs) == 1 else f"_{i}" + path = os.path.join(artifact_dir, f"{artifact_name}_input{suffix}.bin") + inp.contiguous().numpy().tofile(path) + logger.info(f"Saved golden input to {path}") + + if isinstance(reference_output, torch.Tensor): + reference_output = (reference_output,) + elif isinstance(reference_output, OrderedDict): + reference_output = tuple(reference_output.values()) + + for i, out in enumerate(reference_output): + if isinstance(out, torch.Tensor): + suffix = "" if len(reference_output) == 1 else f"_{i}" + path = os.path.join( + artifact_dir, f"{artifact_name}_expected_output{suffix}.bin" + ) + out.contiguous().numpy().tofile(path) + logger.info(f"Saved golden output to {path}") + @staticmethod def _assert_outputs_equal( model_output, diff --git a/backends/test/suite/conftest.py b/backends/test/suite/conftest.py index 70a97454c4e..0102de56da1 100644 --- a/backends/test/suite/conftest.py +++ b/backends/test/suite/conftest.py @@ -1,3 +1,4 @@ +import os from typing import Any import pytest @@ -32,6 +33,13 @@ def __init__(self, flow, test_name, test_base_name): self._test_base_name = test_base_name self._subtest = 0 self._results = [] + self._artifact_dir = self._resolve_artifact_dir() + + def _resolve_artifact_dir(self) -> str | None: + base = os.environ.get("GOLDEN_ARTIFACTS_DIR") + if not base: + return None + return os.path.join(base, self._flow.name) def lower_and_run_model( self, @@ -50,6 +58,7 @@ def lower_and_run_model( None, generate_random_test_inputs=generate_random_test_inputs, dynamic_shapes=dynamic_shapes, + artifact_dir=self._artifact_dir, ) self._subtest += 1 diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py index a6d7d07bce0..e90e951db20 100644 --- a/backends/test/suite/runner.py +++ b/backends/test/suite/runner.py @@ -1,6 +1,8 @@ import argparse import hashlib import importlib +import logging +import os import random import re import time @@ -92,6 +94,7 @@ def run_test( # noqa: C901 params: dict | None, dynamic_shapes: Any | None = None, generate_random_test_inputs: bool = True, + artifact_dir: str | None = None, ) -> TestCaseSummary: """ Top-level test run function for a model, input set, and tester. Handles test execution @@ -201,6 +204,11 @@ def build_result( # We can do this if we ever see to_executorch() or serialize() fail due a backend issue. return build_result(TestResult.UNKNOWN_FAIL, e) + artifact_name = None + if artifact_dir: + base = test_base_name.removeprefix("test_") + artifact_name = f"{base}_{subtest_index}" if subtest_index > 0 else base + # TODO We should consider refactoring the tester slightly to return more signal on # the cause of a failure in run_method_and_compare_outputs. We can look for # AssertionErrors to catch output mismatches, but this might catch more than that. @@ -210,11 +218,25 @@ def build_result( statistics_callback=lambda stats: error_statistics.append(stats), atol=1e-1, rtol=4e-2, + artifact_dir=artifact_dir, + artifact_name=artifact_name, ) except AssertionError as e: return build_result(TestResult.OUTPUT_MISMATCH_FAIL, e) except Exception as e: return build_result(TestResult.PTE_RUN_FAIL, e) + + # Dump .pte after successful comparison. + if artifact_dir and artifact_name and flow.supports_serialize: + logger = logging.getLogger(__name__) + try: + pte_path = os.path.join(artifact_dir, f"{artifact_name}.pte") + tester.stages[StageType.SERIALIZE].dump_artifact(pte_path) + logger.info(f"Saved golden .pte to {pte_path}") + except Exception: + logger.warning( + f"Failed to save .pte for {artifact_name}", exc_info=True + ) else: # Skip the test if nothing is delegated return build_result(TestResult.SUCCESS_UNDELEGATED)