From 0022cb7baf2c9a241c7133595d76ff2cafb4963b Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 22 May 2026 18:36:34 -0700
Subject: [PATCH 1/6] wip commit

---
 .../pytest_plugin/_step_status_capture.py     |  65 ++
 .../pytest_plugin/step_status_states.md       | 126 ++++
 .../_tests/pytest_plugin/test_pass_fail.py    | 621 ++++++++++++++++++
 python/lib/sift_client/pytest_plugin.py       | 158 ++++-
 .../util/test_results/context_manager.py      |  51 +-
 5 files changed, 1001 insertions(+), 20 deletions(-)
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py

diff --git a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
new file mode 100644
index 000000000..6838fdcc5
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
@@ -0,0 +1,65 @@
+"""Shared state for the step-status characterization suite.
+
+The outer test in ``test_step_status_states.py`` runs inner pytest sessions
+via ``pytester``. The inner session installs a fake ``sift_client`` (see
+``_INNER_CONFTEST_SRC`` in that file) which records every step status
+write into this module's ``CAPTURED_STEPS`` dict so the outer test can
+assert on what the plugin produced.
+
+This lives in its own module (rather than inside the test file) because
+the inner ``conftest.py`` runs in a fresh pytester tmp dir and needs an
+importable, package-reachable handle to the same dict object.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from sift_client.sift_types.test_report import TestStatus
+
+
+@dataclass
+class CapturedStep:
+    step_id: str
+    name: str
+    step_path: str
+    parent_step_id: str | None
+    statuses: list[TestStatus] = field(default_factory=list)
+
+
+CAPTURED_STEPS: dict[str, CapturedStep] = {}
+
+
+def reset() -> None:
+    CAPTURED_STEPS.clear()
+
+
+def steps_by_name(name: str) -> list[CapturedStep]:
+    return [s for s in CAPTURED_STEPS.values() if s.name == name]
+
+
+def test_step(name: str) -> CapturedStep | None:
+    """The step the autouse ``step`` fixture creates for the test function.
+
+    There can be a deeper step with the same name when the ``makereport``
+    hook also records one (e.g. ``pytest.skip()`` inside the test body, or
+    an ``xfail`` mark). The autouse step is the shallowest of those, so
+    pick by step_path depth.
+    """
+    matches = [s for s in CAPTURED_STEPS.values() if s.name == name]
+    if not matches:
+        return None
+    return min(matches, key=lambda s: s.step_path.count("."))
+
+
+def child_steps(parent: CapturedStep) -> list[CapturedStep]:
+    return [s for s in CAPTURED_STEPS.values() if s.parent_step_id == parent.step_id]
+
+
+def final_status(name: str) -> TestStatus | None:
+    step = test_step(name)
+    if step is None or not step.statuses:
+        return None
+    return step.statuses[-1]
diff --git a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
new file mode 100644
index 000000000..83b64783d
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
@@ -0,0 +1,126 @@
+# Pytest-plugin step-status: observed vs. target
+
+Companion document to `test_step_status_states.py`. Each row corresponds to
+one scenario in that suite. The **target** column is the contract the suite
+asserts (sourced from
+[`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md));
+the **observed today** column records what the plugin actually produces
+right now. Every row should be marked `OK`; a `Gap` indicates the plugin has
+regressed against the contract.
+
+`TestStatus` values referenced below come from
+`sift_client.sift_types.test_report.TestStatus`: `PASSED`, `FAILED`, `ERROR`,
+`SKIPPED`, `IN_PROGRESS`. The targets below map every scenario onto these
+existing statuses. An `ABORTED` status for hard process exits (`SystemExit`,
+`KeyboardInterrupt`, signals) is a planned future addition; until it lands
+those cases baseline against `ERROR` or `IN_PROGRESS`. The user-facing
+contract these targets describe is documented in
+[`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
+
+## Case ID scheme
+
+Each scenario has a stable case ID of the form `PREFIX-NN`, where the
+prefix names its section. Tests in `test_step_status_states.py` reference
+their case ID in a leading comment so a failing test can be traced back to
+this table without rereading the scenario:
+
+| Prefix  | Section                                  |
+| ------- | ---------------------------------------- |
+| `CALL`  | Call-phase exit paths                    |
+| `SKIP`  | Skip paths                               |
+| `XFAIL` | xfail / xpass                            |
+| `PHASE` | Setup / teardown phases                  |
+| `COLL`  | Collection / fixture-resolution failures |
+| `API`   | Plugin-API exit paths                    |
+
+IDs are stable: a new scenario in a section takes the next free number for
+that prefix; numbers are never reused or shifted when other sections grow.
+
+## Call-phase exit paths
+
+| Case      | Scenario                                | Trigger                                       | Observed today              | Target                                     | Status |
+| --------- | --------------------------------------- | --------------------------------------------- | --------------------------- | ------------------------------------------ | ------ |
+| `CALL-01` | Test passes                             | function body returns cleanly                 | `PASSED`                    | `PASSED`                                   | OK     |
+| `CALL-02` | Assert failure in call phase            | `assert 1 == 2`                               | `FAILED`                    | `FAILED`                                   | OK     |
+| `CALL-03` | Generic exception in call phase         | `raise ValueError("boom")`                    | `ERROR`                     | `ERROR`                                    | OK     |
+| `CALL-04` | `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `FAILED`                    | `FAILED`                                   | OK     |
+| `CALL-05` | `SystemExit` from the test body         | `sys.exit(1)`                                 | `ERROR`                     | `ERROR` (baseline; `ABORTED` planned later) | OK    |
+| `CALL-06` | `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `IN_PROGRESS` (session aborts before the plugin sees the interrupt) | `ERROR` when the plugin sees the interrupt; a session-aborting interrupt leaves the step in `IN_PROGRESS` | OK |
+
+## Skip paths
+
+| Case      | Scenario                                | Trigger                                       | Observed today                                                              | Target                                                          | Status |
+| --------- | --------------------------------------- | --------------------------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | ------ |
+| `SKIP-01` | Collection-time skip                    | `@pytest.mark.skip(reason=...)`               | `SKIPPED` (only the makereport hook records a step; no autouse step ran)    | `SKIPPED`                                                       | OK     |
+| `SKIP-02` | Conditional collection-time skip        | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` (same route as `@pytest.mark.skip`)                               | `SKIPPED`                                                       | OK     |
+| `SKIP-03` | Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `SKIPPED`; no duplicate nested step                              | Outer step `SKIPPED`; no duplicate nested step                  | OK     |
+| `SKIP-04` | Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step           | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step | OK   |
+
+## xfail / xpass
+
+| Case       | Scenario                                  | Trigger                                                | Observed today                                                                                  | Target                                                | Status |
+| ---------- | ----------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------ |
+| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step             | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step | OK     |
+| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | OK     |
+| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                 | Outer step `PASSED`                                                                              | Outer step `PASSED` (`strict=False` doesn't insist on the failure) | OK    |
+| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)                                     | `FAILED` (the `raises=` mismatch is a real test failure) | OK |
+| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (the test never ran)                                                                   | `SKIPPED` (the test never ran)                      | OK    |
+
+## Setup / teardown phases
+
+| Case       | Scenario                                   | Trigger                                                              | Observed today                                                                                                                            | Target                                                  | Status |
+| ---------- | ------------------------------------------ | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
+| `PHASE-01` | Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs        | Outer step `ERROR`; the plugin reads the setup-phase report and maps `failed` → `ERROR`                                                   | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
+| `PHASE-02` | Teardown-phase fixture failure             | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `FAILED`; after teardown the plugin upgrades a passed step when the teardown report shows `failed`                              | `FAILED` (a `phase=teardown` annotation is a planned follow-up) | OK     |
+| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is not yet surfaced separately                                  | `FAILED`; surfacing the teardown error alongside is a planned follow-up | OK |
+
+## Collection / fixture-resolution failures
+
+| Case      | Scenario                                | Trigger                                       | Observed today                                                                                                                                  | Target                                                  | Status |
+| --------- | --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
+| `COLL-01` | Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `ERROR` — the missing fixture surfaces as a setup-phase failure, which the plugin now maps to `ERROR`                                  | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
+
+## Plugin-API exit paths (in-test mutations)
+
+| Case     | Scenario                                | Trigger                                                                | Observed today                                              | Target   | Status |
+| -------- | --------------------------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------------- | -------- | ------ |
+| `API-01` | Manual status override                  | `step.current_step.update({"status": TestStatus.FAILED})`              | `FAILED`                                                    | `FAILED` | OK     |
+| `API-02` | `report_outcome(result=False)`          | `step.report_outcome("the_check", False, "did not match")`             | `FAILED`                                                    | `FAILED` | OK     |
+| `API-03` | `measure(...)` out-of-bounds            | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`  | `FAILED`                                                    | `FAILED` | OK     |
+| `API-04` | Failed measurement on a substep         | `with step.substep(...) as s: s.measure(... out-of-bounds)`            | `FAILED` (propagates from substep to parent)                | `FAILED` | OK     |
+| `API-05` | Manually-skipped substep                | `with step.substep(...) as s: s.current_step.update({"status": SKIPPED})` | Parent step `PASSED` (skip does not propagate as a failure) | `PASSED` | OK     |
+
+## Out of scope for this characterization run
+
+- **Timeout** — needs `pytest-timeout` or a manual signal harness. Add as a
+  follow-up once the audit picks a timeout strategy.
+- **Signal (SIGKILL / SIGTERM)** — cannot be caught from inside the process;
+  needs a subprocess-level harness.
+- **`pytest.exit("...")`** — niche; the "aborts subsequent tests" behavior
+  is hard to characterize cleanly because each `pytester` invocation is its
+  own session. Document the expectation alongside `SystemExit`.
+- **`os._exit()`** — bypasses Python cleanup entirely; can't be tested
+  in-process because it would kill the outer pytest run. Document as a
+  guaranteed data-loss case alongside `SystemExit` / `SIGKILL`.
+- **Parametrize-level marks** (`pytest.param(..., marks=pytest.mark.xfail / skip)`)
+  — routes through a different selection path but produces the same
+  `report.outcome`, so behavior should match the function-level marks
+  already covered above. Add only if the plugin's eventual phase-aware
+  handler diverges between the two.
+- **Import error / syntax error / `conftest.py` error** — these fail
+  collection entirely; no `item` is produced and no plugin hook fires.
+  Document explicitly that no Sift step is recorded.
+- **No-data / indeterminate** — tracked separately as part of the sibling
+  status-semantics work.
+
+## How to refresh this table
+
+Run the suite locally:
+
+```
+pytest lib/sift_client/_tests/util/test_step_status_states.py -v
+```
+
+Every row should be `OK`. If a row regresses to `Gap`, the matching test
+fails; update the **Observed today** column here to describe the
+regression and flip the row's status to **Gap** until the plugin is fixed.
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
new file mode 100644
index 000000000..c5027fb41
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -0,0 +1,621 @@
+"""Contract suite: maps each pytest exit path to the ``TestStatus`` the
+Sift pytest plugin is required to record on the outer step.
+
+Each scenario writes a tiny inner test file and runs it through pytester
+with a fake ``sift_client`` injected via a generated conftest. The fake
+records every step status write into ``_step_status_capture.CAPTURED_STEPS``
+so this outer test can assert on what the plugin produced.
+
+Assertions encode the contract from
+``docs/guides/pytest_plugin/pass_fail_behavior.md``. Tests for scenarios the
+plugin does not yet handle correctly are expected to **fail today** — they
+are the punch list. ``lib/sift_client/_tests/pytest_plugin/step_status_states.md``
+tracks each scenario's observed-today behavior next to the target so the
+remaining gaps are visible without running the suite.
+"""
+
+from __future__ import annotations
+
+import textwrap
+
+import pytest
+
+from sift_client._tests.pytest_plugin import _step_status_capture as capture
+from sift_client.sift_types.test_report import TestStatus
+
+pytest_plugins = ["pytester"]
+
+
+_INNER_CONFTEST_SRC = '''
+"""Auto-generated conftest for the step-status characterization suite.
+
+Installs the Sift pytest plugin and a fake ``sift_client`` that records
+every step status write into the outer test's CAPTURED_STEPS dict.
+"""
+
+from __future__ import annotations
+
+import uuid
+
+import pytest
+
+# Bring the Sift fixtures + the makereport hook into this inner session.
+pytest_plugins = ["sift_client.pytest_plugin"]
+
+from sift_client._tests.pytest_plugin._step_status_capture import CAPTURED_STEPS, CapturedStep
+from sift_client.sift_types.test_report import (
+    TestMeasurement,
+    TestReport,
+    TestStep,
+)
+
+
+
+class _FakeTestResults:
+    def __init__(self, client):
+        self._client = client
+
+    def create(self, test_report, log_file=None):
+        report = TestReport(
+            id_=str(uuid.uuid4()),
+            status=test_report.status,
+            name=test_report.name,
+            test_system_name=test_report.test_system_name,
+            test_case=test_report.test_case,
+            start_time=test_report.start_time,
+            end_time=test_report.end_time,
+            metadata=test_report.metadata or {},
+            is_archived=False,
+        )
+        report._apply_client_to_instance(self._client)
+        return report
+
+    def update(self, test_report, update, log_file=None):
+        return test_report
+
+    def create_step(self, test_step, log_file=None):
+        step_id = str(uuid.uuid4())
+        CAPTURED_STEPS[step_id] = CapturedStep(
+            step_id=step_id,
+            name=test_step.name,
+            step_path=test_step.step_path,
+            parent_step_id=test_step.parent_step_id,
+            statuses=[test_step.status],
+        )
+        step = TestStep(
+            id_=step_id,
+            test_report_id=test_step.test_report_id,
+            parent_step_id=test_step.parent_step_id,
+            name=test_step.name,
+            description=test_step.description,
+            step_type=test_step.step_type,
+            step_path=test_step.step_path,
+            status=test_step.status,
+            start_time=test_step.start_time,
+            end_time=test_step.end_time,
+            error_info=test_step.error_info,
+        )
+        step._apply_client_to_instance(self._client)
+        return step
+
+    def update_step(self, test_step, update, log_file=None):
+        new_status = (
+            update.get("status") if isinstance(update, dict) else update.status
+        )
+        if test_step.id_ in CAPTURED_STEPS and new_status is not None:
+            CAPTURED_STEPS[test_step.id_].statuses.append(new_status)
+        merged_status = new_status if new_status is not None else test_step.status
+        updated = TestStep(
+            id_=test_step.id_,
+            test_report_id=test_step.test_report_id,
+            parent_step_id=test_step.parent_step_id,
+            name=test_step.name,
+            description=test_step.description,
+            step_type=test_step.step_type,
+            step_path=test_step.step_path,
+            status=merged_status,
+            start_time=test_step.start_time,
+            end_time=test_step.end_time,
+            error_info=test_step.error_info,
+        )
+        updated._apply_client_to_instance(self._client)
+        return updated
+
+    def create_measurement(self, test_measurement, update_step=False, log_file=None):
+        measurement = TestMeasurement(
+            id_=str(uuid.uuid4()),
+            measurement_type=test_measurement.measurement_type,
+            name=test_measurement.name,
+            test_step_id=test_measurement.test_step_id,
+            numeric_value=test_measurement.numeric_value,
+            string_value=test_measurement.string_value,
+            boolean_value=test_measurement.boolean_value,
+            unit=test_measurement.unit,
+            numeric_bounds=test_measurement.numeric_bounds,
+            string_expected_value=test_measurement.string_expected_value,
+            passed=test_measurement.passed,
+            timestamp=test_measurement.timestamp,
+        )
+        measurement._apply_client_to_instance(self._client)
+        return measurement
+
+
+class _FakePing:
+    def ping(self):
+        return None
+
+
+class _FakeSiftClient:
+    def __init__(self):
+        self.test_results = _FakeTestResults(self)
+        self.ping = _FakePing()
+
+
+@pytest.fixture(scope="session")
+def sift_client():
+    return _FakeSiftClient()
+'''
+
+
+_RUN_ARGS = (
+    "--sift-test-results-log-file=false",
+    "--no-sift-test-results-git-metadata",
+)
+
+
+@pytest.fixture
+def inner(pytester):
+    """Reset the capture state and install the inner conftest. Returns ``pytester``."""
+    capture.reset()
+    pytester.makeconftest(_INNER_CONFTEST_SRC)
+    return pytester
+
+
+def _run(pytester, body: str) -> None:
+    pytester.makepyfile(textwrap.dedent(body))
+    pytester.runpytest_inprocess(*_RUN_ARGS)
+
+
+# ---------------------------------------------------------------------------
+# Call-phase exit paths
+# ---------------------------------------------------------------------------
+
+
+def test_pass_maps_to_passed(inner):
+    # Case: CALL-01
+    _run(
+        inner,
+        """
+        def test_x():
+            assert True
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.PASSED
+
+
+def test_assert_failure_maps_to_failed(inner):
+    # Case: CALL-02
+    _run(
+        inner,
+        """
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_generic_exception_maps_to_error(inner):
+    # Case: CALL-03
+    _run(
+        inner,
+        """
+        def test_x():
+            raise ValueError("boom")
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.ERROR
+
+
+def test_system_exit_maps_to_error(inner):
+    # Case: CALL-05
+    _run(
+        inner,
+        """
+        import sys
+        def test_x():
+            sys.exit(1)
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.ERROR
+
+
+def test_pytest_fail_maps_to_failed(inner):
+    # Case: CALL-04
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.fail("intentional failure")
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_keyboard_interrupt_leaves_step_in_progress(inner):
+    # Case: CALL-06
+    # KeyboardInterrupt aborts the session before the call-phase makereport
+    # fires; the plugin can't observe the interrupt. The contract is that
+    # the step is left in IN_PROGRESS rather than being silently resolved
+    # to PASSED — a session-aborting interrupt should not look like a clean
+    # pass in the report.
+    try:
+        _run(
+            inner,
+            """
+            def test_x():
+                raise KeyboardInterrupt
+            """,
+        )
+    except KeyboardInterrupt:
+        pass
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.IN_PROGRESS
+
+
+# ---------------------------------------------------------------------------
+# Skip paths
+# ---------------------------------------------------------------------------
+
+
+def test_pytest_skip_in_body_maps_to_skipped(inner):
+    # Case: SKIP-03
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.skip("not today")
+        """,
+    )
+    # Runtime skip in the body resolves the outer step to SKIPPED. The
+    # makereport hook must not create a duplicate nested step.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+def test_pytest_mark_skip_records_skipped(inner):
+    # Case: SKIP-01
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skip(reason="collection-time skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # Collection-time skip: the autouse step fixture never runs. Only the
+    # makereport hook creates a step, with status SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+def test_pytest_mark_skipif_records_skipped(inner):
+    # Case: SKIP-02
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skipif(True, reason="conditional skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # `skipif` with a truthy condition follows the same path as
+    # `@pytest.mark.skip`; only the makereport hook records a step.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+def test_skip_inside_fixture_setup(inner):
+    # Case: SKIP-04
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def skipping_fixture():
+            pytest.skip("environment not ready")
+
+        def test_x(skipping_fixture):
+            assert True
+        """,
+    )
+    # A setup-phase skip resolves the outer step to SKIPPED. The makereport
+    # hook must not create a duplicate nested step.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+# ---------------------------------------------------------------------------
+# xfail / xpass
+# ---------------------------------------------------------------------------
+
+
+def test_xfail_marked_test_that_fails(inner):
+    # Case: XFAIL-01
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="known issue")
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    # xfail + expected failure fulfills the contract; outer step resolves to
+    # PASSED. No duplicate nested step from the makereport hook.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+def test_xfail_strict_unexpected_pass(inner):
+    # Case: XFAIL-02
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(strict=True, reason="should fail")
+        def test_x():
+            assert True
+        """,
+    )
+    # strict xfail that passes must surface as FAILED: either the bug was
+    # fixed (remove the mark) or the test stopped exercising what it claimed.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_xfail_non_strict_unexpected_pass(inner):
+    # Case: XFAIL-03
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="might pass sometimes")
+        def test_x():
+            assert True
+        """,
+    )
+    # Non-strict xfail does not insist on the failure, so a passing run is
+    # PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_xfail_raises_mismatch(inner):
+    # Case: XFAIL-04
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(raises=ValueError, reason="expected ValueError")
+        def test_x():
+            raise KeyError("wrong exception")
+        """,
+    )
+    # `raises=` mismatch is a real test failure — the contract required a
+    # specific exception type and a different one was thrown.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_xfail_run_false(inner):
+    # Case: XFAIL-05
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(run=False, reason="never run")
+        def test_x():
+            assert False
+        """,
+    )
+    # The test never ran; outer step is SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+# ---------------------------------------------------------------------------
+# Setup-phase / teardown-phase fixture failures
+# ---------------------------------------------------------------------------
+
+
+def test_setup_phase_fixture_failure(inner):
+    # Case: PHASE-01
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_setup():
+            raise RuntimeError("setup boom")
+
+        def test_x(bad_setup):
+            assert True
+        """,
+    )
+    # A fixture that raises before `yield` fails the setup phase. The outer
+    # step must surface this as ERROR; the test body never executed and a
+    # silently green step would hide the failure.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+def test_teardown_phase_fixture_failure(inner):
+    # Case: PHASE-02
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert True
+        """,
+    )
+    # A fixture that raises after `yield` fails the teardown phase. The
+    # outer step's status reflects the teardown failure as FAILED rather
+    # than the call-phase pass.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_call_fail_plus_teardown_fail(inner):
+    # Case: PHASE-03
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert 1 == 2
+        """,
+    )
+    # Call-phase failure dominates the outer step status; the contract also
+    # requires the teardown error to be surfaced somewhere on the step
+    # (mechanism TBD — see pass_fail_behavior.md). This test asserts the
+    # status today; tighten once a surfacing mechanism is chosen.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+# ---------------------------------------------------------------------------
+# Collection-phase failures
+# ---------------------------------------------------------------------------
+
+
+def test_missing_fixture_maps_to_error(inner):
+    # Case: COLL-01
+    _run(
+        inner,
+        """
+        def test_x(nonexistent_fixture):
+            assert True
+        """,
+    )
+    # An unresolved fixture is a setup-phase failure. The outer step
+    # surfaces as ERROR rather than a misleading green pass for a test
+    # that never executed.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+# ---------------------------------------------------------------------------
+# Plugin-API exit paths (in-test mutations)
+# ---------------------------------------------------------------------------
+
+
+def test_manual_status_update_to_failed(inner):
+    # Case: API-01
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            step.current_step.update({"status": TestStatus.FAILED})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_report_outcome_false_maps_to_failed(inner):
+    # Case: API-02
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.report_outcome("the_check", False, "did not match")
+        """,
+    )
+    # Outer step sees a failed substep and rolls up to FAILED.
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_measure_out_of_bounds_maps_to_failed(inner):
+    # Case: API-03
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_substep_failure_propagates_to_parent(inner):
+    # Case: API-04
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner") as inner_step:
+                inner_step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    # `test_measure_out_of_bounds_maps_to_failed` exercises a failed
+    # measurement on the function step itself; this one verifies the same
+    # failure on a nested substep propagates up to the parent.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_skipped_substep_does_not_fail_parent(inner):
+    # Case: API-05
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            with step.substep(name="optional_check") as cal:
+                cal.current_step.update(
+                    {"status": TestStatus.SKIPPED},
+                    log_file=step.report_context.log_file,
+                )
+        """,
+    )
+    # A manually-resolved SKIPPED on a substep must not propagate as a failure
+    # to the parent. The outer step has no measurements of its own and resolves
+    # to PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 7c4c1c2f5..07704383d 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -5,14 +5,16 @@
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
+from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, Generator, Tuple
 
 import pytest
 
 from sift_client import SiftClient, SiftConnectionConfig
 from sift_client.errors import SiftWarning
-from sift_client.sift_types.test_report import TestStatus
+from sift_client.sift_types.test_report import ErrorInfo, TestStatus
 from sift_client.util.test_results import ReportContext
+from sift_client.util.test_results.context_manager import format_truncated_traceback
 
 
 class SiftPytestPluginWarning(SiftWarning):
@@ -508,17 +510,151 @@ def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool |
     return Path(raw)
 
 
+def _error_info_from_longrepr(longrepr: Any) -> ErrorInfo:
+    """Fall back to the report's longrepr when no Python exception is available."""
+    return ErrorInfo(error_code=1, error_message=str(longrepr) if longrepr is not None else "")
+
+
+def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
+    """Resolve the function step's status from pytest's per-phase reports.
+
+    Reads ``_sift_phase_setup`` / ``_sift_phase_call`` and the test's xfail marker,
+    then mutates ``new_step.current_step`` in place and flips
+    ``new_step._sift_managed_externally`` so ``NewStep.__exit__`` emits the
+    resolved status without re-classifying.
+
+    When the call phase reports ``passed`` and no override is needed (i.e. the
+    test's own status or substep failures should drive the result), this leaves
+    the step alone so the default ``__exit__`` resolution stays in charge.
+    """
+    setup_phase = getattr(item, "_sift_phase_setup", None)
+    call_phase = getattr(item, "_sift_phase_call", None)
+    xfail_marker = item.get_closest_marker("xfail")
+    xfail_runs = xfail_marker.kwargs.get("run", True) if xfail_marker is not None else True
+
+    status: TestStatus | None = None
+    error_info: ErrorInfo | None = None
+    keep_managed = False
+
+    if setup_phase is not None and setup_phase.report.outcome == "failed":
+        status = TestStatus.ERROR
+        excinfo = setup_phase.call.excinfo
+        if excinfo is not None:
+            error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+        else:
+            error_info = _error_info_from_longrepr(setup_phase.report.longrepr)
+    elif setup_phase is not None and setup_phase.report.outcome == "skipped":
+        status = TestStatus.SKIPPED
+    elif call_phase is None:
+        # Setup completed but the call-phase report never fired — the inner
+        # pytester session was aborted (e.g. by KeyboardInterrupt) before the
+        # plugin could observe the outcome. Leave the step at IN_PROGRESS so
+        # the report does not lie about a clean pass.
+        keep_managed = True
+    else:
+        wasxfail = getattr(call_phase.report, "wasxfail", None)
+        if wasxfail is not None:
+            if call_phase.report.outcome == "failed":
+                # Strict xpass: pytest synthesizes a failure when an xfail(strict=True)
+                # test unexpectedly passes. The xfail mark no longer matches reality.
+                status = TestStatus.FAILED
+            elif call_phase.report.outcome == "skipped":
+                if xfail_marker is not None and xfail_runs is False:
+                    # xfail(run=False): the test body never executed.
+                    status = TestStatus.SKIPPED
+                else:
+                    # xfail + expected failure: the test fulfilled its xfail expectation.
+                    status = TestStatus.PASSED
+            else:
+                # Non-strict xpass: passes that weren't required to fail.
+                status = TestStatus.PASSED
+        elif call_phase.report.outcome == "passed":
+            # Default __exit__ resolves PASSED/FAILED from open_step_results and any
+            # status the test code may have set. Don't override it here.
+            return
+        elif call_phase.report.outcome == "skipped":
+            status = TestStatus.SKIPPED
+        elif call_phase.report.outcome == "failed":
+            excinfo = call_phase.call.excinfo
+            if excinfo is None:
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, AssertionError):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, pytest.fail.Exception):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, KeyboardInterrupt):
+                # Hold status at IN_PROGRESS; a session-aborting interrupt should
+                # not look like a clean pass. Keep the managed flag so the default
+                # exit path does not coerce IN_PROGRESS to PASSED.
+                keep_managed = True
+            elif isinstance(excinfo.value, SystemExit):
+                status = TestStatus.ERROR
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif xfail_marker is not None:
+                # xfail(raises=X) with a non-matching exception: the contract failed.
+                status = TestStatus.FAILED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            else:
+                status = TestStatus.ERROR
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+
+    if status is None and not keep_managed:
+        return
+
+    assert new_step.current_step is not None
+    if status is not None:
+        # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
+        new_step.current_step.__dict__["status"] = status
+        if error_info is not None:
+            new_step.current_step.__dict__["error_info"] = error_info
+    new_step._sift_managed_externally = True
+
+
+def _finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestReport) -> None:
+    """Upgrade a closed step to FAILED when the teardown phase failed.
+
+    The autouse step fixture has already exited by the time the teardown
+    makereport hook fires, so call ``step.update`` again to override the status
+    server-side and propagate the failure to the still-open parent step.
+    """
+    step: NewStep | None = getattr(item, "_sift_step", None)
+    if step is None:
+        return
+    assert step.current_step is not None
+    if teardown_report.outcome == "failed" and step.current_step.status == TestStatus.PASSED:
+        step.current_step.update({"status": TestStatus.FAILED})
+        step.report_context.mark_step_failed_after_close(step.current_step)
+
+
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
 def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
-    """Capture pytest outcomes so assertion failures and skips land on the Sift step."""
+    """Capture per-phase reports and finalize step status after teardown.
+
+    Stashes both ``rep_<when>`` (the ``CallInfo``, kept for pytest plugins that
+    expect that conventional attribute) and ``_sift_phase_<when>`` (a
+    ``SimpleNamespace(call, report)`` used by ``_resolve_initial_status``). The
+    collection-time skip path is strictly gated on ``_sift_step`` being unset
+    so it does not duplicate steps the fixture already created.
+    """
     outcome = yield
     report = outcome.get_result()
-    if report.outcome == "skipped":
-        # Skipped tests bypass the autouse `step` fixture, so we record the step manually here.
-        if REPORT_CONTEXT:
-            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
-                new_step.current_step.update({"status": TestStatus.SKIPPED})
     setattr(item, "rep_" + report.when, call)
+    setattr(item, "_sift_phase_" + report.when, SimpleNamespace(call=call, report=report))
+
+    # Collection-time skip (``@pytest.mark.skip`` / ``skipif``): the autouse
+    # ``step`` fixture never runs, so the hook is the only place that can
+    # record a step. Presence of ``_sift_step`` is the "fixture ran" signal.
+    if (
+        REPORT_CONTEXT
+        and report.when == "setup"
+        and report.outcome == "skipped"
+        and getattr(item, "_sift_step", None) is None
+    ):
+        with REPORT_CONTEXT.new_step(name=item.name) as inline_step:
+            inline_step.current_step.update({"status": TestStatus.SKIPPED})
+
+    if report.when == "teardown":
+        _finalize_after_teardown(item, report)
 
 
 def _report_context_impl(
@@ -748,13 +884,9 @@ def _step_impl(
     with report_context.new_step(
         name=name, description=existing_docstring, assertion_as_fail_not_error=False
     ) as new_step:
+        node._sift_step = new_step
         yield new_step
-        if hasattr(node, "rep_call") and node.rep_call.excinfo:
-            new_step.update_step_from_result(
-                node.rep_call.excinfo,
-                node.rep_call.excinfo.value,
-                node.rep_call.excinfo.tb,
-            )
+        _resolve_initial_status(new_step, node)
 
 
 @pytest.fixture(autouse=True)
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index bd2ec917f..0f6097abc 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -43,6 +43,17 @@
 logger = logging.getLogger(__name__)
 
 
+def format_truncated_traceback(
+    exc: type[BaseException] | None,
+    exc_value: BaseException | None,
+    tb: object | None,
+) -> ErrorInfo:
+    """Format an ErrorInfo from a traceback, keeping the first frame and the last 10."""
+    stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore[arg-type]
+    stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
+    return ErrorInfo(error_code=1, error_message="".join(stack))
+
+
 def log_replay_instructions(log_file: str | Path | None) -> None:
     """Surface replay instructions when an import/replay attempt fails.
 
@@ -363,6 +374,17 @@ def record_step_outcome(self, outcome: bool, step: TestStep):
             self.open_step_results[step.step_path] = False
             self.any_failures = True
 
+    def mark_step_failed_after_close(self, step: TestStep):
+        """Mark a step's parent as failed after the step has already been popped from the stack.
+
+        Used by the pytest plugin when a teardown-phase report fires after the
+        fixture's ``__exit__`` has already resolved and exited the step.
+        """
+        self.any_failures = True
+        path_parts = step.step_path.split(".")
+        if len(path_parts) > 1:
+            self.open_step_results[".".join(path_parts[:-1])] = False
+
     def resolve_and_propagate_step_result(
         self,
         step: TestStep,
@@ -478,13 +500,7 @@ def update_step_from_result(
                 # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
                 self.report_context.record_step_outcome(False, self.current_step)
             else:
-                stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore
-                stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
-                trace = "".join(stack)
-                error_info = ErrorInfo(
-                    error_code=1,
-                    error_message=trace,
-                )
+                error_info = format_truncated_traceback(exc, exc_value, tb)
 
         # Resolve the status of this step (i.e. fail if children failed) and propagate the result to the parent step.
         result = self.report_context.resolve_and_propagate_step_result(
@@ -509,6 +525,27 @@ def update_step_from_result(
         return result
 
     def __exit__(self, exc, exc_value, tb):
+        if getattr(self, "_sift_managed_externally", False):
+            # The pytest fixture already resolved status from phase reports.
+            # Run the standard propagation so the parent step sees this step's
+            # pass/fail, emit one update_step with the resolved values, and pop
+            # from the stack without re-classifying.
+            assert self.current_step is not None
+            result = self.report_context.resolve_and_propagate_step_result(
+                self.current_step, self.current_step.error_info
+            )
+            self.current_step.update(
+                {
+                    "status": self.current_step.status,
+                    "end_time": datetime.now(timezone.utc),
+                    "error_info": self.current_step.error_info,
+                },
+            )
+            self.report_context.exit_step(self.current_step)
+            if hasattr(self, "force_result"):
+                result = self.force_result
+            return result
+
         result = self.update_step_from_result(exc, exc_value, tb)
 
         # Now that the step is updated. Let the report context handle removing it from the stack and updating the report context.

From 212bf77acb4a471f153dcf6474c959b1952fd44d Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 22 May 2026 18:55:12 -0700
Subject: [PATCH 2/6] clean up

---
 .../lib/sift_client/_tests/pytest_plugin/test_pass_fail.py  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
index c5027fb41..a05d18982 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -146,6 +146,8 @@ def ping(self):
 
 
 class _FakeSiftClient:
+    _simulate = False
+
     def __init__(self):
         self.test_results = _FakeTestResults(self)
         self.ping = _FakePing()
@@ -158,8 +160,8 @@ def sift_client():
 
 
 _RUN_ARGS = (
-    "--sift-test-results-log-file=false",
-    "--no-sift-test-results-git-metadata",
+    "--sift-log-file=false",
+    "--no-sift-git-metadata",
 )
 
 

From 6309d98893e7cd8e1aea4549b98a4d0bb41de12b Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 22 May 2026 20:22:06 -0700
Subject: [PATCH 3/6] clean up to properly use jsonl that we already have for
 tests

---
 .../_tests/pytest_plugin/_fakes.py            | 132 ------------
 .../pytest_plugin/_step_status_capture.py     | 129 +++++++++---
 .../pytest_plugin/step_status_states.md       | 148 ++++++-------
 .../_tests/pytest_plugin/test_hierarchy.py    | 172 +++++++---------
 .../_tests/pytest_plugin/test_pass_fail.py    | 194 +++++-------------
 python/lib/sift_client/pytest_plugin.py       |  12 +-
 .../util/test_results/context_manager.py      |  78 +++----
 7 files changed, 343 insertions(+), 522 deletions(-)
 delete mode 100644 python/lib/sift_client/_tests/pytest_plugin/_fakes.py

diff --git a/python/lib/sift_client/_tests/pytest_plugin/_fakes.py b/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
deleted file mode 100644
index 460100daa..000000000
--- a/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""Test doubles for the pytester-driven pytest-plugin tests.
-
-The fake ``ReportContext`` is a drop-in for the real one that records every
-step creation to a JSON file at session exit. Used by ``test_parametrize.py``
-to assert the step tree produced by an inner pytester pytest run.
-"""
-
-from __future__ import annotations
-
-import itertools
-import json
-from typing import TYPE_CHECKING, Any
-from unittest.mock import MagicMock
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
-class FakeStep:
-    def __init__(self, id_: str, name: str, parent_step_id: str | None, step_path: str) -> None:
-        self.id_ = id_
-        self.name = name
-        self.parent_step_id = parent_step_id
-        self.step_path = step_path
-        self.status: Any = None
-        self.description: Any = None
-        self.error_info: Any = None
-
-    def update(self, fields: dict[str, Any]) -> None:
-        for k, v in fields.items():
-            setattr(self, k, v)
-
-
-class FakeReport:
-    def __init__(self) -> None:
-        self.id_ = "report-id"
-
-    def update(self, fields: dict[str, Any]) -> None:
-        pass
-
-
-class FakeReportContext:
-    def __init__(self, steps_file: Path) -> None:
-        self.steps_file = steps_file
-        self.report = FakeReport()
-        self.client = MagicMock()
-        self.step_stack: list[FakeStep] = []
-        self.step_number_at_depth: dict[int, int] = {}
-        self.open_step_results: dict[str, bool] = {}
-        self.any_failures = False
-        self.log_file: Path | None = None
-        self.steps: list[dict[str, Any]] = []
-        self._ids = itertools.count(1)
-
-    def __enter__(self) -> FakeReportContext:
-        return self
-
-    def __exit__(self, *_: Any) -> None:
-        self.steps_file.write_text(json.dumps(self.steps))
-
-    def new_step(
-        self,
-        name: str,
-        description: str | None = None,
-        assertion_as_fail_not_error: bool = True,
-        metadata: dict[str, Any] | None = None,
-    ) -> Any:
-        # Reuse the real NewStep machinery — it talks to this fake via the
-        # methods below.
-        from sift_client.util.test_results.context_manager import NewStep
-
-        return NewStep(
-            self,  # type: ignore[arg-type]
-            name=name,
-            description=description,
-            assertion_as_fail_not_error=assertion_as_fail_not_error,
-            metadata=metadata,
-        )
-
-    def get_next_step_path(self) -> str:
-        top = self.step_stack[-1] if self.step_stack else None
-        path = top.step_path if top else ""
-        next_n = self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        prefix = f"{path}." if path else ""
-        return f"{prefix}{next_n}"
-
-    def create_step(
-        self,
-        name: str,
-        description: str | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> FakeStep:
-        step_path = self.get_next_step_path()
-        parent = self.step_stack[-1] if self.step_stack else None
-        step = FakeStep(
-            id_=f"step-{next(self._ids)}",
-            name=name,
-            parent_step_id=parent.id_ if parent else None,
-            step_path=step_path,
-        )
-        self.step_number_at_depth[len(self.step_stack)] = (
-            self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        )
-        self.step_stack.append(step)
-        self.open_step_results[step.step_path] = True
-        self.steps.append(
-            {
-                "id": step.id_,
-                "name": name,
-                "parent_step_id": step.parent_step_id,
-                "step_path": step_path,
-            }
-        )
-        return step
-
-    def record_step_outcome(self, outcome: bool, step: FakeStep) -> None:
-        if not outcome:
-            self.open_step_results[step.step_path] = False
-            self.any_failures = True
-
-    def resolve_and_propagate_step_result(self, step: FakeStep, error_info: Any = None) -> bool:
-        result = self.open_step_results.get(step.step_path, True)
-        if error_info:
-            result = False
-        return result
-
-    def exit_step(self, step: FakeStep) -> None:
-        self.step_number_at_depth[len(self.step_stack)] = 0
-        stack_top = self.step_stack.pop()
-        self.open_step_results.pop(step.step_path)
-        if stack_top.id_ != step.id_:
-            raise ValueError("popped step was not the top of the stack")
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
index 6838fdcc5..dd8e8c05a 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
@@ -1,23 +1,20 @@
-"""Shared state for the step-status characterization suite.
+"""Read step status sequences from a Sift offline-mode log file.
 
-The outer test in ``test_step_status_states.py`` runs inner pytest sessions
-via ``pytester``. The inner session installs a fake ``sift_client`` (see
-``_INNER_CONFTEST_SRC`` in that file) which records every step status
-write into this module's ``CAPTURED_STEPS`` dict so the outer test can
-assert on what the plugin produced.
-
-This lives in its own module (rather than inside the test file) because
-the inner ``conftest.py`` runs in a fresh pytester tmp dir and needs an
-importable, package-reachable handle to the same dict object.
+The contract suite drives each scenario through an inner pytester session
+run with ``--sift-offline``, which causes the real plugin + ``ReportContext``
+to write every test-result API call to a JSONL log. This module parses
+that log into a per-step status timeline that ``test_pass_fail.py`` asserts
+against, with no test-only ``ReportContext`` fake required.
 """
 
 from __future__ import annotations
 
+import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
+from pathlib import Path
 
-if TYPE_CHECKING:
-    from sift_client.sift_types.test_report import TestStatus
+from sift_client._internal.low_level_wrappers._test_results_log import iter_log_data_lines
+from sift_client.sift_types.test_report import TestStatus
 
 
 @dataclass
@@ -29,37 +26,111 @@ class CapturedStep:
     statuses: list[TestStatus] = field(default_factory=list)
 
 
-CAPTURED_STEPS: dict[str, CapturedStep] = {}
+_PROTO_STATUS_NAMES = {
+    "TEST_STATUS_UNSPECIFIED": TestStatus.UNSPECIFIED,
+    "TEST_STATUS_DRAFT": TestStatus.DRAFT,
+    "TEST_STATUS_PASSED": TestStatus.PASSED,
+    "TEST_STATUS_FAILED": TestStatus.FAILED,
+    "TEST_STATUS_ABORTED": TestStatus.ABORTED,
+    "TEST_STATUS_ERROR": TestStatus.ERROR,
+    "TEST_STATUS_IN_PROGRESS": TestStatus.IN_PROGRESS,
+    "TEST_STATUS_SKIPPED": TestStatus.SKIPPED,
+}
+
+
+def _status(name: str | None) -> TestStatus:
+    if name is None:
+        return TestStatus.UNSPECIFIED
+    return _PROTO_STATUS_NAMES.get(name, TestStatus.UNSPECIFIED)
 
 
-def reset() -> None:
-    CAPTURED_STEPS.clear()
+def parse_log(log_path: Path) -> dict[str, CapturedStep]:
+    """Parse the offline log into ``{step_id: CapturedStep}``.
+
+    Walks the JSONL file in order, building a ``CapturedStep`` for each
+    ``CreateTestStep`` entry and appending the new status from each
+    ``UpdateTestStep`` entry.
+    """
+    steps: dict[str, CapturedStep] = {}
+    for request_type, response_id, json_str in iter_log_data_lines(log_path):
+        payload = json.loads(json_str)
+        test_step = payload.get("testStep", {})
+        if request_type == "CreateTestStep" and response_id:
+            steps[response_id] = CapturedStep(
+                step_id=response_id,
+                name=test_step.get("name", ""),
+                step_path=test_step.get("stepPath", ""),
+                parent_step_id=test_step.get("parentStepId") or None,
+                statuses=[_status(test_step.get("status"))],
+            )
+        elif request_type == "UpdateTestStep":
+            step_id = test_step.get("testStepId")
+            new_status = test_step.get("status")
+            if step_id and step_id in steps and new_status is not None:
+                steps[step_id].statuses.append(_status(new_status))
+    return steps
+
+
+_active_log: Path | None = None
+_cached: dict[str, CapturedStep] | None = None
+
+
+def set_log(path: Path) -> None:
+    """Point subsequent queries at a new log file. Clears the parse cache."""
+    global _active_log, _cached
+    _active_log = path
+    _cached = None
+
+
+def _steps() -> dict[str, CapturedStep]:
+    global _cached
+    if _cached is None:
+        if _active_log is None or not _active_log.exists():
+            _cached = {}
+        else:
+            _cached = parse_log(_active_log)
+    return _cached
 
 
 def steps_by_name(name: str) -> list[CapturedStep]:
-    return [s for s in CAPTURED_STEPS.values() if s.name == name]
+    return [s for s in _steps().values() if s.name == name]
 
 
 def test_step(name: str) -> CapturedStep | None:
     """The step the autouse ``step`` fixture creates for the test function.
 
-    There can be a deeper step with the same name when the ``makereport``
-    hook also records one (e.g. ``pytest.skip()`` inside the test body, or
-    an ``xfail`` mark). The autouse step is the shallowest of those, so
-    pick by step_path depth.
+    Multiple steps can share a name (e.g. when the makereport hook records an
+    inline step for a collection-time skip on top of the autouse step). The
+    autouse step is the shallowest by path depth.
     """
-    matches = [s for s in CAPTURED_STEPS.values() if s.name == name]
+    matches = steps_by_name(name)
     if not matches:
         return None
     return min(matches, key=lambda s: s.step_path.count("."))
 
 
-def child_steps(parent: CapturedStep) -> list[CapturedStep]:
-    return [s for s in CAPTURED_STEPS.values() if s.parent_step_id == parent.step_id]
-
-
 def final_status(name: str) -> TestStatus | None:
     step = test_step(name)
-    if step is None or not step.statuses:
-        return None
-    return step.statuses[-1]
+    return step.statuses[-1] if step and step.statuses else None
+
+
+def load_steps(log_path: Path) -> list[dict]:
+    """Load the offline log as a list of step records keyed by hierarchy fields.
+
+    Each record has ``id``, ``name``, ``parent_step_id``, ``step_path``, the
+    shape ``test_hierarchy.py`` expects for its ``_by_name`` and
+    ``_ancestor_names`` walkers. Returns an empty list if the log was never
+    created (e.g. every item in the inner session was ``sift_exclude``-d, so
+    the plugin's ``report_context`` fixture never fired).
+    """
+    if not log_path.exists():
+        return []
+    return [
+        {
+            "id": s.step_id,
+            "name": s.name,
+            "parent_step_id": s.parent_step_id,
+            "step_path": s.step_path,
+        }
+        for s in parse_log(log_path).values()
+    ]
diff --git a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
index 83b64783d..f98d13fee 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
+++ b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
@@ -1,28 +1,21 @@
-# Pytest-plugin step-status: observed vs. target
-
-Companion document to `test_step_status_states.py`. Each row corresponds to
-one scenario in that suite. The **target** column is the contract the suite
-asserts (sourced from
-[`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md));
-the **observed today** column records what the plugin actually produces
-right now. Every row should be marked `OK`; a `Gap` indicates the plugin has
-regressed against the contract.
-
-`TestStatus` values referenced below come from
-`sift_client.sift_types.test_report.TestStatus`: `PASSED`, `FAILED`, `ERROR`,
-`SKIPPED`, `IN_PROGRESS`. The targets below map every scenario onto these
-existing statuses. An `ABORTED` status for hard process exits (`SystemExit`,
-`KeyboardInterrupt`, signals) is a planned future addition; until it lands
-those cases baseline against `ERROR` or `IN_PROGRESS`. The user-facing
-contract these targets describe is documented in
+# Pytest-plugin step-status: test scenarios
+
+Reference for the pass/fail scenarios covered by
+[`test_pass_fail.py`](test_pass_fail.py). Each row pairs a scenario with the
+`TestStatus` the plugin records, and maps to the user-facing contract in
 [`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
 
+`TestStatus` values come from `sift_client.sift_types.test_report.TestStatus`:
+`PASSED`, `FAILED`, `ERROR`, `SKIPPED`, `ABORTED`, `IN_PROGRESS`. Hard process
+exits the plugin can observe (`SystemExit`, `KeyboardInterrupt` when pytest
+delivers a call-phase report) map to `ABORTED`. A session-aborting interrupt
+that fires before the plugin sees it leaves the step in `IN_PROGRESS`.
+
 ## Case ID scheme
 
-Each scenario has a stable case ID of the form `PREFIX-NN`, where the
-prefix names its section. Tests in `test_step_status_states.py` reference
-their case ID in a leading comment so a failing test can be traced back to
-this table without rereading the scenario:
+Each scenario has a stable case ID of the form `PREFIX-NN`. Tests in
+`test_pass_fail.py` reference their case ID in a leading comment so a test can
+be traced back to its row here without rereading the scenario:
 
 | Prefix  | Section                                  |
 | ------- | ---------------------------------------- |
@@ -33,94 +26,79 @@ this table without rereading the scenario:
 | `COLL`  | Collection / fixture-resolution failures |
 | `API`   | Plugin-API exit paths                    |
 
-IDs are stable: a new scenario in a section takes the next free number for
-that prefix; numbers are never reused or shifted when other sections grow.
 
 ## Call-phase exit paths
 
-| Case      | Scenario                                | Trigger                                       | Observed today              | Target                                     | Status |
-| --------- | --------------------------------------- | --------------------------------------------- | --------------------------- | ------------------------------------------ | ------ |
-| `CALL-01` | Test passes                             | function body returns cleanly                 | `PASSED`                    | `PASSED`                                   | OK     |
-| `CALL-02` | Assert failure in call phase            | `assert 1 == 2`                               | `FAILED`                    | `FAILED`                                   | OK     |
-| `CALL-03` | Generic exception in call phase         | `raise ValueError("boom")`                    | `ERROR`                     | `ERROR`                                    | OK     |
-| `CALL-04` | `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `FAILED`                    | `FAILED`                                   | OK     |
-| `CALL-05` | `SystemExit` from the test body         | `sys.exit(1)`                                 | `ERROR`                     | `ERROR` (baseline; `ABORTED` planned later) | OK    |
-| `CALL-06` | `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `IN_PROGRESS` (session aborts before the plugin sees the interrupt) | `ERROR` when the plugin sees the interrupt; a session-aborting interrupt leaves the step in `IN_PROGRESS` | OK |
+| Case      | Scenario                        | Trigger                              | Outcome                                                                                                  |
+| --------- | ------------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------- |
+| `CALL-01` | Test passes                     | function body returns cleanly        | `PASSED`                                                                                                 |
+| `CALL-02` | Assert failure in call phase    | `assert 1 == 2`                      | `FAILED`                                                                                                 |
+| `CALL-03` | Generic exception in call phase | `raise ValueError("boom")`           | `ERROR`                                                                                                  |
+| `CALL-04` | `pytest.fail("...")` from body  | `pytest.fail("intentional failure")` | `FAILED`                                                                                                 |
+| `CALL-05` | `SystemExit` from the test body | `sys.exit(1)`                        | `ABORTED`                                                                                                |
+| `CALL-06` | `KeyboardInterrupt` in body     | `raise KeyboardInterrupt`            | `IN_PROGRESS` — session aborts before the plugin sees the interrupt; `ABORTED` if the plugin does see it |
 
 ## Skip paths
 
-| Case      | Scenario                                | Trigger                                       | Observed today                                                              | Target                                                          | Status |
-| --------- | --------------------------------------- | --------------------------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | ------ |
-| `SKIP-01` | Collection-time skip                    | `@pytest.mark.skip(reason=...)`               | `SKIPPED` (only the makereport hook records a step; no autouse step ran)    | `SKIPPED`                                                       | OK     |
-| `SKIP-02` | Conditional collection-time skip        | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` (same route as `@pytest.mark.skip`)                               | `SKIPPED`                                                       | OK     |
-| `SKIP-03` | Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `SKIPPED`; no duplicate nested step                              | Outer step `SKIPPED`; no duplicate nested step                  | OK     |
-| `SKIP-04` | Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step           | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step | OK   |
+| Case      | Scenario                         | Trigger                                      | Outcome                                                                  |
+| --------- | -------------------------------- | -------------------------------------------- | ------------------------------------------------------------------------ |
+| `SKIP-01` | Collection-time skip             | `@pytest.mark.skip(reason=...)`              | `SKIPPED` — only the makereport hook records a step; no autouse step ran |
+| `SKIP-02` | Conditional collection-time skip | `@pytest.mark.skipif(True, reason=...)`      | `SKIPPED` — same route as `@pytest.mark.skip`                            |
+| `SKIP-03` | Runtime skip in body             | `pytest.skip("...")`                         | Outer step `SKIPPED`; no duplicate nested step                           |
+| `SKIP-04` | Skip raised inside a fixture     | `@pytest.fixture` calls `pytest.skip("...")` | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step        |
 
 ## xfail / xpass
 
-| Case       | Scenario                                  | Trigger                                                | Observed today                                                                                  | Target                                                | Status |
-| ---------- | ----------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------ |
-| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step             | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step | OK     |
-| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | OK     |
-| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                 | Outer step `PASSED`                                                                              | Outer step `PASSED` (`strict=False` doesn't insist on the failure) | OK    |
-| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)                                     | `FAILED` (the `raises=` mismatch is a real test failure) | OK |
-| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (the test never ran)                                                                   | `SKIPPED` (the test never ran)                      | OK    |
+| Case       | Scenario                                  | Trigger                                                    | Outcome                                                  |
+| ---------- | ----------------------------------------- | ---------------------------------------------------------- | -------------------------------------------------------- |
+| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                     | `PASSED` — test fulfilled the xfail expectation          |
+| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`          | `FAILED` — mark no longer matches reality                |
+| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                     | `PASSED` — `strict=False` doesn't insist on the failure  |
+| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` — `raises=` mismatch is a real test failure     |
+| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)      | `SKIPPED` — the test never ran                           |
 
 ## Setup / teardown phases
 
-| Case       | Scenario                                   | Trigger                                                              | Observed today                                                                                                                            | Target                                                  | Status |
-| ---------- | ------------------------------------------ | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| `PHASE-01` | Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs        | Outer step `ERROR`; the plugin reads the setup-phase report and maps `failed` → `ERROR`                                                   | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
-| `PHASE-02` | Teardown-phase fixture failure             | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `FAILED`; after teardown the plugin upgrades a passed step when the teardown report shows `failed`                              | `FAILED` (a `phase=teardown` annotation is a planned follow-up) | OK     |
-| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is not yet surfaced separately                                  | `FAILED`; surfacing the teardown error alongside is a planned follow-up | OK |
+| Case       | Scenario                                     | Trigger                                                            | Outcome                                                                                                                          |
+| ---------- | -------------------------------------------- | ------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
+| `PHASE-01` | Setup-phase fixture failure (RuntimeError)   | `@pytest.fixture` raises before `yield`; test body never runs      | `ERROR` — plugin reads the setup-phase report and maps `failed` → `ERROR` (a `phase=setup` annotation is a planned follow-up)    |
+| `PHASE-02` | Teardown-phase fixture failure               | `@pytest.fixture` raises after `yield`; test body passed           | `FAILED` — plugin upgrades a passed step when the teardown report shows `failed` (a `phase=teardown` annotation is a planned follow-up) |
+| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | `FAILED` — call-phase failure dominates; surfacing the teardown error alongside is a planned follow-up                           |
 
 ## Collection / fixture-resolution failures
 
-| Case      | Scenario                                | Trigger                                       | Observed today                                                                                                                                  | Target                                                  | Status |
-| --------- | --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| `COLL-01` | Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `ERROR` — the missing fixture surfaces as a setup-phase failure, which the plugin now maps to `ERROR`                                  | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
+| Case      | Scenario        | Trigger                            | Outcome                                                                                                            |
+| --------- | --------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
+| `COLL-01` | Missing fixture | `def test_x(nonexistent_fixture):` | `ERROR` — missing fixture surfaces as a setup-phase failure (a `phase=setup` annotation is a planned follow-up)    |
 
 ## Plugin-API exit paths (in-test mutations)
 
-| Case     | Scenario                                | Trigger                                                                | Observed today                                              | Target   | Status |
-| -------- | --------------------------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------------- | -------- | ------ |
-| `API-01` | Manual status override                  | `step.current_step.update({"status": TestStatus.FAILED})`              | `FAILED`                                                    | `FAILED` | OK     |
-| `API-02` | `report_outcome(result=False)`          | `step.report_outcome("the_check", False, "did not match")`             | `FAILED`                                                    | `FAILED` | OK     |
-| `API-03` | `measure(...)` out-of-bounds            | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`  | `FAILED`                                                    | `FAILED` | OK     |
-| `API-04` | Failed measurement on a substep         | `with step.substep(...) as s: s.measure(... out-of-bounds)`            | `FAILED` (propagates from substep to parent)                | `FAILED` | OK     |
-| `API-05` | Manually-skipped substep                | `with step.substep(...) as s: s.current_step.update({"status": SKIPPED})` | Parent step `PASSED` (skip does not propagate as a failure) | `PASSED` | OK     |
+| Case     | Scenario                          | Trigger                                                                   | Outcome                                                                                                                     |
+| -------- | --------------------------------- | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
+| `API-01` | Manual status override            | `step.current_step.update({"status": TestStatus.FAILED})`                 | `FAILED`                                                                                                                    |
+| `API-02` | `report_outcome(result=False)`    | `step.report_outcome("the_check", False, "did not match")`                | `FAILED`                                                                                                                    |
+| `API-03` | `measure(...)` out-of-bounds      | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`     | `FAILED`                                                                                                                    |
+| `API-04` | Failed measurement on a substep   | `with step.substep(...) as s: s.measure(... out-of-bounds)`               | `FAILED` — propagates from substep to parent                                                                                |
+| `API-05` | Manually-skipped substep          | `with step.substep(...) as s: s.current_step.update({"status": SKIPPED})` | Parent step `PASSED` — skip does not propagate as a failure                                                                 |
+| `API-06` | Hard exit inside a nested substep | `with step.substep(...) as s: with s.substep(...): sys.exit(1)`           | Every open step on the unwind path records `ABORTED`; a sibling substep that closed before the abort keeps its prior status |
 
-## Out of scope for this characterization run
+## Out of scope
 
-- **Timeout** — needs `pytest-timeout` or a manual signal harness. Add as a
-  follow-up once the audit picks a timeout strategy.
+Scenarios deliberately not covered by this suite:
+
+- **Timeout** — needs `pytest-timeout` or a manual signal harness.
 - **Signal (SIGKILL / SIGTERM)** — cannot be caught from inside the process;
   needs a subprocess-level harness.
 - **`pytest.exit("...")`** — niche; the "aborts subsequent tests" behavior
-  is hard to characterize cleanly because each `pytester` invocation is its
-  own session. Document the expectation alongside `SystemExit`.
+  is hard to characterize cleanly because each `pytester` invocation is
+  its own session.
 - **`os._exit()`** — bypasses Python cleanup entirely; can't be tested
-  in-process because it would kill the outer pytest run. Document as a
-  guaranteed data-loss case alongside `SystemExit` / `SIGKILL`.
+  in-process because it would kill the outer pytest run. Guaranteed
+  data-loss case alongside `SystemExit` / `SIGKILL`.
 - **Parametrize-level marks** (`pytest.param(..., marks=pytest.mark.xfail / skip)`)
   — routes through a different selection path but produces the same
-  `report.outcome`, so behavior should match the function-level marks
-  already covered above. Add only if the plugin's eventual phase-aware
-  handler diverges between the two.
+  `report.outcome`, so behavior matches the function-level marks already
+  covered above.
 - **Import error / syntax error / `conftest.py` error** — these fail
-  collection entirely; no `item` is produced and no plugin hook fires.
-  Document explicitly that no Sift step is recorded.
-- **No-data / indeterminate** — tracked separately as part of the sibling
-  status-semantics work.
-
-## How to refresh this table
-
-Run the suite locally:
-
-```
-pytest lib/sift_client/_tests/util/test_step_status_states.py -v
-```
-
-Every row should be `OK`. If a row regresses to `Gap`, the matching test
-fails; update the **Observed today** column here to describe the
-regression and flip the row's status to **Gap** until the plugin is fixed.
+  collection entirely; no `item` is produced and no plugin hook fires, so
+  no Sift step is recorded.
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
index cecad2df8..866471ce7 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -4,65 +4,43 @@
 classes (including nested), parametrize axes — plus the ini opt-out flags,
 failure-cleanup semantics, and the drain helper.
 
-Each test spins up an inner pytest run via ``pytester`` whose conftest swaps
-in a ``FakeReportContext`` (from ``_fakes.py``) that records every step
-creation to a JSON file. The outer test reads that file and asserts the
-resulting step tree.
+Each test spins up an inner pytest run via ``pytester`` configured with
+``--sift-offline`` and a known log path. The plugin writes every test-result
+API call to that JSONL log, and the outer test parses it via
+``_step_status_capture.load_steps`` to reconstruct the step tree.
 """
 
 from __future__ import annotations
 
-import json
-from pathlib import Path as _Path
 from textwrap import dedent
 from typing import TYPE_CHECKING
 
 import pytest
 
+from sift_client._tests.pytest_plugin import _step_status_capture as capture
+
 if TYPE_CHECKING:
     from pathlib import Path
 
-_STEPS_FILE_ENV = "SIFT_FAKE_STEPS_FILE"
-
-# ``_fakes.py`` is excluded from the wheel by ``pyproject.toml``'s
-# ``packages.find`` rule that strips ``sift_client._tests``. The inner
-# pytester subprocess uses the installed package and cannot import from
-# ``sift_client._tests``. Embed the fake source directly into the inner
-# conftest so the subprocess gets a fully self-contained module to load.
-_FAKES_SOURCE = (_Path(__file__).parent / "_fakes.py").read_text()
-
-_INNER_CONFTEST = f"""
-{_FAKES_SOURCE}
-
-import os
-from pathlib import Path
-from unittest.mock import MagicMock
-
-import pytest
-
-pytest_plugins = ["sift_client.pytest_plugin"]
-
 
-@pytest.fixture(scope="session")
-def sift_client():
-    return MagicMock()
+_INNER_CONFTEST = 'pytest_plugins = ["sift_client.pytest_plugin"]\n'
 
 
-@pytest.fixture(scope="session", autouse=True)
-def report_context(sift_client):
-    import sift_client.pytest_plugin as plugin_module
-    steps_file = Path(os.environ[{_STEPS_FILE_ENV!r}])
-    with FakeReportContext(steps_file) as ctx:
-        plugin_module.REPORT_CONTEXT = ctx
-        yield ctx
-"""
+def _base_ini_lines(log_path: Path) -> list[str]:
+    """Default ini settings every inner pytester run needs."""
+    return [
+        "[pytest]",
+        "sift_offline = true",
+        f"sift_log_file = {log_path}",
+        "sift_git_metadata = false",
+    ]
 
 
 @pytest.fixture
-def steps_file(pytester: pytest.Pytester, monkeypatch: pytest.MonkeyPatch) -> Path:
-    path = pytester.path / "captured_steps.json"
+def log_file(pytester: pytest.Pytester) -> Path:
+    path = pytester.path / "sift.log"
     pytester.makeconftest(_INNER_CONFTEST)
-    monkeypatch.setenv(_STEPS_FILE_ENV, str(path))
+    pytester.makefile(".ini", pytest="\n".join(_base_ini_lines(path)) + "\n")
     return path
 
 
@@ -86,7 +64,7 @@ def _ancestor_names(steps: list[dict], leaf: dict) -> list[str]:
 
 
 def test_class_methods_cluster_under_class_step(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_klass=dedent(
@@ -102,7 +80,7 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["TestFoo"]) == 1
     class_id = by_name["TestFoo"][0]["id"]
@@ -110,7 +88,7 @@ def test_b(self):
     assert by_name["test_b"][0]["parent_step_id"] == class_id
 
 
-def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_nested=dedent(
             """
@@ -123,7 +101,7 @@ def test_a(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["TestOuter"]) == 1
     assert len(by_name["TestInner"]) == 1
@@ -136,7 +114,7 @@ def test_a(self):
     ]
 
 
-def test_class_parametrize_nests_under_class(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_class_parametrize_nests_under_class(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_cp=dedent(
             """
@@ -151,7 +129,7 @@ def test_a(self, v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     class_id = by_name["TestFoo"][0]["id"]
     test_a_id = by_name["test_a"][0]["id"]
@@ -160,7 +138,7 @@ def test_a(self, v):
     assert by_name["v=2"][0]["parent_step_id"] == test_a_id
 
 
-def test_two_sibling_classes_in_module(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_two_sibling_classes_in_module(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_sib=dedent(
             """
@@ -176,7 +154,7 @@ def test_y(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     mod_id = by_name["test_sib.py"][0]["id"]
     assert by_name["TestA"][0]["parent_step_id"] == mod_id
@@ -186,7 +164,7 @@ def test_y(self):
     assert len(by_name["TestB"]) == 1
 
 
-def test_mixed_class_and_free_function(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_mixed_class_and_free_function(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_mix=dedent(
             """
@@ -201,7 +179,7 @@ def test_free():
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     mod_id = by_name["test_mix.py"][0]["id"]
     # Class method parents to TestA; free function parents directly to module.
@@ -211,7 +189,7 @@ def test_free():
 
 
 def test_class_with_all_excluded_methods_no_class_step(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_excl=dedent(
@@ -231,14 +209,14 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "TestFoo" not in by_name
     assert "test_a" not in by_name
     assert "test_b" not in by_name
 
 
-def test_sift_exclude_on_class_propagates(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_sift_exclude_on_class_propagates(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_clsexcl=dedent(
             """
@@ -256,14 +234,14 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "TestFoo" not in by_name
     assert "test_a" not in by_name
 
 
 def test_class_docstring_becomes_step_description(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_doc=dedent(
@@ -278,7 +256,7 @@ def test_a(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # The fake records step creation but not all fields — check the class
     # step was recorded, then read the description via the FakeStep's
@@ -289,7 +267,7 @@ def test_a(self):
 
 
 def test_transition_between_class_chains_drains_parametrize(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_trans=dedent(
@@ -310,7 +288,7 @@ def test_y(self, w):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Each class opens exactly once; parametrize parents under the right class.
     assert len(by_name["TestA"]) == 1
@@ -396,7 +374,7 @@ def __exit__(self, *_: object) -> None:
 
 
 def test_failing_test_in_class_does_not_orphan_class_step(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """A failing class method must not block the class step from cleaning up.
 
@@ -422,7 +400,7 @@ def test_c(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2, failed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["TestFoo"]) == 1
     assert len(by_name["TestBar"]) == 1
@@ -439,7 +417,7 @@ def test_c(self):
 
 
 def test_failing_parametrized_method_in_class_closes_full_chain(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """A failing parametrized class method must not orphan its parametrize parents."""
     pytester.makepyfile(
@@ -460,7 +438,7 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2, failed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     foo_id = by_name["TestFoo"][0]["id"]
     test_a_id = by_name["test_a"][0]["id"]
@@ -476,18 +454,19 @@ def test_b(self):
 # ---------------------------------------------------------------------------
 
 
-def _write_ini(pytester: pytest.Pytester, **overrides: object) -> None:
-    """Write a pytest.ini with the given sift_* overrides set under [pytest]."""
-    lines = ["[pytest]"]
+def _write_ini(pytester: pytest.Pytester, log_file: Path, **overrides: object) -> None:
+    """Write a pytest.ini with the given sift_* overrides, preserving the
+    offline/log/git-metadata defaults the ``log_file`` fixture installs."""
+    lines = _base_ini_lines(log_file)
     for key, value in overrides.items():
         lines.append(f"{key} = {value}")
     pytester.makefile(".ini", pytest="\n".join(lines) + "\n")
 
 
 def test_sift_class_step_false_skips_class_steps(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
-    _write_ini(pytester, sift_class_step="false")
+    _write_ini(pytester, log_file, sift_class_step="false")
     pytester.makepyfile(
         test_noclass=dedent(
             """
@@ -502,7 +481,7 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "TestFoo" not in by_name
     mod_id = by_name["test_noclass.py"][0]["id"]
@@ -511,9 +490,9 @@ def test_b(self):
 
 
 def test_sift_module_step_false_skips_module_step(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
-    _write_ini(pytester, sift_module_step="false")
+    _write_ini(pytester, log_file, sift_module_step="false")
     pytester.makepyfile(
         test_nomod=dedent(
             """
@@ -525,7 +504,7 @@ def test_a(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "test_nomod.py" not in by_name
     # TestFoo attaches to the report root (no parent recorded by the fake).
@@ -534,9 +513,9 @@ def test_a(self):
 
 
 def test_sift_parametrize_nesting_false_keeps_flat_leaves(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
-    _write_ini(pytester, sift_parametrize_nesting="false")
+    _write_ini(pytester, log_file, sift_parametrize_nesting="false")
     pytester.makepyfile(
         test_flat=dedent(
             """
@@ -550,7 +529,7 @@ def test_a(v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # No parametrize parent step.
     assert "test_a" not in by_name
@@ -564,7 +543,7 @@ def test_a(v):
 
 
 def test_sift_module_step_false_still_drains_across_modules(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """sift_module_step=false must not merge same-named classes across modules.
 
@@ -572,7 +551,7 @@ def test_sift_module_step_false_still_drains_across_modules(
     (even when it's not rendered as a step), so two modules each declaring
     ``class TestFoo`` produce two distinct ``TestFoo`` frames in the diff.
     """
-    _write_ini(pytester, sift_module_step="false")
+    _write_ini(pytester, log_file, sift_module_step="false")
     pytester.makepyfile(
         test_a=dedent(
             """
@@ -591,7 +570,7 @@ def test_y(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Two distinct TestFoo class steps — one per module — not a shared frame.
     assert len(by_name["TestFoo"]) == 2
@@ -605,7 +584,7 @@ def test_y(self):
 
 
 def test_package_step_default_opens_for_init_dirs(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """Default: a directory with ``__init__.py`` produces a parent package step."""
     pytester.mkpydir("pkg_a")
@@ -619,7 +598,7 @@ def test_one():
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "pkg_a" in by_name
     pkg_id = by_name["pkg_a"][0]["id"]
@@ -628,7 +607,7 @@ def test_one():
 
 
 def test_same_named_packages_in_different_dirs_do_not_merge(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """Two packages with the same display name but different paths must stay distinct.
 
@@ -663,7 +642,7 @@ def test_two():
     # name on disk don't collide during sys.path-based import.
     result = pytester.runpytest_subprocess("-v", "--import-mode=importlib")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Two distinct ``utils`` package steps — one per project.
     assert len(by_name["utils"]) == 2
@@ -677,10 +656,10 @@ def test_two():
 
 
 def test_sift_package_step_false_skips_package_steps(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """With ``sift_package_step=false`` the directory step is suppressed."""
-    _write_ini(pytester, sift_package_step="false")
+    _write_ini(pytester, log_file, sift_package_step="false")
     pytester.mkpydir("pkg_a")
     (pytester.path / "pkg_a" / "test_x.py").write_text(
         dedent(
@@ -692,7 +671,7 @@ def test_one():
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "pkg_a" not in by_name
     # The module step still opens and is now the top-level frame.
@@ -700,10 +679,11 @@ def test_one():
 
 
 def test_all_three_flags_false_matches_legacy_behavior(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     _write_ini(
         pytester,
+        log_file,
         sift_module_step="false",
         sift_class_step="false",
         sift_parametrize_nesting="false",
@@ -722,7 +702,7 @@ def test_a(self, v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # No module, class, or parametrize parents — just bracket-mangled leaves.
     assert "test_legacy.py" not in by_name
@@ -740,7 +720,7 @@ def test_a(self, v):
 
 
 def test_single_parametrize_clusters_under_originalname(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_rail=dedent(
@@ -755,7 +735,7 @@ def test_rail(v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Module step + one shared `test_rail` parent + two leaves.
     assert len(by_name["test_rail.py"]) == 1
@@ -768,7 +748,7 @@ def test_rail(v):
 
 
 def test_stacked_parametrize_nests_outer_to_inner(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_iso=dedent(
@@ -784,7 +764,7 @@ def test_iso(voltage, component):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=4)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # One `test_iso` parent, two `voltage='…'` parents, four `component='…'` leaves.
     assert len(by_name["test_iso"]) == 1
@@ -806,7 +786,7 @@ def test_iso(voltage, component):
         assert leaf["parent_step_id"] in voltage_ids
 
 
-def test_fixture_parametrization_participates(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_fixture_parametrization_participates(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_widget=dedent(
             """
@@ -823,7 +803,7 @@ def test_widget(widget):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["test_widget"]) == 1
     parent_id = by_name["test_widget"][0]["id"]
@@ -832,7 +812,7 @@ def test_widget(widget):
 
 
 def test_module_boundary_isolates_parametrize_stack(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_a=dedent(
@@ -856,7 +836,7 @@ def test_two(w):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=4)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Each module step contains its own `test_one`/`test_two` parametrize subtree.
     mod_a = by_name["test_a.py"][0]
@@ -866,7 +846,7 @@ def test_two(w):
 
 
 def test_leaf_parent_chain_terminates_at_report(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_chain=dedent(
@@ -882,7 +862,7 @@ def test_chain(a, b):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     leaf = next(s for s in steps if s["name"].startswith("b="))
     chain = _ancestor_names(steps, leaf)
     # leaf b=… → a=… → test_chain → test_chain.py (module step) → root
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
index a05d18982..a950cdb7d 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -27,155 +27,41 @@
 
 
 _INNER_CONFTEST_SRC = '''
-"""Auto-generated conftest for the step-status characterization suite.
-
-Installs the Sift pytest plugin and a fake ``sift_client`` that records
-every step status write into the outer test's CAPTURED_STEPS dict.
+"""Auto-generated conftest. Loading the Sift plugin is the only thing the
+inner session needs. ``--sift-offline`` on the CLI causes the plugin's
+default ``sift_client`` fixture to construct a placeholder client and the
+real ``ReportContext`` writes every API call to the JSONL log without
+contacting Sift.
 """
 
-from __future__ import annotations
-
-import uuid
-
-import pytest
-
-# Bring the Sift fixtures + the makereport hook into this inner session.
 pytest_plugins = ["sift_client.pytest_plugin"]
-
-from sift_client._tests.pytest_plugin._step_status_capture import CAPTURED_STEPS, CapturedStep
-from sift_client.sift_types.test_report import (
-    TestMeasurement,
-    TestReport,
-    TestStep,
-)
-
-
-
-class _FakeTestResults:
-    def __init__(self, client):
-        self._client = client
-
-    def create(self, test_report, log_file=None):
-        report = TestReport(
-            id_=str(uuid.uuid4()),
-            status=test_report.status,
-            name=test_report.name,
-            test_system_name=test_report.test_system_name,
-            test_case=test_report.test_case,
-            start_time=test_report.start_time,
-            end_time=test_report.end_time,
-            metadata=test_report.metadata or {},
-            is_archived=False,
-        )
-        report._apply_client_to_instance(self._client)
-        return report
-
-    def update(self, test_report, update, log_file=None):
-        return test_report
-
-    def create_step(self, test_step, log_file=None):
-        step_id = str(uuid.uuid4())
-        CAPTURED_STEPS[step_id] = CapturedStep(
-            step_id=step_id,
-            name=test_step.name,
-            step_path=test_step.step_path,
-            parent_step_id=test_step.parent_step_id,
-            statuses=[test_step.status],
-        )
-        step = TestStep(
-            id_=step_id,
-            test_report_id=test_step.test_report_id,
-            parent_step_id=test_step.parent_step_id,
-            name=test_step.name,
-            description=test_step.description,
-            step_type=test_step.step_type,
-            step_path=test_step.step_path,
-            status=test_step.status,
-            start_time=test_step.start_time,
-            end_time=test_step.end_time,
-            error_info=test_step.error_info,
-        )
-        step._apply_client_to_instance(self._client)
-        return step
-
-    def update_step(self, test_step, update, log_file=None):
-        new_status = (
-            update.get("status") if isinstance(update, dict) else update.status
-        )
-        if test_step.id_ in CAPTURED_STEPS and new_status is not None:
-            CAPTURED_STEPS[test_step.id_].statuses.append(new_status)
-        merged_status = new_status if new_status is not None else test_step.status
-        updated = TestStep(
-            id_=test_step.id_,
-            test_report_id=test_step.test_report_id,
-            parent_step_id=test_step.parent_step_id,
-            name=test_step.name,
-            description=test_step.description,
-            step_type=test_step.step_type,
-            step_path=test_step.step_path,
-            status=merged_status,
-            start_time=test_step.start_time,
-            end_time=test_step.end_time,
-            error_info=test_step.error_info,
-        )
-        updated._apply_client_to_instance(self._client)
-        return updated
-
-    def create_measurement(self, test_measurement, update_step=False, log_file=None):
-        measurement = TestMeasurement(
-            id_=str(uuid.uuid4()),
-            measurement_type=test_measurement.measurement_type,
-            name=test_measurement.name,
-            test_step_id=test_measurement.test_step_id,
-            numeric_value=test_measurement.numeric_value,
-            string_value=test_measurement.string_value,
-            boolean_value=test_measurement.boolean_value,
-            unit=test_measurement.unit,
-            numeric_bounds=test_measurement.numeric_bounds,
-            string_expected_value=test_measurement.string_expected_value,
-            passed=test_measurement.passed,
-            timestamp=test_measurement.timestamp,
-        )
-        measurement._apply_client_to_instance(self._client)
-        return measurement
-
-
-class _FakePing:
-    def ping(self):
-        return None
-
-
-class _FakeSiftClient:
-    _simulate = False
-
-    def __init__(self):
-        self.test_results = _FakeTestResults(self)
-        self.ping = _FakePing()
-
-
-@pytest.fixture(scope="session")
-def sift_client():
-    return _FakeSiftClient()
 '''
 
 
-_RUN_ARGS = (
-    "--sift-log-file=false",
-    "--no-sift-git-metadata",
-)
-
-
 @pytest.fixture
 def inner(pytester):
-    """Reset the capture state and install the inner conftest. Returns ``pytester``."""
-    capture.reset()
+    """Install the inner conftest. Returns ``pytester``."""
     pytester.makeconftest(_INNER_CONFTEST_SRC)
     return pytester
 
 
+# Prepended to every inner test file. Pytest skips marker-based ``skip`` items
+# before any autouse fixture runs, which would leave ``REPORT_CONTEXT`` unset
+# and the plugin's inline-skip recording inert. A single passing item up-front
+# forces ``report_context`` to initialize so the makereport hook can record
+# the skip into the same session's JSONL.
+_WARMUP = "def test_sift_warmup(): pass\n\n"
+
+
 def _run(pytester, body: str) -> None:
-    pytester.makepyfile(textwrap.dedent(body))
-    pytester.runpytest_inprocess(*_RUN_ARGS)
+    pytester.makepyfile(_WARMUP + textwrap.dedent(body))
+    log_path = pytester.path / "sift.log"
+    capture.set_log(log_path)
+    pytester.runpytest_inprocess(
+        "--sift-offline",
+        f"--sift-log-file={log_path}",
+        "--no-sift-git-metadata",
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -219,7 +105,7 @@ def test_x():
     assert capture.final_status("test_x") == TestStatus.ERROR
 
 
-def test_system_exit_maps_to_error(inner):
+def test_system_exit_maps_to_aborted(inner):
     # Case: CALL-05
     _run(
         inner,
@@ -229,7 +115,7 @@ def test_x():
             sys.exit(1)
         """,
     )
-    assert capture.final_status("test_x") == TestStatus.ERROR
+    assert capture.final_status("test_x") == TestStatus.ABORTED
 
 
 def test_pytest_fail_maps_to_failed(inner):
@@ -621,3 +507,35 @@ def test_x(step):
     outer = capture.test_step("test_x")
     assert outer is not None
     assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_abort_inside_substep_marks_every_open_step_aborted(inner):
+    # Case: API-06
+    _run(
+        inner,
+        """
+        import sys
+        def test_x(step):
+            with step.substep(name="completed_sub"):
+                pass
+            with step.substep(name="outer_sub") as outer_sub:
+                with outer_sub.substep(name="inner_sub"):
+                    sys.exit(1)
+        """,
+    )
+    # SystemExit unwinds the substep stack on the way out. Every step that was
+    # open when the abort fired (inner substep, outer substep, test step)
+    # must record ABORTED. The sibling substep that closed cleanly before the
+    # abort must retain its PASSED status.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ABORTED
+    outer_sub = next(iter(capture.steps_by_name("outer_sub")), None)
+    inner_sub = next(iter(capture.steps_by_name("inner_sub")), None)
+    completed_sub = next(iter(capture.steps_by_name("completed_sub")), None)
+    assert outer_sub is not None
+    assert inner_sub is not None
+    assert completed_sub is not None
+    assert outer_sub.statuses[-1] == TestStatus.ABORTED
+    assert inner_sub.statuses[-1] == TestStatus.ABORTED
+    assert completed_sub.statuses[-1] == TestStatus.PASSED
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 07704383d..68304cbf7 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -582,13 +582,11 @@ def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
                 status = TestStatus.FAILED
             elif isinstance(excinfo.value, pytest.fail.Exception):
                 status = TestStatus.FAILED
-            elif isinstance(excinfo.value, KeyboardInterrupt):
-                # Hold status at IN_PROGRESS; a session-aborting interrupt should
-                # not look like a clean pass. Keep the managed flag so the default
-                # exit path does not coerce IN_PROGRESS to PASSED.
-                keep_managed = True
-            elif isinstance(excinfo.value, SystemExit):
-                status = TestStatus.ERROR
+            elif isinstance(excinfo.value, (KeyboardInterrupt, SystemExit)):
+                # Hard exits the plugin can observe: pytest converted the
+                # raise into a call-phase report. The session-aborting variant
+                # (call_phase is None) lands earlier and stays IN_PROGRESS.
+                status = TestStatus.ABORTED
                 error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
             elif xfail_marker is not None:
                 # xfail(raises=X) with a non-matching exception: the contract failed.
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 0f6097abc..5fd090bfc 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -385,30 +385,22 @@ def mark_step_failed_after_close(self, step: TestStep):
         if len(path_parts) > 1:
             self.open_step_results[".".join(path_parts[:-1])] = False
 
-    def resolve_and_propagate_step_result(
-        self,
-        step: TestStep,
-        error_info: ErrorInfo | None = None,
-    ) -> bool:
-        """Resolve the result of a step and propagate the result to the parent step if it failed."""
-        result = self.open_step_results.get(step.step_path, True)
-        if error_info:
-            result = False
-        if step.status != TestStatus.IN_PROGRESS:
-            # The step was manually completed so use that result.
-            # Skipped steps are considered passed.
-            result = step.status in (TestStatus.PASSED, TestStatus.SKIPPED)
-
-        # Update the parent step results if this step failed (true by default so no need to do anything if we didn't fail).
-        if not result:
+    def propagate_step_result(self, step: TestStep, status: TestStatus) -> bool:
+        """Propagate this step's final status to the parent step.
+
+        Status is the governor: anything outside ``{PASSED, SKIPPED}`` counts
+        as a failure for the parent. ``error_info`` is intentionally not
+        consulted here; it is free-form diagnostic data that may sit on a
+        step regardless of status.
+        """
+        succeeded = status in (TestStatus.PASSED, TestStatus.SKIPPED)
+        if not succeeded:
             self.any_failures = True
             self.open_step_results[step.step_path] = False
             path_parts = step.step_path.split(".")
             if len(path_parts) > 1:
-                parent_step_path = ".".join(path_parts[:-1])
-                self.open_step_results[parent_step_path] = False
-
-        return result
+                self.open_step_results[".".join(path_parts[:-1])] = False
+        return succeeded
 
     def exit_step(self, step: TestStep):
         """Exit a step and update the report context."""
@@ -494,26 +486,42 @@ def update_step_from_result(
         returns: The false if step failed or errored, true otherwise.
         """
         error_info = None
+        aborted = False
         assert self.current_step is not None
         if exc:
             if isinstance(exc_value, AssertionError) and not self.assertion_as_fail_not_error:
                 # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
                 self.report_context.record_step_outcome(False, self.current_step)
+            elif isinstance(exc_value, (KeyboardInterrupt, SystemExit)):
+                # Hard exit propagating through the substep stack: record as
+                # ABORTED so every in-progress step on the way out reflects
+                # the abort rather than coercing to ERROR.
+                aborted = True
+                error_info = format_truncated_traceback(exc, exc_value, tb)
             else:
                 error_info = format_truncated_traceback(exc, exc_value, tb)
 
-        # Resolve the status of this step (i.e. fail if children failed) and propagate the result to the parent step.
-        result = self.report_context.resolve_and_propagate_step_result(
-            self.current_step, error_info
-        )
-
-        # Mark the step as completed
+        # Status is the governor: anything other than IN_PROGRESS was set
+        # deliberately (manual override, plugin pre-resolution, etc.) and must
+        # not be silently overwritten by side-channel signals. When the step is
+        # still IN_PROGRESS, fill it in: hard-exit aborts first, then captured
+        # exceptions, then the substep-driven pass/fail read from
+        # open_step_results.
         status = self.current_step.status
         if status == TestStatus.IN_PROGRESS:
-            # Update the status only if the step was in progress i.e. not updated elsewhere.
-            status = TestStatus.PASSED if result else TestStatus.FAILED
-        if error_info:
-            status = TestStatus.ERROR
+            if aborted:
+                status = TestStatus.ABORTED
+            elif error_info:
+                status = TestStatus.ERROR
+            else:
+                children_passed = self.report_context.open_step_results.get(
+                    self.current_step.step_path, True
+                )
+                status = TestStatus.PASSED if children_passed else TestStatus.FAILED
+
+        # Propagate based on the resolved status; error_info rides along as
+        # pure diagnostics and does not affect propagation.
+        result = self.report_context.propagate_step_result(self.current_step, status)
         self.current_step.update(
             {
                 "status": status,
@@ -527,12 +535,12 @@ def update_step_from_result(
     def __exit__(self, exc, exc_value, tb):
         if getattr(self, "_sift_managed_externally", False):
             # The pytest fixture already resolved status from phase reports.
-            # Run the standard propagation so the parent step sees this step's
-            # pass/fail, emit one update_step with the resolved values, and pop
-            # from the stack without re-classifying.
+            # Propagate based on that resolved status, emit one update_step
+            # with the resolved values, and pop from the stack without
+            # re-classifying.
             assert self.current_step is not None
-            result = self.report_context.resolve_and_propagate_step_result(
-                self.current_step, self.current_step.error_info
+            result = self.report_context.propagate_step_result(
+                self.current_step, self.current_step.status
             )
             self.current_step.update(
                 {

From 798d5ab7d8720bb619b8dfe62067d37a849f0910 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 22 May 2026 20:52:48 -0700
Subject: [PATCH 4/6] add pass/fail behavior guide

---
 python/docs/examples/pytest_plugin.md         |   4 +
 .../pytest_plugin/pass_fail_behavior.md       | 187 ++++++++++++++++++
 python/mkdocs.yml                             |   3 +
 3 files changed, 194 insertions(+)
 create mode 100644 python/docs/guides/pytest_plugin/pass_fail_behavior.md

diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index c464e564e..5a40d450d 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -306,6 +306,10 @@ outcomes into `TestStatus`:
 | Non-`AssertionError` exception escapes the test (e.g. `ValueError`, `TimeoutError`) | `ERROR`, with the formatted traceback (last 10 frames plus the first frame) on `step.error_info.error_message` |
 | Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
 
+For the full contract, including skips, xfail/xpass, hard exits (`SystemExit`,
+`KeyboardInterrupt`), setup/teardown phase failures, and propagation rules,
+see the [Pass/Fail Behavior guide](../guides/pytest_plugin/pass_fail_behavior.md).
+
 A failure or error at any depth propagates upward: the parent substep, the
 function step, the class/module/package steps above it, and the session
 report all get marked failed.
diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
new file mode 100644
index 000000000..0c9636fe6
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -0,0 +1,187 @@
+# Pass/Fail Behavior
+
+The pytest plugin maps every pytest outcome to a `TestStatus` on the
+corresponding Sift step. Use this page to look up what a given test will
+produce, and how that result rolls up to the parent steps and the report.
+
+## `TestStatus` values
+
+The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
+
+| Status        | Meaning                                                                                              |
+| ------------- | ---------------------------------------------------------------------------------------------------- |
+| `PASSED`      | The step completed and every check it owns succeeded.                                                |
+| `FAILED`      | An assertion, a `pytest.fail(...)`, a failed `report_outcome`, or a failing measurement marked it.   |
+| `ERROR`       | An unexpected exception escaped the test body or a fixture (setup or teardown).                      |
+| `ABORTED`     | A hard exit (`SystemExit`, observed `KeyboardInterrupt`) interrupted the test.                       |
+| `SKIPPED`     | The test was skipped at collection time, at runtime, or from a fixture.                              |
+| `IN_PROGRESS` | The plugin never observed a final outcome (e.g. a session-aborting interrupt killed pytest first).   |
+
+## Normal test outcomes
+
+| Scenario                                  | Trigger                              | Outcome  |
+| ----------------------------------------- | ------------------------------------ | -------- |
+| Test passes                               | function body returns cleanly        | `PASSED` |
+| Assertion failure                         | `assert 1 == 2`                      | `FAILED` |
+| `pytest.fail("...")` from the body        | `pytest.fail("intentional failure")` | `FAILED` |
+| Uncaught non-assertion exception          | `raise ValueError("boom")`           | `ERROR`  |
+
+A non-assertion exception gets its formatted traceback recorded on
+`step.error_info.error_message` (first frame plus the last ten frames).
+Assertions and `pytest.fail` skip the traceback; pytest already prints one
+in the runner output.
+
+## Hard exits
+
+Hard exits the plugin can observe map to `ABORTED`. If pytest tears the
+session down before the plugin sees the exit, the step stays at
+`IN_PROGRESS` instead of resolving.
+
+| Scenario                                       | Trigger                   | Outcome                                                              |
+| ---------------------------------------------- | ------------------------- | -------------------------------------------------------------------- |
+| `SystemExit` from the test body                | `sys.exit(1)`             | `ABORTED`                                                            |
+| `KeyboardInterrupt` the plugin observes        | `raise KeyboardInterrupt` | `ABORTED`                                                            |
+| Session-aborting `KeyboardInterrupt`           | Ctrl-C terminates pytest  | `IN_PROGRESS` (session ends before the plugin's hooks fire)          |
+
+### Abort propagation through nested substeps
+
+When a hard exit fires inside nested substeps, the exception unwinds the
+substep stack. Every step that was open when the abort fired records
+`ABORTED`. A sibling substep that closed cleanly before the abort keeps
+its prior status.
+
+```python title="test_abort.py"
+import sys
+
+
+def test_x(step):
+    with step.substep(name="completed_sub"):
+        pass  # closes as PASSED before the abort
+    with step.substep(name="outer_sub") as outer_sub:
+        with outer_sub.substep(name="inner_sub"):
+            sys.exit(1)  # ABORTED unwinds through inner_sub, outer_sub, and the test step
+```
+
+The Sift report shows `completed_sub` as `PASSED` and the three steps
+still open at the abort (`inner_sub`, `outer_sub`, and the test step
+itself) as `ABORTED`.
+
+## Skips
+
+| Scenario                              | Trigger                                       | Outcome   |
+| ------------------------------------- | --------------------------------------------- | --------- |
+| Collection-time skip                  | `@pytest.mark.skip(reason=...)`               | `SKIPPED` |
+| Conditional collection-time skip      | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` |
+| Runtime skip from the test body       | `pytest.skip("...")`                          | `SKIPPED` |
+| Skip raised inside a fixture          | `@pytest.fixture` calls `pytest.skip("...")`  | `SKIPPED` |
+
+`SKIPPED` does not propagate as a failure. A skipped substep or test does
+not block its parent from resolving to `PASSED`.
+
+!!! info "Collection-time skip path"
+    `@pytest.mark.skip` and `@pytest.mark.skipif` short-circuit before any
+    autouse fixture fires, so the plugin records the step from inside its
+    `pytest_runtest_makereport` hook rather than from the `step` fixture.
+    Runtime skips (`pytest.skip(...)`) and skips from inside a fixture go
+    through the normal fixture path, with the autouse `step` resolving to
+    `SKIPPED`.
+
+## Expected failures (xfail / xpass)
+
+xfail marks declare that a test is expected to fail. The plugin follows
+the same semantics pytest does.
+
+| Scenario                                  | Trigger                                                    | Outcome                                                       |
+| ----------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------- |
+| xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                     | `PASSED` (the test fulfilled the xfail expectation)           |
+| Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`          | `FAILED` (the mark no longer matches reality)                 |
+| Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                     | `PASSED` (`strict=False` does not insist on the failure)      |
+| `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)      |
+| `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)`                            | `SKIPPED` (the body never ran)                                |
+
+## Setup and teardown phases
+
+Fixture failures land on the test step the fixture was resolving for.
+Setup-phase and teardown-phase failures resolve differently:
+
+| Scenario                                     | Trigger                                                          | Outcome                                                       |
+| -------------------------------------------- | ---------------------------------------------------------------- | ------------------------------------------------------------- |
+| Setup-phase fixture failure                  | `@pytest.fixture` raises before `yield`; body never runs         | `ERROR`                                                       |
+| Teardown-phase fixture failure               | `@pytest.fixture` raises after `yield`; body passed              | `FAILED` (the plugin upgrades the closed step after teardown) |
+| Call-phase failure plus teardown failure     | `assert 1 == 2` in the body AND a `@pytest.fixture` raises after `yield` | `FAILED` (the call-phase failure dominates)            |
+
+## Collection and fixture resolution
+
+| Scenario        | Trigger                            | Outcome |
+| --------------- | ---------------------------------- | ------- |
+| Missing fixture | `def test_x(nonexistent_fixture):` | `ERROR` (surfaces as a setup-phase failure) |
+
+## Influencing outcomes from test code
+
+A test can also set the step's outcome directly via the helpers below.
+Substeps your test opens follow the same propagation rules as the ones
+the plugin opens for you.
+
+### Manual status override
+
+`step.current_step.update({...})` sets the status directly. The step's
+exit handler does not overwrite it.
+
+```python
+from sift_client.sift_types.test_report import TestStatus
+
+
+def test_manual(step):
+    step.current_step.update({"status": TestStatus.FAILED})
+```
+
+### `report_outcome` for externally computed checks
+
+`report_outcome(name, result, reason)` records a named check whose
+pass/fail was computed elsewhere (a subprocess, a remote system, your own
+comparison logic). A failing outcome marks the step `FAILED`.
+
+```python
+def test_external_check(step):
+    result, reason = run_external_validator()
+    step.report_outcome("ext-validator", result, reason)
+```
+
+### Measurements with bounds
+
+`step.measure(name=, value=, bounds=)` records a measurement and resolves
+the step to `FAILED` if the value is out of bounds. The call returns the
+pass/fail boolean and does not raise, so multiple measurements can run
+without short-circuiting.
+
+```python
+def test_battery(step):
+    step.measure(name="voltage", value=12.1, bounds={"min": 11.5, "max": 13.0}, unit="V")
+    step.measure(name="current", value=0.42, bounds={"max": 1.0}, unit="A")
+```
+
+### Substep failures
+
+A failed substep propagates failure to its parent step. A manually-set
+`SKIPPED` on a substep does not.
+
+```python
+def test_with_substep(step):
+    with step.substep(name="check") as inner:
+        inner.measure(name="value", value=99.0, bounds={"min": 0.0, "max": 5.0})
+    # The outer step resolves to FAILED because the substep failed.
+```
+
+## Propagation rules
+
+Steps the plugin opens (function, class, module, package) and substeps
+your test opens follow the same rules:
+
+- A step whose status is anything other than `PASSED` or `SKIPPED`
+  propagates failure to its parent and to the report.
+- `SKIPPED` does not propagate as a failure.
+- A status set explicitly via `current_step.update` is kept; the plugin
+  does not overwrite it.
+- `error_info` is free-form diagnostic data. The step's status is the
+  authoritative signal; `error_info` rides along for the dashboard but
+  does not change how propagation behaves.
diff --git a/python/mkdocs.yml b/python/mkdocs.yml
index 5108b7e4a..af174aa4f 100644
--- a/python/mkdocs.yml
+++ b/python/mkdocs.yml
@@ -62,6 +62,9 @@ nav:
         # Will migrate to Guides in the future
       - Pytest Plugin: examples/pytest_plugin.md
       - Pytest Plugin Quickstart: examples/pytest_plugin_quickstart.md
+  - Guides:
+      - Pytest Plugin:
+          - Pass/Fail Behavior: guides/pytest_plugin/pass_fail_behavior.md
 #  - Guides:
 #      - Logging
 #      - Error Handling

From 1e7ee6f5e43bb0055ff1ca8aebb1cf6b6a3ce5ba Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 10:20:29 -0700
Subject: [PATCH 5/6] update docs

---
 .../pytest_plugin/pass_fail_behavior.md       | 82 +++++++------------
 1 file changed, 30 insertions(+), 52 deletions(-)

diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
index 0c9636fe6..8ec57701f 100644
--- a/python/docs/guides/pytest_plugin/pass_fail_behavior.md
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -8,14 +8,14 @@ produce, and how that result rolls up to the parent steps and the report.
 
 The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
 
-| Status        | Meaning                                                                                              |
-| ------------- | ---------------------------------------------------------------------------------------------------- |
-| `PASSED`      | The step completed and every check it owns succeeded.                                                |
-| `FAILED`      | An assertion, a `pytest.fail(...)`, a failed `report_outcome`, or a failing measurement marked it.   |
-| `ERROR`       | An unexpected exception escaped the test body or a fixture (setup or teardown).                      |
-| `ABORTED`     | A hard exit (`SystemExit`, observed `KeyboardInterrupt`) interrupted the test.                       |
-| `SKIPPED`     | The test was skipped at collection time, at runtime, or from a fixture.                              |
-| `IN_PROGRESS` | The plugin never observed a final outcome (e.g. a session-aborting interrupt killed pytest first).   |
+| Status        | Meaning                                                                                                                |
+| ------------- |------------------------------------------------------------------------------------------------------------------------|
+| `PASSED`      | The step completed and every check it owns succeeded.                                                                  |
+| `FAILED`      | An assertion, a `pytest.fail(...)`, a failed `report_outcome`, or a failing measurement marked it.                     |
+| `ERROR`       | An unexpected exception escaped the test body or a fixture (setup or teardown).                                        |
+| `ABORTED`     | A hard exit (`SystemExit`, observed `KeyboardInterrupt`) interrupted the test.                                         |
+| `SKIPPED`     | The test was skipped at collection time, at runtime, or from a fixture.                                                |
+| `IN_PROGRESS` | Test in progress or the plugin never observed a final outcome (e.g. a session-aborting interrupt killed pytest first). |
 
 ## Normal test outcomes
 
@@ -27,9 +27,7 @@ The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
 | Uncaught non-assertion exception          | `raise ValueError("boom")`           | `ERROR`  |
 
 A non-assertion exception gets its formatted traceback recorded on
-`step.error_info.error_message` (first frame plus the last ten frames).
-Assertions and `pytest.fail` skip the traceback; pytest already prints one
-in the runner output.
+`step.error_info.error_message`.
 
 ## Hard exits
 
@@ -45,10 +43,8 @@ session down before the plugin sees the exit, the step stays at
 
 ### Abort propagation through nested substeps
 
-When a hard exit fires inside nested substeps, the exception unwinds the
-substep stack. Every step that was open when the abort fired records
-`ABORTED`. A sibling substep that closed cleanly before the abort keeps
-its prior status.
+Every step that was open when the abort fired records
+`ABORTED`.
 
 ```python title="test_abort.py"
 import sys
@@ -59,7 +55,7 @@ def test_x(step):
         pass  # closes as PASSED before the abort
     with step.substep(name="outer_sub") as outer_sub:
         with outer_sub.substep(name="inner_sub"):
-            sys.exit(1)  # ABORTED unwinds through inner_sub, outer_sub, and the test step
+            sys.exit(1)  # ABORTED applied to inner_sub, outer_sub, and the test step
 ```
 
 The Sift report shows `completed_sub` as `PASSED` and the three steps
@@ -78,14 +74,6 @@ itself) as `ABORTED`.
 `SKIPPED` does not propagate as a failure. A skipped substep or test does
 not block its parent from resolving to `PASSED`.
 
-!!! info "Collection-time skip path"
-    `@pytest.mark.skip` and `@pytest.mark.skipif` short-circuit before any
-    autouse fixture fires, so the plugin records the step from inside its
-    `pytest_runtest_makereport` hook rather than from the `step` fixture.
-    Runtime skips (`pytest.skip(...)`) and skips from inside a fixture go
-    through the normal fixture path, with the autouse `step` resolving to
-    `SKIPPED`.
-
 ## Expected failures (xfail / xpass)
 
 xfail marks declare that a test is expected to fail. The plugin follows
@@ -99,23 +87,6 @@ the same semantics pytest does.
 | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)      |
 | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)`                            | `SKIPPED` (the body never ran)                                |
 
-## Setup and teardown phases
-
-Fixture failures land on the test step the fixture was resolving for.
-Setup-phase and teardown-phase failures resolve differently:
-
-| Scenario                                     | Trigger                                                          | Outcome                                                       |
-| -------------------------------------------- | ---------------------------------------------------------------- | ------------------------------------------------------------- |
-| Setup-phase fixture failure                  | `@pytest.fixture` raises before `yield`; body never runs         | `ERROR`                                                       |
-| Teardown-phase fixture failure               | `@pytest.fixture` raises after `yield`; body passed              | `FAILED` (the plugin upgrades the closed step after teardown) |
-| Call-phase failure plus teardown failure     | `assert 1 == 2` in the body AND a `@pytest.fixture` raises after `yield` | `FAILED` (the call-phase failure dominates)            |
-
-## Collection and fixture resolution
-
-| Scenario        | Trigger                            | Outcome |
-| --------------- | ---------------------------------- | ------- |
-| Missing fixture | `def test_x(nonexistent_fixture):` | `ERROR` (surfaces as a setup-phase failure) |
-
 ## Influencing outcomes from test code
 
 A test can also set the step's outcome directly via the helpers below.
@@ -174,14 +145,21 @@ def test_with_substep(step):
 
 ## Propagation rules
 
-Steps the plugin opens (function, class, module, package) and substeps
-your test opens follow the same rules:
-
-- A step whose status is anything other than `PASSED` or `SKIPPED`
-  propagates failure to its parent and to the report.
-- `SKIPPED` does not propagate as a failure.
-- A status set explicitly via `current_step.update` is kept; the plugin
-  does not overwrite it.
-- `error_info` is free-form diagnostic data. The step's status is the
-  authoritative signal; `error_info` rides along for the dashboard but
-  does not change how propagation behaves.
+Every non-`PASSED`/`SKIPPED` step marks its parent as failed. What the
+parent records depends on whether the failure raised an exception:
+
+- When an exception is raised, every step it passes through on its way
+  up records the matching status: `ABORTED` for `SystemExit` and observed
+  `KeyboardInterrupt`, `ERROR` for everything else.
+- When the failure is recorded without raising (failed measurement,
+  failed `report_outcome`, manual non-PASSED status), the parent only
+  sees a child-failed signal and resolves to `FAILED` unless its own
+  logic says otherwise.
+
+Pytest catches the test body's exception at the test function step, so
+the exception-driven chain ends there. Hierarchy parents above (class,
+module, package) and the report only see the child-failed signal and
+resolve to `FAILED`.
+
+`SKIPPED` does not propagate. A status set explicitly via
+`current_step.update` is kept.

From a447c853eafca58ea2e22db84edafe60d978af98 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 11:07:50 -0700
Subject: [PATCH 6/6] codify more explicitly that errors are recorded as ERROR
 and parent steps FAIL instead of error

---
 .../pytest_plugin/pass_fail_behavior.md       | 27 +++++----
 .../pytest_plugin/_step_status_capture.py     |  5 +-
 .../pytest_plugin/step_status_states.md       |  1 +
 .../_tests/pytest_plugin/test_hierarchy.py    | 15 ++---
 .../_tests/pytest_plugin/test_pass_fail.py    | 21 +++++++
 python/lib/sift_client/pytest_plugin.py       | 27 ++++++---
 .../util/test_results/context_manager.py      | 58 ++++++++++++-------
 7 files changed, 101 insertions(+), 53 deletions(-)

diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
index 8ec57701f..6e9b1d6e3 100644
--- a/python/docs/guides/pytest_plugin/pass_fail_behavior.md
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -146,20 +146,19 @@ def test_with_substep(step):
 ## Propagation rules
 
 Every non-`PASSED`/`SKIPPED` step marks its parent as failed. What the
-parent records depends on whether the failure raised an exception:
-
-- When an exception is raised, every step it passes through on its way
-  up records the matching status: `ABORTED` for `SystemExit` and observed
-  `KeyboardInterrupt`, `ERROR` for everything else.
-- When the failure is recorded without raising (failed measurement,
-  failed `report_outcome`, manual non-PASSED status), the parent only
-  sees a child-failed signal and resolves to `FAILED` unless its own
-  logic says otherwise.
-
-Pytest catches the test body's exception at the test function step, so
-the exception-driven chain ends there. Hierarchy parents above (class,
-module, package) and the report only see the child-failed signal and
-resolve to `FAILED`.
+parent records depends on whether its own scope had an abort and whether
+a child already failed:
+
+- A hard exit (`SystemExit` or an observed `KeyboardInterrupt`) in the
+  step's own scope records `ABORTED`. `ABORTED` propagates through every
+  step the abort passes through on its way up.
+- A child that already recorded a non-`PASSED`/`SKIPPED` outcome marks
+  the parent as `FAILED`. This holds whether or not an exception is still
+  propagating through the parent's scope: only the originating substep
+  records `ERROR`; ancestors inherit `FAILED`. The traceback stays on
+  the originating step's `error_info`.
+- A step records `ERROR` only when its own scope raised a non-Assertion
+  exception AND no child has failed.
 
 `SKIPPED` does not propagate. A status set explicitly via
 `current_step.update` is kept.
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
index dd8e8c05a..e92d1726e 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
@@ -11,11 +11,14 @@
 
 import json
 from dataclasses import dataclass, field
-from pathlib import Path
+from typing import TYPE_CHECKING
 
 from sift_client._internal.low_level_wrappers._test_results_log import iter_log_data_lines
 from sift_client.sift_types.test_report import TestStatus
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
 
 @dataclass
 class CapturedStep:
diff --git a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
index f98d13fee..7e366a512 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
+++ b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
@@ -37,6 +37,7 @@ be traced back to its row here without rereading the scenario:
 | `CALL-04` | `pytest.fail("...")` from body  | `pytest.fail("intentional failure")` | `FAILED`                                                                                                 |
 | `CALL-05` | `SystemExit` from the test body | `sys.exit(1)`                        | `ABORTED`                                                                                                |
 | `CALL-06` | `KeyboardInterrupt` in body     | `raise KeyboardInterrupt`            | `IN_PROGRESS` — session aborts before the plugin sees the interrupt; `ABORTED` if the plugin does see it |
+| `CALL-07` | Substep raises non-Assertion exception | `with step.substep(...): raise ValueError("boom")` | Substep `ERROR`, test step `FAILED` (child-failed signal outranks the propagating exception) |
 
 ## Skip paths
 
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
index 866471ce7..1efd4e817 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -63,9 +63,7 @@ def _ancestor_names(steps: list[dict], leaf: dict) -> list[str]:
     return chain
 
 
-def test_class_methods_cluster_under_class_step(
-    pytester: pytest.Pytester, log_file: Path
-) -> None:
+def test_class_methods_cluster_under_class_step(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_klass=dedent(
             """
@@ -456,16 +454,15 @@ def test_b(self):
 
 def _write_ini(pytester: pytest.Pytester, log_file: Path, **overrides: object) -> None:
     """Write a pytest.ini with the given sift_* overrides, preserving the
-    offline/log/git-metadata defaults the ``log_file`` fixture installs."""
+    offline/log/git-metadata defaults the ``log_file`` fixture installs.
+    """
     lines = _base_ini_lines(log_file)
     for key, value in overrides.items():
         lines.append(f"{key} = {value}")
     pytester.makefile(".ini", pytest="\n".join(lines) + "\n")
 
 
-def test_sift_class_step_false_skips_class_steps(
-    pytester: pytest.Pytester, log_file: Path
-) -> None:
+def test_sift_class_step_false_skips_class_steps(pytester: pytest.Pytester, log_file: Path) -> None:
     _write_ini(pytester, log_file, sift_class_step="false")
     pytester.makepyfile(
         test_noclass=dedent(
@@ -845,9 +842,7 @@ def test_two(w):
     assert by_name["test_two"][0]["parent_step_id"] == mod_b["id"]
 
 
-def test_leaf_parent_chain_terminates_at_report(
-    pytester: pytest.Pytester, log_file: Path
-) -> None:
+def test_leaf_parent_chain_terminates_at_report(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_chain=dedent(
             """
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
index a950cdb7d..0e1540ce7 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -153,6 +153,27 @@ def test_x():
     assert outer.statuses[-1] == TestStatus.IN_PROGRESS
 
 
+def test_substep_exception_records_error_with_failed_parent(inner):
+    # Case: CALL-07
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner"):
+                raise ValueError("boom")
+        """,
+    )
+    # Only the originating substep records ERROR. The test step inherits the
+    # child-failed signal and resolves to FAILED, even though the same
+    # ValueError propagated through its scope.
+    inner_sub = next(iter(capture.steps_by_name("inner")), None)
+    test_x = capture.test_step("test_x")
+    assert inner_sub is not None
+    assert test_x is not None
+    assert inner_sub.statuses[-1] == TestStatus.ERROR
+    assert test_x.statuses[-1] == TestStatus.FAILED
+
+
 # ---------------------------------------------------------------------------
 # Skip paths
 # ---------------------------------------------------------------------------
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 68304cbf7..c3b303ac8 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -527,6 +527,11 @@ def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
     test's own status or substep failures should drive the result), this leaves
     the step alone so the default ``__exit__`` resolution stays in charge.
     """
+    current_step = new_step.current_step
+    if current_step is None:
+        # The step never opened (the autouse fixture short-circuited or was
+        # disabled). Nothing to resolve.
+        return
     setup_phase = getattr(item, "_sift_phase_setup", None)
     call_phase = getattr(item, "_sift_phase_call", None)
     xfail_marker = item.get_closest_marker("xfail")
@@ -576,6 +581,9 @@ def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
             status = TestStatus.SKIPPED
         elif call_phase.report.outcome == "failed":
             excinfo = call_phase.call.excinfo
+            children_passed = new_step.report_context.open_step_results.get(
+                current_step.step_path, True
+            )
             if excinfo is None:
                 status = TestStatus.FAILED
             elif isinstance(excinfo.value, AssertionError):
@@ -592,6 +600,10 @@ def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
                 # xfail(raises=X) with a non-matching exception: the contract failed.
                 status = TestStatus.FAILED
                 error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif not children_passed:
+                # A substep already recorded the error and carries the traceback;
+                # the test step only inherits the child-failed signal.
+                status = TestStatus.FAILED
             else:
                 status = TestStatus.ERROR
                 error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
@@ -599,12 +611,11 @@ def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
     if status is None and not keep_managed:
         return
 
-    assert new_step.current_step is not None
     if status is not None:
         # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
-        new_step.current_step.__dict__["status"] = status
+        current_step.__dict__["status"] = status
         if error_info is not None:
-            new_step.current_step.__dict__["error_info"] = error_info
+            current_step.__dict__["error_info"] = error_info
     new_step._sift_managed_externally = True
 
 
@@ -618,10 +629,12 @@ def _finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestRepo
     step: NewStep | None = getattr(item, "_sift_step", None)
     if step is None:
         return
-    assert step.current_step is not None
-    if teardown_report.outcome == "failed" and step.current_step.status == TestStatus.PASSED:
-        step.current_step.update({"status": TestStatus.FAILED})
-        step.report_context.mark_step_failed_after_close(step.current_step)
+    current_step = step.current_step
+    if current_step is None:
+        return
+    if teardown_report.outcome == "failed" and current_step.status == TestStatus.PASSED:
+        current_step.update({"status": TestStatus.FAILED})
+        step.report_context.mark_step_failed_after_close(current_step)
 
 
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 5fd090bfc..3454ef5e2 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -421,6 +421,10 @@ class NewStep(AbstractContextManager):
     client: SiftClient
     assertion_as_fail_not_error: bool = True
     current_step: TestStep | None = None
+    # Set by the pytest plugin's ``_resolve_initial_status`` to signal that
+    # status was already resolved upstream and ``__exit__`` should skip
+    # re-classifying. Read via ``getattr`` so unset is treated as False.
+    _sift_managed_externally: bool = False
 
     def __init__(
         self,
@@ -485,13 +489,20 @@ def update_step_from_result(
 
         returns: The false if step failed or errored, true otherwise.
         """
+        current_step = self.current_step
+        if current_step is None:
+            # The step was never opened; nothing to resolve. Treat as a pass
+            # so callers that branch on the return value don't see a spurious
+            # failure.
+            return True
+
         error_info = None
         aborted = False
-        assert self.current_step is not None
+        errored = False
         if exc:
             if isinstance(exc_value, AssertionError) and not self.assertion_as_fail_not_error:
                 # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
-                self.report_context.record_step_outcome(False, self.current_step)
+                self.report_context.record_step_outcome(False, current_step)
             elif isinstance(exc_value, (KeyboardInterrupt, SystemExit)):
                 # Hard exit propagating through the substep stack: record as
                 # ABORTED so every in-progress step on the way out reflects
@@ -499,30 +510,34 @@ def update_step_from_result(
                 aborted = True
                 error_info = format_truncated_traceback(exc, exc_value, tb)
             else:
+                errored = True
                 error_info = format_truncated_traceback(exc, exc_value, tb)
 
         # Status is the governor: anything other than IN_PROGRESS was set
         # deliberately (manual override, plugin pre-resolution, etc.) and must
         # not be silently overwritten by side-channel signals. When the step is
-        # still IN_PROGRESS, fill it in: hard-exit aborts first, then captured
-        # exceptions, then the substep-driven pass/fail read from
-        # open_step_results.
-        status = self.current_step.status
+        # still IN_PROGRESS, resolve from independent state: aborts first, then
+        # a child-failed signal (parents inherit FAILED, not the originating
+        # ERROR), then the step's own captured exception, then the children-pass
+        # default. error_info is diagnostic and never drives status.
+        status = current_step.status
         if status == TestStatus.IN_PROGRESS:
+            children_passed = self.report_context.open_step_results.get(
+                current_step.step_path, True
+            )
             if aborted:
                 status = TestStatus.ABORTED
-            elif error_info:
+            elif not children_passed:
+                status = TestStatus.FAILED
+            elif errored:
                 status = TestStatus.ERROR
             else:
-                children_passed = self.report_context.open_step_results.get(
-                    self.current_step.step_path, True
-                )
-                status = TestStatus.PASSED if children_passed else TestStatus.FAILED
+                status = TestStatus.PASSED
 
         # Propagate based on the resolved status; error_info rides along as
         # pure diagnostics and does not affect propagation.
-        result = self.report_context.propagate_step_result(self.current_step, status)
-        self.current_step.update(
+        result = self.report_context.propagate_step_result(current_step, status)
+        current_step.update(
             {
                 "status": status,
                 "end_time": datetime.now(timezone.utc),
@@ -538,18 +553,19 @@ def __exit__(self, exc, exc_value, tb):
             # Propagate based on that resolved status, emit one update_step
             # with the resolved values, and pop from the stack without
             # re-classifying.
-            assert self.current_step is not None
-            result = self.report_context.propagate_step_result(
-                self.current_step, self.current_step.status
-            )
-            self.current_step.update(
+            current_step = self.current_step
+            if current_step is None:
+                # The step was never opened; nothing to propagate.
+                return True
+            result = self.report_context.propagate_step_result(current_step, current_step.status)
+            current_step.update(
                 {
-                    "status": self.current_step.status,
+                    "status": current_step.status,
                     "end_time": datetime.now(timezone.utc),
-                    "error_info": self.current_step.error_info,
+                    "error_info": current_step.error_info,
                 },
             )
-            self.report_context.exit_step(self.current_step)
+            self.report_context.exit_step(current_step)
             if hasattr(self, "force_result"):
                 result = self.force_result
             return result