diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock new file mode 100644 index 00000000000..0c60bc6257c --- /dev/null +++ b/.claude/scheduled_tasks.lock @@ -0,0 +1 @@ +{"sessionId":"916957a1-57ef-41d3-9ec0-e64d3dcb2c53","pid":161913,"procStart":"403554","acquiredAt":1778992865319} \ No newline at end of file diff --git a/.github/workflows/reflex_compiler_rust_wheels.yml b/.github/workflows/reflex_compiler_rust_wheels.yml new file mode 100644 index 00000000000..e03abb0269f --- /dev/null +++ b/.github/workflows/reflex_compiler_rust_wheels.yml @@ -0,0 +1,107 @@ +name: Build Rust acceleration wheels + +# Builds both reflex-compiler-rust (IR/compiler pipeline) and +# reflex-markdown-rust (pulldown-cmark) wheels. abi3-py310 means one wheel +# per platform covers Python 3.10–3.13. Matrix kept in sync with plan §9. + +on: + pull_request: + paths: + - "packages/reflex-compiler-rust/**" + - "packages/reflex-markdown-rust/**" + - ".github/workflows/reflex_compiler_rust_wheels.yml" + push: + branches: + - main + paths: + - "packages/reflex-compiler-rust/**" + - "packages/reflex-markdown-rust/**" + workflow_dispatch: + inputs: + publish: + description: "Publish wheels to PyPI" + required: false + default: "false" + type: choice + options: + - "false" + - "true" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + name: Build ${{ matrix.package.name }} (${{ matrix.platform.target }} / ${{ matrix.platform.manylinux || matrix.platform.os }}) + runs-on: ${{ matrix.platform.runs-on }} + strategy: + fail-fast: false + matrix: + package: + - { name: reflex-compiler-rust, dir: packages/reflex-compiler-rust, py_module: reflex_compiler_rust, smoke: "from reflex_compiler_rust import _native; print('schema', _native.SCHEMA_VERSION); print('ok')" } + - { name: reflex-markdown-rust, dir: packages/reflex-markdown-rust, py_module: reflex_markdown_rust, smoke: "from reflex_markdown_rust import markdown_to_html; assert '

hi

' in markdown_to_html('# hi'); print('ok')" } + platform: + - { runs-on: ubuntu-22.04, target: x86_64, manylinux: manylinux2014, os: linux } + - { runs-on: ubuntu-22.04, target: x86_64, manylinux: musllinux_1_2, os: linux } + - { runs-on: ubuntu-22.04-arm, target: aarch64, manylinux: manylinux2014, os: linux } + - { runs-on: macos-14, target: aarch64, os: macos } + - { runs-on: macos-13, target: x86_64, os: macos } + - { runs-on: windows-2022, target: x64, os: windows } + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python (host interpreter) + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Build wheel + uses: PyO3/maturin-action@v1 + with: + working-directory: ${{ matrix.package.dir }} + target: ${{ matrix.platform.target }} + manylinux: ${{ matrix.platform.manylinux || 'auto' }} + args: --release --out dist --interpreter python3.12 + sccache: "true" + + - name: Smoke test wheel + if: matrix.platform.os == 'linux' && matrix.platform.target == 'x86_64' && matrix.platform.manylinux == 'manylinux2014' + working-directory: ${{ matrix.package.dir }} + run: | + python -m pip install --upgrade pip + pip install dist/*.whl + python -c "${{ matrix.package.smoke }}" + + - name: Upload wheel artifact + uses: actions/upload-artifact@v4 + with: + name: wheel-${{ matrix.package.name }}-${{ matrix.platform.os }}-${{ matrix.platform.target }}-${{ matrix.platform.manylinux || 'native' }} + path: ${{ matrix.package.dir }}/dist/*.whl + if-no-files-found: error + retention-days: 7 + + publish: + name: Publish to PyPI + runs-on: ubuntu-22.04 + needs: build + if: startsWith(github.ref, 'refs/tags/reflex-compiler-rust-v') || startsWith(github.ref, 'refs/tags/reflex-markdown-rust-v') || github.event.inputs.publish == 'true' + permissions: + id-token: write + steps: + - name: Download all wheels + uses: actions/download-artifact@v4 + with: + pattern: wheel-* + path: dist + merge-multiple: true + + - name: List wheels + run: ls -la dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist + skip-existing: true diff --git a/.gitignore b/.gitignore index 508d57ca9d6..328b32c400f 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ reflex.db node_modules package-lock.json *.pyi -.pre-commit-config.yaml \ No newline at end of file +.pre-commit-config.yaml +target/ \ No newline at end of file diff --git a/PROFILING_FINDINGS.md b/PROFILING_FINDINGS.md new file mode 100644 index 00000000000..bbf9259390d --- /dev/null +++ b/PROFILING_FINDINGS.md @@ -0,0 +1,448 @@ +# Rust pipeline profiling — findings + +Session date: 2026-05-18. +Bench script: `scripts/benchmark_single_page.py`. +Profile artifacts: `/tmp/runrust*.prof`. + +This document records what we **measured** while profiling the Rust +compile pipeline on the docs app and on a synthetic single-page bench. +No assumptions; every number here is from a `perf_counter_ns` timer or +a cProfile run. + +--- + +## 1. Baseline (docs app, 7 pages compiled) + +Initial `reflex run-rust --frontend-only` on `docs/app`. Wall-clock, +no profiler: + +| Phase | Time | +|---|---| +| App import (user `reflex_docs` module) | 1.26 s | +| `rust_pipeline.compile_pages` (7 pages) | **2.10 s** | +| `bun install` | 0.06 s | +| **Total** | **~3.4 s** | + +cProfile breakdown of `compile_pages` (4.92 s under cProfile; +real wall-clock 2.10 s): + +| Phase | cumtime | Share | +|---|---|---| +| `walk_and_memoize` (Python recursion, builds 464 memo wrappers) | 0.73 s | 15% | +| `compile_unevaluated_page` (user page callable + theme) | 0.65 s | 13% | +| `_get_all_imports` recursive Python walks | 0.42 s | 9% | +| `compile_page_from_component` (Rust JSX emit, 7 calls) | 0.40 s | 8% | +| `_get_all_app_wrap_components` Python tree walks | 0.38 s | 8% | +| `_compile_memo_components` (legacy memo for `@rx.memo`) | 0.36 s | 7% | +| `emit_memo_modules` (377 unique memo bodies emitted) | 0.35 s | 7% | +| Rust calls total | ~0.69 s | 14% | +| Python overhead total | ~2.6 s | 53% | +| Other | ~0.6 s | 12% | + +`merge_imports` was hottest by Python self-time: 15,131 calls, +5.2 M iterations through its generator, ~0.6 s self-time. + +--- + +## 2. Round 1 — `_get_all_imports` → Rust walker + +**Change**: added `CompilerSession.collect_all_imports(component)` +backed by `reflex_pyread::collect_all_imports` — walks the Component +tree, calls each node's cached `_get_imports()` via PyO3, merges in a +Rust `HashMap`. Replaced 3 `component._get_all_imports()` call sites +in `rust_pipeline.compile_pages`. + +**Result** (docs app, 3 runs median of `rust-compiled 7 page(s)`): + +| | Before | After | Δ | +|---|---|---|---| +| compile_pages wall-clock | 2095 ms | **1738 ms** | **−357 ms (−17%)** | + +Where the savings came from: 29k fewer Python `extend` calls, deep +`merge_parsed_imports` recursion replaced with Rust HashMap merge. + +--- + +## 3. Round 2 — outer `merge_imports` in-place via Rust + +**Change**: added `collect_all_imports_into(target, component)` and +`merge_imports_into(target, source)` — both apply the `$/utils/...` +lib-prefix transform and merge into a caller-owned dict in place. +Replaced the `merge_imports(all_imports, ...)` wrappers in the +compile_pages page loop (eliminating 385 outer Python `merge_imports` +calls). + +**Result** (docs app, 5 runs): + +| | Round 1 final | Round 2 final | Δ | +|---|---|---|---| +| median compile_pages | 1738 ms | 1727 ms | −11 ms | +| mean compile_pages | 1733 ms | 1715 ms | −18 ms | +| min compile_pages | 1725 ms | 1679 ms | −46 ms | + +Under cProfile compile_pages dropped from 4.63 s → 3.75 s (−880 ms), +but the wall-clock delta is in the noise band (±50 ms run-to-run). + +**Important lesson from round 2**: cProfile cumulative time is *not* a +reliable predictor of wall-clock savings. The 415 outer `merge_imports` +calls had high cProfile-attributed cost but cheap actual cost (just +iterating already-built dicts). Round 1's win was real because it +replaced **5.2 M iterations** of a Python generator (deep +`_get_all_imports` recursion), which is real CPU work. Round 2 replaced +thin Python wrappers around C-level list ops — already fast. + +**Takeaway**: target deep recursive Python work, not shallow wrappers. + +--- + +## 4. Single-page bench setup + +`scripts/benchmark_single_page.py` builds **one** feature-rich page +(state vars, foreach over state, cond + Components in props, match, +event handlers, markdown — exercises every surface `compile_pages` +touches) and runs the full per-page flow with `perf_counter_ns` timers +labeled Python / Rust+PyO3 / pure Rust. Memoize is included; static +artifacts are excluded. + +A `--scale N` arg multiplies the page contents N× for scaling +experiments. + +**Single page at scale=1 (47 nodes)**, after rounds 1 + 2, +10 runs aggregated (1 warmup discarded): + +``` +phase kind median (ms) +compile_unevaluated_page python 1.85 +collect_all_imports_into hybrid 3.10 +_get_all_app_wrap_components python 0.28 +walk_and_memoize python 4.69 +_get_all_custom_code python 4.64 +_get_all_hooks + _render_hooks python 0.12 +compile_page_from_component (Rust JSX emit) hybrid 0.65 +page write_text python 0.14 +memo body: collect_all_imports_into hybrid 0.22 +memo body: _harvest_pre_hooks (Python walk) python 0.08 +memo body: compile_memo_from_component (Rust) hybrid 2.67 +memo body: write_text python 0.22 +app_root composition + render python 6.34 +───────────────────────────────────────────────────────────────── +Per-run median total: 25.00 ms + +Python only 18.36 ms ( 73.4%) +Rust + PyO3 callbacks 6.64 ms ( 26.6%) +pure Rust (no callbacks) 0.00 ms ( 0.0%) +``` + +--- + +## 5. Python vs Rust head-to-head — per-page mechanical compile + +Same evaluated Component tree on both sides, memoize skipped, fresh +tree per iteration (no `_imports_cache` warming). + +### Before fusing the 4 walks (initial state of `read_page`) + +Scale sweep, 15 iterations each: + +| Scale | Tree size | Python `_compile_page` | Rust pipeline | Ratio | +|---|---|---|---|---| +| 1 | 48 nodes | 7.87 ms | 9.55 ms | **Rust 0.82× — 18% slower** | +| 2 | 91 nodes | 19.7 ms | 22.4 ms | **Rust 0.88× — 12% slower** | +| 4 | 177 nodes | 30.9 ms | 38.4 ms | **Rust 0.80× — 20% slower** | +| 8 | 349 nodes | 62.9 ms | 78.6 ms | **Rust 0.80× — 20% slower** | + +Rust pipeline lost at *every* size. + +### Detailed sub-step breakdown (scale=1, 47 nodes, 20 runs) + +``` +=== Python _compile_page — sub-steps === +_get_all_imports 2.561 ms +compile_imports (apply+sort) 0.092 ms +_get_all_dynamic_imports + sort 0.028 ms +_get_all_custom_code 4.119 ms +_get_all_hooks 0.144 ms +component.render() (recursive Python) 1.457 ms +page_template(...) 0.045 ms +───────────────────────────────────────────────────────── +Total: 8.445 ms + +=== Rust pipeline — sub-steps === +collect_all_imports_into (Rust+PyO3) 2.420 ms +_get_all_custom_code 4.036 ms +_get_all_hooks + _render_hooks 0.142 ms +compile_page_from_component (Rust+PyO3) 3.398 ms ← the gap +───────────────────────────────────────────────────────── +Total: 9.996 ms + +Gap = +1.550 ms (+18.4%) +``` + +**The gap is `compile_page_from_component`: 3.40 ms vs Python's +`component.render() + page_template`: 1.50 ms.** Rust was 2.3× +slower on the actual JSX-emit step. + +--- + +## 6. The 4-walk bug + +Inspection of `read_page` (in `reflex_pyread::pyo3_reader`) revealed +it was walking the Python Component tree **four times**: + +```rust +pub fn read_page(...) -> Result { + let root_ir = read_component(py, root, ...)?; // walk 1: build IR + let root_alloc = arena.alloc(root_ir); + + let component_imports = collect_component_imports(...)?; // walk 2 + let state_bindings = collect_state_bindings(...)?; // walk 3 + let needs_ref = scan_needs_ref(...)?; // walk 4 + + Ok(Page { ... }) +} +``` + +Each post-walk re-traversed the Python tree via PyO3 `getattr` to +harvest one piece of metadata. **4× the PyO3 boundary cost for the +same data we'd already read.** + +### Fix: Option A — inline harvests during single walk + +Added `HarvestState` field (via `RefCell`) to `PyRefs`. Inlined the +three harvests: + +- `component_imports`: in `read_element`, register after + `resolve_tag_symbol`. Also in `read_var_data_imports` for VarData + imports. +- `state_bindings`: in `read_value` / `read_bare` / `event_handler_to_js` + — wherever we read a Var's `_js_expr`, scan it for state idents and + register. +- `needs_ref`: in `read_element` — check `id` attr per element. + +`read_page` now does **one** Python walk via `read_component`; harvests +fall out as a side-effect. + +### Result + +Per-page `compile_page_from_component`: **3.40 ms → 1.90 ms (−44%).** + +Full sub-step head-to-head after the fix (scale=1, 20 runs): + +``` +=== Rust pipeline — sub-steps === +collect_all_imports_into (Rust+PyO3) 2.954 ms +_get_all_custom_code 4.643 ms +_get_all_hooks + _render_hooks 0.150 ms +compile_page_from_component (Rust+PyO3) 1.899 ms ← was 3.398 +───────────────────────────────────────────────────────── +Total: 9.646 ms + +Python total: 9.853 ms + +Gap = -0.207 ms (-2.1%) ← Rust now WINS at this size +``` + +### Scale sweep after the fix + +| Scale | Tree size | Python | Rust | Gap | +|---|---|---|---|---| +| 1 | 48 nodes | 8.50 ms | 8.35 ms | **−1.8% (Rust wins)** | +| 2 | 91 nodes | 15.81 ms | 15.78 ms | −0.2% (tie) | +| 4 | 177 nodes | 31.86 ms | 33.53 ms | **+5.2% (Python wins)** | +| 8 | 349 nodes | 63.29 ms | 66.14 ms | **+4.5% (Python wins)** | + +**Closed the gap at small N. Still lose ~5% at larger N.** + +--- + +## 7. Rust-side phase instrumentation + +Added a thread-local `PhaseTimings` cell in `reflex_pyread::timing` +with `Span` RAII guards. Spans cover only **leaf** call sites (no +recursive functions) so totals are self-time. Exposed via +`CompilerSession.last_phase_timings_ns()`. + +### Measurement at scale=4 (177 nodes, 87 elements, 97 vars, 2091 props, 16 event handlers) + +``` +phase ns ms +────────────────────────────────────────────────────────────── +read_page_total_ns 7,065,847 7.066 ← total + emit_ns (pure Rust) 23,749 0.024 ← Rust JSX emit + read_var_data_ns 1,495,238 1.495 ← #1 cost + prop_value_getattr_ns 384,984 0.385 + value_literal_dispatch_ns 234,378 0.234 + var_js_expr_attr_ns 217,419 0.217 + import_alias_ns 152,890 0.153 + resolve_tag_ns 134,754 0.135 + get_props_call_ns 42,837 0.043 + class_name_ns 32,757 0.033 + event_triggers_attr_ns 31,717 0.032 + children_attr_ns 28,040 0.028 + needs_ref_ns 22,340 0.022 + isinstance_var_ns 13,210 0.013 + harvest_register_ns 6,720 0.007 + (unaccounted) 4,244,814 4.245 ← loop control + py_str + dispatch +``` + +### Per-call costs (ns per occurrence) + +| Operation | per-call cost | what it does | +|---|---|---| +| `read_var_data_ns / var` | **15,415 ns** | `var._get_all_var_data()` + decode imports/hooks/deps | +| `var_js_expr_attr_ns / var` | 2,241 ns | `getattr(var, "_js_expr")` for Var values | +| `import_alias_ns / element` | 1,757 ns | reads library/tag/alias for import harvest | +| `resolve_tag_ns / element` | 1,549 ns | reads alias/tag/library/_is_tag_in_global_scope | +| `get_props_call_ns / element` | 492 ns | `component.call_method0("get_props")` | +| `class_name_ns / element` | 377 ns | `type(component).__name__` | +| `event_triggers_attr_ns / element` | 365 ns | `getattr("event_triggers")` + dict downcast | +| `children_attr_ns / element` | 322 ns | `getattr("children")` + iter() setup | +| `needs_ref_ns / element` | 257 ns | `getattr("id")` | +| `prop_value_getattr_ns / prop` | 184 ns | `getattr(prop_name)` | +| `isinstance_var_ns / prop` | **6 ns** | `isinstance(value, var_cls)` | + +### Key findings from the instrumentation + +1. **Pure Rust emit is 24 µs for a 177-node tree.** Free. **Not the bottleneck.** +2. **`read_var_data` dominates at 1.5 ms (21% of `read_page`).** ~15 µs per Var. Most of that is Python-side `_get_all_var_data()` walking deps and merging. +3. **`resolve_tag` + `import_alias` = 0.29 ms** at ~3.3 µs/element. They read the same `library`/`tag`/`alias` attrs twice. Fusing would save ~0.15 ms — small. +4. **2091 prop iterations is the surprise.** ~24 declared props per element (Pydantic gives us *all* declared fields whether set or not). Most are None. Per-prop getattr is fast (184 ns) but adds up: 0.39 ms. +5. **The unaccounted 4.25 ms (60% of `read_page`)** is the aggregate of small per-iteration costs — Rust loop control, `py_str` conversions, `strip_suffix`/`to_owned` allocations, `read_value` function entry, vec pushes. ~2091 prop iterations × ~1 µs each ≈ 2 ms there alone. +6. **`isinstance_var_ns / prop = 6 ns`** and **`harvest_register_ns: 7 µs total`** — the things I worried about are nothing. + +--- + +## 8. Why Rust loses at scale — the PyO3 boundary tax + +Per-node cost comparison from the scale sweep: + +| Scale | Nodes | Python ms | Rust ms | Python ns/node | Rust ns/node | +|---|---|---|---|---|---| +| 1 | 48 | 8.5 | 8.3 | 177 µs | 173 µs | +| 4 | 177 | 32.4 | 33.5 | 183 µs | 189 µs | +| 8 | 349 | 63.3 | 66.1 | 181 µs | 189 µs | + +Both paths are O(N), but **Rust pays ~6 µs more per node** at scale. +At 48 nodes this is invisible (absorbed by per-compile fixed costs). +At 349 nodes it dominates. + +### Boundary cost per operation + +| Operation | C-level Python access | PyO3 from Rust | +|---|---|---| +| `__getattribute__` (simple slot) | ~30 ns | ~150 ns | +| `__getattribute__` (descriptor) | ~80 ns | ~200 ns | +| Method call (`call_method0`) | ~200 ns | ~1,000 ns | +| String marshal (`py_str`) | n/a (already Python str) | ~100 ns conversion | + +Python's `component.render()` and `_get_all_*` walks read the same +attrs we do, but **stay entirely inside CPython memory**. No boundary +crossing. Pydantic descriptors ~80 ns; PyO3 getattr ~200 ns. **2-3× +per-op overhead × ~10 ops per element = ~6-10 µs/element overhead.** + +**We are not doing less work in Rust — we are doing the same work plus +marshaling tax.** At small N the tax is invisible; at large N it +dominates. + +--- + +## 9. Where the time goes — the 24.66 ms / page budget + +After rounds 1 + 2 + walk-fusion, single page at scale=1: + +| Phase | Time | Where it runs | +|---|---|---| +| `walk_and_memoize` | 5.08 ms | **Python** — recursion + 464 `Component.create()` allocations for memo wrappers | +| `app_root composition + render` | 5.99 ms | **Python** — `_RenderUtils.render` is a recursive Python JSX renderer | +| `_get_all_custom_code` | 4.57 ms | **Python** — Markdown component builds `ComponentMap_*` closure | +| `collect_all_imports_into` | 3.02 ms | **Rust+PyO3** — already moved; PyO3 callbacks dominate | +| `memo body: compile_memo_from_component` | 2.63 ms | **Rust+PyO3** — already in Rust | +| `compile_unevaluated_page` | 1.72 ms | **Python** — user `def page()` callable. **Unmovable.** | +| `compile_page_from_component` | 0.65 ms | **Rust+PyO3** — already Rust | +| Other (hooks, app_wraps, I/O) | 1.00 ms | mix | + +**Python share: 73.7%. Hybrid (Rust + PyO3 callbacks): 26.3%. Pure Rust: 0%.** + +--- + +## 10. What "move to Rust" actually means after these measurements + +The new finding reframes the optimization roadmap: + +### Moves that *won't* win + +Anything that just replaces a Python tree walk with a Rust+PyO3 walk +doing the same per-node work. The Rust pipeline pays ~6 µs/node tax +on top of the same per-node Python method execution. Examples: + +- **`_get_all_custom_code` → Rust walk**: still has to call back into + Python per node to build the markdown closure. ~4 ms Python work + stays; we'd add ~1 ms PyO3 tax. **Net loss possible.** +- **`_get_all_app_wrap_components` → Rust walk**: already small + (0.28 ms); marshaling tax would erase most savings. + +### Moves that *will* win + +Work that has **architectural fat to cut** beyond just porting: + +#### Primary: `walk_and_memoize` → Rust IR transform (~5 ms saved) + +- 5.08 ms / page = biggest single Python phase +- Most of the cost is `_wrap_with_memo()` allocating real + `Component` objects via `Component.create()` for memo wrappers +- If memo wrappers become **IR-only** (Rust `Component::Memoize` IR + variant), the Python allocation goes away +- Fuses into the existing `read_page` walk — no new PyO3 tax +- The downstream walks that currently see the wrappers either run + on IR or are tiny + +Expected savings: **~3-5 ms/page → 24.66 → ~20 ms (15-20% reduction)**. + +#### Secondary: `app_root` through Rust emit (~2-3 ms saved net) + +- 5.99 ms total; ~3-4 ms is `_RenderUtils.render(app_root.render())` +- Replace with `compile_page_from_component(app_root, ...)` which + pays ~1-2 ms of PyO3 walk + Rust emit +- Net: ~1-2 ms saved + +### Moves on the Python side that would help us (if reflex_base could be touched) + +Ranked by impact from the instrumentation: + +1. **Cache `_get_all_var_data()` results on Var instances** — `read_var_data_ns / var = 15,415 ns` is mostly Python-side dep walking. Caching turns it into a ~100 ns attribute read. **Saves ~1.4 ms / page.** +2. **`get_props()` returns only `model_fields_set`** — 2091 prop iterations → ~260. **Saves ~1.5 ms / page** (mostly from the unaccounted slice). +3. **Pre-compute `_module_spec`** combining library/tag/alias — eliminates 3 redundant getattrs/element. **Saves ~0.15 ms / page**. +4. **Component as slotted dataclass** (out of Pydantic on the hot read path) — 2-3× faster getattr globally. Large refactor. + +--- + +## 11. Bottom line + +- The Rust **core itself is essentially free** — pure JSX emit is 24 µs for a 177-node tree. +- **All Rust pipeline cost is PyO3 boundary tax + Python method execution.** +- The 4-walks bug in `read_page` was a real ~1.5 ms / page bug; fixed. +- At small N (≤90 nodes) Rust now wins by ~2%. +- At large N (≥170 nodes) Python still wins by ~5%, because the PyO3 tax (~6 µs/node) accumulates. +- **The only path to genuinely beat Python at scale is to reduce per-node PyO3 calls** — either via Python-side caching (Var data, slot-based Components) or by snapshotting once and walking the snapshot in pure Rust thereafter. + +The single highest-ROI Rust-side move is **moving `walk_and_memoize` to a Rust IR transform** — biggest Python phase, no new PyO3 tax (fuses into the existing walk), eliminates Python `Component.create()` allocations. + +--- + +## 12. Reproduce + +```bash +# Build the wheel +cd packages/reflex-compiler-rust +uv run maturin develop --release + +# Single-page bench at scale=1 (47 nodes), 15 iterations +uv run python scripts/benchmark_single_page.py 15 1 + +# Scale sweep — Python wins ratio rises with N +for s in 1 2 4 8; do + uv run python scripts/benchmark_single_page.py 15 $s | tail -5 +done +``` + +Per-phase Rust timings are printed at the end of every run. diff --git a/RUST_REWRITE_PLAN.md b/RUST_REWRITE_PLAN.md new file mode 100644 index 00000000000..0867342dddf --- /dev/null +++ b/RUST_REWRITE_PLAN.md @@ -0,0 +1,1002 @@ +# Reflex Compiler → Rust: One-Go Implementation Spec + +Complete plan with every decision locked. Built so an AI-driven implementation doesn't have to stop and ask. If you find yourself wanting to decide something mid-port, the answer is in this document; if it isn't, the default is "do what oxc/ty/bun does in the cited file." + +> **Status note (post-spike, 2026-05-16).** Scaffold has landed and the 2-day spike has run. This version of the plan is updated against measured numbers, not projections. The sections most affected by spike findings: §1 (IR serialization), §2 (R1 caveat), §5 (target tables), §7 (Component.to_ir API), §11 (pitfalls 1 and 12), §13 (done-criteria). Earlier review-pass issues are also folded in. + +--- + +## 0. Status — scaffold + spike results + +**Scaffold:** `packages/reflex-compiler-rust/` exists. Cargo workspace with all 8 crates from §3, `pyproject.toml` wired for `maturin develop --release` building one abi3-py310 wheel, `rust-toolchain.toml` pinned stable. **All 11 D-items (D0–D11) have landed** as of 2026-05-16 — see §8. The wheel exposes `CompilerSession.compile_page(bytes) -> str` and `CompilerSession.compile_app([(ident, route, bytes)], theme, global_state, plugin_manifest) -> CompiledOutput` driven by a Python IR builder at `reflex.compiler.ir` (§7). 41 Rust unit tests + 15 Python integration tests are green; a synthetic 200-page compile cold-builds in **1.9 ms** (plan target was <500 ms). The Component→IR bridge and full snapshot corpus (§6) remain follow-ups. + +``` +packages/reflex-compiler-rust/ +├── Cargo.toml workspace, 8 crates +├── pyproject.toml maturin, abi3-py310 +├── rust-toolchain.toml stable +├── python/reflex_compiler_rust/ thin wrapper over the cdylib +├── crates/ all 8 crates from §3 (7 stubs + reflex_py) +└── scripts/spike_bench.py napkin-math driver +``` + +Build + run today: `cd packages/reflex-compiler-rust && maturin develop --release && python scripts/spike_bench.py`. + +**Spike results:** full writeup at `ignore/SPIKE_RESULTS.md`. Headlines: + +1. **PyO3 crossing is free** (~100 ns/call). The plan's "boundary is cheap" assumption holds with margin. +2. **rmp-serde is 25× slower than a hand-rolled msgpack reader.** Pitfall 12 validated by measurement. Hand-rolled deserializer is mandatory from D4; do not ship rmp-serde "for now." +3. **`msgpack.Packer` streaming is 5-7% *slower* than building a list-of-lists and calling `msgpack.packb`** in one C call. The §1 "no per-node dict" line was wrong direction; §7's `Component.to_ir(packer)` API has been replaced (see §7 below). +4. **Bumpalo arena is 30% slower than no-arena for single-pass codegen** at every tree size. R1 is now conditional — see §2. +5. **Real Reflex `compiler.compile` is 273 µs/node Python** (2.81s ÷ 10 290 nodes on the 27-page docs app). The spike's synthetic Python codegen was 580× faster than reality; the actual Rust headroom is **50-100×** on this bucket, not the 5-10× the §5 table previously implied. +6. **Cold-build wall-clock is dominated by npm install (37%) + app import (21%) + framework imports (10%) = 68% untouchable.** Best-case cold compile wins are ~2.3× with hot npm cache, ~1.55× without. The user-visible win is **hot reload** (4-5×) and **per-page compile** (50-100×). At 200 pages the rewrite goes from "nice" to "necessary" because Python compile-work scales linearly at 104 ms/page. + +**Decision-rule outcomes:** + +- IR emit > 30% of wall-clock? **No, 0.4%.** Spike + msgpack on 10k nodes is ~10 ms; compile bucket is 2 810 ms. +- Python-remaining > 50%? Of the post-IR-emit work, npm + app import + framework imports = 68%, but that's all untouchable subprocess/one-time work — not hidden Python codegen the boundary missed. **Proceed.** + +**Proceed to D1.** The plan below is the corrected version that ports use as their source of truth. + +--- + +## 0a. Architecture commitment — *minimum Python in the Rust pipeline* + +This is the single most important constraint for everything below. **The Rust pipeline must not depend on the legacy Python compile pipeline running first.** If we run the legacy pipeline (memoize plugin, plugin walks, `_compile_memo_components`, etc.) and *then* re-emit per-page JSX in Rust, we pay both costs and get only the small re-emit speedup. Wall-clock parity with `reflex run` is the ceiling, not the floor. That defeats the rewrite. + +**Two hard rules:** + +1. **Never edit the Python *compile* pipeline OR the framework primitives.** `reflex/compiler/{compiler.py,plugins/*}`, `reflex/experimental/memo.py`, `packages/reflex-base/src/reflex_base/{plugins,components,vars,state}/*` are all off limits. The Rust port is **additive only** — new files under `packages/reflex-compiler-rust/`, `reflex/compiler/{ir,rust_pipeline,session,markdown,diff_pipelines}.py`, and CLI additions like `run-rust`. Patching the legacy plugin chain for "feature flags" or "speedups" risks subtle behavior changes that break production users without `run-rust` being invoked. See [feedback_no_python_compile_changes]. + + **Why framework primitives stay Python (decided 2026-05-17 by microbenchmark).** Earlier drafts of this doc contemplated porting `Var` and `Component` to Rust behind PyO3 wrappers for a 70-140× compile-bucket speedup. `scripts/benchmark_stages.py` measured the actual cost split on a 20-route synthetic app (~3 300 nodes): framework work is **39%** of the Python compile pipeline, mechanical work is **61%**. The Rust mechanical port already delivers a **2.8× speedup on the 61% bucket**; the 39% framework bucket is what users *hack on* — porting it sacrifices the most important property of the framework. Total realistic speedup keeping framework Python: ~3× pipeline-wide. That's the ceiling we're optimizing for. Numbers archived in §0b. + +2. **Minimize Python work performed by `run-rust`.** The split is: + + - **Python** does *only* what Rust fundamentally cannot do: + - Import the user's app module (`prerequisites.get_app()` — import only, no compile). + - Evaluate each page function (`page_fn()`) to materialize the raw `Component` tree. This calls user Python that defines reactive `Var`s, state classes, etc. + - Run the bridge (`reflex.compiler.ir.bridge`) to convert each raw `Component` into IR. The bridge walks Component attributes and `Var._js_expr` / `_get_all_var_data()`, which is Python data. + - **Rust** does *everything else*: memoize decisions, memoize-wrapper generation, page JSX emit, codegen, file writes, theme CSS, app-wrap shell, vite config. + - **Never invoked** in `run-rust`: `prerequisites.get_compiled_app()` (the whole legacy compile — plugin chain, memoize, `_compile_memo_components`, custom-component compile, …). The scaffold under `.web/{package.json,vite.config.js,utils/context.js,utils/state.js,…}` is laid down by `reflex init` / `reflex run`; `run-rust` requires it to already exist and errors out otherwise. + +3. **`run-rust` is single-mode.** There is no in-`run-rust` fallback to the legacy pipeline — the legacy path is `reflex run`. If you want the old behavior, run that. If you want the Rust pipeline, run `reflex run-rust`. Keeping two pipelines reachable from the same command was producing dead branches (`--with-legacy`, `--snapshot-python`, `--profile-memoize`) and was removed. Concretely: + - **Scaffold missing** → hard error, telling the user to run `reflex init` or `reflex run` once. + - **Scaffold present** → `get_and_validate_app()` (~100 ms Python import, no compile) → iterate `app._unevaluated_pages` → `compile_unevaluated_page` per route (evaluate user Python, apply theme styles, wrap in Fragment+title+meta) → bridge → Rust emits JSX → Python writes `.web/app/routes/*.jsx`. Target: **< 500 ms total** for the docs app. + - **Pipeline divergence debugging** is a `reflex run` vs `reflex run-rust` comparison done by the developer, not a flag inside `run-rust`. + +4. **Phased Rust takeover of jobs currently done in the legacy compile** (in the order they need to land for `run-rust` to reach parity with `reflex run` on every axis): + + | order | job | currently in Python | Rust takeover | + |-------|-----|---------------------|---------------| + | 1 | Page module JSX | `compile_page_from_context` + `templates.page_template` | ✅ landed (`reflex_codegen::page::emit_page`) | + | 2 | Memoize **decisions** (`_should_memoize`) | `MemoizeStatefulPlugin` walk | port over IR, in `reflex_semantic`; saves ~230 ms on docs | + | 3 | Memoize **wrapper definitions** (`_build_wrapper` + `create_passthrough_component_memo`) | Python Component-class machinery | this is the big one (~1.2 s on docs). Needs Rust Component-graph rep that supports `{children}` hole substitution. | + | 4 | Memoize wrapper **file emit** | `_compile_single_memo_component` + `templates.memo_single_component_template` | same template machinery as page emit; small `export const = memo(({children}) => …)` shell. | + | 5 | Plugin walks (Radix camelCase, `_get_all_imports` aggregation, etc.) | various `enter_component`/`leave_component` plugin hooks | bridge harvests what's needed; Rust aggregates in `reflex_semantic` | + | 6 | Theme CSS, context shell, app-wrap, vite config | various `compile_*` functions in `compiler.py` | ✅ landed (`reflex_codegen::{theme,context,app_root,vite}`) | + + **Until job #3 lands, `run-rust` ships pages without runtime memoize wrappers.** The trade-off: faster compile, slower React re-renders. `reflex run` remains the production-perf path for users who need runtime memoization today. + +5. **What `reflex.compiler.ir.bridge` may legitimately depend on from `reflex_base`:** + - `Component.tag`, `Component.alias`, `Component.library`, `Component.children`, `Component.event_triggers`, `Component.id`, `Component.class_name`, `Component.key`, `Component.custom_attrs`, `Component.get_props()`, `Component._memoization_mode`, `Component._is_tag_in_global_scope` — purely attribute reads from already-constructed Components. + - `Var._js_expr`, `Var._get_all_var_data()`, `VarData.{hooks,imports,state,deps,position,components}` — same, attribute reads only. + - `format_library_name` from `reflex_base.utils.format` — a pure string helper. + - `LiteralVar.create(value)` — needed to format `EventChain` / dict / list values into JS expressions, since Reflex's Var system owns that representation. This is a constructor call, not a compile step. + + It must *not* call `compile_*`, `render_*`, or anything that runs the plugin chain. If it does, we've leaked legacy compile work into the Rust pipeline. + +--- + +**Implementation status snapshot (2026-05-16).** All 11 D-items have landed in some form; see §8 for per-item caveats. Python side has `reflex.compiler.ir.{schema,builder,canonical,pack,bridge}` + `reflex.compiler.session.CompilerSession` + `reflex.compiler.markdown`. End-to-end pipeline works: Python builds an IR tree → `msgpack.packb` → Rust parses, aggregates, emits, caches → returns rendered JS. Source maps wired through (`compile_page_with_sourcemap`). Two PyO3 wheels (compiler + markdown) with full CI matrix. **51 Rust tests + 49 Python tests + 18 corpus fixtures green.** Synthetic 200-page app cold-compiles in 1.9 ms; markdown wheel is 110× faster than mistletoe. + +**End-to-end browser verification (2026-05-16).** `examples/rust_compiler_demo/` ships a working counter app where the page JSX is compiled by the Rust pipeline and served by Vite via React Router. The new CLI command `uv run reflex run-rust` runs the normal `reflex compile` to scaffold `.web/`, then re-emits each `.web/app/routes/*.jsx` directly from the Rust compiler — **no Python postprocessor**. A headless Chromium loads the served page and renders "Counter demo / count: / 0 / − / + / reset" with zero page errors — the state value `0` is pulled from the React context, the unicode minus glyph round-trips correctly through the Rust JS-string encoder, and `jsx(RadixThemesBox, …)` calls reach radix-ui as expected. Per-page Rust compile time on this app: **49.8 µs vs 2027.2 µs for the legacy Python compiler (38.6× faster)**. + +**Schema v2 (2026-05-16).** `SCHEMA_VERSION` bumped to 2. Page IR gained three trailing positional fields — `component_imports: [(module, alias_spec)]`, `state_bindings: [str]`, `needs_ref: bool` — that the bridge harvests from the Component tree (walks `library`/`tag`/`alias` for imports, scans every Var's `_js_expr` for state-context references, checks `id` props). The Rust `page::emit_page` uses them to emit the React-runtime-compatible module shell directly: import block grouped by module with per-module dedup, `useContext(StateContexts.)` lines, optional `useRef`, and `export default function Component()` as the React-Router-expected default. `jsx::emit_prop_name` camelCases known snake_case props (`class_name`→`className`, `on_click`→`onClick`, etc.). + +**Visual parity confirmed (2026-05-16).** Side-by-side headless screenshots of the counter demo (one capture from `reflex run`, one from `reflex run-rust`) show **pixel-identical** Radix Themes rendering: same heading typography, same `