diff --git a/crates/synth-backend-riscv/src/selector.rs b/crates/synth-backend-riscv/src/selector.rs index 7b5c19c..2c26710 100644 --- a/crates/synth-backend-riscv/src/selector.rs +++ b/crates/synth-backend-riscv/src/selector.rs @@ -196,6 +196,10 @@ pub fn select_with_result_types( // selector (and its unit tests) work off a plain bool. Flag off by default → // `compute_local_promotion` is never consulted → byte-identical baseline. let promote_locals = std::env::var_os("SYNTH_RV_LOCAL_PROMO").is_some(); + // #472: cmp→select fusion (the VCR-SEL-004 lever, ported from ARM). Same + // read-once discipline; flag off by default → `pending_cmp` is never set → + // `lower_select` takes the baseline branch-on-boolean path, byte-identical. + let cmp_select_fuse = std::env::var_os("SYNTH_RV_CMP_SELECT").is_some(); select_inner( wasm_ops, num_params, @@ -203,11 +207,13 @@ pub fn select_with_result_types( func_ret_i64, type_ret_i64, promote_locals, + cmp_select_fuse, ) } -/// Shared selection pipeline. `promote_locals` drives the #472 lever; the public -/// entry point derives it from the env, unit tests pass it explicitly. +/// Shared selection pipeline. `promote_locals` drives the #472 local-promotion +/// lever and `cmp_select_fuse` the #472 cmp→select fusion; the public entry +/// point derives both from the env, unit tests pass them explicitly. fn select_inner( wasm_ops: &[WasmOp], num_params: u32, @@ -215,11 +221,13 @@ fn select_inner( func_ret_i64: &[bool], type_ret_i64: &[bool], promote_locals: bool, + cmp_select_fuse: bool, ) -> Result { let mut ctx = Selector::new_with_options(num_params, options); ctx.func_ret_i64 = func_ret_i64.to_vec(); ctx.type_ret_i64 = type_ret_i64.to_vec(); ctx.promote_locals = promote_locals; + ctx.cmp_select_fuse = cmp_select_fuse; ctx.compute_local_layout(wasm_ops, num_params); // #223 / #312 // #472: zero-init any promoted local whose first access is a read (#457). // Emitted BEFORE the body so (a) `preserve_callee_saved`'s dest scan sees @@ -829,6 +837,39 @@ fn preserve_callee_saved(out: &mut Vec, local_bytes: i32) { *out = new_out; } +/// #472 (VCR-SEL-004 port): a just-lowered i32 comparison whose 0/1 boolean is +/// still sitting on top of the vstack. If the *very next* wasm op is `select`, +/// the boolean materialization (`slt`/`sltu`/`xor`+`sltiu`/`xori`…) is deleted +/// and the select branches directly on the comparison via the matching B-type +/// branch (`beq`/`bne`/`blt`/`bge`/`bltu`/`bgeu`) — RV32's branch comparators +/// play the role ARM's IT-predicated flags do in `fuse_cmp_select`. +/// +/// Validity is enforced two ways (belt and braces): +/// - `lower_one` `take()`s the record at the start of EVERY op, so it only +/// survives from a comparison to the immediately following op. +/// - `lower_select` additionally checks the popped condition register is +/// exactly `bool_reg` and that nothing was emitted since (`end` index), +/// so a zero-emission op in between (e.g. a promoted `local.get`) can +/// never smuggle a stale record through. +/// +/// Deleting the tail is sound: `bool_reg` is a fresh temp pushed exactly once +/// (`alloc_temp` never returns a live vstack register) and the select pops it, +/// so nothing else can observe the boolean; `rs1`/`rs2` are unchanged since the +/// comparison because no instruction was emitted in between. +struct PendingCmp { + /// Branch condition that is TRUE exactly when the wasm comparison is 1. + cond: Branch, + /// Comparison operands, already in emitted (post-swap) order. + rs1: Reg, + rs2: Reg, + /// Register the deleted sequence would have left the 0/1 boolean in. + bool_reg: Reg, + /// `out.len()` BEFORE the boolean materialization — truncation point. + start: usize, + /// `out.len()` AFTER it — must still equal `out.len()` at fuse time. + end: usize, +} + /// Internal control-flow frame. Every wasm `block`/`loop`/`if` pushes one. struct ControlFrame { /// What kind of frame it is — affects br semantics (loop targets the @@ -937,6 +978,16 @@ struct Selector { /// public entry point from `SYNTH_RV_LOCAL_PROMO`; the env is read exactly /// once there so the decision function stays pure and unit-testable. promote_locals: bool, + /// #472 (VCR-SEL-004 port): whether cmp→select fusion is enabled. Set by + /// the public entry point from `SYNTH_RV_CMP_SELECT`; unit tests pass it + /// explicitly. With this false, `pending_cmp` is never populated and the + /// select lowering is byte-identical to the baseline. + cmp_select_fuse: bool, + /// #472: the comparison lowered by the PREVIOUS wasm op, if fusion is + /// enabled and its boolean is on top of the vstack. `lower_one` takes it + /// at the start of each op so it can only feed the immediately following + /// `select`. See [`PendingCmp`]. + pending_cmp: Option, } impl Selector { @@ -978,6 +1029,8 @@ impl Selector { promoted: std::collections::HashMap::new(), promoted_zero_init: std::collections::HashSet::new(), promote_locals: false, + cmp_select_fuse: false, + pending_cmp: None, } } @@ -1256,6 +1309,9 @@ impl Selector { // #312: per-op pin scope — operands popped and scratch allocated while // lowering `op` stay pinned until the next op starts. self.op_pinned.clear(); + // #472: a fusible comparison record only survives from the comparison + // to the IMMEDIATELY following op — any other op invalidates it. + let pending_cmp = self.pending_cmp.take(); match op { // ─── Locals ───────────────────────────────────────────────── LocalGet(idx) => self.lower_local_get(*idx, op)?, @@ -1263,7 +1319,7 @@ impl Selector { LocalTee(idx) => self.lower_local_tee(*idx, op)?, // ─── Select (ternary) — #223 ──────────────────────────────── - Select => self.lower_select(op)?, + Select => self.lower_select(op, pending_cmp)?, // ─── Constants ────────────────────────────────────────────── I32Const(v) => { @@ -1495,10 +1551,48 @@ impl Selector { /// Handles both i32 and i64 operands (the i64 case selects both halves under /// one branch). `dst` may alias `a`/`b` — safe, because the two arms are /// mutually exclusive. - fn lower_select(&mut self, op: &WasmOp) -> Result<(), SelectorError> { + /// + /// #472 (VCR-SEL-004 port): when `pending` carries the comparison lowered by + /// the immediately preceding op AND its boolean is exactly the popped `cond`, + /// the boolean materialization is deleted and the branch tests the comparison + /// directly (`blt a, b` instead of `slt t, a, b; bne t, zero`) — saving the + /// 1–2 instructions the boolean cost. Only reachable with + /// `SYNTH_RV_CMP_SELECT` set (the record is never created otherwise). + /// Note the fused branch is emitted before any `mv`, so the select's freshly + /// allocated `dst` aliasing a comparison operand is harmless. + fn lower_select( + &mut self, + op: &WasmOp, + pending: Option, + ) -> Result<(), SelectorError> { let cond = self.pop_i32(op)?; let b = self.pop_any(op)?; let a = self.pop_any(op)?; + // Fuse only if the record's boolean is the popped condition and nothing + // was emitted since the comparison (see PendingCmp for why both hold + // whenever the record survives `lower_one`'s take()). + let fused = pending.filter(|p| p.bool_reg == cond && p.end == self.out.len()); + if let Some(p) = &fused { + // Drop the boolean materialization — dead once we branch on the + // comparison itself. + self.out.truncate(p.start); + } + // Branch taken (→ take_a arm) exactly when the wasm condition is + // true/non-zero: the fused comparison directly, or `cond != 0`. + let take_branch = |label: String| match &fused { + Some(p) => RiscVOp::Branch { + cond: p.cond, + rs1: p.rs1, + rs2: p.rs2, + label, + }, + None => RiscVOp::Branch { + cond: Branch::Ne, + rs1: cond, + rs2: Reg::ZERO, + label, + }, + }; let take_a = self.fresh_label("Lsel_a"); let end = self.fresh_label("Lsel_end"); let mv = |dst: Reg, src: Reg| RiscVOp::Addi { @@ -1509,12 +1603,7 @@ impl Selector { match (a, b) { (VstackVal::I32(ra), VstackVal::I32(rb)) => { let dst = self.alloc_temp(); - self.out.push(RiscVOp::Branch { - cond: Branch::Ne, - rs1: cond, - rs2: Reg::ZERO, - label: take_a.clone(), - }); + self.out.push(take_branch(take_a.clone())); self.out.push(mv(dst, rb)); // cond == 0 → b self.out.push(RiscVOp::Jal { rd: Reg::ZERO, @@ -1528,12 +1617,7 @@ impl Selector { (VstackVal::I64 { lo: alo, hi: ahi }, VstackVal::I64 { lo: blo, hi: bhi }) => { let dlo = self.alloc_temp(); let dhi = self.alloc_temp(); - self.out.push(RiscVOp::Branch { - cond: Branch::Ne, - rs1: cond, - rs2: Reg::ZERO, - label: take_a.clone(), - }); + self.out.push(take_branch(take_a.clone())); self.out.push(mv(dlo, blo)); // cond == 0 → b self.out.push(mv(dhi, bhi)); self.out.push(RiscVOp::Jal { @@ -1917,9 +2001,36 @@ impl Selector { // ────────── Comparisons ────────── + /// #472 (VCR-SEL-004 port): record a fusible comparison for a possibly + /// following `select`. No-op unless `SYNTH_RV_CMP_SELECT` is set (the + /// flag-off path never populates `pending_cmp`, keeping it byte-identical). + /// `start` is `out.len()` from BEFORE the boolean materialization was + /// emitted; `cond(rs1, rs2)` must be TRUE exactly when the wasm comparison + /// yields 1. + fn record_pending_cmp( + &mut self, + cond: Branch, + rs1: Reg, + rs2: Reg, + bool_reg: Reg, + start: usize, + ) { + if self.cmp_select_fuse { + self.pending_cmp = Some(PendingCmp { + cond, + rs1, + rs2, + bool_reg, + start, + end: self.out.len(), + }); + } + } + fn lower_eqz(&mut self, op: &WasmOp) -> Result<(), SelectorError> { let src = self.pop_i32(op)?; let dst = self.alloc_temp(); + let start = self.out.len(); // sltiu dst, src, 1 → 1 iff src == 0 self.out.push(RiscVOp::Sltiu { rd: dst, @@ -1927,12 +2038,15 @@ impl Selector { imm: 1, }); self.push_i32(dst); + // eqz is true iff src == 0 → beq src, zero. + self.record_pending_cmp(Branch::Eq, src, Reg::ZERO, dst, start); Ok(()) } fn lower_cmp_eq(&mut self, op: &WasmOp, invert: bool) -> Result<(), SelectorError> { let (lhs, rhs) = self.pop_pair_i32(op)?; let diff = self.alloc_temp(); + let start = self.out.len(); // xor diff, lhs, rhs → 0 iff equal self.out.push(RiscVOp::Xor { rd: diff, @@ -1956,6 +2070,9 @@ impl Selector { }); } self.push_i32(dst); + // eq → beq lhs, rhs ; ne → bne lhs, rhs (both instructions fusible). + let cond = if invert { Branch::Ne } else { Branch::Eq }; + self.record_pending_cmp(cond, lhs, rhs, dst, start); Ok(()) } @@ -1963,8 +2080,11 @@ impl Selector { let (a, b) = self.pop_pair_i32(op)?; let dst = self.alloc_temp(); let (rs1, rs2) = if swap { (b, a) } else { (a, b) }; + let start = self.out.len(); self.out.push(RiscVOp::Slt { rd: dst, rs1, rs2 }); self.push_i32(dst); + // lt_s (and gt_s via the operand swap) → blt rs1, rs2. + self.record_pending_cmp(Branch::Lt, rs1, rs2, dst, start); Ok(()) } @@ -1974,6 +2094,7 @@ impl Selector { let lt = self.alloc_temp(); let dst = self.alloc_temp(); let (rs1, rs2) = if swap { (b, a) } else { (a, b) }; + let start = self.out.len(); self.out.push(RiscVOp::Slt { rd: lt, rs1, rs2 }); // dst = lt ^ 1 (flip 0/1) self.out.push(RiscVOp::Xori { @@ -1982,6 +2103,8 @@ impl Selector { imm: 1, }); self.push_i32(dst); + // ge_s (and le_s via the operand swap) → bge rs1, rs2. + self.record_pending_cmp(Branch::Ge, rs1, rs2, dst, start); Ok(()) } @@ -1989,8 +2112,11 @@ impl Selector { let (a, b) = self.pop_pair_i32(op)?; let dst = self.alloc_temp(); let (rs1, rs2) = if swap { (b, a) } else { (a, b) }; + let start = self.out.len(); self.out.push(RiscVOp::Sltu { rd: dst, rs1, rs2 }); self.push_i32(dst); + // lt_u (and gt_u via the operand swap) → bltu rs1, rs2. + self.record_pending_cmp(Branch::Ltu, rs1, rs2, dst, start); Ok(()) } @@ -1999,6 +2125,7 @@ impl Selector { let lt = self.alloc_temp(); let dst = self.alloc_temp(); let (rs1, rs2) = if swap { (b, a) } else { (a, b) }; + let start = self.out.len(); self.out.push(RiscVOp::Sltu { rd: lt, rs1, rs2 }); self.out.push(RiscVOp::Xori { rd: dst, @@ -2006,6 +2133,8 @@ impl Selector { imm: 1, }); self.push_i32(dst); + // ge_u (and le_u via the operand swap) → bgeu rs1, rs2. + self.record_pending_cmp(Branch::Geu, rs1, rs2, dst, start); Ok(()) } @@ -7552,6 +7681,7 @@ mod tests { &[], &[], promote, + false, ) .unwrap() .ops @@ -7725,4 +7855,205 @@ mod tests { assert_eq!(map.get(&2), Some(&Reg::S9)); assert_eq!(map.get(&3), Some(&Reg::S10)); } + + // ─────────── #472: RV32 cmp→select fusion (VCR-SEL-004 port) ─────────── + + fn s_fuse(ops: &[WasmOp], num_params: u32, fuse: bool) -> Vec { + select_inner( + ops, + num_params, + SelectorOptions::wasm_compliant(), + &[], + &[], + false, + fuse, + ) + .unwrap() + .ops + } + + fn count_branch(out: &[RiscVOp], want: Branch) -> usize { + count( + out, + |op| matches!(op, RiscVOp::Branch { cond, .. } if *cond == want), + ) + } + + /// `select(p0, p1, p0 Vec { + vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32LtS, + WasmOp::Select, + WasmOp::End, + ] + } + + /// Flag OFF must be byte-identical to the default `select()` path: the + /// boolean is materialized (`slt`) and the select branches on `bool != 0`. + #[test] + fn cmp_select_flag_off_is_identical_and_unfused_472() { + let ops = lt_s_select_ops(); + let default = s(&ops, 2); // select() — env unset in tests + let off = s_fuse(&ops, 2, false); + assert_eq!(default, off, "flag-off must equal the default path"); + assert_eq!(count(&off, |op| matches!(op, RiscVOp::Slt { .. })), 1); + assert_eq!(count_branch(&off, Branch::Ne), 1, "bne bool, zero: {off:?}"); + assert_eq!(count_branch(&off, Branch::Lt), 0); + } + + /// Flag ON fuses `lt_s` into the select's branch: the `slt` disappears and + /// the branch becomes `blt a, b` — one instruction saved. + #[test] + fn cmp_select_flag_on_fuses_lt_s_472() { + let ops = lt_s_select_ops(); + let off = s_fuse(&ops, 2, false); + let on = s_fuse(&ops, 2, true); + assert_eq!( + count(&on, |op| matches!(op, RiscVOp::Slt { .. })), + 0, + "boolean materialization must be deleted: {on:?}" + ); + assert_eq!(count_branch(&on, Branch::Lt), 1, "blt expected: {on:?}"); + assert_eq!(count_branch(&on, Branch::Ne), 0, "no bne-vs-zero: {on:?}"); + assert_eq!(on.len(), off.len() - 1, "one instruction saved: {on:?}"); + } + + /// `eq` costs two instructions to materialize (`xor` + `sltiu`) — fusing + /// into `beq a, b` saves both. + #[test] + fn cmp_select_flag_on_fuses_eq_472() { + let ops = [ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Eq, + WasmOp::Select, + WasmOp::End, + ]; + let off = s_fuse(&ops, 2, false); + let on = s_fuse(&ops, 2, true); + assert_eq!(count(&on, |op| matches!(op, RiscVOp::Xor { .. })), 0); + assert_eq!(count(&on, |op| matches!(op, RiscVOp::Sltiu { .. })), 0); + assert_eq!(count_branch(&on, Branch::Eq), 1, "beq expected: {on:?}"); + assert_eq!(on.len(), off.len() - 2, "two instructions saved: {on:?}"); + } + + /// `ge_u` costs `sltu` + `xori` — fusing into `bgeu a, b` saves both. + #[test] + fn cmp_select_flag_on_fuses_ge_u_472() { + let ops = [ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32GeU, + WasmOp::Select, + WasmOp::End, + ]; + let off = s_fuse(&ops, 2, false); + let on = s_fuse(&ops, 2, true); + assert_eq!(count(&on, |op| matches!(op, RiscVOp::Sltu { .. })), 0); + assert_eq!(count(&on, |op| matches!(op, RiscVOp::Xori { .. })), 0); + assert_eq!(count_branch(&on, Branch::Geu), 1, "bgeu expected: {on:?}"); + assert_eq!(on.len(), off.len() - 2, "two instructions saved: {on:?}"); + } + + /// `eqz` fuses into `beq src, zero` — the `sltiu` disappears. + #[test] + fn cmp_select_flag_on_fuses_eqz_472() { + let ops = [ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::LocalGet(0), + WasmOp::I32Eqz, + WasmOp::Select, + WasmOp::End, + ]; + let off = s_fuse(&ops, 2, false); + let on = s_fuse(&ops, 2, true); + assert_eq!(count(&on, |op| matches!(op, RiscVOp::Sltiu { .. })), 0); + assert_eq!( + count(&on, |op| matches!( + op, + RiscVOp::Branch { + cond: Branch::Eq, + rs2: Reg::ZERO, + .. + } + )), + 1, + "beq src, zero expected: {on:?}" + ); + assert_eq!(on.len(), off.len() - 1, "one instruction saved: {on:?}"); + } + + /// An i32 comparison feeding a select of i64 operands fuses too — the + /// condition is i32 regardless of the selected type, and both halves move + /// under the single fused branch. + #[test] + fn cmp_select_flag_on_fuses_i64_operand_select_472() { + let ops = [ + WasmOp::I64Const(11), + WasmOp::I64Const(22), + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32LtU, + WasmOp::Select, + WasmOp::End, + ]; + let off = s_fuse(&ops, 2, false); + let on = s_fuse(&ops, 2, true); + assert_eq!(count(&on, |op| matches!(op, RiscVOp::Sltu { .. })), 0); + assert_eq!(count_branch(&on, Branch::Ltu), 1, "bltu expected: {on:?}"); + assert_eq!(on.len(), off.len() - 1, "one instruction saved: {on:?}"); + } + + /// Fusion requires the comparison to IMMEDIATELY precede the select — an + /// intervening op (here a `local.tee` of the boolean) invalidates the + /// record even though the same register ends up back on top of the stack. + #[test] + fn cmp_select_no_fuse_when_not_adjacent_472() { + let ops = [ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32LtS, + WasmOp::LocalTee(2), + WasmOp::Select, + WasmOp::End, + ]; + let on = s_fuse(&ops, 2, true); + assert_eq!( + count(&on, |op| matches!(op, RiscVOp::Slt { .. })), + 1, + "boolean must stay materialized (tee reads it): {on:?}" + ); + assert_eq!(count_branch(&on, Branch::Ne), 1, "unfused select: {on:?}"); + assert_eq!(count_branch(&on, Branch::Lt), 0, "no blt: {on:?}"); + } + + /// An i64 comparison feeding a select is NOT fused (out of scope — its + /// boolean needs the full multi-instruction sequence); the select stays on + /// the baseline branch-on-boolean path. + #[test] + fn cmp_select_i64_comparison_not_fused_472() { + let ops = [ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I64Const(3), + WasmOp::I64Const(4), + WasmOp::I64LtS, + WasmOp::Select, + WasmOp::End, + ]; + let off = s_fuse(&ops, 2, false); + let on = s_fuse(&ops, 2, true); + assert_eq!(on, off, "i64 comparisons are out of fusion scope"); + } } diff --git a/scripts/repro/rv32_cmp_select_472.wat b/scripts/repro/rv32_cmp_select_472.wat new file mode 100644 index 0000000..bd10c53 --- /dev/null +++ b/scripts/repro/rv32_cmp_select_472.wat @@ -0,0 +1,105 @@ +;; #472 — RV32 cmp→select fusion (port of the ARM VCR-SEL-004 lever). +;; A comparison directly feeding a `select` fuses into the select's branch: +;; the boolean materialization (`slt`/`sltu`/`xor`+`sltiu`/`xori`) is deleted +;; and the branch tests the comparison itself (`blt a,b` instead of +;; `slt t,a,b; bne t,zero`). Flag-gated: SYNTH_RV_CMP_SELECT, default OFF. +;; Both the flag-on and flag-off codegen must match wasmtime; the flag-on +;; build must be smaller. +;; +;; One export per fusible i32 comparison kind, each selecting between two +;; arithmetic arms so the result distinguishes the arms even when a == b: +;; sel_(a, b) = cmp(a, b) ? a + 100 : b + 200 +;; plus: +;; sel_eqz — the one-operand comparison (beq src, zero) +;; sel_i64 — i32 comparison selecting i64 operands (both halves under +;; the single fused branch); result = lo ^ hi of the pick +;; sel_cmp_i64 — an i64 comparison feeding select (NOT fused — out of +;; scope; must stay correct on the baseline path) +;; clamp — back-to-back fused selects (record invalidation stress) +(module + (func $sel_eq (export "sel_eq") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.eq (local.get $a) (local.get $b)))) + + (func $sel_ne (export "sel_ne") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.ne (local.get $a) (local.get $b)))) + + (func $sel_lt_s (export "sel_lt_s") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.lt_s (local.get $a) (local.get $b)))) + + (func $sel_gt_s (export "sel_gt_s") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.gt_s (local.get $a) (local.get $b)))) + + (func $sel_le_s (export "sel_le_s") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.le_s (local.get $a) (local.get $b)))) + + (func $sel_ge_s (export "sel_ge_s") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.ge_s (local.get $a) (local.get $b)))) + + (func $sel_lt_u (export "sel_lt_u") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.lt_u (local.get $a) (local.get $b)))) + + (func $sel_gt_u (export "sel_gt_u") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.gt_u (local.get $a) (local.get $b)))) + + (func $sel_le_u (export "sel_le_u") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.le_u (local.get $a) (local.get $b)))) + + (func $sel_ge_u (export "sel_ge_u") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.ge_u (local.get $a) (local.get $b)))) + + ;; eqz: one-operand comparison → beq src, zero. + (func $sel_eqz (export "sel_eqz") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i32.eqz (local.get $a)))) + + ;; i32 comparison selecting i64 operands: both halves of the pick move + ;; under ONE fused branch. Result folds lo ^ hi so a wrong half is loud. + (func $sel_i64 (export "sel_i64") (param $a i32) (param $b i32) (result i32) + (local $v i64) + (local.set $v + (select (i64.const 0x100000005) (i64.const 0x200000009) + (i32.lt_u (local.get $a) (local.get $b)))) + (i32.xor (i32.wrap_i64 (local.get $v)) + (i32.wrap_i64 (i64.shr_u (local.get $v) (i64.const 32))))) + + ;; i64 comparison feeding select — OUT of fusion scope (its boolean needs + ;; the multi-instruction i64 sequence); must stay correct on the baseline + ;; branch-on-boolean path with the flag on. + (func $sel_cmp_i64 (export "sel_cmp_i64") (param $a i32) (param $b i32) (result i32) + (select (i32.add (local.get $a) (i32.const 100)) + (i32.add (local.get $b) (i32.const 200)) + (i64.lt_s (i64.extend_i32_s (local.get $a)) + (i64.extend_i32_s (local.get $b))))) + + ;; Back-to-back fused selects (the gust-mix clamp shape from the ARM + ;; lever): clamp(a) = min(max-clamp(a,100), floor 10). + (func $clamp (export "clamp") (param $a i32) (param $b i32) (result i32) + (local $x i32) + (local.set $x (local.get $a)) + (local.set $x + (select (i32.const 100) (local.get $x) + (i32.gt_s (local.get $x) (i32.const 100)))) + (select (i32.const 10) (local.get $x) + (i32.lt_s (local.get $x) (i32.const 10)))) +) diff --git a/scripts/repro/rv32_cmp_select_472_riscv_differential.py b/scripts/repro/rv32_cmp_select_472_riscv_differential.py new file mode 100755 index 0000000..5da721d --- /dev/null +++ b/scripts/repro/rv32_cmp_select_472_riscv_differential.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +"""#472 — RV32 cmp→select fusion (VCR-SEL-004 port) differential oracle. + +Ports the ARM cmp→select lever to RV32: an i32 comparison that DIRECTLY feeds +a `select` fuses into the select's branch under `SYNTH_RV_CMP_SELECT` — the +boolean materialization (`slt`/`sltu`/`xor`+`sltiu`/`xori`) is deleted and the +branch tests the comparison itself (`blt a, b` instead of +`slt t, a, b; bne t, zero`), saving 1–2 instructions per select. + +This oracle compiles the module for RV32, runs each export under unicorn +(UC_ARCH_RISCV / UC_MODE_RISCV32, ILP32) and compares with wasmtime. Run it +against BOTH the flag-off and the flag-on object — both must match wasmtime. + +Coverage: every fusible i32 comparison kind (eq/ne/lt_s/gt_s/le_s/ge_s/ +lt_u/gt_u/le_u/ge_u/eqz) feeding a select, an i32 comparison selecting i64 +operands (both halves under one fused branch), an i64 comparison feeding a +select (out of fusion scope — must stay correct on the baseline path), and +back-to-back fused selects (record-invalidation stress). Vectors include +equal operands and sign-boundary values (0x80000000 / 0xFFFFFFF0) so a +signed/unsigned branch-condition mix-up is loud. + +Symbols come from the ELF .symtab (SHT_SYMTAB), not `synth disasm` text. + +Run: + synth compile scripts/repro/rv32_cmp_select_472.wat -b riscv \ + --target riscv32imac --relocatable --all-exports -o /tmp/cmpsel.o + /tmp/synthvenv/bin/python \ + scripts/repro/rv32_cmp_select_472_riscv_differential.py /tmp/cmpsel.o +""" +import sys + +import wasmtime +from elftools.elf.elffile import ELFFile +from unicorn import UC_ARCH_RISCV, UC_MODE_RISCV32, Uc, UcError +from unicorn.riscv_const import ( + UC_RISCV_REG_A0, + UC_RISCV_REG_A1, + UC_RISCV_REG_RA, + UC_RISCV_REG_S11, + UC_RISCV_REG_SP, +) + +WAT = "scripts/repro/rv32_cmp_select_472.wat" +ELF = sys.argv[1] if len(sys.argv) > 1 else "/tmp/cmpsel.o" + +CODE, LIN, RET = 0x100000, 0x40000, 0x200000 + +FUNCS = [ + "sel_eq", + "sel_ne", + "sel_lt_s", + "sel_gt_s", + "sel_le_s", + "sel_ge_s", + "sel_lt_u", + "sel_gt_u", + "sel_le_u", + "sel_ge_u", + "sel_eqz", + "sel_i64", + "sel_cmp_i64", + "clamp", +] + +# (a, b) pairs: lt / gt / eq, zero (for eqz), and sign-boundary values where +# signed and unsigned comparisons disagree (0x80000000 = INT_MIN unsigned-huge; +# 0xFFFFFFF0 = -16 signed, huge unsigned). +PAIRS = [ + (5, 9), + (9, 5), + (7, 7), + (0, 3), + (3, 0), + (0, 0), + (0x80000000, 3), + (3, 0x80000000), + (0xFFFFFFF0, 3), + (3, 0xFFFFFFF0), + (0xFFFFFFF0, 0x80000000), + (150, 4), # clamp: above ceiling / below floor + (4, 150), +] + +ARG_REGS = [UC_RISCV_REG_A0, UC_RISCV_REG_A1] + + +def to_i32(v): + """wasmtime wants i32 params in signed range.""" + v &= 0xFFFFFFFF + return v - 0x100000000 if v >= 0x80000000 else v + + +def symbols(path): + f = ELFFile(open(path, "rb")) + st = f.get_section_by_name(".symtab") + syms = {} + for s in st.iter_symbols(): + if s.name and s["st_info"]["type"] == "STT_FUNC": + syms[s.name] = s["st_value"] + code = f.get_section_by_name(".text").data() + return syms, code + + +def main(): + engine = wasmtime.Engine() + module = wasmtime.Module.from_file(engine, WAT) + + def wt(name, args): + store = wasmtime.Store(engine) + inst = wasmtime.Instance(store, module, []) + return inst.exports(store)[name](store, *(to_i32(a) for a in args)) & 0xFFFFFFFF + + syms, code = symbols(ELF) + + def run(name, args): + addr = syms.get(name) + if addr is None: + return None, f"symbol {name} missing (function skipped?)" + mu = Uc(UC_ARCH_RISCV, UC_MODE_RISCV32) + for base, size in [(CODE, 0x20000), (LIN, 0x20000), (RET, 0x1000)]: + mu.mem_map(base, size) + mu.mem_write(CODE, code) + mu.reg_write(UC_RISCV_REG_SP, 0x110000) + mu.reg_write(UC_RISCV_REG_S11, LIN) + for i, v in enumerate(args): + mu.reg_write(ARG_REGS[i], v & 0xFFFFFFFF) + mu.reg_write(UC_RISCV_REG_RA, RET) + try: + mu.emu_start(CODE + addr, RET, count=4000) + except UcError as e: + return None, str(e) + return mu.reg_read(UC_RISCV_REG_A0) & 0xFFFFFFFF, "" + + fails = 0 + for name in FUNCS: + for a, b in PAIRS: + gt = wt(name, (a, b)) + res, err = run(name, (a, b)) + ok = res == gt + fails += 0 if ok else 1 + if not ok: + shown = f"0x{res:08x}" if res is not None else f"ERR({err})" + print(f"{name}(0x{a:x},0x{b:x}) = {shown} wasmtime=0x{gt:08x} FAIL") + print(f"{len(FUNCS) * len(PAIRS)} vectors, {fails} failures") + + print( + "RISC-V cmp->select #472 ORACLE: PASS" + if not fails + else f"RISC-V cmp->select #472 ORACLE: FAIL ({fails})" + ) + sys.exit(1 if fails else 0) + + +if __name__ == "__main__": + main()