From ecec8c4295c2a0e3aa413bd561b5c85ce202474c Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 23 Feb 2022 13:20:21 -0800 Subject: [PATCH 1/2] Use inline asm! for x86 DIV --- src/biguint/convert.rs | 14 ++++++++++---- src/biguint/division.rs | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs index 7e4dee57..384b5f44 100644 --- a/src/biguint/convert.rs +++ b/src/biguint/convert.rs @@ -4,7 +4,7 @@ use super::{biguint_from_vec, BigUint, ToBigUint}; use super::addition::add2; -use super::division::div_rem_digit; +use super::division::{div_rem_digit, FAST_DIV_WIDE}; use super::multiplication::mac_with_carry; use crate::big_digit::{self, BigDigit}; @@ -688,7 +688,13 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec { let mut digits = u.clone(); - let (base, power) = get_half_radix_base(radix); + // X86 DIV can quickly divide by a full digit, otherwise we choose a divisor + // that's suitable for `div_half` to avoid slow `DoubleBigDigit` division. + let (base, power) = if FAST_DIV_WIDE { + get_radix_base(radix) + } else { + get_half_radix_base(radix) + }; let radix = radix as BigDigit; // For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the @@ -696,8 +702,8 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec { // The threshold for this was chosen by anecdotal performance measurements to // approximate where this starts to make a noticeable difference. if digits.data.len() >= 64 { - let mut big_base = BigUint::from(base * base); - let mut big_power = 2usize; + let mut big_base = BigUint::from(base); + let mut big_power = 1usize; // Choose a target base length near √n. let target_len = digits.data.len().sqrt(); diff --git a/src/biguint/division.rs b/src/biguint/division.rs index 5706d2db..8669410f 100644 --- a/src/biguint/division.rs +++ b/src/biguint/division.rs @@ -10,12 +10,15 @@ use core::ops::{Div, DivAssign, Rem, RemAssign}; use num_integer::Integer; use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero}; +pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64")); + /// Divide a two digit numerator by a one digit divisor, returns quotient and remainder: /// /// Note: the caller must ensure that both the quotient and remainder will fit into a single digit. /// This is _not_ true for an arbitrary numerator/denominator. /// /// (This function also matches what the x86 divide instruction does). +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] #[inline] fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) { debug_assert!(hi < divisor); @@ -25,6 +28,34 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit) } +/// x86 and x86_64 can use a real `div` instruction. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline] +fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) { + // This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one + // register, otherwise in release mode it will become a target-specific fault like SIGFPE. + // This should never occur with the inputs from our few `div_wide` callers. + debug_assert!(hi < divisor); + + // SAFETY: The `div` instruction only affects registers, reading the explicit operand as the + // divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly + // written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is + // used, and flags are not preserved. + unsafe { + let (div, rem); + + core::arch::asm!( + "div {}", + in(reg) divisor, + inout("dx") hi => rem, + inout("ax") lo => div, + options(pure, nomem, nostack), + ); + + (div, rem) + } +} + /// For small divisors, we can divide without promoting to `DoubleBigDigit` by /// using half-size pieces of digit, like long-division. #[inline] @@ -45,7 +76,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit) let mut rem = 0; - if b <= big_digit::HALF { + if !FAST_DIV_WIDE && b <= big_digit::HALF { for d in a.data.iter_mut().rev() { let (q, r) = div_half(rem, *d, b); *d = q; @@ -70,7 +101,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit { let mut rem = 0; - if b <= big_digit::HALF { + if !FAST_DIV_WIDE && b <= big_digit::HALF { for &digit in a.data.iter().rev() { let (_, r) = div_half(rem, digit, b); rem = r; @@ -230,7 +261,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) { let mut a0 = 0; // [b1, b0] are the two most significant digits of the divisor. They never change. - let b0 = *b.last().unwrap(); + let b0 = b[b.len() - 1]; let b1 = b[b.len() - 2]; let q_len = a.data.len() - b.len() + 1; From b02188d99846de3f3fe1ec05b5cadffd81dfd5eb Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Sat, 4 May 2024 20:08:37 -0700 Subject: [PATCH 2/2] Skip asm under miri --- src/biguint/division.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/biguint/division.rs b/src/biguint/division.rs index 8669410f..adf9db89 100644 --- a/src/biguint/division.rs +++ b/src/biguint/division.rs @@ -18,7 +18,7 @@ pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch /// This is _not_ true for an arbitrary numerator/denominator. /// /// (This function also matches what the x86 divide instruction does). -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +#[cfg(any(miri, not(any(target_arch = "x86", target_arch = "x86_64"))))] #[inline] fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) { debug_assert!(hi < divisor); @@ -29,7 +29,7 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi } /// x86 and x86_64 can use a real `div` instruction. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(all(not(miri), any(target_arch = "x86", target_arch = "x86_64")))] #[inline] fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) { // This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one