diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs index bca3be56ba5f5..39117b1890ef6 100644 --- a/library/core/src/ptr/mod.rs +++ b/library/core/src/ptr/mod.rs @@ -1143,7 +1143,9 @@ pub unsafe fn write_volatile(dst: *mut T, src: T) { pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <= // 1, where the method versions of these operations are not inlined. - use intrinsics::{unchecked_shl, unchecked_shr, unchecked_sub, wrapping_mul, wrapping_sub}; + use intrinsics::{ + unchecked_shl, unchecked_shr, unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub, + }; /// Calculate multiplicative modular inverse of `x` modulo `m`. /// @@ -1198,8 +1200,17 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { // SAFETY: `a` is a power-of-two, therefore non-zero. let a_minus_one = unsafe { unchecked_sub(a, 1) }; if stride == 1 { - // `stride == 1` case can be computed more efficiently through `-p (mod a)`. - return wrapping_sub(0, p as usize) & a_minus_one; + // `stride == 1` case can be computed more simply through `-p (mod a)`, but doing so + // inhibits LLVM's ability to select instructions like `lea`. Instead we compute + // + // round_up_to_next_alignment(p, a) - p + // + // which distributes operations around the load-bearing, but pessimizing `and` sufficiently + // for LLVM to be able to utilize the various optimizations it knows about. + return wrapping_sub( + wrapping_add(p as usize, a_minus_one) & wrapping_sub(0, a), + p as usize, + ); } let pmoda = p as usize & a_minus_one;