From 4bfacffb903d382333a39c1646ee2b3956bd59bf Mon Sep 17 00:00:00 2001
From: Simonas Kazlauskas <git@kazlauskas.me>
Date: Thu, 20 Aug 2020 03:38:00 +0300
Subject: [PATCH] Optimise align_offset for stride=1 further
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`stride == 1` case can be computed more efficiently through `-p (mod
a)`. That, then translates to a nice and short sequence of LLVM
instructions:

    %address = ptrtoint i8* %p to i64
    %negptr = sub i64 0, %address
    %offset = and i64 %negptr, %a_minus_one

And produces pretty much ideal code-gen when this function is used in
isolation.

Typical use of this function will, however, involve use of
the result to offset a pointer, i.e.

    %aligned = getelementptr inbounds i8, i8* %p, i64 %offset

This still looks very good, but LLVM does not really translate that to
what would be considered ideal machine code (on any target). For example
that's the codegen we obtain for an unknown alignment:

    ; x86_64
    dec     rsi
    mov     rax, rdi
    neg     rax
    and     rax, rsi
    add     rax, rdi

In particular negating a pointer is not something that’s going to be
optimised for in the design of CISC architectures like x86_64. They
are much better at offsetting pointers. And so we’d love to utilize this
ability and produce code that's more like this:

    ; x86_64
    lea     rax, [rsi + rdi - 1]
    neg     rsi
    and     rax, rsi

To achieve this we need to give LLVM an opportunity to apply its
various peep-hole optimisations that it does during DAG selection. In
particular, the `and` instruction appears to be a major inhibitor here.
We cannot, sadly, get rid of this load-bearing operation, but we can
reorder operations such that LLVM has more to work with around this
instruction.

One such ordering is proposed in #75579 and results in LLVM IR that
looks broadly like this:

    ; using add enables `lea` and similar CISCisms
    %offset_ptr = add i64 %address, %a_minus_one
    %mask = sub i64 0, %a
    %masked = and i64 %offset_ptr, %mask
    ; can be folded with `gepi` that may follow
    %offset = sub i64 %masked, %address

…and generates the intended x86_64 machine code. One might also wonder
how the increased amount of code would impact a RISC target. Turns out
not much:

    ; aarch64 previous                 ; aarch64 new
    sub     x8, x1, #1                 add     x8, x1, x0
    neg     x9, x0                     sub     x8, x8, #1
    and     x8, x9, x8                 neg     x9, x1
    add     x0, x0, x8                 and     x0, x8, x9

    (and similarly for ppc, sparc, mips, riscv, etc)

The only target that seems to do worse is… wasm32.

Onto actual measurements – the best way to evaluate snippets like these
is to use llvm-mca. Much like Aarch64 assembly would allow to suspect,
there isn’t any performance difference to be found. Both snippets
execute in same number of cycles for the CPUs I tried. On x86_64,
we get throughput improvement of >50%, however!
---
 library/core/src/ptr/mod.rs | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs
index 68b5d1df71cb2..6f0e1e6a7b322 100644
--- a/library/core/src/ptr/mod.rs
+++ b/library/core/src/ptr/mod.rs
@@ -1168,7 +1168,9 @@ pub unsafe fn write_volatile<T>(dst: *mut T, src: T) {
 pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
     // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
     // 1, where the method versions of these operations are not inlined.
-    use intrinsics::{unchecked_shl, unchecked_shr, unchecked_sub, wrapping_mul, wrapping_sub};
+    use intrinsics::{
+        unchecked_shl, unchecked_shr, unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub,
+    };
 
     /// Calculate multiplicative modular inverse of `x` modulo `m`.
     ///
@@ -1223,8 +1225,17 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
     // SAFETY: `a` is a power-of-two, therefore non-zero.
     let a_minus_one = unsafe { unchecked_sub(a, 1) };
     if stride == 1 {
-        // `stride == 1` case can be computed more efficiently through `-p (mod a)`.
-        return wrapping_sub(0, p as usize) & a_minus_one;
+        // `stride == 1` case can be computed more simply through `-p (mod a)`, but doing so
+        // inhibits LLVM's ability to select instructions like `lea`. Instead we compute
+        //
+        //    round_up_to_next_alignment(p, a) - p
+        //
+        // which distributes operations around the load-bearing, but pessimizing `and` sufficiently
+        // for LLVM to be able to utilize the various optimizations it knows about.
+        return wrapping_sub(
+            wrapping_add(p as usize, a_minus_one) & wrapping_sub(0, a),
+            p as usize,
+        );
     }
 
     let pmoda = p as usize & a_minus_one;