I'm writing this up here so that we have a place to discuss it. Consider this program:
#![feature(portable_simd, core_intrinsics, repr_simd)]
use std::simd::Simd;
#[allow(unused)]
#[repr(simd)]
pub(crate) struct SimdShuffleIdx<const LEN: usize>(pub(crate) [u32; LEN]);
#[allow(unused)]
macro_rules! simd_shuffle {
($x:expr, $y:expr, $idx:expr $(,)?) => {{
std::intrinsics::simd::simd_shuffle($x, $y, const { SimdShuffleIdx($idx) })
}};
}
#[unsafe(no_mangle)]
extern "C" fn foo(elements: [i16; 8]) -> Simd<i16, 4> {
unsafe {
simd_shuffle!(
Simd::from_array(elements),
Simd::from_array([0i16; 8]),
[0u32, 2, 4, 6]
)
}
}
#[unsafe(no_mangle)]
extern "C" fn bar(elements: [i16; 8]) -> Simd<i16, 4> {
unsafe {
simd_shuffle!(
Simd::from_array(elements),
Simd::from_array([0i16; 8]),
[1u32, 3, 5, 7]
)
}
}
fn main() {
let input = [0i16, 4, 1, 5, 2, 6, 3, 7];
unsafe {
dbg!(foo(input));
dbg!(bar(input));
}
}
The expected output here is
> cargo +nightly run --target aarch64-unknown-linux-gnu
warning: `playground` (bin "playground") generated 6 warnings
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.18s
Running `qemu-aarch64 -L /usr/aarch64-linux-gnu target/aarch64-unknown-linux-gnu/debug/playground`
[src/main.rs:42:9] foo(input) = [ 0, 1, 2, 3, ]
[src/main.rs:43:9] bar(input) = [ 4, 5, 6, 7, ]
but with aarch64_be-unknown-linux-gnu I get
> cargo +nightly run --target aarch64_be-unknown-linux-gnu -Zbuild-std=std,panic_abort
warning: `playground` (bin "playground") generated 6 warnings
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.42s
Running `qemu-aarch64_be -cpu max -L /home/folkertdev/Downloads/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64_be-none-linux-gnu/aarch64_be-none-linux-gnu/libc target/aarch64_be-unknown-linux-gnu/debug/playground`
[src/main.rs:42:9] foo(input) = [ 4, 5, 6, 7, ]
[src/main.rs:43:9] bar(input) = [ 4, 5, 6, 7, ]
Miri does get it right
> cargo +nightly miri run --target aarch64_be-unknown-linux-gnu
[src/main.rs:42:9] foo(input) = [ 0, 1, 2, 3, ]
[src/main.rs:43:9] bar(input) = [ 4, 5, 6, 7, ]
It might be an LLVM issue, though the assembly for foo and bar is different.
foo:
fmov d0, x1
fmov d1, x0
rev64 v0.4h, v0.4h
rev64 v1.4h, v1.4h
uzp1 v0.4h, v1.4h, v0.4h
rev64 v0.4h, v0.4h
ret
bar:
fmov d0, x1
fmov d1, x0
rev64 v0.4h, v0.4h
rev64 v1.4h, v1.4h
uzp2 v0.4h, v1.4h, v0.4h
rev64 v0.4h, v0.4h
ret
So perhaps the actual problem is with qemu-aarch64_be? It's still present with a fresh build
> /home/folkertdev/c/qemu/build/qemu-aarch64_be --version
qemu-aarch64_be version 10.2.94 (v11.0.0-rc4)
Copyright (c) 2003-2026 Fabrice Bellard and the QEMU Project developers
Or maybe all of this is expected on aarch64_be-unknown-linux-gnu? but then miri would be wrong.
cc @adamgemmell @davidtwco
I'm writing this up here so that we have a place to discuss it. Consider this program:
The expected output here is
but with
aarch64_be-unknown-linux-gnuI getMiri does get it right
It might be an LLVM issue, though the assembly for
fooandbaris different.So perhaps the actual problem is with
qemu-aarch64_be? It's still present with a fresh buildOr maybe all of this is expected on
aarch64_be-unknown-linux-gnu? but then miri would be wrong.cc @adamgemmell @davidtwco