Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make [u8]::reverse() 5x faster #41764

Merged
merged 3 commits into from May 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/libcollections/benches/lib.rs
Expand Up @@ -10,7 +10,9 @@

#![deny(warnings)]

#![feature(i128_type)]
#![feature(rand)]
#![feature(repr_simd)]
#![feature(sort_unstable)]
#![feature(test)]

Expand Down
25 changes: 25 additions & 0 deletions src/libcollections/benches/slice.rs
Expand Up @@ -290,3 +290,28 @@ sort!(sort_unstable, sort_unstable_large_random, gen_random, 10000);
sort!(sort_unstable, sort_unstable_large_big_random, gen_big_random, 10000);
sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000);

macro_rules! reverse {
($name:ident, $ty:ty, $f:expr) => {
#[bench]
fn $name(b: &mut Bencher) {
// odd length and offset by 1 to be as unaligned as possible
let n = 0xFFFFF;
let mut v: Vec<_> =
(0..1+(n / mem::size_of::<$ty>() as u64))
.map($f)
.collect();
b.iter(|| black_box(&mut v[1..]).reverse());
b.bytes = n;
}
}
}

reverse!(reverse_u8, u8, |x| x as u8);
reverse!(reverse_u16, u16, |x| x as u16);
reverse!(reverse_u8x3, [u8;3], |x| [x as u8, (x>>8) as u8, (x>>16) as u8]);
reverse!(reverse_u32, u32, |x| x as u32);
reverse!(reverse_u64, u64, |x| x as u64);
reverse!(reverse_u128, u128, |x| x as u128);
#[repr(simd)] struct F64x4(f64, f64, f64, f64);
reverse!(reverse_simd_f64x4, F64x4, |x| { let x = x as f64; F64x4(x,x,x,x) });
10 changes: 10 additions & 0 deletions src/libcollections/tests/slice.rs
Expand Up @@ -379,6 +379,16 @@ fn test_reverse() {
let mut v3 = Vec::<i32>::new();
v3.reverse();
assert!(v3.is_empty());

// check the 1-byte-types path
let mut v = (-50..51i8).collect::<Vec<_>>();
v.reverse();
assert_eq!(v, (-50..51i8).rev().collect::<Vec<_>>());

// check the 2-byte-types path
let mut v = (-50..51i16).collect::<Vec<_>>();
v.reverse();
assert_eq!(v, (-50..51i16).rev().collect::<Vec<_>>());
}

#[test]
Expand Down
49 changes: 49 additions & 0 deletions src/libcore/slice/mod.rs
Expand Up @@ -539,6 +539,55 @@ impl<T> SliceExt for [T] {
fn reverse(&mut self) {
let mut i: usize = 0;
let ln = self.len();

// For very small types, all the individual reads in the normal
// path perform poorly. We can do better, given efficient unaligned
// load/store, by loading a larger chunk and reversing a register.

// Ideally LLVM would do this for us, as it knows better than we do
// whether unaligned reads are efficient (since that changes between
// different ARM versions, for example) and what the best chunk size
// would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
// the loop, so we need to do this ourselves. (Hypothesis: reverse
// is troublesome because the sides can be aligned differently --
// will be, when the length is odd -- so there's no way of emitting
// pre- and postludes to use fully-aligned SIMD in the middle.)

let fast_unaligned =
cfg!(any(target_arch = "x86", target_arch = "x86_64"));

if fast_unaligned && mem::size_of::<T>() == 1 {
// Use the llvm.bswap intrinsic to reverse u8s in a usize
let chunk = mem::size_of::<usize>();
while i + chunk - 1 < ln / 2 {
unsafe {
let pa: *mut T = self.get_unchecked_mut(i);
let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
let va = ptr::read_unaligned(pa as *mut usize);
let vb = ptr::read_unaligned(pb as *mut usize);
ptr::write_unaligned(pa as *mut usize, vb.swap_bytes());
ptr::write_unaligned(pb as *mut usize, va.swap_bytes());
}
i += chunk;
}
}

if fast_unaligned && mem::size_of::<T>() == 2 {
// Use rotate-by-16 to reverse u16s in a u32
let chunk = mem::size_of::<u32>() / 2;
while i + chunk - 1 < ln / 2 {
unsafe {
let pa: *mut T = self.get_unchecked_mut(i);
let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
let va = ptr::read_unaligned(pa as *mut u32);
let vb = ptr::read_unaligned(pb as *mut u32);
ptr::write_unaligned(pa as *mut u32, vb.rotate_left(16));
ptr::write_unaligned(pb as *mut u32, va.rotate_left(16));
}
i += chunk;
}
}

while i < ln / 2 {
// Unsafe swap to avoid the bounds check in safe swap.
unsafe {
Expand Down