Add/Rework benchmarks to track initialization cost (#272)

This PR adds more benchmarks so we can get and accurate idea about two things: - What is the cost of having to zero the buffer before calling `getrandom`? - What is the performance on aligned, 32-byte buffers? - This is by far the most common use, as its used to seed usersapce CSPRNGs. I ran the benchmarks on my system: - CPU: AMD Ryzen 7 5700G - OS: Linux 5.15.52-1-lts - Rust Version: 1.62.0-nightly (ea92b0838 2022-05-07) I got the following results: ``` test bench_large ... bench: 3,759,323 ns/iter (+/- 177,100) = 557 MB/s test bench_large_init ... bench: 3,821,229 ns/iter (+/- 39,132) = 548 MB/s test bench_page ... bench: 7,281 ns/iter (+/- 59) = 562 MB/s test bench_page_init ... bench: 7,290 ns/iter (+/- 69) = 561 MB/s test bench_seed ... bench: 206 ns/iter (+/- 3) = 155 MB/s test bench_seed_init ... bench: 206 ns/iter (+/- 1) = 155 MB/s ``` These results were very consistent across multiple runs, and roughtly behave as we would expect: - The thoughput is highest with a buffer large enough to amoritize the syscall overhead, but small enough to stay in the L1D cache. - There is a _very_ small cost to zeroing the buffer beforehand. - This cost is imperceptible in the common 32-byte usecase, where the syscall overhead dominates. - The cost is slightly higher (1%) with multi-megabyte buffers as the data gets evicted from the L1 cache between the `memset` and the call to `getrandom`. I would love to see results for other platforms. Could we get someone to run this on an M1 Mac? Signed-off-by: Joe Richey <joerichey@google.com>
rust-random · Jul 13, 2022 · 7089766 · 7089766
1 parent 3d818a6
commit 7089766
Showing 1 changed file with 80 additions and 8 deletions.
diff --git a/benches/mod.rs b/benches/mod.rs
@@ -1,22 +1,94 @@
 #![feature(test)]
 extern crate test;
 
-#[bench]
-fn bench_64(b: &mut test::Bencher) {
-    let mut buf = [0u8; 64];
+use std::{
+    alloc::{alloc_zeroed, dealloc, Layout},
+    ptr::NonNull,
+};
+
+// AlignedBuffer is like a Box<[u8; N]> except that it is always N-byte aligned
+struct AlignedBuffer<const N: usize>(NonNull<[u8; N]>);
+
+impl<const N: usize> AlignedBuffer<N> {
+    fn layout() -> Layout {
+        Layout::from_size_align(N, N).unwrap()
+    }
+
+    fn new() -> Self {
+        let p = unsafe { alloc_zeroed(Self::layout()) } as *mut [u8; N];
+        Self(NonNull::new(p).unwrap())
+    }
+
+    fn buf(&mut self) -> &mut [u8; N] {
+        unsafe { self.0.as_mut() }
+    }
+}
+
+impl<const N: usize> Drop for AlignedBuffer<N> {
+    fn drop(&mut self) {
+        unsafe { dealloc(self.0.as_ptr() as *mut u8, Self::layout()) }
+    }
+}
+
+// Used to benchmark the throughput of getrandom in an optimal scenario.
+// The buffer is hot, and does not require initialization.
+#[inline(always)]
+fn bench<const N: usize>(b: &mut test::Bencher) {
+    let mut ab = AlignedBuffer::<N>::new();
+    let buf = ab.buf();
     b.iter(|| {
         getrandom::getrandom(&mut buf[..]).unwrap();
         test::black_box(&buf);
     });
-    b.bytes = buf.len() as u64;
+    b.bytes = N as u64;
 }
 
-#[bench]
-fn bench_65536(b: &mut test::Bencher) {
-    let mut buf = [0u8; 65536];
+// Used to benchmark the throughput of getrandom is a slightly less optimal
+// scenario. The buffer is still hot, but requires initialization.
+#[inline(always)]
+fn bench_with_init<const N: usize>(b: &mut test::Bencher) {
+    let mut ab = AlignedBuffer::<N>::new();
+    let buf = ab.buf();
     b.iter(|| {
+        for byte in buf.iter_mut() {
+            *byte = 0;
+        }
         getrandom::getrandom(&mut buf[..]).unwrap();
         test::black_box(&buf);
     });
-    b.bytes = buf.len() as u64;
+    b.bytes = N as u64;
+}
+
+// 32 bytes (256-bit) is the seed sized used for rand::thread_rng
+const SEED: usize = 32;
+// Common size of a page, 4 KiB
+const PAGE: usize = 4096;
+// Large buffer to get asymptotic performance, 2 MiB
+const LARGE: usize = 1 << 21;
+
+#[bench]
+fn bench_seed(b: &mut test::Bencher) {
+    bench::<SEED>(b);
+}
+#[bench]
+fn bench_seed_init(b: &mut test::Bencher) {
+    bench_with_init::<SEED>(b);
+}
+
+#[bench]
+fn bench_page(b: &mut test::Bencher) {
+    bench::<PAGE>(b);
+}
+#[bench]
+fn bench_page_init(b: &mut test::Bencher) {
+    bench_with_init::<PAGE>(b);
+}
+
+#[bench]
+fn bench_large(b: &mut test::Bencher) {
+    bench::<LARGE>(b);
+}
+#[bench]
+fn bench_large_init(b: &mut test::Bencher) {
+    bench_with_init::<LARGE>(b);
 }