rayon-demo/src/sieve/mod.rs

const USAGE: &str = "
Usage: sieve bench
       sieve --help

Sieve of Eratosthenes

A sieve finds prime numbers by first assuming all candidates are prime,
then progressively using small primes to mark their multiples composite.

Note that a given prime p only needs to start marking its multiples >=p²,
as anything less will already have been marked by an even smaller prime
factor.  This also means that sieving up to N only requires primes up to
sqrt(N) - a useful fact to prepare for parallel sieving.

Here we have three forms of the sieve:

- `sieve_serial`: direct iteration in serial
- `sieve_chunks`: chunked iteration, still serial
- `sieve_parallel`: chunked iteration in parallel

The performance improvement of `sieve_chunks` is largely due to cache
locality.  Since the sieve has to sweep repeatedly over the same memory,
it's helpful that the chunk size can fit entirely in the CPU cache.  Then
`sieve_parallel` also benefits from this as long as there's cache room for
multiple chunks, for the separate jobs in each thread.

Commands:
    bench              Run the benchmark in different modes and print the timings.

Options:
    -h, --help         Show this message.
";

#[derive(serde::Deserialize)]
pub struct Args {
    cmd_bench: bool,
}

#[cfg(test)]
mod bench;

use docopt::Docopt;
use rayon::prelude::*;
use std::time::{Duration, Instant};

const CHUNK_SIZE: usize = 100_000;

// Number of Primes < 10^n
// https://oeis.org/A006880
const NUM_PRIMES: &[usize] = &[
    0,  // primes in 0..10^0
    4,  // primes in 0..10^1
    25, // etc
    168,
    1_229,
    9_592,
    78_498,
    664_579,
    5_761_455,
    50_847_534,
    455_052_511,
    4_118_054_813,
    37_607_912_018,
    346_065_536_839,
    3_204_941_750_802,
    29_844_570_422_669,
    279_238_341_033_925,
    2_623_557_157_654_233,
    24_739_954_287_740_860,
    234_057_667_276_344_607,
    2_220_819_602_560_918_840,
];

// For all of these sieves, sieve[i]==true -> 2*i+1 is prime

fn max(magnitude: usize) -> usize {
    10_usize.pow(magnitude as u32)
}

/// Sieve odd integers for primes < max.
fn sieve_serial(max: usize) -> Vec<bool> {
    let mut sieve = vec![true; max / 2];
    sieve[0] = false; // 1 is not prime
    for i in 1.. {
        if sieve[i] {
            let p = 2 * i + 1;
            let pp = p * p;
            if pp >= max {
                break;
            }
            clear_stride(&mut sieve, pp / 2, p);
        }
    }
    sieve
}

/// Sieve odd integers for primes < max using chunks.
fn sieve_chunks(max: usize) -> Vec<bool> {
    // first compute the small primes, up to sqrt(max).
    let small_max = (max as f64).sqrt().ceil() as usize;
    let mut sieve = sieve_serial(small_max);
    sieve.resize(max / 2, true);

    {
        let (low, high) = sieve.split_at_mut(small_max / 2);
        for (chunk_index, chunk) in high.chunks_mut(CHUNK_SIZE).enumerate() {
            let i = small_max / 2 + chunk_index * CHUNK_SIZE;
            let base = i * 2 + 1;
            update_chunk(low, chunk, base);
        }
    }

    sieve
}

/// Sieve odd integers for primes < max, in parallel!
fn sieve_parallel(max: usize) -> Vec<bool> {
    // first compute the small primes, up to sqrt(max).
    let small_max = (max as f64).sqrt().ceil() as usize;
    let mut sieve = sieve_serial(small_max);
    sieve.resize(max / 2, true);

    {
        let (low, high) = sieve.split_at_mut(small_max / 2);
        high.par_chunks_mut(CHUNK_SIZE)
            .enumerate() // to figure out where this chunk came from
            .with_max_len(1) // ensure every single chunk is a separate rayon job
            .for_each(|(chunk_index, chunk)| {
                let i = small_max / 2 + chunk_index * CHUNK_SIZE;
                let base = i * 2 + 1;
                update_chunk(low, chunk, base);
            });
    }

    sieve
}

/// Update a chunk with low primes
fn update_chunk(low: &[bool], chunk: &mut [bool], base: usize) {
    let max = base + chunk.len() * 2;
    for (i, &is_prime) in low.iter().enumerate() {
        if is_prime {
            let p = 2 * i + 1;
            let pp = p * p;
            if pp >= max {
                break;
            }

            let pm = if pp < base {
                // p² is too small - find the first odd multiple that's in range
                (((base + p - 1) / p) | 1) * p
            } else {
                pp
            };

            if pm < max {
                clear_stride(chunk, (pm - base) / 2, p);
            }
        }
    }
}

fn clear_stride(slice: &mut [bool], from: usize, stride: usize) {
    slice[from..]
        .iter_mut()
        .step_by(stride)
        .for_each(|x| *x = false)
}

fn measure(f: fn(usize) -> Vec<bool>) -> Duration {
    const MAGNITUDE: usize = 9;

    let start = Instant::now();
    let sieve = f(max(MAGNITUDE));
    let duration = start.elapsed();

    // sanity check the number of primes found
    let num_primes = 1 + sieve.into_iter().filter(|&b| b).count();
    assert_eq!(num_primes, NUM_PRIMES[MAGNITUDE]);

    duration
}

pub fn main(args: &[String]) {
    let args: Args = Docopt::new(USAGE)
        .and_then(|d| d.argv(args).deserialize())
        .unwrap_or_else(|e| e.exit());

    if args.cmd_bench {
        let serial = measure(sieve_serial).as_nanos();
        println!("  serial: {:10} ns", serial);

        let chunks = measure(sieve_chunks).as_nanos();
        println!(
            "  chunks: {:10} ns -> {:.2}x speedup",
            chunks,
            serial as f64 / chunks as f64
        );

        let parallel = measure(sieve_parallel).as_nanos();
        println!(
            "parallel: {:10} ns -> {:.2}x speedup",
            parallel,
            chunks as f64 / parallel as f64
        );
    }
}