-
Notifications
You must be signed in to change notification settings - Fork 496
/
mod.rs
207 lines (173 loc) · 5.63 KB
/
mod.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
const USAGE: &str = "
Usage: sieve bench
sieve --help
Sieve of Eratosthenes
A sieve finds prime numbers by first assuming all candidates are prime,
then progressively using small primes to mark their multiples composite.
Note that a given prime p only needs to start marking its multiples >=p²,
as anything less will already have been marked by an even smaller prime
factor. This also means that sieving up to N only requires primes up to
sqrt(N) - a useful fact to prepare for parallel sieving.
Here we have three forms of the sieve:
- `sieve_serial`: direct iteration in serial
- `sieve_chunks`: chunked iteration, still serial
- `sieve_parallel`: chunked iteration in parallel
The performance improvement of `sieve_chunks` is largely due to cache
locality. Since the sieve has to sweep repeatedly over the same memory,
it's helpful that the chunk size can fit entirely in the CPU cache. Then
`sieve_parallel` also benefits from this as long as there's cache room for
multiple chunks, for the separate jobs in each thread.
Commands:
bench Run the benchmark in different modes and print the timings.
Options:
-h, --help Show this message.
";
#[derive(serde::Deserialize)]
pub struct Args {
cmd_bench: bool,
}
#[cfg(test)]
mod bench;
use docopt::Docopt;
use rayon::prelude::*;
use std::time::{Duration, Instant};
const CHUNK_SIZE: usize = 100_000;
// Number of Primes < 10^n
// https://oeis.org/A006880
const NUM_PRIMES: &[usize] = &[
0, // primes in 0..10^0
4, // primes in 0..10^1
25, // etc
168,
1_229,
9_592,
78_498,
664_579,
5_761_455,
50_847_534,
455_052_511,
4_118_054_813,
37_607_912_018,
346_065_536_839,
3_204_941_750_802,
29_844_570_422_669,
279_238_341_033_925,
2_623_557_157_654_233,
24_739_954_287_740_860,
234_057_667_276_344_607,
2_220_819_602_560_918_840,
];
// For all of these sieves, sieve[i]==true -> 2*i+1 is prime
fn max(magnitude: usize) -> usize {
10_usize.pow(magnitude as u32)
}
/// Sieve odd integers for primes < max.
fn sieve_serial(max: usize) -> Vec<bool> {
let mut sieve = vec![true; max / 2];
sieve[0] = false; // 1 is not prime
for i in 1.. {
if sieve[i] {
let p = 2 * i + 1;
let pp = p * p;
if pp >= max {
break;
}
clear_stride(&mut sieve, pp / 2, p);
}
}
sieve
}
/// Sieve odd integers for primes < max using chunks.
fn sieve_chunks(max: usize) -> Vec<bool> {
// first compute the small primes, up to sqrt(max).
let small_max = (max as f64).sqrt().ceil() as usize;
let mut sieve = sieve_serial(small_max);
sieve.resize(max / 2, true);
{
let (low, high) = sieve.split_at_mut(small_max / 2);
for (chunk_index, chunk) in high.chunks_mut(CHUNK_SIZE).enumerate() {
let i = small_max / 2 + chunk_index * CHUNK_SIZE;
let base = i * 2 + 1;
update_chunk(low, chunk, base);
}
}
sieve
}
/// Sieve odd integers for primes < max, in parallel!
fn sieve_parallel(max: usize) -> Vec<bool> {
// first compute the small primes, up to sqrt(max).
let small_max = (max as f64).sqrt().ceil() as usize;
let mut sieve = sieve_serial(small_max);
sieve.resize(max / 2, true);
{
let (low, high) = sieve.split_at_mut(small_max / 2);
high.par_chunks_mut(CHUNK_SIZE)
.enumerate() // to figure out where this chunk came from
.with_max_len(1) // ensure every single chunk is a separate rayon job
.for_each(|(chunk_index, chunk)| {
let i = small_max / 2 + chunk_index * CHUNK_SIZE;
let base = i * 2 + 1;
update_chunk(low, chunk, base);
});
}
sieve
}
/// Update a chunk with low primes
fn update_chunk(low: &[bool], chunk: &mut [bool], base: usize) {
let max = base + chunk.len() * 2;
for (i, &is_prime) in low.iter().enumerate() {
if is_prime {
let p = 2 * i + 1;
let pp = p * p;
if pp >= max {
break;
}
let pm = if pp < base {
// p² is too small - find the first odd multiple that's in range
(((base + p - 1) / p) | 1) * p
} else {
pp
};
if pm < max {
clear_stride(chunk, (pm - base) / 2, p);
}
}
}
}
fn clear_stride(slice: &mut [bool], from: usize, stride: usize) {
slice[from..]
.iter_mut()
.step_by(stride)
.for_each(|x| *x = false)
}
fn measure(f: fn(usize) -> Vec<bool>) -> Duration {
const MAGNITUDE: usize = 9;
let start = Instant::now();
let sieve = f(max(MAGNITUDE));
let duration = start.elapsed();
// sanity check the number of primes found
let num_primes = 1 + sieve.into_iter().filter(|&b| b).count();
assert_eq!(num_primes, NUM_PRIMES[MAGNITUDE]);
duration
}
pub fn main(args: &[String]) {
let args: Args = Docopt::new(USAGE)
.and_then(|d| d.argv(args).deserialize())
.unwrap_or_else(|e| e.exit());
if args.cmd_bench {
let serial = measure(sieve_serial).as_nanos();
println!(" serial: {:10} ns", serial);
let chunks = measure(sieve_chunks).as_nanos();
println!(
" chunks: {:10} ns -> {:.2}x speedup",
chunks,
serial as f64 / chunks as f64
);
let parallel = measure(sieve_parallel).as_nanos();
println!(
"parallel: {:10} ns -> {:.2}x speedup",
parallel,
chunks as f64 / parallel as f64
);
}
}