In [1]:
import numpy as np

from typing import Dict, List

from core import integer_sqrt

# Prime Number Sieves

A prime number sieve is an algorithm that generates all primes up to a given bound $n$.

## The Sieve of Eratosthenes

The first method we discuss is the oldest; the sieve of Eratosthenes.
In its basic form, we operate on `block`, a boolean array of size $n + 1$, representing each integer $0 \leq k \leq n$.
When the sieve is complete, `block[k]` will be `True` or `False` to indicate the primality or non-primality of $k$.

We begin by setting `block[k] = False` for $k = 1, 2$ to indicate that these values $k$ are not prime.
We then proceed to sieve by each integer $2 \leq k \leq \sqrt{n}$.
When `block[k] == True`, this indicates that $k$ is prime, so we sieve out the multiples of $k$.
In order to keep `block[k] == True`, we won't mark out `block[k]`, but intead will begin the sieve at the $k^2$ index.
This can be done in numpy as `block[k*k::k] = False`.

Once this is done for $k \leq \sqrt{n}$, we will have marked out all multiples of primes $\leq \sqrt{n}$, except for these primes themselves. As a result, each prime $\sqrt{n} < p \leq{n}$ will be left unmarked in our array, and so our array will hold all the primes $\leq n$.

In [2]:
def eratosthenes(n: int) -> List[int]:
    """Returns a list of primes <= n, using the sieve of Eratosthenes"""
    block = np.ones(n + 1, dtype=bool)
    block[0] = block[1] = False
    
    for k in range(2, integer_sqrt(n) + 1):
        if block[k]:
            block[k*k::k] = False
    
    return np.flatnonzero(block).tolist()

This implementation uses $O(n)$ space for the sieve block.

For each of the primes $p\leq \sqrt{n}$ that we encounter, we mark out all $\left\lfloor\frac{n}{p}\right\rfloor$ multiples of $p$ from the block.
It then follows from the bound
$$
    \sum_{p \leq n} \frac{1}{p} = \log \log n + O(1)
$$
that we have
$$
    \sum_{p \leq \sqrt{n}} \frac{n}{p} = O(n \log \log n).
$$
This gives a time complexity of $O(n \log \log n)$ for our sieve algorithm.

In [3]:
eratosthenes(31)

[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31]

In [4]:
len(eratosthenes(100))

25

In [5]:
timeit len(eratosthenes(10**7))

157 ms ± 6.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## First extension of the Sieve of Eratosthenes

The first extension to the sieve is based on the fact that 2 is the only even prime.
This allows us to reduce our space requirements by half, since we only need to sieve the odd integers $\leq n$.

As before, we have a `block` of `bools`, representing the odd integers $1, 3, \ldots, 2 l + 1 \leq n$.
The primality of each $k = 2 l + 1$ will be indicated by `block[k//2]` once the sieve process is complete.

After setting `block[0] = False` to mark $k = 1$ as not prime, we then sieve by the odd integers $3 \leq k \leq \sqrt{n}$.
Encountering `block[k//2] == True` indicates that `k` is prime, and so we sieve out its multiples by `block[k*k//2::k] = False`. Once the sieve process is complete, locate the `True` indices in the array, translate each to the odd prime it represents, and prepend the even prime to obtain all primes $\leq n$.

In [6]:
def eratosthenes2(n: int) -> List[int]:
    """Returns a list of primes <= n, using the sieve of Eratosthenes"""
    block_size = (n + 1)//2
    block = np.ones(block_size, dtype=bool)
    # 1, 3, 5, ..., n
    # 2*0 + 1, 2*1 + 1, 2*2 + 1, ...
    # 2*l + 1 is prime iff block[l] == True
    # i.e. odd k is prime iff block[k//2] == True
    block[0] = False

    for k in range(3, integer_sqrt(n) + 1, 2):
        if block[k//2]:
            block[k*k//2::k] = False

    nonzero = np.flatnonzero(block)
    return np.r_[2, 2*nonzero + 1].tolist()

In [7]:
eratosthenes2(11)

[2, 3, 5, 7, 11]

In [8]:
timeit len(eratosthenes2(10**8))

1.17 s ± 10 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


This method has the same time and space complexity as the ordinary sieve of Eratosthenes, but for practical purposes, it uses roughly half the space for a given value $n$.

## Second extension

In [9]:
def luo(n: int) -> List[int]:
    block_size = (n + 1)//3 + (n % 6 == 1)
    block = np.ones(block_size, dtype=bool)
    block[0] = False
    c, k, t = 0, 1, 2

    for i in range(1, integer_sqrt(n)//3 + 1):
        k = 3 - k
        c += 4*k*i
        t += 4*k

        if block[i]:
            b = c + 2*i*(3 - k) + 1
            block[c::t] = False
            block[b::t] = False

    nonzero = np.flatnonzero(block)
    return np.r_[2, 3, (3*nonzero + 1) | 1].tolist()

In [10]:
timeit len(luo(10**7))

98.3 ms ± 831 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
timeit len(eratosthenes2(10**7))

108 ms ± 1.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
timeit sum(1 for _ in sieve_interval2(2, 10**7))

184 ms ± 274 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Segmented Sieves

In [None]:
def sieve_interval(a, b):
    sqrt = integer_sqrt(b)
    small_primes = luo(sqrt)
    block_size = 10**6
    
    for p in small_primes:
        if a <= p < b:
            yield p
       
    for start in range(max(sqrt + 1, a), b, block_size):
        block = np.ones(min(block_size, b - start), dtype=bool)
        for p in small_primes:
            multiple = ((start // p) + (start % p > 0))*p
            block[multiple - start::p] = False

        nonzero = np.flatnonzero(block)
        for p in (start + nonzero).tolist():
            yield p

In [None]:
def sieve_interval2(a, b):
    sqrt = integer_sqrt(b)
    small_primes = luo(sqrt)[1:]
    block_size = 10**6
    
    if a <= 2:
        yield 2
        
    for p in small_primes:
        if a <= p < b:
            yield p

    bb = b + b % 2
    aa = max(sqrt + 1, a)
    aa = aa - (aa % 2)
       
    for start in range(aa, bb, 2*block_size):
        block = np.ones(min(block_size, (bb - start)//2), dtype=bool)
        for p in small_primes:
            offset = -(start + 1 + p)//2 % p
            block[offset::p] = False

        nonzero = np.flatnonzero(block)
        for p in (2*nonzero + (start + 1)).tolist():
            yield p

In [12]:
timeit sum(1 for _ in sieve_interval(2, 10**7))

NameError: name 'sieve_interval' is not defined

In [None]:
list(sieve_interval(2, 100)) == luo(100)

True

In [None]:
list(sieve_interval(2, 100))

[2,
 3,
 5,
 7,
 11,
 13,
 17,
 19,
 23,
 29,
 31,
 37,
 41,
 43,
 47,
 53,
 59,
 61,
 67,
 71,
 73,
 79,
 83,
 89,
 97]

In [None]:
timeit sum(luo(10**9))

12.2 s ± 485 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
timeit sum(p for p in sieve_interval(2, 10**9))

23.4 s ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
timeit sum(p for p in sieve_interval2(2, 10**9))

17.8 s ± 50 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
list(sieve_interval(400, 22000)) == list(sieve_interval2(400, 22000))

True

In [None]:
time sum(p for p in sieve_interval(2, 10**8))

CPU times: user 2.08 s, sys: 56 ms, total: 2.14 s
Wall time: 2.14 s


279209790387276

In [None]:
time list(p for p in sieve_interval2(10**15, 10**15 + 200))

CPU times: user 4.01 s, sys: 44.1 ms, total: 4.06 s
Wall time: 4.05 s


[1000000000000037, 1000000000000091, 1000000000000159, 1000000000000187]

In [None]:
time list(p for p in sieve_interval(10**15, 10**15 + 200))

CPU times: user 4.32 s, sys: 115 ms, total: 4.44 s
Wall time: 4.44 s


[1000000000000037, 1000000000000091, 1000000000000159, 1000000000000187]