In [1]:
from numba import cuda, njit, prange
import numpy as np

In [2]:
@cuda.reduce
def sum_reduce_cuda(a, b):
    return a + b

@njit(parallel=True)
def sum_reduce_njit(arr):
    total = 0.0
    for i in prange(len(arr)):
        total += arr[i]
    return total

@cuda.reduce
def custom_reduce(a, b):
    return a + (b * b)

@njit(parallel=True)
def custom_reduce_njit(arr):
    total = 0.0
    for i in prange(len(arr)):
        total += arr[i] * arr[i]
    return total


In [3]:
A = (np.arange(321_456_789, dtype=np.float64)) + 1

In [7]:
%%timeit -n 3 -r 3 -o

custom_reduce(A)



779 ms ± 3.31 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


<TimeitResult : 779 ms ± 3.31 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)>

In [5]:
%%timeit -n 3 -r 3 -o

custom_reduce_njit(A)



The slowest run took 4.47 times longer than the fastest. This could mean that an intermediate result is being cached.
243 ms ± 178 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


<TimeitResult : 243 ms ± 178 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)>

In [None]:
%%timeit -n 3 -r 3 -o

np.sum(A + (A * A))

1.02 s ± 7.47 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


<TimeitResult : 1.02 s ± 7.47 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)>

In [13]:
%%timeit -n 1 -r 1 -o

sum_reduce_cuda(A)

820 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


<TimeitResult : 820 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

In [9]:
%%timeit -n 10 -r 10 -o

sum_reduce_njit(A)

130 ms ± 4.19 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


<TimeitResult : 130 ms ± 4.19 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)>

In [8]:
%%timeit -n 10 -r 10 -o

np.sum(A)

158 ms ± 4.95 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


<TimeitResult : 158 ms ± 4.95 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)>

In [54]:
%%timeit -n 10 -r 100 -o

sum(A)

83.2 ms ± 548 μs per loop (mean ± std. dev. of 100 runs, 10 loops each)


<TimeitResult : 83.2 ms ± 548 μs per loop (mean ± std. dev. of 100 runs, 10 loops each)>