# Understanding the impact of how the code is structured per simple example: The Sum

In [1]:
import numba as nb
import numpy as np

@nb.njit(cache=False, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def vectorized_opt(z):
    return z[:,0]+z[:,1]+z[:,2]

@nb.njit(cache=False, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def vectorized_optb(z):
    return np.sum(z, axis=1)


@nb.njit(cache=False, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def vl(z, r, i):
    # Sum across z[i, :] and store directly into r at position i
    r[i] = np.sum(z)

@nb.njit(cache=False, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cvlopt(z, n, r):
    for i in nb.prange(n):
        vl(z[i,:], r, i)

def vlopt(z):
    n = z.shape[0]  # Determine the number of rows in z
    r = np.empty(n, dtype=z.dtype)  # Initialize the result array
    cvlopt(z, n, r)
    return r



@nb.njit(cache=False, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def w(zij, ri):
    # Intended to modify ri in place, but you should return the updated value instead
    return ri + zij


@nb.njit(cache=False, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def wl(zi, k, r, i):
    # Accumulate directly into the array r at position i
    for j in nb.prange(k):
        r[i] = w(zi[j], r[i])


@nb.njit(cache=False, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cwlopt(z, n, k, r):
    for i in nb.prange(n):
        wl(z[i,:], k, r, i)
    return r

def wlopt(z):
    n, k = z.shape
    r = np.zeros(n, dtype=z.dtype)
    return cwlopt(z, n, k, r)

                  
arr = np.random.randn(10000,3).astype(np.float64)
#Do one pass to dont have compile time in benchmarks
vectorized_opt(arr),vectorized_optb(arr), vlopt(arr),wlopt(arr)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "../../../../../tmp/ipykernel_1504290/2868275802.py", line 4:
<source missing, REPL/exec in use?>

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "../../../../../tmp/ipykernel_1504290/2868275802.py", line 8:
<source missing, REPL/exec in use?>



(array([ 2.6424679 , -1.40860445,  3.26047798, ..., -2.26988148,
        -1.0384263 , -0.6958217 ]),
 array([ 2.6424679 , -1.40860445,  3.26047798, ..., -2.26988148,
        -1.0384263 , -0.6958217 ]),
 array([ 2.6424679 , -1.40860445,  3.26047798, ..., -2.26988148,
        -1.0384263 , -0.6958217 ]),
 array([ 2.6424679 , -1.40860445,  3.26047798, ..., -2.26988148,
        -1.0384263 , -0.6958217 ]))

In [2]:
%timeit vectorized_opt(arr)
%timeit vectorized_optb(arr)
%timeit vlopt(arr)
%timeit wlopt(arr)
%timeit np.sum(arr, axis=1)
%timeit np.fromiter((map(lambda v: np.sum(v), (arr[i,:] for i in range(arr.shape[0])))), dtype=arr.dtype)

9.35 µs ± 22.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
11.2 µs ± 250 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
44.3 µs ± 740 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
49.1 µs ± 637 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
143 µs ± 215 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
24.9 ms ± 105 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
arr = np.random.randn(100_000,3).astype(np.float32)

%timeit vectorized_opt(arr)
%timeit vectorized_optb(arr)
%timeit vlopt(arr)
%timeit wlopt(arr)
%timeit np.sum(arr, axis=1)
%timeit np.fromiter((map(lambda v: np.sum(v), (arr[i,:] for i in range(arr.shape[0])))), dtype=arr.dtype)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "../../../../../tmp/ipykernel_1504290/2868275802.py", line 4:
<source missing, REPL/exec in use?>



74.3 µs ± 6.15 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "../../../../../tmp/ipykernel_1504290/2868275802.py", line 8:
<source missing, REPL/exec in use?>



89.9 µs ± 6.28 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
75.1 µs ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
101 µs ± 13.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.38 ms ± 975 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
243 ms ± 275 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
arr = np.random.randn(1_000_000,3).astype(np.float32)

%timeit vectorized_opt(arr)
%timeit vectorized_optb(arr)
%timeit vlopt(arr)
%timeit wlopt(arr)
%timeit np.sum(arr, axis=1)
%timeit np.fromiter((map(lambda v: np.sum(v), (arr[i,:] for i in range(arr.shape[0])))), dtype=arr.dtype)

703 µs ± 7.28 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
892 µs ± 843 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
222 µs ± 4.07 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
391 µs ± 2.85 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
13.9 ms ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.46 s ± 36.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
arr = np.random.randn(100_000_000,3).astype(np.float32)

%timeit vectorized_optb(arr)
%timeit vlopt(arr)
%timeit wlopt(arr)
%timeit np.sum(arr, axis=1)
%timeit np.fromiter((map(lambda v: np.sum(v), (arr[i,:] for i in range(arr.shape[0])))), dtype=arr.dtype)

341 ms ± 3.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
71.7 ms ± 389 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
116 ms ± 1.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.43 s ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
