In [1]:
import os
os.environ["NUMBA_LOOP_VECTORIZE"] = "1"
from numba import njit
import numpy as np
#from llvmlite import binding as llvm
#llvm.set_option("", "--debug-only=loop-vectorize")
DATA = np.linspace(1, 1_000_000, 1_000_000, dtype=np.uint32)

In [2]:
@njit
def num_even_and_odd(arr):
    num_odd = 0
    for i in range(len(arr)):
        if arr[i] & 1:
            num_odd += 1
    return len(arr) - num_odd, num_odd

num_even_and_odd(DATA)

(500000, 500000)

In [3]:
%timeit num_even_and_odd(DATA)

73.2 µs ± 181 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [4]:
@njit
def num_even_and_odd_2(arr):
    num_odd = 0
    for i in range(len(arr)):
        num_odd += arr[i] & 1
    return len(arr) - num_odd, num_odd

num_even_and_odd_2(DATA)

(500000, 500000)

In [5]:
%timeit num_even_and_odd_2(DATA)

73 µs ± 118 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [6]:
@njit
def num_even_and_odd_3(arr):
    result = np.zeros((2,), dtype=np.uint64)
    for i in range(len(arr)):
        result[1] += arr[i] & 1
    result[0] = len(arr) - result[1]
    return result

num_even_and_odd_3(DATA)

array([500000, 500000], dtype=uint64)

In [7]:
%timeit num_even_and_odd_3(DATA)

283 µs ± 76 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
@njit
def num_even_and_odd_4(arr):
    odds = np.zeros((4,), dtype=np.uint64)
    for i in range(len(arr) // 4):
        odds[0] += arr[i * 4] & 1
        odds[1] += arr[i * 4 + 1] & 1
        odds[2] += arr[i * 4 + 2] & 1
        odds[3] += arr[i * 4 + 3] & 1
        
    result = np.zeros((2,), dtype=np.uint64)
    for o in arr[-(len(arr) % 4):]:
        result[1] += o & 1
    for o in odds:
        result[1] += o
    result[0] = len(arr) - result[1]
    return result

num_even_and_odd_4(DATA)

array([      0, 1000000], dtype=uint64)