In [1]:
import numpy as np
from numba import njit
from contextlib import contextmanager

@contextmanager
def disable_simd():
    import sys
    import os
    
    def clear_numba():
        for mod in list(sys.modules):
            if mod.startswith("numba") or mod.startswith("llvmlite"):
                del sys.modules[mod]

    clear_numba()
    os.environ["NUMBA_LOOP_VECTORIZE"] = "0"
    try:
        from numba import njit
        yield njit
    finally:
        os.environ["NUMBA_LOOP_VECTORIZE"] = "1"
        clear_numba()

In [2]:
@njit
def sum(x, y):
    out = np.empty(x.shape, dtype=x.dtype)
    for i in range(len(x)):
        out[i] = x[i] + y[i]
    return out

DATA_UINT64 = np.ones((1_000_000,), dtype=np.uint64)
DATA_UINT16 = np.ones((1_000_000,), dtype=np.uint16)
sum(DATA_UINT64, DATA_UINT64)
sum(DATA_UINT16, DATA_UINT16) 

array([2, 2, 2, ..., 2, 2, 2], dtype=uint16)

In [3]:
%timeit sum(DATA_UINT64, DATA_UINT64)
%timeit sum(DATA_UINT16, DATA_UINT16)

346 µs ± 1.85 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
85 µs ± 318 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [4]:
with disable_simd() as njit_no_simd:
    @njit_no_simd
    def sum_no_simd(x, y):
        out = np.empty(x.shape, dtype=x.dtype)
        for i in range(len(x)):
            out[i] = x[i] + y[i]
        return out

    sum_no_simd(DATA_UINT64, DATA_UINT64)
    sum_no_simd(DATA_UINT16, DATA_UINT16)

In [5]:
%timeit sum_no_simd(DATA_UINT64, DATA_UINT64)
%timeit sum_no_simd(DATA_UINT16, DATA_UINT16)

373 µs ± 1.51 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
226 µs ± 143 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
