In [1]:
# Numba works well when the code relies a lot on (1) numpy, (2) loops, and (2) cuda

In [13]:
from numba import jit, prange
import numpy as np

input_ndarray = np.random.rand(100*100).reshape(100, 100)

# Decorate python function allowing it to run without python interpreter
# Compiles the function with specific arguments once into machine code, then uses the cache subsequently
@jit(nopython=True)
def go_fast(a):
    trace = 0
    for i in range(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace

%timeit go_fast(input_ndarray)

5.16 µs ± 132 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [17]:
@jit(nopython=True)
def go_faster(a):
    trace = 0
    for i in range(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace

%timeit go_faster(input_ndarray)

5.5 µs ± 163 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [15]:
@jit(nopython=True, parallel=True)
def go_even_faster(a):
    trace = 0
    for i in prange(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace

%timeit go_even_faster(input_ndarray)

46.9 µs ± 24.8 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
