# Numba on Multicore and Manycore

In [1]:
from utiliviz import register_magic
register_magic(setup_bokeh=True)

## Numba on Multicore
### Automatic parallization and SIMD-vectorization

In [2]:
import numpy as np
from numba import njit, prange

@njit
def log_sum_exp(arr):
    # Basic scalar loop.
    out = 0
    for i in range(arr.size):
        out += np.exp(arr[i])
        
    return np.log(out)


@njit(parallel=True, fastmath=True)
def log_sum_exp_mt(arr):
    # Using automatic parallizing for-loop and fastmath.
    # Uses SVML to get SIMD on transendental functions.
    out = 0
    for i in prange(arr.size):   # parallel-range
        out += np.exp(arr[i]) 
        
    return np.log(out)


@njit(parallel=True, fastmath=True)
def log_sum_exp_arrexpr(arr):
    # Array-expression that are automatically parallelized.
    return np.log(np.sum(np.exp(arr)))

#### Quick test & Warm up the JIT functions

In [3]:
a = np.random.random(10**4)
r1 = log_sum_exp(a)
r2 = log_sum_exp_mt(a)
r3 = log_sum_exp_arrexpr(a)
np.testing.assert_allclose(r1, r2)
np.testing.assert_allclose(r1, r3)

#### Use a bigger array for profiling

In [4]:
a = np.random.random(2 * 10**7)

#### Profile serial scalar loop

In [5]:
%%utiliviz
%%time
log_sum_exp(a)

CPU times: user 2.33 s, sys: 0 ns, total: 2.33 s
Wall time: 2.33 s


17.35253851608995

#### Profile the parallel loop version

In [6]:
%%utiliviz
%%time
log_sum_exp_mt(a)

CPU times: user 64 ms, sys: 0 ns, total: 64 ms
Wall time: 16.6 ms


17.35253851608988

utiliviz: insufficient sample data for plotting


Note: utiliviz may fail when the runtime of the profiled code-block is too short.

#### Re-run with bigger array

In [7]:
a = np.random.random(4 * 10**8)

In [8]:
%%utiliviz
%%time
log_sum_exp_mt(a)

CPU times: user 1.56 s, sys: 0 ns, total: 1.56 s
Wall time: 205 ms


20.348303717097387

Note: observe from the above plot that all cpu cores are utilized.  (See "cpu_overall*")

#### Profile the array-expression version

In [9]:
%%utiliviz
%%time
log_sum_exp_arrexpr(a)

CPU times: user 1.5 s, sys: 4 ms, total: 1.5 s
Wall time: 214 ms


20.348303717097387

Note: again, the array-expression version uses all the cores.

#### Profile scipy implementation

In [10]:
import scipy.special

In [11]:
%%utiliviz
%%time
scipy.special.logsumexp(a)

CPU times: user 2.53 s, sys: 9.1 s, total: 11.6 s
Wall time: 3.21 s


20.348303717097384

## Numba on Manycore
### JIT CUDA kernels

In [12]:
from numba import cuda
import math

@cuda.jit
def foo(arr):
    # A simple CUDA kernel
    i = cuda.grid(1)
    for j in range(5):
        arr[i] += math.sin(arr[i] / 2)

In [13]:
arr = np.arange(5 * 10**8, dtype=np.float32)

In [14]:
%%utiliviz --cuda

foo.forall(arr.size)(arr)