In [2]:
!conda env list

# conda environments:
#
base                  *  C:\Users\reza.bonyadi\Anaconda3
car_speed_env            C:\Users\reza.bonyadi\Anaconda3\envs\car_speed_env
cuda_tut                 C:\Users\reza.bonyadi\Anaconda3\envs\cuda_tut
dash_ex                  C:\Users\reza.bonyadi\Anaconda3\envs\dash_ex
finance                  C:\Users\reza.bonyadi\Anaconda3\envs\finance
finance_dashboard        C:\Users\reza.bonyadi\Anaconda3\envs\finance_dashboard
journal_dashboard        C:\Users\reza.bonyadi\Anaconda3\envs\journal_dashboard
sherlock                 C:\Users\reza.bonyadi\Anaconda3\envs\sherlock
                         C:\Users\reza.bonyadi\AppData\Local\Orange



In [12]:
import numpy as np
size = 10000

In [13]:
a = np.ones(size)
b = np.ones(size)
c = np.ones(size)

def operation(i, j):
    return i+j

def func_cpu(a, b):
    res = np.ones(size)
    for i in range(size):
        res[i] = operation(a[i], b[i])
    return res
c = func_cpu(a, b)

In [14]:
%timeit func_cpu(a, b)

3.85 ms ± 22.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
from numba import jit
a = np.ones((size))
b = np.ones((size))
c = np.ones((size))

@jit
def operation(i, j):
    return i+j

@jit
def func_jit(a, b):
    res = np.ones((size))

    for i in range(size):
        res[i] = operation(a[i], b[i])
    return res

c = func_jit(a, b)

In [16]:
%timeit func_jit(a, b)

4.68 µs ± 19 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [17]:
from numba import cuda
print(cuda.gpus)

device = cuda.get_current_device()
tpb = device.WARP_SIZE

bpg = int(np.ceil(float(size)/tpb))
print('Blocks per grid:', bpg)
print('Threads per block', tpb)

<Managed Device 0>
Blocks per grid: 313
Threads per block 32


In [20]:
a = np.ones((size))
b = np.ones((size))
c = np.ones((size))

@cuda.jit('void(f8[:],f8[:],f8[:])')
def kernel(a, b, c):
    i = cuda.grid(1)
    c[i] = a[i]+b[i]


def func_cuda(a, b, c):
    kernel[bpg, tpb](a, b, c)   
    res = c.copy_to_host()
    return res

device = cuda.get_current_device()

da = cuda.to_device(a)
db = cuda.to_device(b)
dc = cuda.device_array_like(c)

c = func_cuda(da, db, dc)


In [21]:
%timeit func_cuda(da, db, dc)

188 µs ± 2.96 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
