In [1]:
import numpy as np
from numba import cuda 

In [2]:
@cuda.jit('void(float64 [:], float64[:], float64[:])') 
def f(a, b, c):
    # like threadIdx.x + (blockIdx.x * blockDim.x)
    tid = cuda.grid(1)
    size = len(c)
    if tid < size:
        c[tid] = a[tid] + b[tid]

In [3]:
start = cuda.event(timing=True)
stop = cuda.event(timing=True)

In [11]:
N = 1000000
h_a  = np.random.random(N)
h_b = np.random.random(N)
start.record()
d_a = cuda.to_device(h_a)
d_b = cuda.to_device(h_b)
stop.record()
stop.synchronize()
time = start.elapsed_time(stop)

print("It took me {} mseconds to get data to GPU".format(time))
d_c = cuda.device_array_like(d_a)
start.record()
f[N//256, 256](d_a, d_b, d_c)
stop.record()
stop.synchronize()
time = start.elapsed_time(stop)
print("It took me {} mseconds to add vectors on the GPU".format(time))
start.record()
c = d_c.copy_to_host()
stop.record()
stop.synchronize()
time = start.elapsed_time(stop)
print("It took me {} mseconds to copy result back".format(time))

It took me 11.001376152038574 mseconds to get data to GPU
It took me 0.2717440128326416 mseconds to add vectors on the GPU
It took me 5.0920000076293945 mseconds to copy result back
