In [1]:
!conda env list
!conda activate summarizer1

# conda environments:
#
                         C:\Program Files\Orange
base                     C:\ProgramData\Anaconda3
car_speed_env            C:\ProgramData\Anaconda3\envs\car_speed_env
cuda_tut_env             C:\ProgramData\Anaconda3\envs\cuda_tut_env
summarizer1           *  C:\ProgramData\Anaconda3\envs\summarizer1
untitled                 C:\ProgramData\Anaconda3\envs\untitled
finance_dashboard        C:\Users\Reza\.conda\envs\finance_dashboard
sherlock                 C:\Users\Reza\.conda\envs\sherlock
vocal                    C:\Users\Reza\.conda\envs\vocal



In [1]:
!conda list

# packages in environment at C:\ProgramData\Anaconda3:
#
# Name                    Version                   Build  Channel
_ipyw_jlab_nb_ext_conf    0.1.0                    py37_0  
alabaster                 0.7.12                   py37_0  
anaconda                  2020.02                  py37_0  
anaconda-client           1.7.2                    py37_0  
anaconda-navigator        1.9.12                   py37_0  
anaconda-project          0.8.4                      py_0  
argh                      0.26.2                   py37_0  
asn1crypto                1.3.0                    py37_0  
astroid                   2.3.3                    py37_0  
astropy                   4.0              py37he774522_0  
atomicwrites              1.3.0                    py37_1  
attrs                     19.3.0                     py_0  
autopep8                  1.4.4                      py_0  
babel                     2.8.0                      py_0  
backcall                  0.1.0     

In [1]:
import numpy as np
size = 100000000

In [27]:
a = np.ones(size)
b = np.ones(size)
c = np.ones(size)

def operation(i, j):
    return i+j

def func_cpu(a, b):
    res = np.ones(size)
    for i in range(size):
        res[i] = operation(a[i], b[i])
    return res
c = func_cpu(a, b)

In [28]:
%timeit func_cpu(a, b)

36.8 s ± 368 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
from numba import jit
a = np.ones((size))
b = np.ones((size))
c = np.ones((size))

@jit
def operation(i, j):
    return i+j

@jit
def func_jit(a, b):
    res = np.ones((size))

    for i in range(size):
        res[i] = operation(a[i], b[i])
    return res

c = func_jit(a, b)

In [30]:
%timeit func_jit(a, b)

374 ms ± 2.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
from numba import cuda
print(cuda.gpus)

device = cuda.get_current_device()
tpb = device.WARP_SIZE

bpg = int(np.ceil(float(size)/tpb))
print('Blocks per grid:', bpg)
print('Threads per block', tpb)

<Managed Device 0>
Blocks per grid: 3125000
Threads per block 32


In [32]:
a = np.ones((size))
b = np.ones((size))
c = np.ones((size))

@cuda.jit('void(f8[:],f8[:],f8[:])')
def kernel(a, b, c):
    i = cuda.grid(1)
    c[i] = a[i]+b[i]


def func_cuda(a, b, c):
    kernel[bpg, tpb](a, b, c)   
    res = c.copy_to_host()
    return res

device = cuda.get_current_device()

da = cuda.to_device(a)
db = cuda.to_device(b)
dc = cuda.device_array_like(c)

c = func_cuda(da, db, dc)


In [33]:
%timeit func_cuda(da, db, dc)

439 ms ± 1.77 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
