In [7]:
import numpy as np
from numba import cuda

In [2]:
@cuda.jit('int32(int32, int32)', device=True)
def add(a, b):
    return a + b

In [3]:
@cuda.jit('void(int32[:], int32[:], int32[:])')
def add_arrays(aryA, aryB, aryOut):
    i = cuda.grid(1) # global position of the thread for a 1D grid.
    aryOut[i] = add(aryA[i], aryB[i])

In [10]:
out = np.array([1], dtype='int32')
add_arrays(np.array([1], dtype='int32'), np.array([1], dtype='int32'), out)
out

array([2], dtype=int32)

In [6]:
import cupy as cp

In [14]:
x = cp.ones((10000, 100))
x.dtype, x.shape

(dtype('float64'), (10000, 100))

In [17]:
y = cp.arange(x.size).reshape(x.shape)
y.shape

(10000, 100)

In [18]:
y[:5, :5]

array([[  0,   1,   2,   3,   4],
       [100, 101, 102, 103, 104],
       [200, 201, 202, 203, 204],
       [300, 301, 302, 303, 304],
       [400, 401, 402, 403, 404]])

In [19]:
%%time
z = cp.dot(x, y.T)

CPU times: user 300 ms, sys: 92.7 ms, total: 393 ms
Wall time: 390 ms


In [22]:
z.device

<CUDA Device 0>

In [24]:
cp.__version__

'7.3.0'

In [25]:
xnp, ynp = cp.asnumpy(y), cp.asnumpy(y)

In [26]:
%%time
znp = np.dot(xnp, ynp.T)

CPU times: user 5.51 s, sys: 43.7 ms, total: 5.56 s
Wall time: 5.55 s
