In [24]:
import numba as nb
import numpy as np
from numba import cuda


In [25]:
import numba as nb
import numpy as np
from numba import cuda

@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add1(a, b, c):
    """This kernel function will be executed by a thread."""
    bx = cuda.blockIdx.x # which block in the grid?
    bw = cuda.blockDim.x # what is the size of a block?
    tx = cuda.threadIdx.x # unique thread ID within a blcok
    i = tx + bx * bw

    if i > c.size:
        return

    c[i] = a[i] + b[i]
    
    
device = cuda.get_current_device()

n = 100

# Host memory
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32)

# Assign equivalent storage on device
da = cuda.to_device(a)
db = cuda.to_device(b)

# Assign storage on device for output
dc = cuda.device_array_like(a)

# Set up enough threads for kernel
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
print ('Blocks per grid:', bpg)
print ('Threads per block', tpb)

# Launch kernel
%timeit cu_add1[bpg, tpb](da, db, dc)

# Transfer output from device to host
c = dc.copy_to_host()

print (c)    
#
@cuda.jit('void(float32[:,:], float32[:,:], float32, float32[:,:])')
def cu_add_2d(a, b, m, c):
    """This kernel function will be executed by a thread."""
    i, j  = cuda.grid(2)

    if (i < c.shape[0]) and (j < c.shape[1]):
        c[i, j] = m*(a[i, j] + b[i, j])
    cuda.syncthreads()
    
    
device = cuda.get_current_device()

n = 2023
p = 2023
a = np.random.random((n, p)).astype(np.float32)
b = np.ones((n, p)).astype(np.float32)
c = np.empty_like(a)

threadsperblock = (32, 32)
blockspergrid_x = (n + threadsperblock[0]) // threadsperblock[0]
blockspergrid_y = (p + threadsperblock[1]) // threadsperblock[1]
blockspergrid = (blockspergrid_x, blockspergrid_y)

print (blockspergrid, threadsperblock)
mm = np.float32(2.0)
for i in range(4000):
    cu_add_2d[blockspergrid, threadsperblock](a, b,mm, c)
%timeit cu_add_2d[blockspergrid, threadsperblock](a, b,mm, c)
print (a[-5:, -5:])
print (b[-5:, -5:])
print (c[-5:, -5:])

cuda.close()  # ha.  needs this.

Blocks per grid: 4
Threads per block 32
47.3 µs ± 311 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
[  0.   2.   4.   6.   8.  10.  12.  14.  16.  18.  20.  22.  24.  26.
  28.  30.  32.  34.  36.  38.  40.  42.  44.  46.  48.  50.  52.  54.
  56.  58.  60.  62.  64.  66.  68.  70.  72.  74.  76.  78.  80.  82.
  84.  86.  88.  90.  92.  94.  96.  98. 100. 102. 104. 106. 108. 110.
 112. 114. 116. 118. 120. 122. 124. 126. 128. 130. 132. 134. 136. 138.
 140. 142. 144. 146. 148. 150. 152. 154. 156. 158. 160. 162. 164. 166.
 168. 170. 172. 174. 176. 178. 180. 182. 184. 186. 188. 190. 192. 194.
 196. 198.]
(64, 64) (32, 32)
13.6 ms ± 155 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
[[0.48471838 0.19326018 0.24558993 0.9512417  0.5472509 ]
 [0.19016863 0.6582351  0.7993192  0.24108073 0.50878245]
 [0.16148567 0.49161237 0.2185717  0.8951179  0.8927562 ]
 [0.26769117 0.5325211  0.48802334 0.32992548 0.45702055]
 [0.31001717 0.85399127 0.42110714 0.8299771  0.9273521 ]

In [23]:
A=nb.SmartArray(A)
B=nb.SmartArray(B)
C=nb.SmartArray(C)

NameError: name 'A' is not defined