In [1]:
from numba import cuda

print("CUDA available:", cuda.is_available())

if cuda.is_available():
    device = cuda.get_current_device()
    print("GPU detected:", device.name)
    print("Compute capability:", device.compute_capability)
else:
    print("⚠️ Numba cannot access the GPU.")


CUDA available: True
GPU detected: b'NVIDIA GeForce GTX 1060 6GB'
Compute capability: (6, 1)


In [2]:
from numba import cuda
import numpy as np

@cuda.jit
def add_kernel(a, b, out):
    idx = cuda.grid(1)
    if idx < out.size:
        out[idx] = a[idx] + b[idx]
# Create host arrays
n = 1_000_000
a = np.ones(n, dtype=np.float32)
b = np.ones(n, dtype=np.float32)
out = np.zeros(n, dtype=np.float32)

# Copy to GPU
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_out = cuda.to_device(out)

# Configure kernel launch
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

print(f"Launching kernel: {blocks_per_grid} blocks × {threads_per_block} threads")

# Launch kernel
add_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_out)

# Copy result back
result = d_out.copy_to_host()

print("Done!")
import numpy as np

if np.all(result == 2):
    print("✅ SUCCESS — GPU computed correct result.")
else:
    print("❌ ERROR — incorrect output.")
    print(result[:10])


Launching kernel: 3907 blocks × 256 threads
Done!
✅ SUCCESS — GPU computed correct result.
