In [1]:
import numpy as np
import time
from numba import cuda, config

config.CUDA_ENABLE_PYNVJITLINK = 0

print("Numba CUDA available:", cuda.is_available())

@cuda.jit
def first_kernel(a, result):
    idx = cuda.grid(1)
    if idx < a.size:
        result[idx] = a[idx]

def main():
    # 1. Initialize data on CPU
    N = 10_000_000
    a_cpu = np.arange(N, dtype=np.float32)

    # ---------------- CPU ----------------
    start = time.time()
    result_cpu = a_cpu
    cpu_time = time.time() - start
    print(f"CPU Time: {cpu_time * 1e3:.2f} ms")

    # ---------------- GPU ----------------
    start = time.time()
    a_gpu = cuda.to_device(a_cpu)
    result_gpu = cuda.device_array_like(a_cpu)
    transfer_in_time = time.time() - start

    threads_per_block = 128
    blocks_per_grid = (N + threads_per_block - 1) // threads_per_block

    start = time.time()
    first_kernel[blocks_per_grid, threads_per_block](a_gpu, result_gpu)
    cuda.synchronize()
    kernel_time = time.time() - start

    start = time.time()
    result_from_gpu = result_gpu.copy_to_host()
    transfer_out_time = time.time() - start

    print(f"GPU transfer to device: {transfer_in_time * 1e3:.2f} ms")
    print(f"GPU kernel execution:   {kernel_time * 1e3:.2f} ms")
    print(f"GPU transfer to host:   {transfer_out_time * 1e3:.2f} ms")

    total_gpu_time_ms = (transfer_in_time + kernel_time + transfer_out_time) * 1e3
    print(f"Total GPU time:         {total_gpu_time_ms:.2f} ms")

if __name__ == "__main__":
    main()


Numba CUDA available: True
CPU Time: 0.00 ms
GPU transfer to device: 244.90 ms
GPU kernel execution:   1741.80 ms
GPU transfer to host:   15.45 ms
Total GPU time:         2002.15 ms
