<a href="https://colab.research.google.com/github/sappyb/Codes-Doc/blob/master/cuda_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!uv pip install -q --system numba-cuda==0.4.0
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [2]:
from numba import cuda
print(cuda.detect())  # should list a GPU device


Found 1 CUDA devices
id 0             b'Tesla T4'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 4
                              PCI Bus ID: 0
                                    UUID: GPU-90b0b386-3a86-eb8a-3365-d9c86cd16b5e
                                Watchdog: Disabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported
True


In [3]:
import numpy as np
from numba import cuda

# Prepare data
N = 100000000
A = np.random.rand(N).astype(np.float32)
B = np.random.rand(N).astype(np.float32)
C = np.zeros_like(A)

In [4]:
# Copy to device
dA = cuda.to_device(A)
dB = cuda.to_device(B)
dC = cuda.device_array_like(A)

In [5]:
from numba import cuda

# Grab the current CUDA device
dev = cuda.get_current_device()

# 1. Maximum threads per block
print("Max threads per block:", dev.MAX_THREADS_PER_BLOCK)

# 2. Maximum size of each block dimension
print("Max block dim (x,y,z):",
      dev.MAX_BLOCK_DIM_X,
      dev.MAX_BLOCK_DIM_Y,
      dev.MAX_BLOCK_DIM_Z)

# 3. Maximum size of each grid dimension
print("Max grid dim (x,y,z):",
      dev.MAX_GRID_DIM_X,
      dev.MAX_GRID_DIM_Y,
      dev.MAX_GRID_DIM_Z)

# 4. Warp size (threads are scheduled in warps of this size)
print("Warp size:", dev.WARP_SIZE)

# 5. Multiprocessor info (for occupancy considerations)
print("Multiprocessors:", dev.MULTIPROCESSOR_COUNT)
# Instead of dev.MAX_THREADS_PER_MULTIPROCESSOR, try this:
#print("Max threads per multiprocessor:", dev.MAX_THREADS_PER_SM)
# MAX_THREADS_PER_SM might be the correct attribute name for your CUDA version
# or device. SM stands for Streaming Multiprocessor.

Max threads per block: 1024
Max block dim (x,y,z): 1024 1024 64
Max grid dim (x,y,z): 2147483647 65535 65535
Warp size: 32
Multiprocessors: 40


In [6]:
threads_per_block = 1024
blocks = (N + threads_per_block - 1) // threads_per_block

# 1D checks
assert threads_per_block <= dev.MAX_THREADS_PER_BLOCK
assert blocks           <= dev.MAX_GRID_DIM_X
assert threads_per_block % dev.WARP_SIZE == 0
print("Launch configuration is compatible!")


Launch configuration is compatible!


In [7]:
@cuda.jit
def add_kernel(a, b, c):
    i = cuda.grid(1)                # 1D grid index
    if i < a.size:
        c[i] = a[i] + b[i]

In [8]:
# Launch kernel with enough threads to cover N elements
threads_per_block = 256
blocks = (N + threads_per_block - 1) // threads_per_block
add_kernel[blocks, threads_per_block](dA, dB, dC)

In [9]:
import time
# Try different block sizes
thread_options = [128, 256, 512, 1024]
results = []

for threads_per_block in thread_options:
    blocks = (N + threads_per_block - 1) // threads_per_block
    # Allocate output on GPU
    dC = cuda.device_array_like(A)

    # Warm up
    add_kernel[blocks, threads_per_block](dA, dB, dC)
    cuda.synchronize()

    # Time the kernel
    start = time.time()
    add_kernel[blocks, threads_per_block](dA, dB, dC)
    cuda.synchronize()
    elapsed = time.time() - start

    # Copy back and verify
    C = dC.copy_to_host()
    correct = np.allclose(C, A + B)

    results.append((threads_per_block, blocks, elapsed * 1000, correct))

# Display results
import pandas as pd
df = pd.DataFrame(results, columns=["Threads/block", "Blocks", "Time (ms)", "Correct"])
print(df)

   Threads/block  Blocks  Time (ms)  Correct
0            128  781250   4.710197     True
1            256  390625   4.708529     True
2            512  195313   4.801989     True
3           1024   97657   5.205154     True


In [10]:
# Benchmark settings
thread_options = [128, 256, 512]
results = []

for threads_per_block in thread_options:
    blocks = (N + threads_per_block - 1) // threads_per_block
    dC = cuda.device_array_like(A)

    # Warm up
    add_kernel[blocks, threads_per_block](dA, dB, dC)
    cuda.synchronize()

    # Create CUDA events for precise timing
    start_evt = cuda.event()
    end_evt   = cuda.event()

    # Record start event
    start_evt.record()

    # Run kernel multiple times to average out jitter
    n_iters = 100
    for _ in range(n_iters):
        add_kernel[blocks, threads_per_block](dA, dB, dC)

    # Record end event and synchronize
    end_evt.record()
    end_evt.synchronize()

    # Compute average elapsed time (ms)
    # elapsed_ms = start_evt.time_till(end_evt) / n_iters # This line was causing the error
    elapsed_ms = cuda.event_elapsed_time(start_evt, end_evt) / n_iters # Use cuda.event_elapsed_time instead

    # Copy result back and verify correctness
    C = dC.copy_to_host()
    correct = np.allclose(C, A + B)

    results.append((threads_per_block, blocks, elapsed_ms, correct))

# Prepare and display results
df = pd.DataFrame(results, columns=["Threads/block", "Blocks", "Time (ms)", "Correct"])

In [11]:
print(df)

   Threads/block  Blocks  Time (ms)  Correct
0            128  781250   4.592357     True
1            256  390625   4.638435     True
2            512  195313   4.632944     True


In [13]:
import numpy as np
from numba import cuda
import time
import pandas as pd

# CUDA kernel for vector addition
@cuda.jit
def add_kernel(a, b, c):
    i = cuda.grid(1)
    if i < a.size:
        c[i] = a[i] + b[i]

# Problem size
N = 100_000_000  # adjust as needed

# Prepare host data
A = np.random.rand(N).astype(np.float32)
B = np.random.rand(N).astype(np.float32)

# Copy inputs to device once
dA = cuda.to_device(A)
dB = cuda.to_device(B)

# Benchmark settings
thread_options = [128, 256, 512]
results = []

for threads_per_block in thread_options:
    blocks = (N + threads_per_block - 1) // threads_per_block
    dC = cuda.device_array_like(A)

    # Warm up
    add_kernel[blocks, threads_per_block](dA, dB, dC)
    cuda.synchronize()

    # Precise GPU timing with events
    start_evt = cuda.event()
    end_evt   = cuda.event()
    start_evt.record()
    add_kernel[blocks, threads_per_block](dA, dB, dC)
    end_evt.record()
    end_evt.synchronize()
    # Use cuda.event_elapsed_time to get the elapsed time between events
    elapsed_ms = cuda.event_elapsed_time(start_evt, end_evt)

    # Copy back & verify
    C = dC.copy_to_host()
    correct = bool(np.allclose(C, A + B))

    # Compute metrics
    # Each element: 2 reads + 1 write of float32 → 3 * 4 bytes
    bytes_moved = N * 3 * 4
    bandwidth_gbs = (bytes_moved / (elapsed_ms / 1000)) / 1e9

    # Each element: 1 add → N flops
    gflops = (N / (elapsed_ms / 1000)) / 1e9

    results.append({
        "Threads/block": threads_per_block,
        "Blocks": blocks,
        "Time (ms)": round(elapsed_ms, 3),
        "Bandwidth (GB/s)": round(bandwidth_gbs, 1),
        "Throughput (GFLOP/s)": round(gflops, 1),
        "Correct": correct
    })

df = pd.DataFrame(results)
print(df)

   Threads/block  Blocks  Time (ms)  Bandwidth (GB/s)  Throughput (GFLOP/s)  \
0            128  781250      4.725             254.0                  21.2   
1            256  390625      4.688             256.0                  21.3   
2            512  195313      4.758             252.2                  21.0   

   Correct  
0     True  
1     True  
2     True  
