In [1]:
!uv pip install -q --system numba-cuda==0.4.0

import numpy as np
import time
import os

# Activar simulador CUDA antes de importar Numba
os.environ["NUMBA_ENABLE_CUDASIM"] = "1"

from numba import cuda
from numba import config

config.CUDA_ENABLE_PYNVJITLINK = 1

# Datos A–H en ASCII
characters = ['A','B','C','D','E','F','G','H']
data = np.array([ord(c) for c in characters], dtype=np.uint8)

@cuda.jit
def kernel_1d_dims(arr):
    gidx = cuda.grid(1)
    if gidx < arr.size:
        print(f"GID: {gidx}, Char: {chr(arr[gidx])}")


In [3]:
# Ejemplo 1: uso de bloques e hilos

blocks_per_grid_ex1 = 2
threads_per_block_ex1 = 2  # 2 × 2 = 4 hilos en total

kernel_1d_dims[blocks_per_grid_ex1, threads_per_block_ex1](data)
cuda.synchronize()

GID: 0, Char: A
GID: 1, Char: B
GID: 2, Char: C
GID: 3, Char: D


In [5]:
# Ejemplo 2: bloques e hilos

blocks_per_grid_ex1 = 2
threads_per_block_ex1 = 8  # 2 × 8 = 16 hilos

kernel_1d_dims[blocks_per_grid_ex1, threads_per_block_ex1](data)
cuda.synchronize()

GID: 0, Char: A
GID: 1, Char: B
GID: 2, Char: C
GID: 3, Char: D
GID: 4, Char: E
GID: 5, Char: F
GID: 6, Char: G
GID: 7, Char: H


In [7]:
# Ejemplo 3: bloques y hilos en 2D

@cuda.jit
def whoami():
    block_id = (
        cuda.blockIdx.x +
        cuda.blockIdx.y * cuda.gridDim.x
    )

    threads_per_block = cuda.blockDim.x * cuda.blockDim.y

    block_offset = block_id * threads_per_block

    thread_offset = (
        cuda.threadIdx.x +
        cuda.threadIdx.y * cuda.blockDim.x
    )

    global_id = block_offset + thread_offset

    print(f"{global_id:03d} | Block({cuda.blockIdx.x},{cuda.blockIdx.y})={block_id:3d} | "
          f"Thread({cuda.threadIdx.x},{cuda.threadIdx.y})={thread_offset:3d} "
          f"BlockDim({cuda.blockDim.x},{cuda.blockDim.y}) GridDim({cuda.gridDim.x},{cuda.gridDim.y})")


b_x, b_y = 2, 2
t_x, t_y = 4, 1

blocks_per_grid = (b_x, b_y)
threads_per_block = (t_x, t_y)

total_blocks = b_x * b_y
total_threads = t_x * t_y

print(f"{total_blocks} blocks/grid")
print(f"{total_threads} threads/block")
print(f"{total_blocks * total_threads} total threads\n")

whoami[blocks_per_grid, threads_per_block]()
cuda.synchronize()


4 blocks/grid
4 threads/block
16 total threads

000 | Block(0,0)=  0 | Thread(0,0)=  0 BlockDim(4,1) GridDim(2,2)
001 | Block(0,0)=  0 | Thread(1,0)=  1 BlockDim(4,1) GridDim(2,2)
002 | Block(0,0)=  0 | Thread(2,0)=  2 BlockDim(4,1) GridDim(2,2)
003 | Block(0,0)=  0 | Thread(3,0)=  3 BlockDim(4,1) GridDim(2,2)
008 | Block(0,1)=  2 | Thread(0,0)=  0 BlockDim(4,1) GridDim(2,2)
009 | Block(0,1)=  2 | Thread(1,0)=  1 BlockDim(4,1) GridDim(2,2)
010 | Block(0,1)=  2 | Thread(2,0)=  2 BlockDim(4,1) GridDim(2,2)
011 | Block(0,1)=  2 | Thread(3,0)=  3 BlockDim(4,1) GridDim(2,2)
004 | Block(1,0)=  1 | Thread(0,0)=  0 BlockDim(4,1) GridDim(2,2)
005 | Block(1,0)=  1 | Thread(1,0)=  1 BlockDim(4,1) GridDim(2,2)
006 | Block(1,0)=  1 | Thread(2,0)=  2 BlockDim(4,1) GridDim(2,2)
007 | Block(1,0)=  1 | Thread(3,0)=  3 BlockDim(4,1) GridDim(2,2)
012 | Block(1,1)=  3 | Thread(0,0)=  0 BlockDim(4,1) GridDim(2,2)
013 | Block(1,1)=  3 | Thread(1,0)=  1 BlockDim(4,1) GridDim(2,2)
014 | Block(1,1)=  3 | Threa