## Tema 11: Introducción a NVidia CUDA

## C - Kernels en CUDA

__Primer Kernel CUDA__

In [None]:
from numba import cuda
import numpy as np

# Kernels decorados con `@cuda.jit` no devuelven valores
# No es necesaria signatura de tipos
@cuda.jit
def add_kernel(x, y, out):
    idx = cuda.grid(1)
        # 1 = grid unidimensional
        # cuda.grid(1) = cuda.threadIdx.x + cuda.blockIdx.x*cuda.blockDim.x
    out[idx] = x[idx] + y[idx]


n = 4096
h_x = np.arange(n).astype(np.float32)  # [0.0 ... 4095.0] 
h_y = np.ones_like(h_x)              # [1.0 ... 1.0] 

d_x = cuda.to_device(h_x) 
d_y = cuda.to_device(h_y) 
d_out = cuda.device_array_like(d_x) 

# Necesitamos un hilo para cada elemento (4096)
threads_per_block = 128
blocks_per_grid = 32

add_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)
cuda.synchronize() # Esto sería innecesario
print(d_out.copy_to_host().astype(np.int16)) # Resultado: [1...4096]

__Ejercicio 1: crear un Kernel a partir de una función__

In [None]:
import numpy as np

n = 16384
def h_square(a):
    return a**2

# TODO: implementar un kernel d_square()

a = np.arange(n, dtype=np.float32) 
# TODO: crear vector d_a y copiar al kernel
# TODO: crear vector en GPU para obtener la salida

# TODO: modificar estos valores e invocar kernel
blocks = 0
threads = 0

# TODO: Launch as a kernel with an appropriate execution configuration
out = h_square(a)

out_aux = a**2
np.testing.assert_almost_equal(out, out_aux)
# TODO: reemplazar out_aux con lo obtenido en el kernel (usar .copy_to_host())

## D - Uso de stride en Kernels CUDA

__Sin stride__

In [None]:
from numba import cuda
import numpy as np

@cuda.jit
def add_kernel(x, y, out):
    idx = cuda.grid(1)
    out[idx] = x[idx] + y[idx]

n = 4096
h_x = np.arange(n).astype(np.float32)  
h_y = np.ones_like(h_x)              

d_x = cuda.to_device(h_x) 
d_y = cuda.to_device(h_y) 
d_out = cuda.device_array_like(d_x) 

threads_per_block = 128
blocks_per_grid = 32

add_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)
d_out.copy_to_host().astype(np.int16)

__Con stride__

In [None]:
from numba import cuda
import numpy as np

@cuda.jit
def add_kernel(x, y, out):
    start = cuda.grid(1)
    stride = cuda.gridsize(1)
    for i in range(start, x.shape[0], stride):
        out[i] = x[i] + y[i]


n = 125000
h_x = np.arange(n).astype(np.float32) 
h_y = np.ones_like(h_x)         

d_x = cuda.to_device(h_x) 
d_y = cuda.to_device(h_y) 
d_out = cuda.device_array_like(d_x) 

threads_per_block = 128
blocks_per_grid = 56

add_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)
d_out.copy_to_host().astype(np.float16)

__Ejercicio 2: kernel CUDA con stride__

In [None]:
import numpy as np
from math import hypot
from numba import cuda

def cpu_hypot(a,b):
    return np.hypot(a,b)

# TODO: implementar esta función
# usando stride
def gpu_hypot_stride(a, b, c):
    None

# No modificar a partir de aquí
n = 1000000
h_a = np.random.uniform(-12, 12, n).astype(np.float32)
h_b = np.random.uniform(-12, 12, n).astype(np.float32)
d_a = cuda.to_device(h_a)
d_b = cuda.to_device(h_b)
d_c = cuda.device_array_like(d_b)

blocks = 128
threads_per_block = 64
gpu_hypot_stride[blocks, threads_per_block](d_a, d_b, d_c)
np.testing.assert_almost_equal(np.hypot(h_a, h_b), d_c.copy_to_host(), decimal=5)

## E - Operaciones atómicas

In [None]:
import numpy as np
from numba import cuda

@cuda.jit
def thread_counter_race_condition(global_counter):
    global_counter[0] += 1  # Mal
    
@cuda.jit
def thread_counter_safe(global_counter):
    cuda.atomic.add(global_counter, 0, 1) 

# Esto no funciona bien
global_counter = cuda.to_device(np.array([0], dtype=np.float32))
thread_counter_race_condition[64, 64](global_counter)
print('Debería dar %d:' % (64*64), global_counter.copy_to_host().astype(np.int16))

# Esto sí funciona bien
global_counter = cuda.to_device(np.array([0], dtype=np.float32))
thread_counter_safe[64, 64](global_counter)
print('Debería dar %d:' % (64*64), global_counter.copy_to_host().astype(np.int16))

## F - Kernels bidimensionales y tridimensionales

In [None]:
import numpy as np
from numba import cuda

@cuda.jit
def get_2D_indices(A):
    x, y = cuda.grid(2) # Obtenemos las dos dimensiones
    # Equivalente a:
    # x = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    # y = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    
    # Escribimos índice x + '.' + índice y
    A[x][y] = x + y / 10

d_A = cuda.device_array(shape=(4,4), dtype=np.float32)
    # Matriz 4x4 en la GPU

blocks = (2, 2) # Grid = 2x2 bloques
threads_per_block = (2, 2) # Bloque = 2x2 threads

get_2D_indices[blocks, threads_per_block](d_A)
np.set_printoptions(precision=1, suppress=True)
print(d_A.copy_to_host())

__Kernel bidimensional: suma de matrices__

In [None]:
from numba import cuda
import numpy as np

@cuda.jit  # Adjust block size as needed
def add_matrices(a, b, c):
    i, j = cuda.grid(2)  # Get thread indices in two dimensions (row, column)
    c[i, j] = a[i, j] + b[i, j]

# Example usage
rows = 4096
cols = 4096

h_a = np.random.rand(rows, cols).astype(np.float32)  # Allocate matrices on CPU
h_b = np.random.rand(rows, cols).astype(np.float32)
d_a = cuda.to_device(h_a)  # Transfer matrices to GPU
d_b = cuda.to_device(h_b)
d_c = cuda.device_array_like(d_b)

threads_per_block = (32, 32)
blocks = (128, 128)

add_matrices[blocks, threads_per_block](d_a, d_b, d_c)  # Launch kernel with appropriate grid size

h_c = d_c.copy_to_host()

np.testing.assert_almost_equal(h_c, h_a+h_b)

In [None]:
%timeit c_aux= (h_a + h_b)
%timeit add_matrices[blocks, threads_per_block](d_a, d_b, d_c)

__Ejercicio 3: kernel bidimensional para procesar una imagen__

In [None]:
# Necesitamos skimage
# Instalar con
#       conda install scikit-image

import matplotlib.pyplot as plt
from skimage import data, color
import numpy as np

@cuda.jit
def blur(input, output):
    x, y = cuda.grid(2)
    if x>0 and y>0 and x<(input.shape[0]-1) and y<(input.shape[1]-1):
        output[x][y] = 0.25*(input[x-1][y]+input[x+1][y]+input[x][y-1]+input[x][y+1])
    else:
        output[x][y] = input [x][y]

# TODO: definir tamaño de grid y de bloque
num_cycles = 100

astronaut = (255.-color.rgb2gray(data.astronaut()))/255.0
print("Image size: ",astronaut.shape)
fig, ax = plt.subplots()
im = ax.imshow(astronaut,  cmap='Greys')

# TODO: datos a GPU (duplicar imagen)

# TODO: ejecutar num_cycles veces un el kernel blur

# TODO: copiar imagen desenfocada al host
    
fig, ax = plt.subplots()
im = ax.imshow(astronaut_blurred, cmap='Greys')