## Tema 11: Introducción a NVidia CUDA (soluciones)

## C - Kernels en CUDA
__Ejercicio 1: crear un Kernel a partir de una función__

In [None]:
import numpy as np
from numba import cuda

n = 1024*512

def h_square(a):
    return a**2

@cuda.jit
def d_square(a, a_squared):
    idx = cuda.grid(1)
    a_squared[idx] = a[idx]**2

h_a = np.arange(n, dtype=np.float32)
h_out = np.zeros_like(h_a) 

d_a = cuda.to_device(h_a)              
d_out = cuda.device_array_like(h_a)
blocks = 1024
threads = 512
d_square[blocks,threads](d_a,d_out)

h_out = h_square(h_a)

np.testing.assert_almost_equal(h_out, d_out.copy_to_host())

In [None]:
%timeit h_square(h_a)
%timeit d_square[blocks,threads](d_a,d_out)

## D - Uso de stride en Kernels CUDA

__Ejercicio 2: kernel CUDA con stride__



In [None]:
import numpy as np
from math import hypot
from numba import cuda

def cpu_hypot(a,b):
    return np.hypot(a,b)

@cuda.jit
def gpu_hypot_stride(a, b, c):
    start = cuda.grid(1)
    stride = cuda.gridsize(1)
    for i in range(start, a.shape[0], stride):
        c[i] = hypot(a[i], b[i])


# No modificar las siguientes líneas
n = 1000000
h_a = np.random.uniform(-12, 12, n).astype(np.float32)
h_b = np.random.uniform(-12, 12, n).astype(np.float32)
d_a = cuda.to_device(h_a)
d_b = cuda.to_device(h_b)
d_c = cuda.device_array_like(d_b)

blocks = 128
threads_per_block = 64
gpu_hypot_stride[blocks, threads_per_block](d_a, d_b, d_c)
np.testing.assert_almost_equal(np.hypot(h_a, h_b), d_c.copy_to_host(), decimal=5)

In [None]:
%timeit cpu_hypot(h_a,h_b)
%timeit gpu_hypot_stride[128, 64](d_a, d_b, d_c)

__Ejercicio 3: kernel bidimensional para procesar una imagen__

In [None]:
# Necesitamos skimage
# Instalar con
#       conda install scikit-image

import matplotlib.pyplot as plt
from skimage import data, color
import numpy as np

@cuda.jit
def blur(input, output):
    x, y = cuda.grid(2)
    if x>0 and y>0 and x<(input.shape[0]-1) and y<(input.shape[1]-1):
        output[x][y] = 0.25*(input[x-1][y]+input[x+1][y]+input[x][y-1]+input[x][y+1])
    else:
        output[x][y] = input [x][y]

blocks = (16, 16) # 2D grid
threads_per_block = (32, 32)  # 2D block
num_cycles = 100

astronaut = (255.-color.rgb2gray(data.astronaut()))/255.0
print("Image size: ",astronaut.shape)

fig, ax = plt.subplots()
im = ax.imshow(astronaut,  cmap='Greys')

d_astronaut = cuda.to_device(astronaut)
d_astronaut_blurred = cuda.device_array_like(d_astronaut)

for _ in range(num_cycles):
    blur[blocks, threads_per_block](d_astronaut, d_astronaut_blurred)
    cuda.synchronize()
    d_astronaut = d_astronaut_blurred
astronaut_blurred = d_astronaut_blurred.copy_to_host()

fig, ax = plt.subplots()
im = ax.imshow(astronaut_blurred, cmap='Greys')