# Tema 12: NVidia CUDA avanzado



__Ejercicio: sumar filas y columnas de matriz__

In [None]:
import numpy as np
from numba import cuda

@cuda.jit
def row_sums(a, sums, n):
    idx = cuda.grid(1)
    sum = 0.0
    for i in range(n):
        sum += a[idx][i]
    sums[idx] = sum

@cuda.jit
def col_sums(a, sums, n):
    idx = cuda.grid(1)
    sum = 0.0
    for i in range(n):
        sum += a[i][idx]
    sums[idx] = sum

n = 32768 # matrix side size
threads_per_block = 256
blocks = int(n / threads_per_block)

# Input Matrix
h_a = np.ones(n*n).reshape(n, n).astype(np.float32)

# Vectors in GPU
d_a = cuda.to_device(h_a)
d_sums = cuda.device_array(shape=(n,), dtype=np.float32)

# Calculate sum of rows
row_sums[blocks, threads_per_block](d_a, d_sums, n)
h_sums = d_sums.copy_to_host()
# Check sum
truth = h_a.sum(axis=1)
np.testing.assert_equal(h_sums,truth)

# Calculate sum of columns
col_sums[blocks, threads_per_block](d_a, d_sums, n)
h_sums = d_sums.copy_to_host()
# Check sum
truth = h_a.sum(axis=0)
np.testing.assert_equal(h_sums,truth)

In [None]:
%timeit row_sums[blocks, threads_per_block](d_a, d_sums, n); cuda.synchronize()
%timeit col_sums[blocks, threads_per_block](d_a, d_sums, n); cuda.synchronize()

__Ejercicio: sumar matrices con kernel 2D__

In [None]:
import numpy as np
from numba import cuda

n = 4096

@cuda.jit
def matrix_add(a, b, out, coalesced):
    x, y = cuda.grid(2)
    if coalesced == True:
        out[y][x] = a[y][x]+b[y][x]
    else:
        out[x][y] = a[x][y]+b[x][y]

threads_per_block = (32, 32)  # 2D block
blocks = (128, 128) # 2D grid

h_a = np.arange(n*n).reshape(n,n).astype(np.float32)
h_b = h_a.copy().astype(np.float32)

d_a = cuda.to_device(h_a)
d_b = cuda.to_device(h_b)
d_out = cuda.device_array(shape=(n,n), dtype=np.float32)

matrix_add[blocks, threads_per_block](d_a, d_b, d_out, True)
h_out = d_out.copy_to_host()
truth = h_a+h_b
np.testing.assert_equal(h_out, truth)

In [None]:
%timeit matrix_add[blocks, threads_per_block](d_a, d_b, d_out, True); cuda.synchronize()
%timeit matrix_add[blocks, threads_per_block](d_a, d_b, d_out, False); cuda.synchronize()

__Ejercicio: trasponer matriz con coalescencia usando memoria compartida__

In [None]:
from numba import cuda, types as numba_types
import numpy as np
n = 4096*4096 # 16M

@cuda.jit
def transpose(a, transposed):
    x, y = cuda.grid(2)
    transposed[x][y] = a[y][x]

@cuda.jit
def tile_transpose(a, transposed):
    tile = cuda.shared.array((32, 32), numba_types.float32)
    a_row, a_col = cuda.grid(2)
    tile[cuda.threadIdx.x, cuda.threadIdx.y] = a[a_row, a_col]
    cuda.syncthreads()
    transposed[a_col, a_row] = tile[cuda.threadIdx.x, cuda.threadIdx.y]

@cuda.jit
def tile_transpose2(a, transposed):
    tile = cuda.shared.array((32, 33), numba_types.float32)
    a_row, a_col = cuda.grid(2)
    tile[cuda.threadIdx.x, cuda.threadIdx.y] = a[a_row, a_col]
    cuda.syncthreads()
    transposed[a_col, a_row] = tile[cuda.threadIdx.x, cuda.threadIdx.y]

threads_per_block = (32, 32) # 2D blocks
blocks = (128, 128) #2D grid

# 4096x4096 input and output matrices
h_a = np.arange(n).reshape((4096,4096)).astype(np.float32)
d_a = cuda.to_device(h_a)
d_transposed = cuda.device_array(shape=(4096,4096), dtype=np.float32)
d_transposed_alt = cuda.device_array(shape=(4096,4096), dtype=np.float32)


# Invocación a traspose y comprobación
transpose[blocks, threads_per_block](d_a, d_transposed)
result = d_transposed.copy_to_host()
expected = h_a.T
np.testing.assert_equal(result, expected)

# Invocación a tile_traspose y comprobación
tile_transpose[blocks, threads_per_block](d_a, d_transposed)
result = d_transposed.copy_to_host()
expected = h_a.T
np.testing.assert_equal(result, expected)

# Invocación a tile_traspose2 y comprobación
tile_transpose2[blocks, threads_per_block](d_a, d_transposed)
result = d_transposed.copy_to_host()
expected = h_a.T
np.testing.assert_equal(result, expected)

In [None]:
%timeit transpose[blocks, threads_per_block](d_a, d_transposed); cuda.synchronize()
%timeit tile_transpose[blocks, threads_per_block](d_a, d_transposed); cuda.synchronize()

In [None]:
%timeit transpose[blocks, threads_per_block](d_a, d_transposed); cuda.synchronize()
%timeit tile_transpose[blocks, threads_per_block](d_a, d_transposed); cuda.synchronize()
%timeit tile_transpose2[blocks, threads_per_block](d_a, d_transposed); cuda.synchronize()