# CUDA

- https://cuda.juliagpu.org/stable/tutorials/introduction/
- https://enccs.github.io/Julia-for-HPC/guide/

### pde tutorials
- https://pde-on-gpu.vaw.ethz.ch/lecture1/
- https://pde-on-gpu.vaw.ethz.ch/lecture3/


In [7]:
import CUDA as cu
import CUDA: @cushow, @cuda, CuArray
import BenchmarkTools: @btime
import Test as t
import .Threads: @threads

@assert CUDA.functional()
# CUDA.versioninfo()

# * https://discourse.julialang.org/t/cuda-threads-and-blocks-confusion/54816
cudimmax = cu.attribute(cu.device(),cu.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
cudims = (;
  x=cu.attribute(cu.device(), cu.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
  y=cu.attribute(cu.device(), cu.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
  z=cu.attribute(cu.device(), cu.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)
)
(cudimmax, cudims)

(1024, (x = 1024, y = 1024, z = 64))

In [3]:
# * gpu utils

function gpu_grid_idx_x()
  block_dim = cu.blockDim().x

  thread_idx = cu.threadIdx().x
  block_idx = cu.blockIdx().x
  grid_idx = (block_idx - 1) * block_dim + thread_idx;

  return grid_idx
end
function gpu_grid_dim_x()
  grid_dim = cu.gridDim().x * cu.blockDim().x
  return grid_dim
end

function gpu_grid_idx_y()
  block_dim = cu.blockDim().y

  thread_idx = cu.threadIdx().y
  block_idx = cu.blockIdx().y
  grid_idx = (block_idx - 1) * block_dim + thread_idx;

  return grid_idx
end
function gpu_grid_dim_y()
  grid_dim = cu.gridDim().y * cu.blockDim().y
  return grid_dim
end

function gpu_grid_idx_z()
  block_dim = cu.blockDim().z

  thread_idx = cu.threadIdx().z
  block_idx = cu.blockIdx().z
  grid_idx = (block_idx - 1) * block_dim + thread_idx;

  return grid_idx
end
function gpu_grid_dim_z()
  grid_dim = cu.gridDim().z * cu.blockDim().z
  return grid_dim
end


gpu_grid_dim_z (generic function with 1 method)

In [3]:
let
  gpu_log = () -> begin
    grid_idx = gpu_grid_idx_x()
    grid_dim = gpu_grid_dim_x()
    @cushow (grid_idx, grid_dim)
    return
  end
  @cuda threads=3 blocks=2 gpu_log()
  cu.synchronize()
end

(grid_idx, grid_dim) = (1, 6)
(grid_idx, grid_dim) = (2, 6)
(grid_idx, grid_dim) = (3, 6)
(grid_idx, grid_dim) = (4, 6)
(grid_idx, grid_dim) = (5, 6)
(grid_idx, grid_dim) = (6, 6)


In [4]:
let
  gpu_log_idx = () -> begin
    grid_idx_x = gpu_grid_idx_x()
    grid_idx_y = gpu_grid_idx_y()
    @cushow (grid_idx_x, grid_idx_y)
    return
  end
  @cuda threads = (2, 3) blocks = (2, 2) gpu_log_idx()
  cu.synchronize()
end

(grid_idx_x, grid_idx_y) = (3, 4)
(grid_idx_x, grid_idx_y) = (4, 4)
(grid_idx_x, grid_idx_y) = (3, 5)
(grid_idx_x, grid_idx_y) = (4, 5)
(grid_idx_x, grid_idx_y) = (3, 6)
(grid_idx_x, grid_idx_y) = (4, 6)
(grid_idx_x, grid_idx_y) = (1, 1)
(grid_idx_x, grid_idx_y) = (2, 1)
(grid_idx_x, grid_idx_y) = (1, 2)
(grid_idx_x, grid_idx_y) = (2, 2)
(grid_idx_x, grid_idx_y) = (1, 3)
(grid_idx_x, grid_idx_y) = (2, 3)
(grid_idx_x, grid_idx_y) = (1, 4)
(grid_idx_x, grid_idx_y) = (2, 4)
(grid_idx_x, grid_idx_y) = (1, 5)
(grid_idx_x, grid_idx_y) = (2, 5)
(grid_idx_x, grid_idx_y) = (1, 6)
(grid_idx_x, grid_idx_y) = (2, 6)
(grid_idx_x, grid_idx_y) = (3, 1)
(grid_idx_x, grid_idx_y) = (4, 1)
(grid_idx_x, grid_idx_y) = (3, 2)
(grid_idx_x, grid_idx_y) = (4, 2)
(grid_idx_x, grid_idx_y) = (3, 3)
(grid_idx_x, grid_idx_y) = (4, 3)


In [3]:
const N = 2^27
x = zeros(Float32, N)
y = zeros(Float32, N)

fill!(x, 1)
fill!(y, 2)

bench_cpu_add! = let
  function cpu_add!(u, v)
    @threads for i in 1:N
      @inbounds u[i] = u[i] + v[i]
    end
    return
  end

  function bench_cpu_add!(u, v)
    cpu_add!(u, v)
  end

  bench_cpu_add!(x, y)
  t.@test t.all(x .== 3)

  bench_cpu_add!
end

fill!(x, 1)
@btime bench_cpu_add!($x, $y)

  119.850 ms (51 allocations: 5.27 KiB)


In [4]:
const N = 2^27
x = cu.fill(1.0f0, N)
y = cu.fill(2.0f0, N)

bench_gpu_add! = let
  function gpu_add!(u, v)
    ix = gpu_grid_idx_x()
    dim = gpu_grid_dim_x()
    for i in ix:dim:N
      @inbounds u[i] += v[i]
    end
    return nothing
  end

  function bench_gpu_add!(u, v; threads, blocks)
    @cuda threads = threads blocks = blocks gpu_add!(x, y)
    cu.synchronize()
  end

  bench_gpu_add!(x, y; threads=1, blocks=1)
  t.@test all(Array(x) .== 3.0f0)

  bench_gpu_add!
end

function bench(threads, blocks)
  @show (threads, blocks)
  cu.fill!(x, 1)
  @btime bench_gpu_add!($x, $y; threads=$threads, blocks=$blocks)
end

bench(1, 1)
bench(128, 1)
bench(256, 1)
bench(128, 2)
bench(256, 2)
bench(cudims.x, 1)
bench(cudims.x, 2)
bench(cudims.x, 14)
bench(128, ceil(Int, N/128))
bench(256, ceil(Int, N/256))
bench(cudims.x, ceil(Int, N/cudims.x))

(threads, blocks) = (1, 1)
  9.497 s (366 allocations: 12.92 KiB)
(threads, blocks) = (128, 1)
  336.437 ms (89 allocations: 4.91 KiB)
(threads, blocks) = (256, 1)
  178.386 ms (83 allocations: 4.78 KiB)
(threads, blocks) = (128, 2)
  178.280 ms (82 allocations: 4.75 KiB)
(threads, blocks) = (256, 2)
  98.386 ms (80 allocations: 4.72 KiB)
(threads, blocks) = (1024, 1)
  61.597 ms (80 allocations: 4.70 KiB)
(threads, blocks) = (1024, 2)
  42.442 ms (80 allocations: 4.70 KiB)
(threads, blocks) = (1024, 14)
  37.092 ms (80 allocations: 4.70 KiB)
(threads, blocks) = (128, 1048576)
  33.837 ms (80 allocations: 4.70 KiB)
(threads, blocks) = (256, 524288)
  34.093 ms (80 allocations: 4.70 KiB)
(threads, blocks) = (1024, 131072)
  34.087 ms (81 allocations: 4.72 KiB)


In [4]:
const M = 2^13
x = zeros(Float32, M, M)
y = zeros(Float32, M, M)

fill!(x, 1)
fill!(y, 2)

bench_cpu_add! = let
  function cpu_add!(u, v)
    @threads for i in 1:M
      for j in 1:M
        @inbounds u[i, j] = u[i, j] + v[i, j]
      end
    end
    return
  end

  function bench_cpu_add!(u, v)
    cpu_add!(u, v)
  end

  bench_cpu_add!(x, y)
  t.@test t.all(x .== 3)

  bench_cpu_add!
end

fill!(x, 1)
@btime bench_cpu_add!($x, $y)

  821.279 ms (51 allocations: 5.27 KiB)


In [6]:
const M = 2^13
x = cu.fill(1.0f0, M, M)
y = cu.fill(2.0f0, M, M)

bench_gpu_add! = let
  function gpu_add!(u, v)
    ix_x = gpu_grid_idx_x()
    dim_x = gpu_grid_dim_x()
    ix_y = gpu_grid_idx_y()
    dim_y = gpu_grid_dim_y()
    for i in ix_x:dim_x:M
      for j in ix_y:dim_y:M
        @inbounds u[i, j] += v[i, j]
      end
    end
    return nothing
  end

  function bench_gpu_add!(u, v; threads, blocks)
    @cuda threads = (threads, threads) blocks = (blocks, blocks) gpu_add!(x, y)
    cu.synchronize()
  end

  bench_gpu_add!(x, y; threads=1, blocks=1)
  t.@test all(Array(x) .== 3.0f0)

  bench_gpu_add!
end

function bench(threads, blocks)
  @show (threads, blocks)
  cu.fill!(x, 1)
  @btime bench_gpu_add!($x, $y; threads=$threads, blocks=$blocks)
end

bench(1, 1)
bench(floor(Int, sqrt(cudims.x)), 10)

(threads, blocks) = (1, 1)
  21.133 s (723 allocations: 23.05 KiB)
(threads, blocks) = (32, 10)
  18.791 ms (81 allocations: 4.88 KiB)
