In [1]:
using Pkg, Revise
Pkg.activate("/home/fxw/julia/DeconvOptim.jl/examples/paper/.")

[32m[1m  Activating[22m[39m environment at `~/julia/DeconvOptim.jl/examples/paper/Project.toml`


In [28]:
using CUDA, Tullio, BenchmarkTools, CUDAKernels, KernelAbstractions, Zygote, Distributed
CUDA.allowscalar(false)

┌ Info: Precompiling CUDAKernels [72cfdca4-0801-4ab0-bf6a-d52aa10adc57]
└ @ Base loading.jl:1342


In [3]:
function TV_cpu(arr, ϵ=eltype(arr)(1e-8))
    @tullio r = sqrt(ϵ + abs2(arr[i,j,k] - arr[i+1,j,k])
                       + abs2(arr[i,j,k] - arr[i,j+1,k])
                       + abs2(arr[i,j,k] - arr[i,j,k+1]))
end

# see here https://github.com/mcabbott/Tullio.jl/issues/85
# gradient pass is super slow
function TV_gpu(arr, ϵ=eltype(arr)(1e-8))
    arr1 = arr
    arr2 = arr
    arr3 = arr
    @tullio r[i, j, k] := sqrt(ϵ + abs2(arr[i,j,k] - arr1[i+1,j,k])
                       + abs2(arr[i,j,k] - arr2[i,j+1,k])
                       + abs2(arr[i,j,k] - arr3[i,j,k+1]))
    return sum(r)
end


f_inds(rs, b) = ntuple(i -> i == b ? rs[i] .+ 1 : rs[i], length(rs))

# called "naiv" implementation
function TV_3D_view(arr::AbstractArray{T, N}, ϵ=1f-8) where {T, N}
    as = ntuple(i -> axes(arr, i), Val(N))
    rs = map(x -> first(x):last(x)-1, as)
    arr0 = view(arr, f_inds(rs, 0)...)
    arr1 = view(arr, f_inds(rs, 1)...)
    arr2 = view(arr, f_inds(rs, 2)...)
    arr3 = view(arr, f_inds(rs, 3)...)

    return @fastmath sum(sqrt.(ϵ .+ abs2.(arr1 .- arr0) .+ 
                            abs2.(arr2 .- arr0) .+ abs2.(arr3 .- arr0)))

end

function TV_3D_circshift(arr::AbstractArray{T, N}, ϵ=1f-8) where {T, N}
    arr0 = arr
    arr1 = circshift(arr, (1, 0, 0))
    arr2 = circshift(arr, (0, 1, 0))
    arr3 = circshift(arr, (0, 0, 1))

    return @fastmath sum(sqrt.(ϵ .+ abs2.(arr1 .- arr0) .+ 
                            abs2.(arr2 .- arr0) .+ abs2.(arr3 .- arr0)))

end

function TV_tullio_2(arr::AbstractArray{T, N}, ϵ=1f-8) where {T, N}
    @tullio diff1[i,j,k] := abs2(arr[i+1,j,k] - arr[i,j,k])
    @tullio diff2[i,j,k] := abs2(arr[i,j+1,k] - arr[i,j,k])
    @tullio diff3[i,j,k] := abs2(arr[i,j,k+1] - arr[i,j,k])

    @tullio res[i, j, k] := sqrt(ϵ + diff1[i+0,j+0,k+0] + diff2[i+0,j+0,k+0] + diff3[i+0,j+0,k+0]) (i in 1:size(arr, 1)-1, j in 1:size(arr, 2)-1,l in 1:size(arr, 3)-1)
    return sum(res)
end

TV_tullio_2 (generic function with 2 methods)

In [6]:
arr = randn(Float32, (300, 300, 300));
arr_c = CuArray(arr);

In [7]:
@btime TV_cpu(arr)
x = @btime gradient(TV_cpu, arr);

  4.821 ms (217 allocations: 10.52 KiB)
  117.558 ms (315 allocations: 103.01 MiB)


In [8]:
@btime TV_3D_view(arr);
x = @btime gradient(TV_3D_view, arr);

  84.030 ms (3 allocations: 101.97 MiB)
  1.018 s (1227 allocations: 3.29 GiB)


In [10]:
@btime TV_3D_view($arr_c);
x_c = @btime gradient(TV_3D_view, $arr_c);

  3.280 ms (6267 allocations: 127.41 KiB)
  29.742 ms (65355 allocations: 1.08 MiB)


In [11]:
GC.gc(true)
@btime CUDA.@sync TV_3D_view($arr_c);
GC.gc(true)
@btime CUDA.@sync gradient(TV_3D_view, $arr_c);

  3.280 ms (6265 allocations: 127.38 KiB)
  45.442 ms (94510 allocations: 1.53 MiB)


In [12]:
GC.gc(true)
@btime CUDA.@sync TV_3D_circshift($arr_c);
GC.gc(true)
@btime CUDA.@sync gradient(TV_3D_circshift, $arr_c);

  6.134 ms (8122 allocations: 149.61 KiB)
  41.275 ms (84207 allocations: 1.32 MiB)


In [29]:
@btime TV_gpu($arr_c)
@btime gradient(TV_gpu, $arr_c);

  3.407 ms (7373 allocations: 123.97 KiB)
  13.729 s (29217371 allocations: 445.86 MiB)


In [38]:
function TV_cpu_2D(arr, ϵ=eltype(arr)(1e-8))
    @tullio r = sqrt(ϵ + abs2(arr[i,j] - arr[i+1,j])
                       + abs2(arr[i,j] - arr[i,j+1]))
end

# see here https://github.com/mcabbott/Tullio.jl/issues/85
# gradient pass is super slow
function TV_gpu_2D(arr, ϵ=eltype(arr)(1e-8))
    arr1 = arr
    arr2 = arr
    @tullio r[i, j] := sqrt(ϵ + abs2(arr[i,j] - arr1[i+1,j])
                       + abs2(arr[i,j] - arr2[i,j+1]))
    return sum(r)
end


f_inds(rs, b) = ntuple(i -> i == b ? rs[i] .+ 1 : rs[i], length(rs))

# called "naiv" implementation
function TV_2D_view(arr::AbstractArray{T, N}, ϵ=1f-8) where {T, N}
    as = ntuple(i -> axes(arr, i), Val(N))
    rs = map(x -> first(x):last(x)-1, as)
    arr0 = view(arr, f_inds(rs, 0)...)
    arr1 = view(arr, f_inds(rs, 1)...)
    arr2 = view(arr, f_inds(rs, 2)...)

    return @fastmath sum(sqrt.(ϵ .+ abs2.(arr1 .- arr0) .+ 
                            abs2.(arr2 .- arr0)))

end


TV_2D_view (generic function with 2 methods)

In [23]:
arr_2d = randn(Float32, (512, 512));
arr_c_2d = CuArray(arr_2d);

In [24]:
@btime TV_cpu_2D($arr_2d)
x = @btime gradient($TV_cpu_2D, $arr_2d);

  19.520 μs (203 allocations: 9.80 KiB)
  533.275 μs (283 allocations: 1.01 MiB)


In [30]:
@btime TV_2D_view($arr_2d);
x = @btime gradient($TV_2D_view, $arr_2d);

  387.237 μs (3 allocations: 1020.16 KiB)
  2.663 ms (862 allocations: 23.98 MiB)


In [25]:
@btime TV_2D_view($arr_c_2d);
x = @btime gradient($TV_2D_view, $arr_c_2d);

  44.280 μs (183 allocations: 18.67 KiB)
  472.536 μs (1477 allocations: 80.95 KiB)


In [40]:
@btime TV_gpu_2D($arr_c_2d)
x = @btime gradient($TV_gpu_2D, $arr_c_2d);

  50.730 μs (256 allocations: 10.98 KiB)
  405.437 μs (570 allocations: 26.11 KiB)
