In [1]:
N = 2^20
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0

y .+= x             # increment each element of y with the corresponding element of x

1048576-element Vector{Float32}:
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 ⋮
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0

In [2]:
using Test
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y .== 3.0f0)

In [3]:
function sequential_add!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
sequential_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y .== 3.0f0)

In [4]:
function parallel_add!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
parallel_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y .== 3.0f0)

In [5]:
using BenchmarkTools
@btime sequential_add!($y, $x)

  156.200 μs (0 allocations: 0 bytes)


In [6]:
@btime parallel_add!($y, $x)

  163.600 μs (6 allocations: 672 bytes)


In [7]:
using CUDA

x_d = CUDA.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N)  # a vector stored on the GPU filled with 2.0

1048576-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 ⋮
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0

In [8]:
y_d .+= x_d
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(Array(y_d) .== 3.0f0)

In [9]:
function add_broadcast!(y, x)
    CUDA.@sync y .+= x
    return
end

add_broadcast! (generic function with 1 method)

In [10]:
@btime add_broadcast!($y_d, $x_d)

  60.800 μs (23 allocations: 1.61 KiB)
