## Distributed Arrays

In [1]:
using Distributed

In [2]:
nprocs() == 1 && addprocs(;exeflags="--project")

8-element Vector{Int64}:
 2
 3
 4
 5
 6
 7
 8
 9

In [3]:
@everywhere using DistributedArrays

In [4]:
workers()

8-element Vector{Int64}:
 2
 3
 4
 5
 6
 7
 8
 9

#### Start with regular array

In [5]:
dat=rand(10000,10000)

10000×10000 Matrix{Float64}:
 0.123723  0.444996  0.246716    …  0.705871   0.655958   0.61486
 0.649506  0.778737  0.731993       0.348227   0.0516317  0.566177
 0.761093  0.405962  0.751093       0.867999   0.255145   0.405285
 0.609628  0.272982  0.359789       0.976385   0.297469   0.71698
 0.380213  0.159257  0.153968       0.35067    0.41966    0.430037
 0.605719  0.540934  0.759018    …  0.461343   0.343755   0.639653
 0.57235   0.707484  0.854691       0.0547383  0.683133   0.882455
 0.271017  0.617553  0.59311        0.181349   0.495497   0.139849
 0.978263  0.84524   0.439817       0.942757   0.660323   0.328855
 0.947058  0.346634  0.497383       0.748693   0.158723   0.360928
 0.6787    0.80356   0.899306    …  0.0999335  0.203494   0.151415
 0.903023  0.71282   0.0341192      0.264255   0.151948   0.870035
 0.979759  0.543438  0.3833         0.987089   0.7874     0.0592685
 ⋮                               ⋱                        
 0.684499  0.827455  0.00167859     0.5599

#### Process id 1 is assigned to regular array

In [6]:
map(x->myid(),dat)

10000×10000 Matrix{Int64}:
 1  1  1  1  1  1  1  1  1  1  1  1  1  …  1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1  …  1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1  …  1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  1     1  1  1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1  1  1  

#### Distribute ownership to other workers with distributed array

In [7]:
ddat=distribute(dat)

10000×10000 DArray{Float64, 2, Matrix{Float64}}:
 0.123723  0.444996  0.246716    …  0.705871   0.655958   0.61486
 0.649506  0.778737  0.731993       0.348227   0.0516317  0.566177
 0.761093  0.405962  0.751093       0.867999   0.255145   0.405285
 0.609628  0.272982  0.359789       0.976385   0.297469   0.71698
 0.380213  0.159257  0.153968       0.35067    0.41966    0.430037
 0.605719  0.540934  0.759018    …  0.461343   0.343755   0.639653
 0.57235   0.707484  0.854691       0.0547383  0.683133   0.882455
 0.271017  0.617553  0.59311        0.181349   0.495497   0.139849
 0.978263  0.84524   0.439817       0.942757   0.660323   0.328855
 0.947058  0.346634  0.497383       0.748693   0.158723   0.360928
 0.6787    0.80356   0.899306    …  0.0999335  0.203494   0.151415
 0.903023  0.71282   0.0341192      0.264255   0.151948   0.870035
 0.979759  0.543438  0.3833         0.987089   0.7874     0.0592685
 ⋮                               ⋱                        
 0.684499  0.827455  0

#### Different workers are assigned to different blocks of data

In [8]:
map(x->myid(),ddat)

10000×10000 DArray{Int64, 2, Matrix{Int64}}:
 2  2  2  2  2  2  2  2  2  2  2  2  2  …  8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2  …  8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2  …  8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  2  2  2  2  2  2  2     8  8  8  8  8  8  8  8  8  8  8  8
 2  2  2  2  2  2  

#### Process each chunk locally by the corresponding worker assigned to the chunk

In [9]:
[ @spawnat p sum(x->x*x,localpart(ddat)) for p in workers() ]

8-element Vector{Future}:
 Future(2, 1, 5108, ReentrantLock(nothing, Base.GenericCondition{Base.Threads.SpinLock}(Base.InvasiveLinkedList{Task}(nothing, nothing), Base.Threads.SpinLock(0)), 0), nothing)
 Future(3, 1, 5109, ReentrantLock(nothing, Base.GenericCondition{Base.Threads.SpinLock}(Base.InvasiveLinkedList{Task}(nothing, nothing), Base.Threads.SpinLock(0)), 0), nothing)
 Future(4, 1, 5110, ReentrantLock(nothing, Base.GenericCondition{Base.Threads.SpinLock}(Base.InvasiveLinkedList{Task}(nothing, nothing), Base.Threads.SpinLock(0)), 0), nothing)
 Future(5, 1, 5111, ReentrantLock(nothing, Base.GenericCondition{Base.Threads.SpinLock}(Base.InvasiveLinkedList{Task}(nothing, nothing), Base.Threads.SpinLock(0)), 0), nothing)
 Future(6, 1, 5112, ReentrantLock(nothing, Base.GenericCondition{Base.Threads.SpinLock}(Base.InvasiveLinkedList{Task}(nothing, nothing), Base.Threads.SpinLock(0)), 0), nothing)
 Future(7, 1, 5113, ReentrantLock(nothing, Base.GenericCondition{Base.Threads.SpinLock}(B

#### Fetch the results from each worker working on their corresponding chunk of data

In [10]:
map(fetch,[ @spawnat p sum(x->x*x,localpart(ddat)) for p in workers() ])

8-element Vector{Float64}:
 4.165783090590872e6
 4.166645246017161e6
 4.1661054964631647e6
 4.1655412539773807e6
 4.167189334580513e6
 4.1648296769934585e6
 4.1674088697341075e6
 4.1682965613832995e6

#### Finally reduce the returned results from each worker by adding them

In [11]:
@time reduce(+,map(fetch,[ @spawnat p sum(x->x*x,localpart(ddat)) for p in workers() ]))

  0.199132 seconds (317.34 k allocations: 17.165 MiB, 50.34% compilation time)


3.333179952973996e7

In [12]:
@time sum(ddat .* ddat)

  1.782984 seconds (1.64 M allocations: 91.779 MiB, 37.66% compilation time)


3.333179952973996e7

In [13]:
@code_native sum(ddat .* ddat)

	[0m.section	[0m__TEXT[0m,[0m__text[0m,[0mregular[0m,[0mpure_instructions
[90m; ┌ @ reducedim.jl:889 within `sum`[39m
	[96m[1msubq[22m[39m	[33m$40[39m[0m, [0m%rsp
	[96m[1mmovq[22m[39m	[0m%rsi[0m, [33m32[39m[33m([39m[0m%rsp[33m)[39m
	[96m[1mmovq[22m[39m	[33m([39m[0m%rsi[33m)[39m[0m, [0m%rax
	[96m[1mmovabsq[22m[39m	[93m$jl_system_image_data[39m[0m, [0m%rcx
[90m; │┌ @ reducedim.jl:889 within `#sum#732`[39m
[90m; ││┌ @ reducedim.jl:893 within `_sum`[39m
[90m; │││┌ @ reducedim.jl:893 within `#_sum#734`[39m
[90m; ││││┌ @ reducedim.jl:894 within `_sum`[39m
[90m; │││││┌ @ reducedim.jl:894 within `#_sum#735`[39m
[90m; ││││││┌ @ reducedim.jl:322 within `mapreduce`[39m
[90m; │││││││┌ @ reducedim.jl:322 within `#mapreduce#725`[39m
[90m; ││││││││┌ @ reducedim.jl:330 within `_mapreduce_dim`[39m
	[96m[1mmovq[22m[39m	[0m%rcx[0m, [33m([39m[0m%rsp[33m)[39m
	[96m[1mmovabsq[22m[39m	[93m$jl_system_image_data[39m[0m, [0m%rc

In [14]:
methodswith(DArray)

In [15]:
@which sum(ddat .* ddat)