In [None]:
using Plots;

# Group Seminar on Julia Pt. II  
# Parallel Techniques

- Julia's parallel architecture, like MPI, uses message passing between worker processes.
- Distributed Memory architecture $\rightarrow$ Each worker has its own memory.
- It is however more transparent and high level.

## 1. Add worker processes.

In [None]:
workers()

In [None]:
rmprocs(2:5)

In [None]:
addprocs();workers()

In [None]:
[nworkers(),
    nprocs()]

   ## 2. Running functions on other processes.

In [None]:
r = remotecall(2, +, 1,1)

In [None]:
fetch(r)

In [None]:
remotecall_fetch(2, +, 1,1)

### `remotecall` and `fetch` are rather "low level" and cumbersome.
### Julia offers convenient macros `@spawn`, `@spawnat`, `@fetch`

In [None]:
@spawn svd(rand(10,10))

In [None]:
fetch(ans)

In [None]:
@fetch svd(rand(10,10))

### Transforming datasets in parallel with `pmap`.

In [None]:
matrices = [ rand(5,5) for i in 1:4 ];
pmap(svd, matrices);

In [None]:
function tictoq(ex::Expr)
    tic()
    eval(ex)
    return toq()
end

In [None]:
maxSize = 100;
@time times = hcat([ [size(m[1],1), tictoq(:(pmap(svd, $m))), tictoq( :(map(svd,$m)))]
    for m in [ [rand(k,k) for i in 1:8] for k in 5:maxSize ]]...)';

In [None]:
scatter(times[:,1],times[:,2:end],label=[:pmap :map],legend=:topleft)

In [None]:
for i = 1:nworkers()
    local matrices = [ rand(500,500) for j in 1:8 ];
    @time pmap(svd, matrices, pids=workers()[1:i]);
end

__Caveat:__ `pmap` only suitable for distributing large chunks of work.

In [None]:
@time pmap(x->x+1, collect(1:Int(1e5)));

In [None]:
@time collect(1:Int(1e5)) + 1 ;

### `@parallel` to the rescue!

__Scenario:__ Parallel calculations that are reduced (Matrix$\rightarrow$Vector, Vector$\rightarrow$Number)

In [None]:
piEst = @parallel (+) for i = 1:Int(1e9)
    ifelse( abs2(rand()) + abs2(rand()) <= 1, 1, 0)
end
piEst /= (1e9 / 4)

In [None]:
@time randWalk = @parallel (+) for i = 1:Int(1e8)
    randn()
end

In [None]:
@time reduce(+, randn(Int(1e8)))

In [None]:
s = 0.0
@time for i = 1:Int(1e8)
    s += randn()
end
s

In [None]:
function randomWalk_serial(L::Int)
    s = 0.0
    for i = 1:L
        s += randn()
    end
    s
end
@time randomWalk_serial(Int(1e8))

----

__What if we need the results of every run?__  
Naively one could concatinate the results ($=$reduce with `vcat`).

In [None]:
L = Int(1e5);

In [None]:
@time randn(L);

In [None]:
@time n = @parallel (vcat) for i in 1:L
    randn()
end;

__Abysmal runtime! __ Note the allocations.  
`vcat` allocates new memory __each__ iteration $\rightarrow$ dynamical resizing of arrays is not a good idea in performance critical code.

Alright, allocate the memory beforehand...

In [None]:
a = zeros(Int64, L)
@time @parallel for i in 1:L
    a[i] = randn()
end;

Ok, that was fast, but did it do what we wanted?

In [None]:
println(maximum(a))

__No__, because `a[]` inside the parallel for-loop is a __local variable__ to each process.

__Needed:__ Data structure that is shared between processes.

### Shared Arrays

In [None]:
@everywhere gc()

In [None]:
L = Int(1e8);

In [None]:
ShA = SharedArray(Float64, L, init= S->S[localindexes(S)] = 0);

In [None]:
length(ShA)*8/1024^2

In [None]:
ShA.pids

In [None]:
[@fetchfrom i localindexes(ShA) for i in workers()]

In [None]:
@time @sync @parallel for i in 1:length(ShA)
    ShA[i] = randn()
end;

In [None]:
@everywhere function Randn(S::SharedArray)
    for i in localindexes(S)
        S[i] = randn()
    end
end
@time @sync begin 
    for p in ShA.pids
        @async remotecall(p, Randn, ShA)
    end
end

In [None]:
ShA[1:10]

In [None]:
function Randn(L::Int)
    a = zeros(Float64,L)
    @time for i in 1:L
        a[i] = randn()
    end
    return a
end;
Randn(L);

In [None]:
histogram(ShA,nbins=100,legend=:none)

In [None]:
Plots.clf()