# Scaling computations using parallel computing

## Przemysław Szufel

In [36]:
println("Number of threads that your Julia is run: ## $(Threads.nthreads())")

Number of threads that your Julia is run: ## 8


In [None]:
using BenchmarkTools, Distributed

### Parallelize via Single Instruction Multiple Data (SIMD)

In [37]:
function dot1(x, y)
    s = 0.0
    for i in 1:length(x)
        @inbounds s += x[i]*y[i]
    end
    s
end

dot1 (generic function with 1 method)

In [38]:
function dot2(x, y)
    s = 0.0
    @simd for i in 1:length(x)
        @inbounds s += x[i]*y[i]
    end
    s
end

dot2 (generic function with 1 method)

In [40]:
x = 100*rand(10000)
y = 100*rand(10000);

#res1 = @btime dot1($x, $y)
#res2 = @btime dot2($x, $y)

#println(res1)
#println(res2)

In [50]:
res1 =  dot1(x, y)

2.426381729118624e7

In [51]:
res2 =  dot2(x, y)

2.4263817291186295e7

In [52]:
res1 == res2

false

In [53]:
@show res1 
@show res2

res1 = 2.426381729118624e7
res2 = 2.4263817291186295e7


2.4263817291186295e7

### Green threading

In [54]:
@time sleep(2)

  2.013307 seconds (61 allocations: 1.859 KiB)


In [58]:
@time t = @async sleep(2)

  0.000105 seconds (29 allocations: 2.164 KiB)


Task (runnable) @0x00000249e833f840

In [60]:
t

Task (done) @0x00000249e833f840

In [None]:
function dojob(i)
    val = round(rand(), digits=2)
    sleep(val)   # this could be external computations with I/O
    i, val
end

In [71]:
result = Vector{Tuple{Int,Float64}}(undef, 8)

8-element Vector{Tuple{Int64, Float64}}:
 (0, 6.95332130479936e-310)
 (140736790006112, 0.0)
 (140736789999776, 6.9533213044863e-310)
 (0, 6.95332097038016e-310)
 (140736783242288, 0.0)
 (140736783237392, 6.95332096896357e-310)
 (0, 6.95332097038016e-310)
 (140736783208720, 0.0)

In [72]:
@time for i=1:8
    result[i] = dojob(i)
end
result

  3.326780 seconds (143 allocations: 4.172 KiB)


8-element Vector{Tuple{Int64, Float64}}:
 (1, 0.7)
 (2, 0.09)
 (3, 0.13)
 (4, 0.11)
 (5, 0.5)
 (6, 0.8)
 (7, 0.31)
 (8, 0.61)

In [73]:
result = Vector{Tuple{Int,Float64}}(undef, 8);
@time for i=1:8
   @async result[i] = dojob(i)
end
result

  0.000106 seconds (83 allocations: 7.139 KiB)


8-element Vector{Tuple{Int64, Float64}}:
 (2517304857952, 1.2437138504313e-311)
 (2517304857984, 1.243713850447e-311)
 (140736823693760, 6.9533229691903e-310)
 (140736823693760, 6.9533229691903e-310)
 (140736823693760, 6.9533229691903e-310)
 (140736823693760, 6.9533229691903e-310)
 (140736823693760, 6.9533229691903e-310)
 (140736823693760, 6.9533229691903e-310)

In [74]:
result

8-element Vector{Tuple{Int64, Float64}}:
 (1, 0.51)
 (2, 0.27)
 (3, 0.38)
 (4, 0.27)
 (5, 0.45)
 (6, 0.44)
 (7, 0.43)
 (8, 0.11)

In [80]:
result = Vector{Tuple{Int,Float64}}(undef, 8);
@time @sync for i=1:8
   @async result[i] = dojob(i)
end
result

  0.972172 seconds (4.53 k allocations: 344.805 KiB, 1.63% compilation time)


8-element Vector{Tuple{Int64, Float64}}:
 (1, 0.17)
 (2, 0.95)
 (3, 0.91)
 (4, 0.36)
 (5, 0.87)
 (6, 0.63)
 (7, 0.6)
 (8, 0.8)

#### Programming a simple web server
You should be able to connect using the address <a href="http://localhost:9992/3+4" target="about:blank">http://localhost:9992/3+4</a>

To stop web server click <a href="http://localhost:9992/stopme" target="about:blank">http://localhost:9992/stopme</a>

In [82]:
using Sockets
println("Starting the web server...")
server = Sockets.listen(9992)

Starting the web server...


Sockets.TCPServer(Base.Libc.WindowsRawSocket(0x00000000000004dc) active)

In [84]:
@async begin
    contt = Ref(true)
    while contt[]
        sock = Sockets.accept(server)
        @async begin
            data = readline(sock)
            print("Got request:\n", data, "\n")
            cmd = split(data, " ")[2][2:end]
            println(sock, "\nHTTP/1.1 200 OK\nContent-Type: text/html\n")
            contt[] = contt[] && (!occursin("stopme", data))
            if contt[]
                 println(sock, string("<html><body>", cmd, "=", 
                     eval(Meta.parse(cmd)), "</body></html>"))
            else
                println(sock,"<html><body>stopping</body></html>")
            end
            close(sock)
        end
    end
    println("Handling requests stopped")
end

Task (runnable) @0x00000249e7e3be10

Got request:
GET /3+4 HTTP/1.1
Got request:
GET /favicon.ico HTTP/1.1
Got request:
GET /3+400 HTTP/1.1
Got request:
GET /favicon.ico HTTP/1.1
Got request:
GET /3+4000 HTTP/1.1
Got request:
GET /favicon.ico HTTP/1.1
Got request:
GET /stopme HTTP/1.1
Handling requests stopped
Got request:
GET /favicon.ico HTTP/1.1


### Multithreading

In [85]:
Threads.nthreads()

8

In [86]:
function ssum(x)
    r, c = size(x)
    y = zeros(c)
    for i in 1:c
        for j in 1:r
            @inbounds y[i] += x[j, i]
        end
    end
    y
end

ssum (generic function with 1 method)

In [87]:
function tsum(x)
    r, c = size(x)
    y = zeros(c)
    Threads.@threads for i in 1:c
        for j in 1:r
            @inbounds y[i] += x[j, i]
        end
    end
    y
end


tsum (generic function with 1 method)

In [93]:
x = rand(1000,10000);

In [94]:
@time ssum(x)
@time ssum(x);

  0.008778 seconds (2 allocations: 78.172 KiB)
  0.009669 seconds (2 allocations: 78.172 KiB)


In [95]:
@time tsum(x)
@time tsum(x);

  0.002482 seconds (53 allocations: 83.594 KiB)
  0.002101 seconds (60 allocations: 83.953 KiB)


#### Locking mechanism for threads

In [96]:
function f_bad()
    x = 0
    Threads.@threads for i in 1:10^6
        x += 1
    end
    return x
end


f_bad (generic function with 1 method)

In [124]:
@time f_bad()

  0.027474 seconds (999.22 k allocations: 15.251 MiB)


125866

In [129]:
function f_add()
    x = 0 
    for i in 1:10^7
        x += 1
    end
    x
end
@btime f_add()
    

  1.700 ns (0 allocations: 0 bytes)


10000000

In [126]:
function f_atomic()
    x = Threads.Atomic{Int}(0)
    Threads.@threads for i in 1:10^6
        Threads.atomic_add!(x, 1)
    end
    return x[]
end
f_atomic()

1000000

In [133]:


function f_spin()
    l = Threads.SpinLock()
    x = 0
    Threads.@threads for i in 1:10^6
        Threads.lock(l) do
            x += 1
        end
    end
    return x
end

function f_reentrant()
    l = ReentrantLock()
    x = 0
    Threads.@threads for i in 1:10^6
        Threads.lock(l) do
            x += 1
        end
    end
    return x
end


f_reentrant (generic function with 1 method)

In [134]:
using DataFrames
stats = DataFrame()
for f in [f_bad, f_atomic, f_spin, f_reentrant]
    for i = 1:2
        value, elapsedtime  = @timed f()
        push!(stats, (f=string(f),i=i, value=value, timems=elapsedtime*1000))
    end
end
println(stats)


[1m8×4 DataFrame[0m
[1m Row [0m│[1m f           [0m[1m i     [0m[1m value   [0m[1m timems    [0m
     │[90m String      [0m[90m Int64 [0m[90m Int64   [0m[90m Float64   [0m
─────┼────────────────────────────────────────
   1 │ f_bad            1   128073    46.2
   2 │ f_bad            2   125286    56.2839
   3 │ f_atomic         1  1000000    23.9411
   4 │ f_atomic         2  1000000    26.6469
   5 │ f_spin           1  1000000   980.696
   6 │ f_spin           2  1000000   572.184
   7 │ f_reentrant      1  1000000  1533.87
   8 │ f_reentrant      2  1000000  1305.01


### Multi-processing and distributed computing

In [135]:
using Distributed

This code adds 4 workers (and avoids adding more)

In [136]:
addprocs(max(0, 5-nworkers()));

In [137]:
workers()

5-element Vector{Int64}:
 2
 3
 4
 5
 6

In [138]:
function s_rand()
    n = 10^4
    x = 0.0
    for i in 1:n
        x += sum(rand(10^4))
    end
    x / n
end
 
@time s_rand()
@time s_rand()


  0.487433 seconds (20.00 k allocations: 763.397 MiB, 23.77% gc time)
  0.435563 seconds (20.00 k allocations: 763.397 MiB, 15.76% gc time)


4999.997426068711

In [140]:
using Distributed
 
 
function p_rand()
    n = 10^4
    x = @distributed (+) for i in 1:n
        #line
        # but the last line will be aggregated
        sum(rand(10^4))
    end
    x / n
end

@time p_rand()
@time p_rand()


  0.201967 seconds (20.00 k allocations: 1.005 MiB, 10.60% compilation time)
  0.151015 seconds (493 allocations: 25.891 KiB)


4999.765402011155

In [141]:
workers()'

1×5 adjoint(::Vector{Int64}) with eltype Int64:
 2  3  4  5  6

In [143]:
fetch(@spawnat 3 4+3)

7

In [148]:
@everywhere function f() 
    println("I am on worker ", myid())
    rand()
end
f()

I am on worker 1


0.7399595040430285

In [149]:
fetch(@spawnat 4 f())

      From worker 4:	I am on worker 4


0.6278477606839582

In [150]:
vec(collect(Iterators.product(1:4, 1:5)))
        

20-element Vector{Tuple{Int64, Int64}}:
 (1, 1)
 (2, 1)
 (3, 1)
 (4, 1)
 (1, 2)
 (2, 2)
 (3, 2)
 (4, 2)
 (1, 3)
 (2, 3)
 (3, 3)
 (4, 3)
 (1, 4)
 (2, 4)
 (3, 4)
 (4, 4)
 (1, 5)
 (2, 5)
 (3, 5)
 (4, 5)

In [151]:
using Distributed
@everywhere using Pkg
@everywhere Pkg.activate("..")
@everywhere using Distributed, Random, DataFrames

@everywhere function calc(x, y)
    2x + y
end

@everywhere function init_worker()    
   Random.seed!(myid())
    # readding CSV file
end

@sync for wid in workers()
    @async fetch(@spawnat wid init_worker())
end


[32m[1m  Activating[22m[39m project at `C:\AAABIBLIOTEKA\MIT_Boston\MIT_18.S097_Introduction-to-Julia-for-Data-Science`


      From worker 6:	[32m[1m  Activating[22m[39m project at `C:\AAABIBLIOTEKA\MIT_Boston\MIT_18.S097_Introduction-to-Julia-for-Data-Science`
      From worker 4:	[32m[1m  Activating[22m[39m project at `C:\AAABIBLIOTEKA\MIT_Boston\MIT_18.S097_Introduction-to-Julia-for-Data-Science`
      From worker 3:	[32m[1m  Activating[22m[39m project at `C:\AAABIBLIOTEKA\MIT_Boston\MIT_18.S097_Introduction-to-Julia-for-Data-Science`
      From worker 5:	[32m[1m  Activating[22m[39m project at `C:\AAABIBLIOTEKA\MIT_Boston\MIT_18.S097_Introduction-to-Julia-for-Data-Science`
      From worker 2:	[32m[1m  Activating[22m[39m project at `C:\AAABIBLIOTEKA\MIT_Boston\MIT_18.S097_Introduction-to-Julia-for-Data-Science`


Typically results are collected to a `DataFrame`

In [152]:
data = @distributed (append!) for (i, j) = vec(collect(Iterators.product(1:4, 1:5)))
    a = rand(1:499)
    b = rand(1:9)*1000
    c = calc(a, b)
    DataFrame(;i,j,a,b,c,procid = myid())
end

Row,i,j,a,b,c,procid
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64
1,1,1,193,1000,1386,2
2,2,1,195,4000,4390,2
3,3,1,82,9000,9164,2
4,4,1,93,5000,5186,2
5,1,2,302,8000,8604,3
6,2,2,241,2000,2482,3
7,3,2,370,1000,1740,3
8,4,2,47,8000,8094,3
9,1,3,218,2000,2436,4
10,2,3,20,3000,3040,4


#### Advanced Interprocess communication - cellular automaton example

In [157]:
addprocs(1)

1-element Vector{Int64}:
 7

In [158]:
using Distributed
@everywhere using ParallelDataTransfer, Distributed


@everywhere function rule30()
    lastv = Main.caa[1]
    for i in 2:(length(Main.caa)-1)
        current = Main.caa[i]
        Main.caa[i] = xor(lastv, Main.caa[i] || Main.caa[i+1])
        lastv = current
    end
end


@everywhere function getcaa()
    Main.caa
end
@everywhere function getsetborder()
    #println(myid(),"gs");flush(stdout)
    Main.caa[1] = (@fetchfrom Main.neighbours[1] getcaa()[15+1])
    #println(myid(),"gs1");flush(stdout)
    Main.caa[end] = (@fetchfrom Main.neighbours[2] getcaa()[2])
    #println(myid(),"gse");flush(stdout)
end

function printsimdist(workers::Array{Int})
    for w in workers
        dat = @fetchfrom w caa
        for b in dat[2:end-1]
            print(b ? "#" : " ")
        end
    end
    println()
    flush(stdout)
end

function runca(steps::Int, visualize::Bool)
    @sync for w in workers()
        @async @fetchfrom w fill!(caa, false)
    end
    @fetchfrom wks[Int(nwks/2)+1] caa[2]=true
    visualize && printsimdist(workers())
    for i in 1:steps
        @sync for w in workers()
            @async @fetchfrom w getsetborder()
        end
        @sync for w in workers()
            @async @fetchfrom w rule30()
        end
        visualize && printsimdist(workers())
    end
end



runca (generic function with 1 method)

In [159]:
wks = workers()
nwks = length(wks)
for i in 1:nwks
    sendto(wks[i], neighbours = (i==1 ? wks[nwks] : wks[i-1],
                                i==nwks ? wks[1] : wks[i+1]))
    fetch(@defineat wks[i] const caa = zeros(Bool, 15+2));
end

runca(20,true)


                                             #                                            
                                            ###                                           
                                           ##  #                                          
                                          ## ####                                         
                                         ##  #   #                                        
                                        ## #### ###                                       
                                       ##  #    #  #                                      
                                      ## ####  ######                                     
                                     ##  #   ###     #                                    
                                    ## #### ##  #   ###                                   
                                   ##  #    # #### ##  #                                  

*Preparation of this workshop has been supported by the Polish National Agency for Academic Exchange under the Strategic Partnerships programme, grant number BPI/PST/2021/1/00069/U/00001.*

![SGH & NAWA](logo.png)