# Parallel and Distributed Computing

In [1]:
function calculate_pi(n)
    inside = 0
    for i = 1:n
        x = rand()
        y = rand()
        inside += (x^2 + y^2) <= 1.0 ? 1 : 0
    end
    return 4 * inside / n
end

calculate_pi (generic function with 1 method)

In [2]:
calculate_pi(1e10)

3.1415561152

In [3]:
using Distributed

# Add worker processes equal to the number of available cores
addprocs(Sys.CPU_THREADS)


4-element Vector{Int64}:
 2
 3
 4
 5

In [4]:
@everywhere function calculate_pi(n)
    inside = 0
    for i = 1:n
        x = rand()
        y = rand()
        inside += (x^2 + y^2) <= 1.0 ? 1 : 0
    end
    return 4 * inside / n
end

In [5]:
using Statistics

# Each worker will calculate π using n points
n = 2.5 * 10^9
pies = @distributed (vcat) for i in 1:nworkers()
    calculate_pi(n)
end

pi_estimate = mean(pies)

println("Estimate of π: $pi_estimate")


Estimate of π: 3.1415866864


## GPU Programming

In [9]:
import Pkg
Pkg.add("Metal")

[32m[1m   Resolving[22m[39m package versions...


[32m[1m   Installed[22m[39m XZ_jll ─────────────── v5.4.5+0
[32m[1m   Installed[22m[39m UnsafeAtomicsLLVM ──── v0.1.3
[32m[1m   Installed[22m[39m TimerOutputs ───────── v0.5.23


[32m[1m   Installed[22m[39m Metal_LLVM_Tools_jll ─ v0.5.1+0
[32m[1m   Installed[22m[39m ExprTools ──────────── v0.1.10
[32m[1m   Installed[22m[39m Python_jll ─────────── v3.10.8+1
[32m[1m   Installed[22m[39m GPUArrays ──────────── v9.1.0


[32m[1m   Installed[22m[39m LLVMExtra_jll ──────── v0.0.27+1
[32m[1m   Installed[22m[39m GPUArraysCore ──────── v0.1.5
[32m[1m   Installed[22m[39m UnsafeAtomics ──────── v0.2.1
[32m[1m   Installed[22m[39m KernelAbstractions ─── v0.9.13
[32m[1m   Installed[22m[39m Atomix ─────────────── v0.1.0


[32m[1m   Installed[22m[39m SQLite_jll ─────────── v3.43.0+0
[32m[1m   Installed[22m[39m LLVM ───────────────── v6.4.0
[32m[1m   Installed[22m[39m GPUCompiler ────────── v0.24.5


[32m[1m   Installed[22m[39m StructIO ───────────── v0.3.0
[32m[1m   Installed[22m[39m LibMPDec_jll ───────── v2.5.1+0
[32m[1m   Installed[22m[39m ObjectiveC ─────────── v1.0.0
[32m[1m   Installed[22m[39m Metal ──────────────── v0.5.1
[32m[1m   Installed[22m[39m ObjectFile ─────────── v0.4.1


[32m[1m    Updating[22m[39m `~/.julia/environments/v1.9/Project.toml`
  [90m[dde4c033] [39m[92m+ Metal v0.5.1[39m
[32m[1m    Updating[22m[39m `~/.julia/environments/v1.9/Manifest.toml`


  [90m[a9b6321e] [39m[92m+ Atomix v0.1.0[39m
  [90m[e2ba6199] [39m[92m+ ExprTools v0.1.10[39m
  [90m[0c68f7d7] [39m[92m+ GPUArrays v9.1.0[39m
  [90m[46192b85] [39m[92m+ GPUArraysCore v0.1.5[39m
[33m⌅[39m [90m[61eb1bfa] [39m[92m+ GPUCompiler v0.24.5[39m
  [90m[63c18a36] [39m[92m+ KernelAbstractions v0.9.13[39m
  [90m[929cbde3] [39m[92m+ LLVM v6.4.0[39m
  [90m[dde4c033] [39m[92m+ Metal v0.5.1[39m
  [90m[d8793406] [39m[92m+ ObjectFile v0.4.1[39m
  [90m[e86c9b32] [39m[92m+ ObjectiveC v1.0.0[39m
  [90m[53d494c1] [39m[92m+ StructIO v0.3.0[39m
  [90m[a759f4b9] [39m[92m+ TimerOutputs v0.5.23[39m
  [90m[013be700] [39m[92m+ UnsafeAtomics v0.2.1[39m
  [90m[d80eeb9a] [39m[92m+ UnsafeAtomicsLLVM v0.1.3[39m
  [90m[dad2f222] [39m[92m+ LLVMExtra_jll v0.0.27+1[39m
  [90m[7106de7a] [39m[92m+ LibMPDec_jll v2.5.1+0[39m
  [90m[0418c028] [39m[92m+ Metal_LLVM_Tools_jll v0.5.1+0[39m
  [90m[93d3a430] [39m[92m+ Python_jll v3.10.8+1[39m
  

[32m[1mPrecompiling[22m[39m 

project...


[32m  ✓ [39m[90mStructIO[39m
[32m  ✓ [39m[90mExprTools[39m


[32m  ✓ [39m[90mMetal_LLVM_Tools_jll[39m


[32m  ✓ [39m[90mLibMPDec_jll[39m


[32m  ✓ [39m[90mSQLite_jll[39m
[32m  ✓ [39m[90mLLVMExtra_jll[39m
[32m  ✓ [39m[90mUnsafeAtomics[39m


[32m  ✓ [39m[90mXZ_jll[39m
[32m  ✓ [39m[90mGPUArraysCore[39m


[32m  ✓ [39m[90mAtomix[39m


[32m  ✓ [39m[90mObjectiveC[39m


[32m  ✓ [39m[90mObjectFile[39m


[32m  ✓ [39m[90mArrayInterface → ArrayInterfaceGPUArraysCoreExt[39m


[32m  ✓ [39m[90mTimerOutputs[39m


[32m  ✓ [39m[90mPython_jll[39m


[32m  ✓ [39m[90mLLVM[39m


[32m  ✓ [39m[90mUnsafeAtomicsLLVM[39m
[32m  ✓ [39m[90mGPUArrays[39m


[32m  ✓ [39m[90mKernelAbstractions[39m


[32m  ✓ [39m[90mGPUCompiler[39m


[32m  ✓ [39mMetal


[32m  ✓ [39m[90mMetal → SpecialFunctionsExt[39m
  22 dependencies successfully precompiled in 32 seconds. 321 already precompiled.


In [14]:
using Metal

function pi_approximation_kernel(xs, ys, inside)
    i = thread_position_in_grid_1d()
    x = xs[i]
    y = ys[i]
    if x^2 + y^2 <= 1.0
        @inbounds inside[i] = 1
    else
        @inbounds inside[i] = 0
    end
    return
end

# Use a large number of points
n = 10^10

# Create arrays on the GPU
xs = MtlArray(rand(Float32, n))
ys = MtlArray(rand(Float32, n))
inside = MtlArray(zeros(Int32, n))

# Define thread configuration
threads_per_group = 256
num_groups = ceil(Int, n / threads_per_group)

# Launch the kernel
@metal threads=threads_per_group groups=num_groups pi_approximation_kernel(xs, ys, inside)

# Copy the result back to the CPU and calculate π
inside_cpu = Array(inside)
pi_estimate = 4 * sum(inside_cpu) / n

println("Estimate of π: $pi_estimate")