In [1]:
ENV["METAL_CAPTURE_ENABLED"] = 1

1

In [2]:
using Metal, BenchmarkTools, LinearAlgebra

In [3]:
function matrix_multiplication_kernel(A, B, C)
    idx = thread_position_in_grid_1d()
    N = 10
    for _ in 1:N
        C[idx] += sin(A[idx]) * exp(B[idx])
    end
    return
end;

In [4]:
m, n, p = 1_000, 1_000, 1_000

A = Metal.rand(Float32, (m, n); storage=Shared)
B = Metal.rand(Float32, (n, p); storage=Shared)
C = Metal.zeros(Float32, (m, p); storage=Shared)

C_cpu = unsafe_wrap(Array{Float32}, C, size(C))

len = 1024*1024*100
n_threads = 1024
n_groups = cld(len, n_threads);


2024-02-27 14:01:04.734 julia[60000:6100152] Metal GPU Frame Capture Enabled


In [5]:
benchmark_results = @benchmark Metal.@sync begin
    @metal threads=$n_threads groups=$n_groups matrix_multiplication_kernel($A, $B, $C)
end

BenchmarkTools.Trial: 483 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 7.308 ms[22m[39m … [35m 15.547 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m10.392 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m10.368 ms[22m[39m ± [32m966.928 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m▃[39m▂[39m [39m [39m [39m [39m▂[39m [34m [39m[39m [39m [39m [39m [39m [39m [39m▂[39m▃[39m [39m▁[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▁[39m▂[39m▁[39m▃[39m▁

In [6]:
function generate_report(benchmark_results)
    println("Benchmark Details:")
    show(benchmark_results)
    println("\nAdditional Information:")
    println("  - Secs: ", minimum(benchmark_results.times))
    println("  - Memory estimate: ", benchmark_results.memory) 
end;

In [7]:
generate_report(benchmark_results)
println(benchmark_results)
finalize(A)
finalize(B)
finalize(C)


Benchmark Details:
Trial(7.308 ms)
Additional Information:
  - Secs: 7.307541e6
  - Memory estimate: 4496
Trial(7.308 ms)


In [8]:
# The resulting matrix can now be modified on the CPU
inv(C_cpu)

1000×1000 Matrix{Float32}:
  7.62464f-6   8.36633f-6   3.32465f-6  …  -7.15063f-6  -6.4118f-7
 -1.04566f-5   9.15481f-6  -1.22784f-5      2.70968f-5   1.34434f-5
  5.63164f-6  -9.32931f-6   5.27309f-6     -2.91817f-5  -8.1776f-6
 -1.40291f-5  -6.98569f-6  -6.24964f-7      2.834f-5    -1.68307f-6
 -8.8691f-6   -4.65356f-6  -5.68081f-6      1.08244f-5   8.83836f-6
 -5.63228f-6   5.87806f-6  -8.96055f-6  …   1.40251f-5   1.0643f-5
 -1.55907f-6   1.02844f-6  -5.60905f-6      1.1255f-5    1.84362f-6
  4.98201f-6   8.72346f-6  -6.78458f-7     -6.25853f-6   5.35293f-6
 -8.15554f-6  -1.48945f-5   5.37479f-6     -2.74923f-6  -1.05896f-5
 -1.10733f-5  -2.75375f-6  -8.89457f-6      1.92805f-5   1.01614f-5
  9.43542f-6   6.30736f-6  -6.79795f-7  …  -2.33425f-5   1.86059f-6
 -4.67163f-6  -6.32272f-6   1.52284f-6      8.49268f-6  -1.32841f-6
  1.33908f-6  -4.64726f-6   1.49257f-6     -4.55839f-6  -2.49072f-6
  ⋮                                     ⋱               
 -5.56182f-6   2.56613f-6  -8.68572