# Profiling

Reference: https://plmlab.math.cnrs.fr/fabreges/julia-2019/

In [2]:
import LinearAlgebra
import Profile

In [3]:
begin
    
    

    N = 10_000_000
    x = rand(Float64, N)
    y = rand(Float64, N)

    niter = 10

    Profile.clear()
    @Profile.profile begin
    for i=1:niter
        C = LinearAlgebra.norm(y - x, 2)
        global y += C * x
    end
    end

    Profile.print(maxdepth=7)
end

Overhead ╎ [+additional indent] Count File:Line; Function
    ╎955  @Base/task.jl:484; (::IJulia.var"#15#18")()
    ╎ 955  ...lia/src/eventloop.jl:8; eventloop(socket::ZMQ.Socket)
    ╎  955  @Base/essentials.jl:726; invokelatest
    ╎   955  @Base/essentials.jl:729; #invokelatest#2
    ╎    955  ...execute_request.jl:67; execute_request(socket::ZMQ.S...
    ╎     955  ...SoftGlobalScope.jl:65; softscope_include_string(m::...
    ╎    ╎ 955  @Base/loading.jl:1428; include_string(mapexpr::typ...
    ╎    ╎  955  @Base/boot.jl:368; eval
    ╎2012 @Base/task.jl:634; task_done_hook(t::Task)
    ╎ 2012 @Base/task.jl:930; wait()
2011╎  2012 @Base/task.jl:921; poptask(W::Base.InvasiveLinked...
    ╎1006 ...eadingconstructs.jl:258; (::Profile.var"#3#4")()
    ╎ 1006 ...file/src/Profile.jl:39; profile_printing_listener()
    ╎  1006 @Base/asyncevent.jl:155; wait
    ╎   1006 @Base/asyncevent.jl:138; _trywait(t::Base.AsyncCondition)
    ╎    1006 @Base/condition.jl:124; wait(c::Base.GenericCondi

In [4]:
begin
    
    N = 10_000_000
    x = rand(Float64, N)
    y = rand(Float64, N)
    
    niter = 10
    
    Profile.clear()
    @Profile.profile begin
    for i=1:niter
        C = LinearAlgebra.norm(y - x, 2)
        global y += C * x
    end
    end
    
    Profile.print(format=:flat, sortedby=:count)
    
end

 Count  Overhead File                    Line Function
     2         0 @Base/int.jl              87 +
     2         0 @Base/simdloop.jl         78 macro expansion
     3         0 @Base/float.jl           384 -
     5         0 @Base/float.jl           385 *
     6         0 @Base/float.jl           383 +
    14         0 @Base/broadcast.jl       670 _broadcast_getindex_evalf
    14         0 @Base/broadcast.jl       643 _broadcast_getindex
    32         0 @Base/boot.jl            459 Array
    32         0 @Base/boot.jl            468 Array
    32         0 @Base/boot.jl            476 Array
    32         0 @Base/abstractarray.jl   841 similar
    32         0 @Base/abstractarray.jl   840 similar
    32         0 @Base/broadcast.jl       212 similar
    32         0 @Base/broadcast.jl       211 similar
   132         0 @Base/arraymath.jl        21 *(A::Float64, B::Vector{Float64})
   149         0 @Base/array.jl           966 setindex!
   154         0 In[4]                     12

## Memory storage

Julia uses "fortran" storage for 2d array along columns

In [11]:
begin
    function compute_dist!(x, dist)
        N = length(x)
        for i=1:N
            for j=1:N
                dist[i, j] = abs(x[i] - x[j])
            end
        end
    end
    
    N = 10_000
    x = rand(Float64, N)
    dist = Array{Float64}(undef, (N, N))
    
    compute_dist!(x, dist)
    @time compute_dist!(x, dist)
end


  1.288974 seconds (9 allocations: 352 bytes)


In [12]:
begin
    function compute_dist!(x, dist)
        N = length(x)
        for j=1:N
            @simd for i=1:N
                dist[i, j] = abs(x[i] - x[j])
            end
        end
    end
    
    N = 10_000
    x = rand(Float64, N)
    dist = Array{Float64}(undef, (N, N))
    
    compute_dist!(x, dist)
    @time compute_dist!(x, dist)
end

  0.086032 seconds (9 allocations: 352 bytes)


In [12]:
N = 50_000_000
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

@time for i in 1:N
    @inbounds y[i] += a * x[i]
end

  8.755449 seconds (350.00 M allocations: 5.215 GiB, 11.48% gc time)


In [15]:
using InteractiveUtils

N = 10
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

function axpy()
    for i in 1:N
        @inbounds y[i] += a * x[i]
    end
end

@code_warntype axpy()


MethodInstance for axpy()
  from axpy() in Main at In[15]:8
Arguments
  #self#[36m::Core.Const(axpy)[39m
Locals
  @_2[33m[1m::Union{Nothing, Tuple{Int64, Int64}}[22m[39m
  val[36m::Float64[39m
  i[36m::Int64[39m
Body[36m::Nothing[39m
[90m1 ─[39m %1  = (1:Main.N)[36m::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])[39m
[90m│  [39m       (@_2 = Base.iterate(%1))
[90m│  [39m %3  = (@_2 === nothing)[36m::Bool[39m
[90m│  [39m %4  = Base.not_int(%3)[36m::Bool[39m
[90m└──[39m       goto #4 if not %4
[90m2 ┄[39m %6  = @_2[36m::Tuple{Int64, Int64}[39m
[90m│  [39m       (i = Core.getfield(%6, 1))
[90m│  [39m %8  = Core.getfield(%6, 2)[36m::Int64[39m
[90m│  [39m       nothing
[90m│  [39m %10 = Base.getindex(Main.y, i)[36m::Float64[39m
[90m│  [39m %11 = Base.getindex(Main.x, i)[36m::Float64[39m
[90m│  [39m %12 = (Main.a * %11)[36m::Float64[39m
[90m│  [39m %13 = (%10 + %12)[36m::Float64[39m
[90m│  [39m       Base.setindex!(Ma

In [16]:
let
    N = 50_000_000
    a = 1.2
    x = rand(Float64, N)
    y = rand(Float64, N)

    @time for i in 1:N
        @inbounds y[i] += a * x[i]
    end
end

  0.151305 seconds


In [19]:

N = 50_000_000
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

function axpy!(a::Float64, x::Array{Float64}, y::Array{Float64})
    for i in 1:length(x)
        @inbounds y[i] += a * x[i]
    end
end

# warmup
axpy!(a, x, y)

# timing
@time axpy!(a, x, y)

  0.060045 seconds


In [20]:
using InteractiveUtils

function somme(x::Array{Float64, 2}, dim::Int64)
    if(dim > 0)
        return sum(x, dims=dim)
    else
        return sum(x)
    end
end

N = 10
x = ones(N, N)

@code_warntype somme(x, 0)

MethodInstance for somme(::Matrix{Float64}, ::Int64)
  from somme(x::Matrix{Float64}, dim::Int64) in Main at In[20]:3
Arguments
  #self#[36m::Core.Const(somme)[39m
  x[36m::Matrix{Float64}[39m
  dim[36m::Int64[39m
Body[91m[1m::Union{Float64, Matrix{Float64}}[22m[39m
[90m1 ─[39m %1  = (dim > 0)[36m::Bool[39m
[90m└──[39m       goto #3 if not %1
[90m2 ─[39m %3  = (:dims,)[36m::Core.Const((:dims,))[39m
[90m│  [39m %4  = Core.apply_type(Core.NamedTuple, %3)[36m::Core.Const(NamedTuple{(:dims,)})[39m
[90m│  [39m %5  = Core.tuple(dim)[36m::Tuple{Int64}[39m
[90m│  [39m %6  = (%4)(%5)[36m::NamedTuple{(:dims,), Tuple{Int64}}[39m
[90m│  [39m %7  = Core.kwfunc(Main.sum)[36m::Core.Const(Base.var"#sum##kw"())[39m
[90m│  [39m %8  = (%7)(%6, Main.sum, x)[36m::Matrix{Float64}[39m
[90m└──[39m       return %8
[90m3 ─[39m %10 = Main.sum(x)[36m::Float64[39m
[90m└──[39m       return %10



In [26]:
N = 50_000_000
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

# warmup
@. y += a * x

# timing
@time @. y += a * x;

  0.057500 seconds (4 allocations: 128 bytes)


In [1]:
const N = 50_000_000
const a = 1.2
const x = rand(Float64, N)
const y = rand(Float64, N)

const nn = 100
const n_start = 1 + nn
const n_end = N - nn

# warmup
@. y[n_start:n_end] += a * x[n_start:n_end]

# timing
@time @. y[n_start:n_end] += a * x[n_start:n_end]

  0.721204 seconds (4 allocations: 762.936 MiB, 16.35% gc time)


49999800-element view(::Vector{Float64}, 101:49999900) with eltype Float64:
 0.9658489623046363
 1.123964221390209
 2.9420081635204243
 1.321540026708714
 2.1148877441179392
 2.1447056903288617
 3.302557273467911
 2.43665777785713
 0.9122455824780733
 1.497097968318496
 1.5477294770833758
 0.49830645650464067
 0.47753771476557116
 ⋮
 1.504530661267116
 0.6759802016915673
 3.1503238951143047
 2.5850247257200003
 2.7421438610175226
 2.84226948477731
 2.306577456667552
 2.4508362574772162
 1.5488522336253285
 0.8025929855866992
 1.3683234557280306
 1.7760543078292006

In [22]:
const N = 50_000_000
const a = 1.2
const x = rand(Float64, N)
const y = rand(Float64, N)

const nn = 100
const n_start = 1 + nn
const n_end = N - nn

# warmup
@. @views y[n_start:n_end] += a * x[n_start:n_end]

# timing
@time @. @views y[n_start:n_end] += a * x[n_start:n_end]

LoadError: cannot declare N constant; it already has a value

In [23]:
N = 100_000_000
a = 1.2
x = rand(Float64, N)
y = Array{Float64}(undef, N)

function func!(a::Float64, x::Array{Float64}, y::Array{Float64})
    @Threads.threads for i in 1:length(x)
        y[i] = exp(a * x[i] * x[i])
    end
end

# warmup
func!(a, x, y)

# timing
@time func!(a, x, y)

  0.317545 seconds (27 allocations: 2.109 KiB)


In [24]:
Threads.nthreads()

4

To get the performance of benchmarks with Julia, you just have to follow some rules:
- Type the variables as much as possible!
- Put the calculations in functions and pass the global variables as parameters of these functions.
- Use the `const` keyword on global variables, when possible. 
- Use loops or vectorize calculations with the @ macro.
- Avoid unnecessary copies by using the `@views` macro for operations on sub-tables.