# Profile and improve performance

notebook from [Benoît Fabrèges](https://plmlab.math.cnrs.fr/fabreges/julia-2019/)


In [1]:
import LinearAlgebra
import Profile

In [3]:
N = 10_000_000
x = rand(Float64, N)
y = rand(Float64, N)

niter = 10

Profile.clear()
@Profile.profile begin
    for i=1:niter
        C = LinearAlgebra.norm(y - x, 2)
        global y += C * x
    end
end

Profile.print(maxdepth=7)

Overhead ╎ [+additional indent] Count File:Line; Function
    ╎1050 @Base/task.jl:484; (::IJulia.var"#15#18")()
    ╎ 1050 ...lia/src/eventloop.jl:8; eventloop(socket::ZMQ.Socket)
    ╎  1050 @Base/essentials.jl:726; invokelatest
    ╎   1050 @Base/essentials.jl:729; #invokelatest#2
    ╎    1050 ...execute_request.jl:67; execute_request(socket::ZMQ.S...
    ╎     1050 ...SoftGlobalScope.jl:65; softscope_include_string(m::...
    ╎    ╎ 1050 @Base/loading.jl:1428; include_string(mapexpr::typ...
    ╎    ╎  1050 @Base/boot.jl:368; eval
    ╎2252 @Base/task.jl:634; task_done_hook(t::Task)
    ╎ 2252 @Base/task.jl:930; wait()
2251╎  2252 @Base/task.jl:921; poptask(W::Base.InvasiveLinked...
    ╎1126 ...eadingconstructs.jl:258; (::Profile.var"#3#4")()
    ╎ 1126 ...file/src/Profile.jl:39; profile_printing_listener()
    ╎  1126 @Base/asyncevent.jl:155; wait
    ╎   1126 @Base/asyncevent.jl:138; _trywait(t::Base.AsyncCondition)
    ╎    1126 @Base/condition.jl:124; wait(c::Base.GenericCondi

In [4]:
Profile.print(format=:flat, sortedby=:count)

 Count  Overhead File                    Line Function
     1         0 @Base/boot.jl            411 LineInfoNode
     1         0 ...r/ssair/inlining.jl   331 ir_inline_item!(compact::Core.C...
     1         0 ...ompiler/optimize.jl   192 stmt_effect_free(stmt::Any, rt:...
     1         0 @Base/array.jl          1058 push!
     1         0 ...r/ssair/slot2ssa.jl    41 scan_slot_def_use(nargs::Int64,...
     1         0 ...r/ssair/slot2ssa.jl   173 rename_uses!
     1         0 ...r/ssair/slot2ssa.jl   761 construct_ssa!(ci::Core.CodeInf...
     1         0 ...r/ssair/slot2ssa.jl   148 fixemup!(cond::Core.Compiler.va...
     1         0 ...r/ssair/slot2ssa.jl   815 construct_ssa!(ci::Core.CodeInf...
     1         0 @Base/sort.jl            704 sort!
     1         0 @Base/sort.jl            769 #sort#9
     1         0 @Base/sort.jl            769 sort
     1         0 ...r/ssair/slot2ssa.jl   400 domsort_ssa!(ir::Core.Compiler....
     1         0 @Base/sort.jl            722 sort!

     1         0 @Base/array.jl           404 getindex
     1         0 ...r/ssair/inlining.jl  1242 handle_const_call!(ir::Core.Com...
     1         0 ...ler/ssair/passes.jl  1101 adce_pass!(ir::Core.Compiler.IR...
     1         0 ...r/ssair/slot2ssa.jl   432 domsort_ssa!(ir::Core.Compiler....
     1         0 ...ompiler/ssair/ir.jl   193 Core.Compiler.InstructionStream...
     1         0 ...r/ssair/slot2ssa.jl   526 compute_live_ins
     1         0 ...r/ssair/slot2ssa.jl   644 construct_ssa!(ci::Core.CodeInf...
     1         0 @Base/array.jl           676 _array_for
     1         0 @Base/array.jl           679 _array_for
     1         0 ...r/ssair/slot2ssa.jl   532 compute_live_ins(cfg::Core.Comp...
     1         0 @Base/array.jl           369 copy
     1         0 ...iler/typelattice.jl   438 stupdate!
     1         0 ...ler/ssair/legacy.jl    35 inflate_ir(ci::Core.CodeInfo, s...
     1         0 ...ompiler/optimize.jl   541 run_passes(ci::Core.CodeInfo, s...
     1       

   159         0 @Base/broadcast.jl       643 _broadcast_getindex
   200         0 @Base/boot.jl            476 Array
   200         0 @Base/broadcast.jl       212 similar
   200         0 @Base/broadcast.jl       211 similar
   201         0 @Base/abstractarray.jl   841 similar
   201         0 @Base/abstractarray.jl   840 similar
   206         0 @Base/boot.jl            468 Array
   213         0 @Base/boot.jl            459 Array
   233         0 @Base/arraymath.jl         8 -(A::Vector{Float64}, B::Vector...
   248         0 @Base/arraymath.jl        16 +(A::Vector{Float64}, Bs::Vecto...
   288         0 @Base/array.jl           966 setindex!
   295         0 @Base/broadcast.jl       597 getindex
   305         0 @Base/arraymath.jl        21 *(A::Float64, B::Vector{Float64})
   424         0 In[3]                     10 macro expansion
   583         0 @Base/broadcast.jl       961 macro expansion
   583         0 @Base/simdloop.jl         77 macro expansion
   586         0 @Base/

## Memory storage

Julia uses "fortran" storage for 2d array along columns


In [10]:
using BenchmarkTools

function compute_dist!(x, dist)
    N = length(x)
    for i=1:N
        for j=1:N
            dist[i, j] = abs(x[i] - x[j])
        end
    end
end

N = 10_000
x = rand(Float64, N)
dist = Array{Float64}(undef, (N, N))

@btime compute_dist!($x, $dist)


  873.708 ms (0 allocations: 0 bytes)


In [11]:
function compute_dist!(x, dist)
    N = length(x)
    for j=1:N
        @simd for i=1:N
            dist[i, j] = abs(x[i] - x[j])
        end
    end
end

N = 10_000
x = rand(Float64, N)
dist = Array{Float64}(undef, (N, N))

@btime compute_dist!($x, $dist)

  63.680 ms (0 allocations: 0 bytes)


In [None]:
N = 50_000_000
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

@time for i in 1:N
    @inbounds y[i] += a * x[i]
end

In [None]:
using InteractiveUtils

N = 10
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

function axpy()
    for i in 1:N
        @inbounds y[i] += a * x[i]
    end
end

@code_warntype axpy()

In [None]:
let
    N = 50_000_000
    a = 1.2
    x = rand(Float64, N)
    y = rand(Float64, N)

    @time for i in 1:N
        @inbounds y[i] += a * x[i]
    end
end

In [None]:
N = 50_000_000
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

function axpy!(a::Float64, x::Array{Float64}, y::Array{Float64})
    for i in 1:length(x)
        @inbounds y[i] += a * x[i]
    end
end

# warmup
axpy!(a, x, y)

# timing
@time axpy!(a, x, y)

In [12]:
function somme(x::Array{Float64, 2}, dim::Int64)
    if(dim > 0)
        return sum(x, dims=dim)
    else
        return sum(x)
    end
end

N = 10
x = ones(N, N)

@code_warntype somme(x, 0)

MethodInstance for somme(::Matrix{Float64}, ::Int64)
  from somme(x::Matrix{Float64}, dim::Int64) in Main at In[12]:1
Arguments
  #self#[36m::Core.Const(somme)[39m
  x[36m::Matrix{Float64}[39m
  dim[36m::Int64[39m
Body[91m[1m::Union{Float64, Matrix{Float64}}[22m[39m
[90m1 ─[39m %1  = (dim > 0)[36m::Bool[39m
[90m└──[39m       goto #3 if not %1
[90m2 ─[39m %3  = (:dims,)[36m::Core.Const((:dims,))[39m
[90m│  [39m %4  = Core.apply_type(Core.NamedTuple, %3)[36m::Core.Const(NamedTuple{(:dims,)})[39m
[90m│  [39m %5  = Core.tuple(dim)[36m::Tuple{Int64}[39m
[90m│  [39m %6  = (%4)(%5)[36m::NamedTuple{(:dims,), Tuple{Int64}}[39m
[90m│  [39m %7  = Core.kwfunc(Main.sum)[36m::Core.Const(Base.var"#sum##kw"())[39m
[90m│  [39m %8  = (%7)(%6, Main.sum, x)[36m::Matrix{Float64}[39m
[90m└──[39m       return %8
[90m3 ─[39m %10 = Main.sum(x)[36m::Float64[39m
[90m└──[39m       return %10



In [13]:
N = 50_000_000
a = 1.2
x = rand(Float64, N)
y = rand(Float64, N)

# warmup
@. y += a * x

# timing
@time @. y += a * x;

  0.059362 seconds (4 allocations: 128 bytes)


In [14]:
const N = 50_000_000
const a = 1.2
const x = rand(Float64, N)
const y = rand(Float64, N)

const nn = 100
const n_start = 1 + nn
const n_end = N - nn

# warmup
@. y[n_start:n_end] += a * x[n_start:n_end]

# timing
@time @. y[n_start:n_end] += a * x[n_start:n_end]

LoadError: cannot declare N constant; it already has a value

In [None]:
const N = 50_000_000
const a = 1.2
const x = rand(Float64, N)
const y = rand(Float64, N)

const nn = 100
const n_start = 1 + nn
const n_end = N - nn

# warmup
@. @views y[n_start:n_end] += a * x[n_start:n_end]

# timing
@time @. @views y[n_start:n_end] += a * x[n_start:n_end]

In [None]:
N = 100_000_000
a = 1.2
x = rand(Float64, N)
y = Array{Float64}(undef, N)

function func!(a::Float64, x::Array{Float64}, y::Array{Float64})
    @Threads.threads for i in 1:length(x)
        y[i] = exp(a * x[i] * x[i])
    end
end

# warmup
func!(a, x, y)

# timing
@time func!(a, x, y)

In [None]:
Threads.nthreads()

To get the performance of benchmarks with Julia, you just have to follow some rules:
- Type the variables as much as possible!
- Put the calculations in functions and pass the global variables as parameters of these functions.
- Use the `const` keyword on global variables, when possible. 
- Use loops or vectorize calculations with the @ macro.
- Avoid unnecessary copies by using the `@views` macro for operations on sub-tables.
