# Group Seminar on Julia Pt. II  
## General Performance Tips / Pitfalls to avoid

<img src="http://imgs.xkcd.com/comics/optimization.png"></img>

## From input to machine code

<img src="./JuliaCompilation.png"></img>

Image from __De Sutter et al. (2016)__ https://arxiv.org/abs/1604.03410

In [15]:
parsedExp = parse("10*(2+5)")

:(10 * (2 + 5))

In [16]:
parsedExp.args

3-element Array{Any,1}:
   :*      
 10        
   :(2 + 5)

In [23]:
typeof(parsedExp)

Expr

## Measuring performance

### `@time` is your best friend.

In [22]:
@time begin
    local A = rand(250,250)
    local F = eigfact((A+A')/2)
end

  0.019089 seconds (45 allocations: 3.429 MB)


Base.LinAlg.Eigen{Float64,Float64,Array{Float64,2},Array{Float64,1}}([-6.34939,-6.11064,-6.10436,-6.02585,-5.89586,-5.82497,-5.81078,-5.59252,-5.53155,-5.43081  …  5.53626,5.62821,5.68272,5.80636,6.00727,6.16042,6.176,6.20775,6.26774,125.279],250x250 Array{Float64,2}:
  0.025726      0.0079314   -0.0836035   …   0.0466329   -0.0617622
 -0.0670366    -0.0209826    0.0676174       0.0249354   -0.0611163
  0.111818      0.0321288   -0.00724786     -0.0332679   -0.0626986
 -0.0191875    -0.0152057   -0.0598918       0.00838277  -0.0646816
 -0.0888896    -0.10645     -0.0932347      -0.266264    -0.0617213
  0.022458      0.0210968   -0.0348935   …   0.0657349   -0.0630189
 -0.0102074    -0.0112421    0.0412524       0.069668    -0.0665839
  0.0964242    -0.0264586    0.0014948      -0.00198468  -0.0658235
 -0.125689      0.00607313   0.0535068       0.0190857   -0.0631135
 -0.0240121     0.0138479   -0.0901025      -0.0380534   -0.0628626
 -0.084401      0.0532588    0.0332956   …   0.0992

In [24]:
macroexpand( parse("@time EXPR") )

quote  # util.jl, line 153:
    local #84#stats = Base.gc_num() # util.jl, line 154:
    local #86#elapsedtime = Base.time_ns() # util.jl, line 155:
    local #85#val = EXPR # util.jl, line 156:
    #86#elapsedtime = Base.-(Base.time_ns(),#86#elapsedtime) # util.jl, line 157:
    local #87#diff = Base.GC_Diff(Base.gc_num(),#84#stats) # util.jl, line 158:
    Base.time_print(#86#elapsedtime,#87#diff.allocd,#87#diff.total_time,Base.gc_alloc_count(#87#diff)) # util.jl, line 160:
    #85#val
end

In [35]:
macro timeOften(ex::Expr,n::Int)
    quote
        begin 
         x=0
         @time for i = 1:$n
            local val = $ex
         end
        end
    end
end

In [None]:
@timeOften rand() 100000

### More advanced: Profiling

Julia features a __statistical profiler__. Not every function call is back-traced.

In [25]:
function myEig(n::Int)
    A = rand(n,n)
    return eigfact((A+A')/2)
end

myEig (generic function with 1 method)

In [26]:
myEig(1);

In [27]:
Profile.clear()

In [28]:
@profile myEig(2500);

In [29]:
Profile.print(format=:flat)

 Count File                       Function                                 Line
   146 ....4/IJulia/src/IJulia.jl eventloop                                 143
   146 .../src/execute_request.jl execute_request_0x535c5df2                183
    12 In[25]                     myEig                                       2
   134 In[25]                     myEig                                       3
    18 array.jl                   reshape                                   148
    17 arraymath.jl               +                                          96
    43 arraymath.jl               ./                                         49
    22 arraymath.jl               transpose!                                323
    21 arraymath.jl               transposeblock!                           340
    87 arraymath.jl               transposeblock!                           346
    95 arraymath.jl               transposeblock!                           347
   111 arraymath.jl               transp

## 1. Type stability

In [31]:
function unstable(x::Integer)
    if iseven(x)
        return Float64(x)
    else
        return Int64(x)
    end
end

unstable (generic function with 1 method)

In [34]:
typeof([ unstable(i) for i in 1:127 ][1])

Int64

In [32]:
typeof( [ unstable(i) for i in 1:127 ] )

Array{Union{Float64,Int64},1}

In [37]:
begin 
    local A = [ unstable(i) for i in 1:127 ]
    @timeOften A+1 100000
end

  0.678259 seconds (12.70 M allocations: 296.021 MB, 7.62% gc time)


In [39]:
begin 
    local A = [ Float64(i) for i in 1:127 ]
    @timeOften A+1 100000
end

  0.038367 seconds (100.00 k allocations: 103.760 MB, 54.03% gc time)


<hr />

In [40]:
function sum_unstable()
    sum = 0
    for i in 1:100
        sum += i/2
    end
    sum
end
function sum_stable()
    sum = 0.0
    for i in 1:100
        sum += i/2
    end
    sum
end

sum_stable (generic function with 1 method)

In [42]:
@timeOften sum_unstable() 100000;
@timeOften sum_stable() 100000;

  0.271960 seconds (20.00 M allocations: 305.176 MB, 20.68% gc time)
  0.007035 seconds


In [44]:
0+"a"

LoadError: LoadError: MethodError: `+` has no method matching +(::Int64, ::ASCIIString)
Closest candidates are:
  +(::Any, ::Any, !Matched::Any, !Matched::Any...)
  +(::Int64, !Matched::Int64)
  +(::Integer, !Matched::Ptr{T})
  ...
while loading In[44], in expression starting on line 1

In [45]:
@code_warntype sum_stable();

Variables:
  sum::Float64
  #s52::Int64
  i::Int64

Body:
  begin  # In[40], line 9:
      sum = 0.0 # In[40], line 10:
      GenSym(0) = $(Expr(:new, UnitRange{Int64}, 1, :(((top(getfield))(Base.Intrinsics,:select_value)::I)((Base.sle_int)(1,100)::Bool,100,(Base.box)(Int64,(Base.sub_int)(1,1)))::Int64)))
      #s52 = (top(getfield))(GenSym(0),:start)::Int64
      unless (Base.box)(Base.Bool,(Base.not_int)(#s52::Int64 === (Base.box)(Base.Int,(Base.add_int)((top(getfield))(GenSym(0),:stop)::Int64,1))::Bool)) goto 1
      2: 
      GenSym(2) = #s52::Int64
      GenSym(3) = (Base.box)(Base.Int,(Base.add_int)(#s52::Int64,1))
      i = GenSym(2)
      #s52 = GenSym(3) # In[40], line 11:
      sum = (Base.box)(Base.Float64,(Base.add_float)(sum::Float64,(Base.box)(Base.Float64,(Base.div_float)((Base.box)(Float64,(Base.sitofp)(Float64,i::Int64)),(Base.box)(Float64,(Base.sitofp)(Float64,2))))))
      3: 
      unless (Base.box)(Base.Bool,(Base.not_int)((Base.box)(Base.Bool,(Base.not_int)(#s52::

In [49]:
@code_native sum_stable()

	.section	__TEXT,__text,regular,pure_instructions
Filename: In[40]
Source line: 11
	pushq	%rbp
	movq	%rsp, %rbp
	xorps	%xmm0, %xmm0
	movl	$1, %eax
	movl	$100, %ecx
	movabsq	$13449345936, %rdx      ## imm = 0x321A4BB90
	movsd	(%rdx), %xmm1
Source line: 11
L31:	xorps	%xmm2, %xmm2
	cvtsi2sdq	%rax, %xmm2
	mulsd	%xmm1, %xmm2
	addsd	%xmm2, %xmm0
Source line: 10
	incq	%rax
Source line: 11
	decq	%rcx
	jne	L31
Source line: 13
	popq	%rbp
	ret


## 2. Be careful with global variables.

In [59]:
# Alright
const global C = 100000
# Potentially VERY Bad
global D = 100000

function f(x)
    s=0
    for i in 1:10^6
        s += s+x+i + C end
    return s
end
function g(x)
    s=0
    for i in 1:10^6
        s += s+x+i + D::Int64 end
    return s
end

g (generic function with 1 method)

In [52]:
@time f(10);

  0.000544 seconds (5 allocations: 176 bytes)


In [57]:
@time g(10);

  0.089411 seconds (4.00 M allocations: 61.028 MB, 13.60% gc time)


In [64]:
function g(x,y::Int=D)
    s=0
    for i in 1:Int(1e6)
        s+=s+x+i + y end
    return s
end

g (generic function with 2 methods)

In [67]:
@time g(10,3);

  0.000543 seconds (5 allocations: 176 bytes)


__Q: Why is a `const` so much more efficient?__  
A: The compiler cannot infer the type of a dynamic global. A constants type is fixed at declaration and can be inlined.

In [None]:
@code_llvm f(10)

In [None]:
@code_llvm g(10)

In [None]:
@code_llvm g(10,D)

<span style="font-size:14pt">$\Longrightarrow$ Adopt a __functional__ programming style!</span>  



## 3. Respect the order.
### Unlike C, Julia has _column major_ order!

In [68]:
function add_matrices_one(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    for i = 1:size(A,1)
        for j = 1:size(A,2)
            C[i,j] = A[i,j] + B[i,j]
        end
    end
    return C
end
function add_matrices_two(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    for i = 1:size(A,2)
        for j = 1:size(A,1)
            C[j,i] = A[j,i] + B[j,i]
        end
    end
    return C
end

add_matrices_two (generic function with 1 method)

In [69]:
A = rand(1000,1000);
B = rand(1000,1000);

In [71]:
@time add_matrices_one(A,B);
@time add_matrices_two(A,B);

  0.025518 seconds (6 allocations: 7.630 MB)
  0.004613 seconds (6 allocations: 7.630 MB, 32.92% gc time)


__Note:__ Don't bother implementing LinAlg operations.

In [72]:
@time A+B;

  0.001952 seconds (9 allocations: 7.630 MB)


## 4. Disable safty nets where appropriate.

By default Julia performs __out-of-bound checks__ on array access. We get an exception without crashing the kernel.

In [79]:
add_matrices_three(rand(4,4),rand(3,3))

However, this can cost performance and is unneccessary if we guarantee inbounds access $\rightarrow$ __@inbounds__

In [73]:
function add_matrices_three(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    if size(A) != size(B)
        return nothing
    end
    @inbounds for i = 1:size(A,2)
        for j = 1:size(A,1)
           C[j,i] = A[j,i] + B[j,i]
        end
    end
    return C
end

add_matrices_three (generic function with 1 method)

In [76]:
@time add_matrices_three(A,B);

  0.002876 seconds (6 allocations: 7.630 MB)


## 5. Pre-allocate output

_[Verbatim from http://docs.julialang.org/en/release-0.4/manual/performance-tips/#pre-allocating-outputs]_

In [80]:
function xinc(x)
    return [x, x+1, x+2]
end

function loopinc()
    y = 0
    for i = 1:10^7
        ret = xinc(i)
        y += ret[2]
    end
    y
end

loopinc (generic function with 1 method)

In [82]:
@time loopinc();

  0.716213 seconds (40.00 M allocations: 1.341 GB, 29.47% gc time)


Every call to `xinc` allocates __a new array__.   
$\rightarrow$ Better allocate the array beforehand and __update it__.

In [83]:
function xinc!{T}(ret::AbstractVector{T}, x::T)
    ret[1] = x
    ret[2] = x+1
    ret[3] = x+2
    nothing
end

function loopinc_prealloc()
    ret = Array(Int, 3)
    y = 0
    for i = 1:10^7
        xinc!(ret, i)
        y += ret[2]
    end
    y
end

loopinc_prealloc (generic function with 1 method)

In [85]:
@time loopinc_prealloc();

  0.026791 seconds (6 allocations: 272 bytes)


## Linear Algebra?

__Increase the number of BLAS threads!!__

In [12]:
blas_set_num_threads(1)

In [10]:
A = rand(4096,4096);

In [13]:
@time A*A;

  3.129976 seconds (9 allocations: 128.000 MB)


In [14]:
blas_set_num_threads(4)

In [16]:
@time A*A;

  0.919351 seconds (9 allocations: 128.000 MB)
