# Group Seminar on Julia Pt. II  
## General Performance Tips / Pitfalls to avoid

<img src="http://imgs.xkcd.com/comics/optimization.png"></img>

## From input to machine code

<img src="./JuliaCompilation.png"></img>

Image from __De Sutter et al. (2016)__ https://arxiv.org/abs/1604.03410

## Measuring performance

### `@time` is your best friend.

In [None]:
@time begin
    local A = rand(250,250)
    local F = eigfact((A+A')/2)
end

In [None]:
macro timeOften(ex::Expr,n::Int)
    :(begin 
     x=0
     @time for i = 1:$n
      $ex
     end
    end)
end

In [None]:
@timeOften rand() 100000

### More advanced: Profiling

Julia features a __statistical profiler__. Not every function call is back-traced.

In [None]:
function myEig(n::Int)
    A = rand(n,n)
    return eigfact((A+A')/2)
end

In [None]:
myEig(1);

In [None]:
Profile.clear()

In [None]:
@profile myEig(2500);

In [None]:
Profile.print(format=:flat)

In [None]:
using ProfileView

## 1. Type stability

In [None]:
function unstable(x::Int)
    if iseven(x)
        return Float64(x)
    else
        return Int64(x)
    end
end

In [None]:
typeof([ unstable(i) for i in 1:127 ])

In [None]:
begin 
    local A = [ unstable(i) for i in 1:127 ]
    @timeOften A+1 100000
end

In [None]:
begin 
    local A = [ Float64(i) for i in 1:127 ]
    @timeOften A+1 100000
end

<hr />

In [None]:
function sum_unstable()
    sum = 0
    for i in 1:100
        sum += i/2
    end
    sum
end
function sum_stable()
    sum = 0.0
    for i in 1:100
        sum += i/2
    end
    sum
end

In [None]:
@timeOften sum_unstable() 100000;
@timeOften sum_stable() 100000;

In [None]:
@code_warntype sum_unstable();

In [None]:
@code_llvm sum_unstable()

## 2. Be careful with global variables.

In [None]:
# Alright
const global C = 100000
# Potentially VERY Bad
global D = 100000

function f(x)
    s=0
    for i in 1:10^6
        s += s+x+i + C end
    return s
end
function g(x)
    s=0
    for i in 1:10^6
        s += s+x+i + D end
    return s
end

In [None]:
@time f(10);

In [None]:
@time g(10);

In [None]:
function g(x,y::Int)
    s=0
    for i in 1:Int(1e6)
        s+=s+x+i + y end
    return s
end

In [None]:
@time g(10,D);

__Q: Why is a `const` so much more efficient?__  
A: The compiler cannot infer the type of a dynamic global. A constants type is fixed at declaration and can be inlined.

In [None]:
@code_llvm f(10)

In [None]:
@code_llvm g(10)

In [None]:
@code_llvm g(10,D)

<span style="font-size:14pt">$\Longrightarrow$ Adopt a __functional__ programming style!</span>  



## 3. Respect the order.
### Unlike C, Julia has _column major_ order!

In [None]:
function add_matrices_one(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    for i = 1:size(A,1)
        for j = 1:size(A,2)
            C[i,j] = A[i,j] + B[i,j]
        end
    end
    return C
end
function add_matrices_two(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    for i = 1:size(A,2)
        for j = 1:size(A,1)
            C[j,i] = A[j,i] + B[j,i]
        end
    end
    return C
end

In [None]:
A = rand(1000,1000);
B = rand(1000,1000);

In [None]:
@time add_matrices_one(A,B);
@time add_matrices_two(A,B);

__Note:__ Don't bother implementing LinAlg operations.

In [None]:
@time A+B;

## 4. Disable safty nets where appropriate.

By default Julia performs __out-of-bound checks__ on array access. We get an exception without crashing the kernel.

In [None]:
add_matrices_two(rand(4,4),rand(3,3))

However, this can cost performance and is unneccessary if we guarantee inbounds access $\rightarrow$ __@inbounds__

In [None]:
function add_matrices_three(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    if size(A) != size(B)
        return nothing
    end
    @inbounds for i = 1:size(A,2)
        for j = 1:size(A,1)
           C[j,i] = A[j,i] + B[j,i]
        end
    end
    return C
end

In [None]:
@time add_matrices_three(A,B);

## 5. Pre-allocate output

_[Verbatim from http://docs.julialang.org/en/release-0.4/manual/performance-tips/#pre-allocating-outputs]_

In [None]:
function xinc(x)
    return [x, x+1, x+2]
end

function loopinc()
    y = 0
    for i = 1:10^7
        ret = xinc(i)
        y += ret[2]
    end
    y
end

Every call to `xinc` allocates __a new array__.   
$\rightarrow$ Better allocate the array beforehand and __update it__.

In [None]:
function xinc!{T}(ret::AbstractVector{T}, x::T)
    ret[1] = x
    ret[2] = x+1
    ret[3] = x+2
    nothing
end

function loopinc_prealloc()
    ret = Array(Int, 3)
    y = 0
    for i = 1:10^7
        xinc!(ret, i)
        y += ret[2]
    end
    y
end

In [None]:
@time loopinc();
@time loopinc_prealloc();