# Group Seminar on Julia Pt. II  
# General Performance Tips

In [6]:
macro timeOften(ex::Expr)
    :(begin 
     x=0
     @time for i = 1:100000
      x=$ex
     end
     x
    end)
end

## 1. Type stability

In [65]:
function unstable(x::Integer)
    if iseven(x)
        return Float64(x)
    else
        return Int8(x)
    end
end

unstable (generic function with 1 method)

In [125]:
function sum_unstable()
    sum = 0
    for i in 1:100
        sum += i/2
    end
    sum
end
function sum_stable()
    sum = 0.0
    for i in 1:100
        sum += i/2
    end
    sum
end

sum_stable (generic function with 1 method)

In [130]:
@timeOften sum_unstable();
@timeOften sum_stable();

  0.188656 seconds (20.00 M allocations: 305.176 MB, 7.51% gc time)
  0.011303 seconds (100.00 k allocations: 1.526 MB)


In [129]:
@code_warntype sum_stable();

Variables:
  sum::Float64
  #s482::Int64
  i::Int64

Body:
  begin  # In[125], line 9:
      sum = 0.0 # In[125], line 10:
      GenSym(0) = $(Expr(:new, UnitRange{Int64}, 1, :(((top(getfield))(Base.Intrinsics,:select_value)::I)((Base.sle_int)(1,100)::Bool,100,(Base.box)(Int64,(Base.sub_int)(1,1)))::Int64)))
      #s482 = (top(getfield))(GenSym(0),:start)::Int64
      unless (Base.box)(Base.Bool,(Base.not_int)(#s482::Int64 === (Base.box)(Base.Int,(Base.add_int)((top(getfield))(GenSym(0),:stop)::Int64,1))::Bool)) goto 1
      2: 
      GenSym(2) = #s482::Int64
      GenSym(3) = (Base.box)(Base.Int,(Base.add_int)(#s482::Int64,1))
      i = GenSym(2)
      #s482 = GenSym(3) # In[125], line 11:
      sum = (Base.box)(Base.Float64,(Base.add_float)(sum::Float64,(Base.box)(Base.Float64,(Base.div_float)((Base.box)(Float64,(Base.sitofp)(Float64,i::Int64)),(Base.box)(Float64,(Base.sitofp)(Float64,2))))))
      3: 
      unless (Base.box)(Base.Bool,(Base.not_int)((Base.box)(Base.Bool,(Base.not_in

In [89]:
@code_llvm sum_stable()


define double @julia_sum_stable_2287() {
top:
  br label %L

L:                                                ; preds = %L, %top
  %lsr.iv = phi i64 [ 100, %top ], [ %lsr.iv.next, %L ]
  %"#s29.0" = phi i64 [ 1, %top ], [ %0, %L ]
  %sum.0 = phi double [ 0.000000e+00, %top ], [ %3, %L ]
  %0 = add i64 %"#s29.0", 1
  %1 = sitofp i64 %"#s29.0" to double
  %2 = fmul double %1, 5.000000e-01
  %3 = fadd double %sum.0, %2
  %lsr.iv.next = add i64 %lsr.iv, -1
  %4 = icmp eq i64 %lsr.iv.next, 0
  br i1 %4, label %L3, label %L

L3:                                               ; preds = %L
  ret double %3
}


## 2. Avoid non-constant global variables.

In [194]:
# Alright
const global C = 100000
# VERY Bad
global D = 100000

f(x) = x + C
g(x) = x + D

g (generic function with 1 method)

In [213]:
@assert f(10)==g(10)

In [214]:
@timeOften f(10)
@timeOften g(10)

  0.000245 seconds
  0.001756 seconds (100.00 k allocations: 1.526 MB)


100010

__Q: Why is a `const` so much more efficient?__  
A: The compiler cannot infer the type of a dynamic global. A constants type is fixed at declaration and can be inlined.

In [16]:
@code_llvm f(10)


define i64 @julia_f_21855(i64) {
top:
  %1 = add i64 %0, 100000
  ret i64 %1
}


In [11]:
@code_llvm g(10)


define %jl_value_t* @julia_g_21781(i64) {
top:
  %1 = alloca [4 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [4 x %jl_value_t*]* %1, i64 0, i64 0
  %2 = getelementptr [4 x %jl_value_t*]* %1, i64 0, i64 2
  store %jl_value_t* inttoptr (i64 4 to %jl_value_t*), %jl_value_t** %.sub, align 8
  %3 = load %jl_value_t*** @jl_pgcstack, align 8
  %4 = getelementptr [4 x %jl_value_t*]* %1, i64 0, i64 1
  %.c = bitcast %jl_value_t** %3 to %jl_value_t*
  store %jl_value_t* %.c, %jl_value_t** %4, align 8
  store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8
  store %jl_value_t* null, %jl_value_t** %2, align 8
  %5 = getelementptr [4 x %jl_value_t*]* %1, i64 0, i64 3
  store %jl_value_t* null, %jl_value_t** %5, align 8
  %6 = call %jl_value_t* @jl_box_int64(i64 signext %0)
  store %jl_value_t* %6, %jl_value_t** %2, align 8
  %7 = load %jl_value_t** inttoptr (i64 4437588488 to %jl_value_t**), align 8
  store %jl_value_t* %7, %jl_value_t** %5, align 8
  %8 = call %jl_value_t* 

## 3. Respect the order.
### Unlike C, Julia has _column major_ order!

In [162]:
function add_matrices_one(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    for i = 1:size(A,1)
        for j = 1:size(A,2)
            C[i,j] = A[i,j] + B[i,j]
        end
    end
    return C
end
function add_matrices_two(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    for i = 1:size(A,2)
        for j = 1:size(A,1)
            C[j,i] = A[j,i] + B[j,i]
        end
    end
    return C
end

add_matrices_two (generic function with 1 method)

In [165]:
A = rand(1000,1000);
B = rand(1000,1000);

In [167]:
@time add_matrices_one(A,B);
@time add_matrices_two(A,B);

  0.015669 seconds (6 allocations: 7.630 MB)
  0.003556 seconds (6 allocations: 7.630 MB)


Note: Don't bother implementing LinAlg operations.

In [231]:
@time A+B;

  0.002296 seconds (9 allocations: 7.630 MB)


## 4. Disable safty nets where appropriate.

By default Julia performs __out-of-bound checks__ on array access. We get an exception without crashing the kernel.

In [185]:
add_matrices_two(rand(4,4),rand(3,3))

LoadError: LoadError: BoundsError: attempt to access 3x3 Array{Float64,2}:
 0.0407126  0.551313  0.155657
 0.152054   0.683091  0.911898
 0.765163   0.77917   0.248355
  at index [4,1]
while loading In[185], in expression starting on line 1

However, this can cost performance and is unneccessary if we guarantee inbounds access $\rightarrow$ __@inbounds__

In [202]:
function add_matrices_three(A::Matrix, B::Matrix)
    C = Matrix{eltype(A)}(size(A)...)
    if size(A) != size(B)
        return nothing
    end
    @inbounds for i = 1:size(A,2)
        for j = 1:size(A,1)
           C[j,i] = A[j,i] + B[j,i]
        end
    end
    return C
end

add_matrices_three (generic function with 1 method)

In [227]:
@time add_matrices_three(A,B);

  0.002560 seconds (6 allocations: 7.630 MB, 103.71% gc time)


## 5. Pre-allocate output

_[Verbatim from http://docs.julialang.org/en/release-0.4/manual/performance-tips/#pre-allocating-outputs]_

In [207]:
function xinc(x)
    return [x, x+1, x+2]
end

function loopinc()
    y = 0
    for i = 1:10^7
        ret = xinc(i)
        y += ret[2]
    end
    y
end

loopinc (generic function with 1 method)

Every call to `xinc` allocates __a new array__.   
$\rightarrow$ Better allocate the array beforehand and __update it__.

In [208]:
function xinc!{T}(ret::AbstractVector{T}, x::T)
    ret[1] = x
    ret[2] = x+1
    ret[3] = x+2
    nothing
end

function loopinc_prealloc()
    ret = Array(Int, 3)
    y = 0
    for i = 1:10^7
        xinc!(ret, i)
        y += ret[2]
    end
    y
end

loopinc_prealloc (generic function with 1 method)

In [212]:
@time loopinc();
@time loopinc_prealloc();

  0.593367 seconds (40.00 M allocations: 1.341 GB, 17.44% gc time)
  0.025406 seconds (6 allocations: 272 bytes)
