In [17]:
#!/usr/bin/julia

@noinline function scale_nothread(a, s)
    for i in 1:length(a)
        a[i] *= s
    end
end

@noinline function scale_nothread_ib(a, s)
    @inbounds for i in 1:length(a)
        a[i] *= s
    end
end

@noinline function scale_nothread_simd(a, s)
    @inbounds @simd for i in 1:length(a)
        a[i] *= s
    end
end

@noinline function scale_thread(a, s)
    Threads.@threads for i in 1:length(a)
        a[i] *= s
    end
end

@noinline function scale_thread_ib(a, s)
    Threads.@threads for i in 1:length(a)
        @inbounds a[i] *= s
    end
end

macro time_scale(fname)
    quote
        println($(esc(fname)))
        a = fill(1e100, 10^4)
        print("10^4")
        $(esc(fname))(a, 0.999)
        @time for i in 1:(10^4)
            $(esc(fname))(a, 0.999)
        end
        a = fill(1e100, 10^5)
        print("10^5")
        $(esc(fname))(a, 0.999)
        @time for i in 1:(10^3)
            $(esc(fname))(a, 0.999)
        end
        a = fill(1e100, 10^6)
        print("10^6")
        $(esc(fname))(a, 0.999)
        @time for i in 1:(10^2)
            $(esc(fname))(a, 0.999)
        end
    end
end

function time_all()
    @time_scale scale_nothread
    @time_scale scale_nothread_ib
    @time_scale scale_nothread_simd
    @time_scale scale_thread
    @time_scale scale_thread_ib
end

time_all()

scale_nothread
10^4  0.080103 seconds
10^5



  0.085836 seconds
10^6  0.085435 seconds
scale_nothread_ib
10^4  0.034491 seconds
10^5  0.055978 seconds
10^6  0.055971 seconds
scale_nothread_simd
10^4  0.031586 seconds
10^5  0.045741 seconds
10^6  0.043211 seconds
scale_thread
10^4  0.057908 seconds (20.00 k allocations: 625.000 KB)
10^5  0.012299 seconds (2.00 k allocations: 62.500 KB)
10^6  0.007808 seconds (200 allocations: 6.250 KB)
scale_thread_ib
10^4  0.047813 seconds (20.00 k allocations: 625.000 KB)
10^5  0.006534 seconds (2.00 k allocations: 62.500 KB)
10^6  0.003588 seconds (200 allocations: 6.250 KB)


In [None]:
function threadstest1{N}(A::Array{Float64},b::Vector{Float64},k::Vector{Array{Float64,N}})
  @inbounds for i in eachindex(A)
    A[i]+= sin(b[1]*k[1][i] + b[2]*k[2][i] + b[3]*k[3][i] + b[5]*k[5][i]) +
    exp(b[7]*k[7][i] + b[8]*k[8][i] + b[10]*k[10][i] + b[11]*k[11][i]) +
    erf(b[12]*k[12][i] + b[14]*k[14][i] + b[15]*k[15][i] + b[16]*k[16][i]) +
    (b[18]*k[18][i] + b[19]*k[19][i] + b[20]*k[20][i] + b[21]*k[21][i]) +
    (b[22]*k[22][i] + b[23]*k[23][i] + b[24]*k[24][i] + b[25]*k[25][i]) +
    (b[26]*k[26][i] + b[27]*k[27][i] + b[28]*k[28][i] + b[29]*k[29][i]) +
    (b[30]*k[30][i] + b[31]*k[31][i] + b[32]*k[32][i] + b[33]*k[33][i]) +
    (b[34]*k[34][i] + b[35]*k[35][i])
  end
  A
end
function threadstest2{N}(A::Array{Float64},b::Vector{Float64},k::Vector{Array{Float64,N}})
  @inbounds Threads.@threads for i in eachindex(A)
    A[i]+= sin(b[1]*k[1][i] + b[2]*k[2][i] + b[3]*k[3][i] + b[5]*k[5][i]) +
    exp(b[7]*k[7][i] + b[8]*k[8][i] + b[10]*k[10][i] + b[11]*k[11][i]) +
    erf(b[12]*k[12][i] + b[14]*k[14][i] + b[15]*k[15][i] + b[16]*k[16][i]) +
    (b[18]*k[18][i] + b[19]*k[19][i] + b[20]*k[20][i] + b[21]*k[21][i]) +
    (b[22]*k[22][i] + b[23]*k[23][i] + b[24]*k[24][i] + b[25]*k[25][i]) +
    (b[26]*k[26][i] + b[27]*k[27][i] + b[28]*k[28][i] + b[29]*k[29][i]) +
    (b[30]*k[30][i] + b[31]*k[31][i] + b[32]*k[32][i] + b[33]*k[33][i]) +
    (b[34]*k[34][i] + b[35]*k[35][i])
  end
  A
end
using BenchmarkTools
function benchmarkthreadtests()
  const ds = [10 100 1000 2000 5000]
  ts = Array{Any}(length(ds),2)
  for i in eachindex(ds)
    d = ds[i]
    A = rand(d,d); b = rand(35)
    k = [rand(d,d)]
    for j = 1:34
      push!(k,rand(d,d))
    end
    println("d = $d")
    println("Serial")
    ts[i,1] = @benchmark threadstest1($A,$b,$k)
    println("Threads")
    ts[i,2] = @benchmark threadstest2($A,$b,$k)
  end
  ts
end
ts = benchmarkthreadtests()
meantimes = [mean(x.times) for x in ts]


In [11]:
const N = 10^6

using Base.Threads
function driver()
    println("Number of threads = $(nthreads())")
    x = rand(N)
    y = zeros(N)
    println("Warmup!")
    warmup(x, y)
    t1 = test1(x, y)
    t2 = test2(x, y)
    println("Serial time = $t1")
    println("Parallel time = $t2")
end
function warmup(x::Vector{Float64}, y::Vector{Float64})
    for i = 1:N
        y[i] = sin(x[i])^2 + cos(x[i])^2
    end
end
function test1(x::Vector{Float64}, y::Vector{Float64})
    t1 = @elapsed for i = 1:N
        y[i] = sin(x[i])^2 + cos(x[i])^2
    end
    @assert sum(y) == N
    t1
end
function test2(x::Vector{Float64}, y::Vector{Float64})
    t2 = @elapsed @threads for i = 1:N
        y[i] = sin(x[i])^2 + cos(x[i])^2
    end
    @assert sum(y) == N
    t2
end
driver()

Number of threads = 32
Warmup!
Serial time = 0.027521303
Parallel time = 0.024238235


