## Speed test for loops and reshaped representations

We want to see which is the best shape for doing calculations conveniently and faster.

In [1]:
using Base.Threads
using Distributions
using LaTeXStrings
using Printf
using LsqFit
using BenchmarkTools
using LinearAlgebra

In [2]:
nthreads()

2

Typical reshape for a NxN grid and a 2-vector.

In [3]:
const D = 2
const J1 = 1000
const J2 = 1000
EE = rand(D*J1*J2)
E = reshape(EE,D,J1,J2)

function prueba_vel_ji!(EE)
    @inbounds E = reshape(EE,D,J1,J2)
    @inbounds @threads for j in 1:J2
        @inbounds    for i in 1:J1
            E[1,i,j] = 5.0 * E[2,i,j]
        end
    end
end

function prueba_vel_ij!(EE)
    @inbounds E = reshape(EE,D,J1,J2)
    @inbounds for i in 1:J1
        @inbounds    for j in 1:J2
            E[1,i,j] = 5.0 * E[2,i,j]
        end
    end
end

function prueba_vel_no_for!(EE)
    @inbounds E = reshape(EE,D,J1,J2)   
    @inbounds E[1,:,:] .= 5.0 * E[2,:,:]
    return 0
end

@show @btime prueba_vel_ij!($EE)
@show @btime prueba_vel_ji!($EE)
@show @btime prueba_vel_no_for!($EE)

#sum(reshape(EE,2,100,100)[1,:,:] - 5*reshape(EE,2,100,100)[2,:,:])


  2.356 ms (2 allocations: 96 bytes)
#= /Users/reula/Julia/PIC/PIC-1D/Tests/speed_test.ipynb:31 =# @btime(prueba_vel_ij!($(Expr(:$, :EE)))) = nothing


  533.699 μs (14 allocations: 1.09 KiB)
#= /Users/reula/Julia/PIC/PIC-1D/Tests/speed_test.ipynb:32 =# @btime(prueba_vel_ji!($(Expr(:$, :EE)))) = nothing
  

3.704 ms (6 allocations: 15.26 MiB)
#= /Users/reula/Julia/PIC/PIC-1D/Tests/speed_test.ipynb:33 =# @btime(prueba_vel_no_for!($(Expr(:$, :EE)))) = 0


0

### 100x100

12.786 μs (14 allocations: 1.09 KiB) @btime(prueba_vel_ij!($(Expr(:$, :EE))))  **con threads(2)**

3.916 μs (2 allocations: 96 bytes) @btime(prueba_vel_ji!($(Expr(:$, :EE)))) = nothing

23.232 μs (6 allocations: 156.44 KiB) @btime(prueba_vel_no_for!($(Expr(:$, :EE)))) = 0

### 1000x1000 

1.297 ms (14 allocations: 1.09 KiB) @btime(prueba_vel_ij!($(Expr(:$, :EE)))) **con threads(2)** 

2.363 ms (2 allocations: 96 bytes) @btime(prueba_vel_ij!($(Expr(:$, :EE)))) = nothing

534.565 μs (14 allocations: 1.09 KiB) @btime(prueba_vel_ji!($(Expr(:$, :EE)))) = nothing **con threads(2)**

819.136 μs (2 allocations: 96 bytes) @btime(prueba_vel_ji!($(Expr(:$, :EE)))) = nothing

3.745 ms (6 allocations: 15.26 MiB) @btime(prueba_vel_no_for!($(Expr(:$, :EE)))) = 0

**clearly the fast indices are j,i**


Checking that the values are written.

In [4]:
sum(reshape(EE,D,J1,J2)[1,:,:] - 5*reshape(EE,D,J1,J2)[2,:,:])

0.0

### With the vector index at the end

In [6]:
function prueba_2_vel_ji!(EE)
    @inbounds E = reshape(EE,J1,J2,D)
    @inbounds @threads for j in 1:J2
        @inbounds for i in 1:J1
            E[i,j,1] = 5.0 * E[i,j,2]
        end
    end
end

function prueba_2_vel_ij!(EE)
    @inbounds E = reshape(EE,J1,J2,D)
    @inbounds  for i in 1:J1
        @inbounds    for j in 1:J2
            E[i,j,1] = 5.0 * E[i,j,1]
        end
    end
end

function prueba_2_vel_no_for!(EE)
    @inbounds E = reshape(EE,D,J1,J2)   
    @inbounds E[1,:,:] .= 5.0 .* E[2,:,:]
    return 0
end

#@show @btime prueba_2_vel_ij!($EE)
#@show @btime prueba_2_vel_ji!($EE)
#@show @btime prueba_2_vel_no_for!($EE)

  2.600 ms (4 allocations: 7.63 MiB)
#= /Users/reula/Julia/PIC/PIC-1D/Tests/speed_test.ipynb:27 =# @btime(prueba_2_vel_no_for!($(Expr(:$, :EE)))) = 0


0

## 100x100

10.943 μs (14 allocations: 1.09 KiB) **con threads ij**

1.831 μs (2 allocations: 96 bytes) ji

23.328 μs (6 allocations: 156.44 KiB) ::,::

## 1000x1000

869.742 μs (14 allocations: 1.09 KiB) @btime(prueba_2_vel_ij!($(Expr(:$, :EE)))) = nothing **con threads**

572.861 μs (2 allocations: 96 bytes) @btime(prueba_2_vel_ji!($(Expr(:$, :EE)))) = nothing

1.360 ms (2 allocations: 96 bytes) @btime(prueba_2_vel_ij!($(Expr(:$, :EE)))) = nothing

398.227 μs (14 allocations: 1.09 KiB) @btime(prueba_2_vel_ji!($(Expr(:$, :EE)))) = nothing **con threads**
  
3.686 ms (6 allocations: 15.26 MiB) @btime(prueba_2_vel_no_for!($(Expr(:$, :EE)))) = 0

2.600 ms (4 allocations: 7.63 MiB) @btime(prueba_2_vel_no_for!($(Expr(:$, :EE)))) ** con . en - y *


In [None]:
sum(reshape(EE,J1,J2,D)[:,:,1] - 5*reshape(EE,J1,J2,D)[:,:,2])

For a scalar field

In [8]:
#D = 1
#J1 = 100
#J2 = 100

BB = rand(J1*J2)

function prueba_scalar_ji!(BB)
    @inbounds B = reshape(BB,J1,J2)
    @inbounds  for j in 1:J2
        @inbounds for i in 1:J1
            B[i,j] = 5.0 * B[i,j]
        end
    end
end

function prueba_scalar_ij!(BB)
    @inbounds B = reshape(BB,J1,J2)
    @inbounds @threads for i in 1:J1
        @inbounds    for j in 1:J2
            B[i,j] = 5.0 * B[i,j]
        end
    end
end

function prueba_scalar_no_for!(BB)
    @inbounds B = reshape(BB,J1,J2)   
    @inbounds B[:,:] .= 5.0 .* B[:,:]
    return 0
end

@show @btime prueba_scalar_ij!($BB)
@show @btime prueba_scalar_ji!($BB)
@show @btime prueba_scalar_no_for!($BB)

  856.074 μs (14 allocations: 1.09 KiB)
#= /Users/reula/Julia/PIC/PIC-1D/Tests/speed_test.ipynb:31 =# @btime(prueba_scalar_ij!($(Expr(:$, :BB)))) = nothing
  

318.992 μs (2 allocations: 96 bytes)
#= /Users/reula/Julia/PIC/PIC-1D/Tests/speed_test.ipynb:32 =# @btime(prueba_scalar_ji!($(Expr(:$, :BB)))) = nothing


  1.934 ms (4 allocations: 7.63 MiB)
#= /Users/reula/Julia/PIC/PIC-1D/Tests/speed_test.ipynb:33 =# @btime(prueba_scalar_no_for!($(Expr(:$, :BB)))) = 0


0

## 100x100

**sin definir las constantes** 

578.613 μs (30117 allocations: 629.31 KiB) @btime(prueba_scalar_ij!($(Expr(:$, :BB)))) = nothing **threads**
  
1.036 ms (29903 allocations: 625.12 KiB) @btime(prueba_scalar_ji!($(Expr(:$, :BB)))) = nothing 

18.836 μs (14 allocations: 156.69 KiB) @btime(prueba_scalar_no_for!($(Expr(:$, :BB)))) = 0

**con las constantes**

11.930 μs (14 allocations: 1.09 KiB) @btime(prueba_scalar_ij!($(Expr(:$, :BB)))) = nothing **threads**

1.494 μs (2 allocations: 96 bytes) @btime(prueba_scalar_ji!($(Expr(:$, :BB)))) = nothing 

16.905 μs (6 allocations: 156.44 KiB) @btime(prueba_scalar_no_for!($(Expr(:$, :BB)))) = 0

3.792 μs (2 allocations: 96 bytes) @btime(prueba_scalar_ij!($(Expr(:$, :BB)))) = nothing

9.847 μs (14 allocations: 1.09 KiB) @btime(prueba_scalar_ji!($(Expr(:$, :BB)))) = nothing **threads**

**Nota: Las threads las ponemos siempre en el loop externo.**

## 1000x1000

1.875 ms (2 allocations: 96 bytes) @btime(prueba_scalar_ij!($(Expr(:$, :BB)))) = nothing

201.421 μs (14 allocations: 1.09 KiB) @btime(prueba_scalar_ji!($(Expr(:$, :BB)))) = nothing **threads**

3.088 ms (6 allocations: 15.26 MiB) @btime(prueba_scalar_no_for!($(Expr(:$, :BB))))

856.074 μs (14 allocations: 1.09 KiB) @btime(prueba_scalar_ij!($(Expr(:$, :BB)))) = nothing **threads**

318.992 μs (2 allocations: 96 bytes) @btime(prueba_scalar_ji!($(Expr(:$, :BB)))) = nothing

1.934 ms (4 allocations: 7.63 MiB) @btime(prueba_scalar_no_for!($(Expr(:$, :BB)))) **doble .**
