# Performance tips

In [1]:
using DataFrames
using BenchmarkTools
using CategoricalArrays
using PooledArrays

## Access by column number is faster than by name

In [2]:
x = DataFrame(rand(5, 1000), :auto)
@btime $x[!, 500];
@btime $x.x500;

  2.480 ns (0 allocations: 0 bytes)
  18.536 ns (1 allocation: 32 bytes)


## When working with data `DataFrame` use barrier functions or type annotation

In [3]:
using Random
function f_bad() # this function will be slow
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    y, z = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_bad();
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`


  84.815 ms (5999017 allocations: 122.06 MiB)


In [4]:
# solution 1 is to use barrier function (it should be possible to use it in almost any code)
function f_inner(y,z)
   p = 0.0
   for i in 1:length(y)
       p += y[i]*z[i]
   end
   p
end

function f_barrier() # extract the work to an inner function
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    f_inner(x[!, 1], x[!, 2])
end

using LinearAlgebra
function f_inbuilt() # or use inbuilt function if possible
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    dot(x[!, 1], x[!, 2])
end

@btime f_barrier();
@btime f_inbuilt();

  4.493 ms (37 allocations: 30.52 MiB)
  4.464 ms (37 allocations: 30.52 MiB)


In [5]:
# solution 2 is to provide the types of extracted columns
# it is simpler but there are cases in which you will not know these types
# This example  assumes that you have DataFrames master at least from August 31, 2018
function f_typed()
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    y::Vector{Float64}, z::Vector{Float64} = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_typed();

  5.087 ms (37 allocations: 30.52 MiB)


In general for tall and narrow tables it is often useful to use `Tables.rowtable`, `Tables.columntable` or `Tables.namedtupleiterator` for intermediate processing of data in a type-stable way.

## Consider using delayed `DataFrame` creation technique

also notice the difference in performance between copying vs non-copying data frame creation

In [6]:
function f1()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto, copycols=false) # we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f1a()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto) # we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f2()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto, copycols=false) # we delay creation of DataFrame after we have our job done
end

function f2a()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto) # we delay creation of DataFrame after we have our job done
end

@btime f1();
@btime f1a();
@btime f2();
@btime f2a();

  23.844 ms (1949523 allocations: 37.40 MiB)
  26.127 ms (1949723 allocations: 45.03 MiB)
  1.688 ms (623 allocations: 7.66 MiB)
  2.451 ms (823 allocations: 15.30 MiB)


## You can add rows to a `DataFrame` in place and it is fast

In [7]:
x = DataFrame(rand(10^6, 5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;]

@btime vcat($x, $y); # creates a new DataFrame - slow
@btime append!($x, $y); # in place - fast

x = DataFrame(rand(10^6, 5), :auto) # reset to the same starting point
@btime push!($x, $z); # add a single row in place - fast

  4.411 ms (180 allocations: 38.15 MiB)
  782.750 ns (17 allocations: 672 bytes)
  416.700 ns (16 allocations: 256 bytes)


## Allowing `missing` as well as `categorical` slows down computations

In [8]:
using StatsBase

function test(data) # uses countmap function to test performance
    println(eltype(data))
    x = rand(data, 10^6)
    y = categorical(x)
    println(" raw:")
    @btime countmap($x)
    println(" categorical:")
    @btime countmap($y)
    nothing
end

test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))


LoadError: ArgumentError: Package StatsBase not found in current path:
- Run `import Pkg; Pkg.add("StatsBase")` to install the StatsBase package.


## When aggregating use column selector and prefer integer, categorical, or pooled array grouping variable

In [9]:
df = DataFrame(x=rand('a':'d', 10^7), y=1);

In [10]:
gdf = groupby(df, :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,a,1
2,a,1
3,a,1
4,a,1
5,a,1
6,a,1
7,a,1
8,a,1
9,a,1
10,a,1

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,d,1
2,d,1
3,d,1
4,d,1
5,d,1
6,d,1
7,d,1
8,d,1
9,d,1
10,d,1


In [11]:
@btime combine(v -> sum(v.y), $gdf) # traditional syntax, slow

  42.850 ms (334 allocations: 19.10 MiB)


Unnamed: 0_level_0,x,x1
Unnamed: 0_level_1,Char,Int64
1,a,2500849
2,b,2499628
3,c,2500112
4,d,2499411


In [12]:
@btime combine($gdf, :y=>sum) # use column selector

  8.719 ms (230 allocations: 13.31 KiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Char,Int64
1,a,2500849
2,b,2499628
3,c,2500112
4,d,2499411


In [13]:
transform!(df, :x => categorical => :x);

In [14]:
gdf = groupby(df, :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Cat…,Int64
1,a,1
2,a,1
3,a,1
4,a,1
5,a,1
6,a,1
7,a,1
8,a,1
9,a,1
10,a,1

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Cat…,Int64
1,d,1
2,d,1
3,d,1
4,d,1
5,d,1
6,d,1
7,d,1
8,d,1
9,d,1
10,d,1


In [15]:
@btime combine($gdf, :y=>sum)

  8.908 ms (237 allocations: 13.89 KiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Cat…,Int64
1,a,2500849
2,b,2499628
3,c,2500112
4,d,2499411


In [16]:
transform!(df, :x => PooledArray{Char} => :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,a,1
2,b,1
3,c,1
4,a,1
5,b,1
6,a,1
7,b,1
8,b,1
9,a,1
10,c,1


In [17]:
gdf = groupby(df, :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,a,1
2,a,1
3,a,1
4,a,1
5,a,1
6,a,1
7,a,1
8,a,1
9,a,1
10,a,1

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,d,1
2,d,1
3,d,1
4,d,1
5,d,1
6,d,1
7,d,1
8,d,1
9,d,1
10,d,1


In [18]:
@btime combine($gdf, :y=>sum)

  9.534 ms (232 allocations: 13.38 KiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Char,Int64
1,a,2500849
2,b,2499628
3,c,2500112
4,d,2499411


## Use views instead of materializing a new DataFrame

In [19]:
x = DataFrame(rand(100, 1000), :auto)

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.524836,0.72675,0.474059,0.521572,0.460615,0.493288,0.62556
2,0.450111,0.438803,0.914643,0.152005,0.470075,0.477494,0.098748
3,0.399348,0.810325,0.994383,0.636791,0.663914,0.358692,0.557299
4,0.171113,0.995161,0.957723,0.0978758,0.0672851,0.829578,0.87674
5,0.237382,0.367287,0.690186,0.61463,0.69947,0.571084,0.981192
6,0.0738984,0.895709,0.0657372,0.066569,0.479868,0.339752,0.679379
7,0.0838025,0.161537,0.638759,0.839586,0.461109,0.533586,0.968726
8,0.323185,0.679265,0.406851,0.950191,0.102193,0.0965595,0.569159
9,0.939335,0.0349469,0.370833,0.234694,0.499019,0.178378,0.292509
10,0.90771,0.599251,0.921183,0.155048,0.825517,0.417809,0.720356


In [20]:
@btime $x[1:1, :]

  165.450 μs (2985 allocations: 159.44 KiB)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.524836,0.72675,0.474059,0.521572,0.460615,0.493288,0.62556,0.000997819


In [21]:
@btime $x[1, :]

  18.785 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.524836,0.72675,0.474059,0.521572,0.460615,0.493288,0.62556,0.000997819


In [22]:
@btime view($x, 1:1, :)

  17.864 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.524836,0.72675,0.474059,0.521572,0.460615,0.493288,0.62556,0.000997819


In [23]:
@btime $x[1:1, 1:20]

  3.643 μs (50 allocations: 4.22 KiB)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.524836,0.72675,0.474059,0.521572,0.460615,0.493288,0.62556,0.000997819


In [24]:
@btime $x[1, 1:20]

  19.097 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.524836,0.72675,0.474059,0.521572,0.460615,0.493288,0.62556,0.000997819


In [25]:
@btime view($x, 1:1, 1:20)

  18.184 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.524836,0.72675,0.474059,0.521572,0.460615,0.493288,0.62556,0.000997819
