# Benchmarking GT4Py

Compare a vanilla Numpy and a GT4Py-powered implementation of some basic mathematical 
operations.

## Data initialization

In [None]:
# Install numexpr if needed
!pip install numexpr

In [1]:
from copy import deepcopy
from gt4py import gtscript
import numpy as np
import numexpr as ne
from tasmania.python.utils.gtscript_utils import (
    set_annotations,
    stencil_add_defs,
    stencil_iadd_defs,
    stencil_sub_defs,
    stencil_isub_defs,
    stencil_mul_defs,
    stencil_imul_defs,
    stencil_scale_defs,
    stencil_iscale_defs,
    stencil_copy_defs,
    stencil_copychange_defs,
    stencil_addsub_defs,
    stencil_iaddsub_defs,
    stencil_fma_defs,
    stencil_sts_rk2_0_defs,
    stencil_sts_rk3ws_0_defs
)
from tasmania.python.utils.storage_utils import zeros

# customizable settings
backend = "gtx86"
dtype = np.float64
storage_shape = (321, 321, 120)
default_origin = (3, 3, 0)

a = zeros(storage_shape, backend, dtype, default_origin)
a[...] = np.random.rand(*storage_shape)
a_dc = deepcopy(a)
b = zeros(storage_shape, backend, dtype, default_origin)
b[...] = np.random.rand(*storage_shape)
c = zeros(storage_shape, backend, dtype, default_origin)
c[...] = np.random.rand(*storage_shape)
d = zeros(storage_shape, backend, dtype, default_origin)

f = np.random.rand(1).item()

  class DataArray(xr.DataArray):


## add

In [2]:
# numpy
%timeit c[...] = a[...] + b[...]

31.7 ms ± 818 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
# numexpr with assignment
%timeit c[...] = ne.evaluate("a + b")

41.1 ms ± 845 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
# numexpr
%timeit c = ne.evaluate("a + b")

27.4 ms ± 431 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# gt4py
set_annotations(stencil_add_defs, dtype)
stencil_add = gtscript.stencil(
    backend=backend, definition=stencil_add_defs, rebuild=False
)
%timeit stencil_add(in_a=a, in_b=b, out_c=c, origin=(0, 0, 0), domain=storage_shape)

22 ms ± 730 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_add(in_a=a, in_b=b, out_c=c, origin=default_origin, domain=domain)

21 ms ± 148 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## iadd

In [7]:
# numpy
%timeit a[...] += b[...]

15.4 ms ± 297 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
# gt4py
a[...] = a_dc[...]
set_annotations(stencil_iadd_defs, dtype)
stencil_iadd = gtscript.stencil(
    backend=backend, definition=stencil_iadd_defs, rebuild=False
)
%timeit stencil_iadd(inout_a=a, in_b=b, origin=(0, 0, 0), domain=storage_shape)

15 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_iadd(inout_a=a, in_b=b, origin=default_origin, domain=domain)

14.6 ms ± 357 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## sub

In [10]:
# numpy
%timeit c[...] = a[...] - b[...]

31.9 ms ± 490 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
# numexpr with assignment
%timeit c[...] = ne.evaluate("a - b")

37.1 ms ± 711 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
# numexpr
%timeit c = ne.evaluate("a - b")

27 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
# gt4py
set_annotations(stencil_sub_defs, dtype)
stencil_sub = gtscript.stencil(
    backend=backend, definition=stencil_sub_defs, rebuild=False
)
%timeit stencil_sub(in_a=a, in_b=b, out_c=c, origin=(0, 0, 0), domain=storage_shape)

22.4 ms ± 880 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_sub(in_a=a, in_b=b, out_c=c, origin=default_origin, domain=domain)

21.8 ms ± 975 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## isub

In [15]:
# numpy
%timeit a[...] -= b[...]

15.2 ms ± 135 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
# gt4py
a[...] = a_dc[...]
set_annotations(stencil_isub_defs, dtype)
stencil_isub = gtscript.stencil(
    backend=backend, definition=stencil_isub_defs, rebuild=False
)
%timeit stencil_isub(inout_a=a, in_b=b, origin=(0, 0, 0), domain=storage_shape)

16.6 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_isub(inout_a=a, in_b=b, origin=default_origin, domain=domain)

14.9 ms ± 389 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## mul

In [18]:
# numpy
%timeit c[...] = a[...] * b[...]

38 ms ± 9.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
# numexpr with assignment
%timeit c[...] = ne.evaluate("a * b")

42.8 ms ± 6.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
# numexpr
%timeit c = ne.evaluate("a * b")

27.1 ms ± 96.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
# gt4py
set_annotations(stencil_mul_defs, dtype)
stencil_mul = gtscript.stencil(
    backend=backend, definition=stencil_mul_defs, rebuild=False
)
%timeit stencil_mul(in_a=a, in_b=b, out_c=c, origin=(0, 0, 0), domain=storage_shape)

23.4 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_mul(in_a=a, in_b=b, out_c=c, origin=default_origin, domain=domain)

32.9 ms ± 5.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## imul

In [23]:
# numpy
%timeit a[...] *= b[...]

16.5 ms ± 376 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
# gt4py
a[...] = a_dc[...]
set_annotations(stencil_imul_defs, dtype)
stencil_imul = gtscript.stencil(
    backend=backend, definition=stencil_imul_defs, rebuild=False
)
%timeit stencil_imul(inout_a=a, in_b=b, origin=(0, 0, 0), domain=storage_shape)

16.6 ms ± 787 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_imul(inout_a=a, in_b=b, origin=default_origin, domain=domain)

16.2 ms ± 2 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## copy

In [26]:
# numpy
%timeit a[...] = b[...]

10.7 ms ± 541 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
# gt4py
set_annotations(stencil_copy_defs, dtype)
stencil_copy = gtscript.stencil(
    backend=backend, definition=stencil_copy_defs, rebuild=False
)
%timeit stencil_copy(src=b, dst=a, origin=(0, 0, 0), domain=storage_shape)

15.1 ms ± 227 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## copychange

In [28]:
# numpy
%timeit a[...] = - b[...]

25.7 ms ± 913 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
# numexpr with assignment
%timeit c[...] = ne.evaluate("-b")

35.3 ms ± 837 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
# numexpr
%timeit c = ne.evaluate("-b")

30.8 ms ± 3.87 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
# gt4py
set_annotations(stencil_copychange_defs, dtype)
stencil_copychange = gtscript.stencil(
    backend=backend, definition=stencil_copychange_defs, rebuild=False
)
%timeit stencil_copychange(src=b, dst=a, origin=(0, 0, 0), domain=storage_shape)

19.6 ms ± 4.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## addsub

In [32]:
# numpy
%timeit d[...] = a[...] + b[...] - c[...]

61.9 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
# numexpr with assignment
%timeit d[...] = ne.evaluate("a + b - c")

51.8 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [34]:
# numexpr
%timeit d = ne.evaluate("a + b - c")

38.8 ms ± 416 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [35]:
# gt4py
set_annotations(stencil_addsub_defs, dtype)
stencil_addsub = gtscript.stencil(
    backend=backend, definition=stencil_addsub_defs, rebuild=False
)
%timeit stencil_addsub(in_a=a, in_b=b, in_c=c, out_d=d, origin=(0, 0, 0), domain=storage_shape)

29.9 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [36]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_addsub(in_a=a, in_b=b, in_c=c, out_d=d, origin=default_origin, domain=domain)

32.6 ms ± 7.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## iaddsub

In [37]:
# numpy
%timeit a[...] += b[...] - c[...]

38.3 ms ± 5.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
# gt4py
a[...] = a_dc[...]
stencil_iaddsub = gtscript.stencil(
    backend=backend, definition=stencil_iaddsub_defs, rebuild=False
)
%timeit stencil_iaddsub(inout_a=a, in_b=b, in_c=c, origin=(0, 0, 0), domain=storage_shape)

20.5 ms ± 63.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [39]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_iaddsub(inout_a=a, in_b=b, in_c=c, origin=default_origin, domain=domain)

20 ms ± 546 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## fma

In [40]:
# numpy
%timeit c[...] = a[...] + f * b[...]

49.2 ms ± 3.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [41]:
# numexpr with assignment
%timeit c[...] = ne.evaluate("a + f * b")

37.8 ms ± 479 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [42]:
# numexpr
%timeit c = ne.evaluate("a + f * b")

26.5 ms ± 307 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
# gt4py
set_annotations(stencil_fma_defs, dtype)
stencil_fma = gtscript.stencil(
    backend=backend, definition=stencil_fma_defs, rebuild=False
)
%timeit stencil_fma(in_a=a, in_b=b, out_c=c, f=f, origin=(0, 0, 0), domain=storage_shape)

21.7 ms ± 67 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [44]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_fma(in_a=a, in_b=b, out_c=c, f=f, origin=default_origin, domain=domain)

21.3 ms ± 616 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## sts_rk2_0

In [None]:
# numpy
%timeit d[...] = 0.5 * (a[...] + b[...] + f * c[...])

In [6]:
# numexpr with assignment
%timeit d[...] = ne.evaluate("a + b + f * c")

54.6 ms ± 8.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
# numexpr
%timeit d = ne.evaluate("a + b + f * c")

35.8 ms ± 784 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
# gt4py
set_annotations(stencil_sts_rk2_0_defs, dtype)
stencil_sts_rk2_0 = gtscript.stencil(
    backend=backend, definition=stencil_sts_rk2_0_defs, rebuild=False
)
%timeit stencil_sts_rk2_0(a, b, c, d, dt=f, origin=(0, 0, 0), domain=storage_shape)

25.5 ms ± 641 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_sts_rk2_0(a, b, c, d, dt=f, origin=default_origin, domain=domain)

24.4 ms ± 968 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## sts_rk3ws_0

In [None]:
# numpy
%timeit d[...] = (2.0 * a[...] + b[...] + f * c[...]) / 3.0

In [2]:
# numexpr with assignment
%timeit d[...] = ne.evaluate("(2.0 * a + b + f * c) / 3.0")

50 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
# numexpr
%timeit d = ne.evaluate("(2.0 * a + b + f * c) / 3.0")

37.2 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
# gt4py
set_annotations(stencil_sts_rk3ws_0_defs, dtype)
stencil_sts_rk3ws_0 = gtscript.stencil(
    backend=backend, definition=stencil_sts_rk3ws_0_defs, rebuild=False
)
%timeit stencil_sts_rk3ws_0(a, b, c, d, dt=f, origin=(0, 0, 0), domain=storage_shape)

25.4 ms ± 804 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_sts_rk3ws_0(a, b, c, d, dt=f, origin=default_origin, domain=domain)

23.9 ms ± 60.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
