# Benchmarking GT4Py

Compare a vanilla Numpy and a GT4Py-powered implementation of some basic mathematical 
operations.

## Data initialization

In [1]:
from copy import deepcopy
from gt4py import gtscript
import numpy as np
from tasmania.python.utils.gtscript_utils import (
    set_annotations,
    stencil_add_defs,
    stencil_iadd_defs,
    stencil_sub_defs,
    stencil_isub_defs,
    stencil_mul_defs,
    stencil_imul_defs,
    stencil_scale_defs,
    stencil_iscale_defs,
    stencil_copy_defs,
    stencil_copychange_defs,
    stencil_addsub_defs,
    stencil_iaddsub_defs,
    stencil_fma_defs,
    stencil_sts_rk2_0_defs,
    stencil_sts_rk3ws_0_defs
)
from tasmania.python.utils.storage_utils import zeros

# customizable settings
backend = "gtx86"
dtype = np.float64
storage_shape = (321, 321, 120)
default_origin = (3, 3, 0)

a = zeros(storage_shape, backend, dtype, default_origin)
a[...] = np.random.rand(*storage_shape)
a_dc = deepcopy(a)
b = zeros(storage_shape, backend, dtype, default_origin)
b[...] = np.random.rand(*storage_shape)
c = zeros(storage_shape, backend, dtype, default_origin)
c[...] = np.random.rand(*storage_shape)
d = zeros(storage_shape, backend, dtype, default_origin)

f = np.random.rand(1).item()

  class DataArray(xr.DataArray):


## add

In [2]:
# numpy
%timeit c[...] = a[...] + b[...]

40.4 ms ± 554 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
# gt4py
set_annotations(stencil_add_defs, dtype)
stencil_add = gtscript.stencil(
    backend=backend, definition=stencil_add_defs, rebuild=False
)
%timeit stencil_add(in_a=a, in_b=b, out_c=c, origin=(0, 0, 0), domain=storage_shape)

18.6 ms ± 73.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_add(in_a=a, in_b=b, out_c=c, origin=default_origin, domain=domain)

18 ms ± 62.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## iadd

In [5]:
# numpy
%timeit a[...] += b[...]

13.2 ms ± 79.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
# gt4py
a[...] = a_dc[...]
set_annotations(stencil_iadd_defs, dtype)
stencil_iadd = gtscript.stencil(
    backend=backend, definition=stencil_iadd_defs, rebuild=False
)
%timeit stencil_iadd(inout_a=a, in_b=b, origin=(0, 0, 0), domain=storage_shape)

11 ms ± 95 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_iadd(inout_a=a, in_b=b, origin=default_origin, domain=domain)

10.3 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## sub

In [8]:
# numpy
%timeit c[...] = a[...] - b[...]

40.9 ms ± 773 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
# gt4py
set_annotations(stencil_sub_defs, dtype)
stencil_sub = gtscript.stencil(
    backend=backend, definition=stencil_sub_defs, rebuild=False
)
%timeit stencil_sub(in_a=a, in_b=b, out_c=c, origin=(0, 0, 0), domain=storage_shape)

18.7 ms ± 64.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_sub(in_a=a, in_b=b, out_c=c, origin=default_origin, domain=domain)

18 ms ± 61.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## isub

In [11]:
# numpy
%timeit a[...] -= b[...]

13.3 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
# gt4py
a[...] = a_dc[...]
set_annotations(stencil_isub_defs, dtype)
stencil_isub = gtscript.stencil(
    backend=backend, definition=stencil_isub_defs, rebuild=False
)
%timeit stencil_isub(inout_a=a, in_b=b, origin=(0, 0, 0), domain=storage_shape)

11 ms ± 91 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_isub(inout_a=a, in_b=b, origin=default_origin, domain=domain)

10.3 ms ± 84.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## mul

In [14]:
# numpy
%timeit c[...] = a[...] * b[...]

40.4 ms ± 479 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
# gt4py
set_annotations(stencil_mul_defs, dtype)
stencil_mul = gtscript.stencil(
    backend=backend, definition=stencil_mul_defs, rebuild=False
)
%timeit stencil_mul(in_a=a, in_b=b, out_c=c, origin=(0, 0, 0), domain=storage_shape)

18.6 ms ± 90.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_mul(in_a=a, in_b=b, out_c=c, origin=default_origin, domain=domain)

18 ms ± 38.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## imul

In [17]:
# numpy
%timeit a[...] *= b[...]

17 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
# gt4py
a[...] = a_dc[...]
set_annotations(stencil_imul_defs, dtype)
stencil_imul = gtscript.stencil(
    backend=backend, definition=stencil_imul_defs, rebuild=False
)
%timeit stencil_imul(inout_a=a, in_b=b, origin=(0, 0, 0), domain=storage_shape)

10.8 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_imul(inout_a=a, in_b=b, origin=default_origin, domain=domain)

10.2 ms ± 159 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## copy

In [20]:
# numpy
%timeit a[...] = b[...]

11.7 ms ± 45.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
# gt4py
set_annotations(stencil_copy_defs, dtype)
stencil_copy = gtscript.stencil(
    backend=backend, definition=stencil_copy_defs, rebuild=False
)
%timeit stencil_copy(src=b, dst=a, origin=(0, 0, 0), domain=storage_shape)

11 ms ± 57.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## copychange

In [4]:
# numpy
%timeit a[...] = - b[...]

36.1 ms ± 682 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# gt4py
set_annotations(stencil_copychange_defs, dtype)
stencil_copychange = gtscript.stencil(
    backend=backend, definition=stencil_copychange_defs, rebuild=False
)
%timeit stencil_copychange(src=b, dst=a, origin=(0, 0, 0), domain=storage_shape)

10.9 ms ± 97.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## addsub

In [22]:
# numpy
%timeit d[...] = a[...] + b[...] - c[...]

67.5 ms ± 256 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
# gt4py
set_annotations(stencil_addsub_defs, dtype)
stencil_addsub = gtscript.stencil(
    backend=backend, definition=stencil_addsub_defs, rebuild=False
)
%timeit stencil_addsub(in_a=a, in_b=b, in_c=c, out_d=d, origin=(0, 0, 0), domain=storage_shape)

24 ms ± 201 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_addsub(in_a=a, in_b=b, in_c=c, out_d=d, origin=default_origin, domain=domain)

23.3 ms ± 158 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## iaddsub

In [25]:
# numpy
%timeit a[...] += b[...] - c[...]

42.1 ms ± 621 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
# gt4py
a[...] = a_dc[...]
stencil_iaddsub = gtscript.stencil(
    backend=backend, definition=stencil_iaddsub_defs, rebuild=False
)
%timeit stencil_iaddsub(inout_a=a, in_b=b, in_c=c, origin=(0, 0, 0), domain=storage_shape)

18.3 ms ± 43.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
# gt4py: make origin coincide with default_origin
a[...] = a_dc[...]
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_iaddsub(inout_a=a, in_b=b, in_c=c, origin=default_origin, domain=domain)

17.5 ms ± 392 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## fma

In [28]:
# numpy
%timeit c[...] = a[...] + f * b[...]

62.7 ms ± 498 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
# gt4py
set_annotations(stencil_fma_defs, dtype)
stencil_fma = gtscript.stencil(
    backend=backend, definition=stencil_fma_defs, rebuild=False
)
%timeit stencil_fma(in_a=a, in_b=b, out_c=c, f=f, origin=(0, 0, 0), domain=storage_shape)

18.4 ms ± 47.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_fma(in_a=a, in_b=b, out_c=c, f=f, origin=default_origin, domain=domain)

17.7 ms ± 60.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## sts_rk2_0

In [31]:
# numpy
%timeit d[...] = 0.5 * (a[...] + b[...] + f * c[...])

112 ms ± 210 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [32]:
# gt4py
set_annotations(stencil_sts_rk2_0_defs, dtype)
stencil_sts_rk2_0 = gtscript.stencil(
    backend=backend, definition=stencil_sts_rk2_0_defs, rebuild=False
)
%timeit stencil_sts_rk2_0(a, b, c, d, dt=f, origin=(0, 0, 0), domain=storage_shape)

23.3 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_sts_rk2_0(a, b, c, d, dt=f, origin=default_origin, domain=domain)

22.7 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## sts_rk3ws_0

In [34]:
# numpy
%timeit d[...] = (2.0 * a[...] + b[...] + f * c[...]) / 3.0

135 ms ± 1.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [35]:
# gt4py
set_annotations(stencil_sts_rk3ws_0_defs, dtype)
stencil_sts_rk3ws_0 = gtscript.stencil(
    backend=backend, definition=stencil_sts_rk3ws_0_defs, rebuild=False
)
%timeit stencil_sts_rk3ws_0(a, b, c, d, dt=f, origin=(0, 0, 0), domain=storage_shape)

23 ms ± 361 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [36]:
# gt4py: make origin coincide with default_origin
domain = tuple(s - 2*d for s, d in zip(storage_shape, default_origin))
%timeit stencil_sts_rk3ws_0(a, b, c, d, dt=f, origin=default_origin, domain=domain)

22.5 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
