# Benchmarking: cupy

Timing a cupy implementation of some basic mathematical operations.

## User input

In [1]:
import numpy as np

# storage
dtype = np.float64
storage_shape = (321, 321, 120)

# serialization
serialize = True
filename = "timings_daint.xlsx"
sheetname = "{} x {} x {}".format(*storage_shape)

## Data initialization

In [2]:
# install and import cupy
is_installed = !pip list 2> /dev/null | grep 'cupy'
if not is_installed:
    !pip install cupy-cuda101
import cupy as cp

In [3]:
from copy import deepcopy

a = cp.zeros(storage_shape, dtype=dtype)
a[...] = cp.random.rand(*storage_shape)
a_dc = deepcopy(a)
b = cp.zeros(storage_shape, dtype=dtype)
b[...] = cp.random.rand(*storage_shape)
c = cp.zeros(storage_shape, dtype=dtype)
c[...] = cp.random.rand(*storage_shape)
d = cp.zeros(storage_shape, dtype=dtype)

f = cp.random.rand(1).item()

## Serialization setup

In [4]:
# install and import openpyxl
is_installed = !pip list 2> /dev/null | grep 'openpyxl'
if not is_installed:
    !pip install openpyxl==2.6
import openpyxl as xl

# install and import pandas
is_installed = !pip list 2> /dev/null | grep 'pandas'
if not is_installed:
    !pip install pandas
import pandas as pd

# install xlrd
is_installed = !pip list 2> /dev/null | grep 'xlrd'
if not is_installed:
    !pip install xlrd

In [5]:
# dataframe's indices and column
index = [
    "copy", 
    "copychange", 
    "abs", 
    "iabs", 
    "add", 
    "iadd", 
    "sub", 
    "isub", 
    "mul", 
    "imul",
    "addsub",
    "iaddsub",
    "fma",
    "sts_rk2_0",
    "sts_rk3ws_0"
]
column = "cupy"
    
if serialize:
    # create the spreadsheet
    import os
    if not os.path.exists(filename):
        wb = xl.Workbook()
        wb.save(filename=filename)
    else:
        wb = xl.load_workbook(filename)
        
    # create an ExcelWriter object
    with pd.ExcelWriter(filename, engine="openpyxl") as writer:
        writer.book = wb
        # writer.sheets = dict((ws.title, ws) for ws in wb.worksheets)
    
    if sheetname not in wb.sheetnames:
        # create an empty dataframe
        df = pd.DataFrame(data=[None,]*len(index), index=index, columns=[column,])
        df.to_excel(writer, sheet_name=sheetname)
    else:
        # load the dataframe
        df = pd.read_excel(writer, sheet_name=sheetname, index_col=0)
        
        # remove the sheet
        sheetid = wb.sheetnames.index(sheetname)
        wb.remove(wb.worksheets[sheetid])
else:
    # create the dataframe
    df = pd.DataFrame(data=[None,]*len(index), index=index, columns=[column,])

## Timing

In [6]:
# copy
out = %timeit -o a[...] = b[...]
df.at["copy", column] = 1000 * out.average

383 µs ± 10.7 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
# copychange
out = %timeit -o a = - b

20.3 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
# copychange with assignment
out = %timeit -o a[...] = - b[...]
df.at["copychange", column] = 1000 * out.average

752 µs ± 40.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
# abs
out = %timeit -o b = cp.abs(a)

25.7 µs ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# abs with assignment
out = %timeit -o b[...] = cp.abs(a)
df.at["abs", column] = 1000 * out.average

751 µs ± 31 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
# iabs with assignment
out = %timeit -o a[...] = cp.abs(a)
df.at["iabs", column] = 1000 * out.average

752 µs ± 9.87 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
# add
out = %timeit -o c = a + b

23.3 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# add with assignment
out = %timeit -o c[...] = a[...] + b[...]
df.at["add", column] = 1000 * out.average

921 µs ± 20.6 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
# iadd with assignment
out = %timeit -o a[...] += b[...]
df.at["iadd", column] = 1000 * out.average

536 µs ± 12.7 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
# sub
out = %timeit -o c = a - b

20.2 µs ± 11.2 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
# sub with assignment
out = %timeit -o c[...] = a[...] - b[...]
df.at["sub", column] = 1000 * out.average

921 µs ± 20 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
# isub with assignment
out = %timeit -o a[...] -= b[...]
df.at["isub", column] = 1000 * out.average

536 µs ± 15.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
# mul
out = %timeit -o c = a * b

19.9 µs ± 10.9 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
# mul with assignment
out = %timeit -o c[...] = a[...] * b[...]
df.at["mul", column] = 1000 * out.average

921 µs ± 17.3 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
# imul with assignment
out = %timeit -o a[...] *= b[...]
df.at["imul", column] = 1000 * out.average

536 µs ± 10.5 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
# addsub
out = %timeit -o d = a + b - c

1.07 ms ± 16.8 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
# addsub with assignment
out = %timeit -o d[...] = a[...] + b[...] - c[...]
df.at["addsub", column] = 1000 * out.average

1.6 ms ± 249 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [23]:
# iaddsub with assignment
out = %timeit -o a[...] += b[...] - c[...]
df.at["iaddsub", column] = 1000 * out.average

1.07 ms ± 21.3 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
# fma
out = %timeit -o c = a + f * b

74.8 µs ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
# fma with assignment
out = %timeit -o c[...] = a[...] + f * b[...]
df.at["fma", column] = 1000 * out.average

1.29 ms ± 39.3 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [26]:
# sts_rk2_0
out = %timeit -o d = 0.5 * (a + b + f * c)

1.81 ms ± 49.2 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [27]:
# sts_rk2_0 with assignment
out = %timeit -o d[...] = 0.5 * (a[...] + b[...] + f * c[...])
df.at["sts_rk2_0", column] = 1000 * out.average

2.24 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
# sts_rk3ws_0
out = %timeit -o d = (2.0 * a + b + f * c) / 3.0

181 µs ± 25.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
# sts_rk3ws_0 with assignment
out = %timeit -o d[...] = (2.0 * a[...] + b[...] + f * c[...]) / 3.0
df.at["sts_rk3ws_0", column] = 1000 * out.average

2.56 ms ± 41.8 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Serialization

In [30]:
if serialize:
    df.to_excel(writer, sheet_name=sheetname)
    writer.save()

In [31]:
df

Unnamed: 0,numpy,numexpr,cupy
copy,9.449362,,0.383272
copychange,50.33706,20.007233,0.751502
abs,49.926181,,0.751421
iabs,50.135189,,0.751514
add,55.094303,20.875209,0.920538
iadd,15.295182,,0.535888
sub,54.904362,20.907313,0.920541
isub,15.50136,,0.535892
mul,55.079714,22.223766,0.920547
imul,18.34474,,0.53589
