# This notebbooks benchmarks matrix multiplications at different precisions with subsequent calls to cp.array.

# The Hardware used is a 1080 Ti GPU connected to a PCIe 3/8x lane

In [17]:
import cupy as cp

In [18]:
cp.cuda.Device(2).use()

In [53]:
cp.__version__

'9.1.0'

In [19]:
def mult_asarr(x1,x2):
    x3 = cp.asarray(x1@x2)
    return x3

def mult(x1,x2):
    x3 = x1@x2
    return x3

def mult_arr(x1,x2):
    x3 = cp.array(x1@x2,copy=True)
    return x3

def mult_arr_nc(x1,x2):
    x3 = cp.array(x1@x2,copy=False)
    return x3


In [20]:
import numpy as np

# Double Complex benchmarks

In [21]:
x1 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex128).reshape(2000, 2000)
x2 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex128).reshape(2000, 2000)

In [22]:
%%timeit -n 100 -r 10
x3 = x1@x2

The slowest run took 2225.38 times longer than the fastest. This could mean that an intermediate result is being cached.
80 ms ± 80.1 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [23]:
%%timeit -n 100 -r 10
x3 = cp.array(x1@x2,copy=True)

The slowest run took 8718.64 times longer than the fastest. This could mean that an intermediate result is being cached.
195 ms ± 227 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [24]:
from cupyx.time import repeat



In [25]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:  116.431 us   +/-36.022 (min:   84.350 / max:  253.268) us     GPU-2:166508.626 us   +/-66.168 (min:166389.725 / max:166788.101) us


In [26]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:  179.477 us   +/-66.521 (min:   90.359 / max:  372.948) us     GPU-2:166558.448 us   +/-65.667 (min:166420.486 / max:166797.318) us


In [27]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  180.583 us   +/-67.698 (min:  109.039 / max:  352.937) us     GPU-2:166913.096 us   +/-64.060 (min:166813.477 / max:167102.432) us


In [28]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:  141.459 us   +/-51.971 (min:   89.989 / max:  253.288) us     GPU-2:166563.221 us   +/-73.784 (min:166426.620 / max:166861.816) us


In [130]:
%%timeit -n 100 -r 10
mult(x1,x2);

The slowest run took 2078.32 times longer than the fastest. This could mean that an intermediate result is being cached.
82.1 ms ± 82.2 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [131]:
%%timeit -n 100 -r 10
mult_arr(x1,x2);

197 ms ± 37.4 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [132]:
%%timeit -n 100 -r 10
mult_asarr(x1,x2);

139 ms ± 27.3 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


# Single Complex Benchmarks

In [35]:
x1 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex64).reshape(2000, 2000)
x2 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex64).reshape(2000, 2000)

In [36]:
from cupyx.time import repeat



In [37]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:   96.657 us   +/-33.156 (min:   78.560 / max:  250.238) us     GPU-2: 6136.612 us   +/-96.065 (min: 5981.184 / max: 6447.104) us


In [38]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:   89.201 us   +/-19.072 (min:   81.540 / max:  241.718) us     GPU-2: 6121.749 us   +/-40.298 (min: 6022.144 / max: 6293.504) us


In [39]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  113.073 us   +/-30.773 (min:  100.930 / max:  309.508) us     GPU-2: 6294.140 us   +/-40.418 (min: 6181.888 / max: 6409.216) us


In [40]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:   86.710 us   +/-17.326 (min:   81.820 / max:  248.969) us     GPU-2: 6151.248 us   +/-27.968 (min: 6041.600 / max: 6301.696) us


# Double Benchmarks

In [41]:
x1 = cp.arange(2000*2000, dtype=cp.float64).reshape(2000, 2000)
x2 = cp.arange(2000*2000, dtype=cp.float64).reshape(2000, 2000)

In [42]:
%%timeit -n 100 -r 10
x3 = x1@x2

The slowest run took 595.32 times longer than the fastest. This could mean that an intermediate result is being cached.
30 ms ± 20.3 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [43]:
%%timeit -n 100 -r 10
x3 = cp.array(x1@x2)

The slowest run took 1581.77 times longer than the fastest. This could mean that an intermediate result is being cached.
49.7 ms ± 38.4 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [44]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:  102.837 us   +/-31.784 (min:   82.109 / max:  248.529) us     GPU-2:45816.684 us   +/-35.946 (min:45784.065 / max:46047.039) us


In [45]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:   97.622 us   +/-21.741 (min:   83.730 / max:  249.818) us     GPU-2:45809.086 us   +/-18.976 (min:45784.065 / max:45946.880) us


In [46]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  117.513 us   +/-32.771 (min:  104.140 / max:  344.598) us     GPU-2:45974.173 us   +/-23.601 (min:45954.048 / max:46135.296) us


In [47]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:   91.245 us   +/-17.941 (min:   83.300 / max:  244.238) us     GPU-2:45801.933 us   +/-15.540 (min:45784.065 / max:45935.616) us


# Single Benchmarks

In [48]:
x1 = cp.arange(2000*2000, dtype=cp.float32).reshape(2000, 2000)
x2 = cp.arange(2000*2000, dtype=cp.float32).reshape(2000, 2000)

In [49]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:   84.947 us   +/-10.653 (min:   77.920 / max:  131.269) us     GPU-2: 1739.193 us   +/-12.112 (min: 1718.272 / max: 1769.472) us


In [50]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:   84.086 us   +/- 4.582 (min:   80.399 / max:  121.170) us     GPU-2: 1729.584 us   +/- 4.898 (min: 1722.240 / max: 1762.208) us


In [51]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  105.760 us   +/- 4.220 (min:  101.959 / max:  138.680) us     GPU-2: 1814.397 us   +/- 4.001 (min: 1807.360 / max: 1832.960) us


In [52]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:   86.511 us   +/-14.318 (min:   81.090 / max:  204.599) us     GPU-2: 1740.374 us   +/-13.098 (min: 1724.416 / max: 1830.656) us


# The Hardware used is a 2080 Ti GPU connected to a PCIe 3/16x lane

In [54]:
cp.cuda.Device(0).use()

# Double Complex benchmarks

In [55]:
x1 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex128).reshape(2000, 2000)
x2 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex128).reshape(2000, 2000)

In [56]:
%%timeit -n 100 -r 10
x3 = x1@x2

The slowest run took 1776.72 times longer than the fastest. This could mean that an intermediate result is being cached.
64 ms ± 63.7 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [57]:
%%timeit -n 100 -r 10
x3 = cp.array(x1@x2,copy=True)

The slowest run took 6979.55 times longer than the fastest. This could mean that an intermediate result is being cached.
165 ms ± 185 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [58]:
from cupyx.time import repeat



In [59]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:  152.365 us   +/-59.756 (min:   83.360 / max:  259.109) us     GPU-0:144359.401 us   +/-5557.052 (min:135135.239 / max:153556.992) us


In [60]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:  167.561 us   +/-63.160 (min:   91.080 / max:  269.889) us     GPU-0:144103.770 us   +/-5486.232 (min:135075.745 / max:154463.226) us


In [61]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  237.373 us   +/-67.534 (min:  119.950 / max:  402.548) us     GPU-0:144880.880 us   +/-5591.313 (min:135366.013 / max:156071.838) us


In [62]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:  178.968 us   +/-53.741 (min:   90.839 / max:  300.279) us     GPU-0:144002.112 us   +/-5295.327 (min:135077.667 / max:154351.166) us


In [63]:
%%timeit -n 100 -r 10
mult(x1,x2);

The slowest run took 1732.79 times longer than the fastest. This could mean that an intermediate result is being cached.
70.3 ms ± 70.3 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [64]:
%%timeit -n 100 -r 10
mult_arr(x1,x2);

168 ms ± 32.2 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [65]:
%%timeit -n 100 -r 10
mult_asarr(x1,x2);

119 ms ± 23.3 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


# Single Complex Benchmarks

In [66]:
x1 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex64).reshape(2000, 2000)
x2 = cp.array(np.random.uniform(size=2000*2000) + 1.j *np.random.uniform(size=2000*2000), dtype=cp.complex64).reshape(2000, 2000)

In [67]:
from cupyx.time import repeat



In [68]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:   87.802 us   +/-19.414 (min:   82.039 / max:  270.289) us     GPU-0: 5865.767 us   +/-50.319 (min: 5734.944 / max: 6070.528) us


In [69]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:   89.805 us   +/-17.634 (min:   84.180 / max:  255.629) us     GPU-0: 5858.912 us   +/-61.742 (min: 5734.656 / max: 6054.080) us


In [70]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  114.124 us   +/-10.103 (min:  106.700 / max:  162.310) us     GPU-0: 5966.839 us   +/-61.517 (min: 5857.024 / max: 6064.192) us


In [71]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:   88.828 us   +/-17.024 (min:   83.729 / max:  255.639) us     GPU-0: 5888.378 us   +/-71.942 (min: 5746.432 / max: 6155.264) us


# Double Benchmarks

In [72]:
x1 = cp.arange(2000*2000, dtype=cp.float64).reshape(2000, 2000)
x2 = cp.arange(2000*2000, dtype=cp.float64).reshape(2000, 2000)

In [73]:
%%timeit -n 100 -r 10
x3 = x1@x2

The slowest run took 453.32 times longer than the fastest. This could mean that an intermediate result is being cached.
25.2 ms ± 17 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [74]:
%%timeit -n 100 -r 10
x3 = cp.array(x1@x2)

The slowest run took 1266.16 times longer than the fastest. This could mean that an intermediate result is being cached.
41.1 ms ± 31.8 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [75]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:  179.479 us   +/-52.906 (min:   88.710 / max:  255.609) us     GPU-0:38019.862 us   +/-1219.887 (min:36090.145 / max:40228.642) us


In [76]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:  116.679 us   +/-50.911 (min:   86.289 / max:  294.279) us     GPU-0:37948.303 us   +/-1001.912 (min:36087.807 / max:39544.830) us


In [77]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  119.390 us   +/-28.024 (min:  106.580 / max:  299.999) us     GPU-0:37784.357 us   +/-1079.357 (min:36205.055 / max:40059.551) us


In [78]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:   99.056 us   +/-24.263 (min:   85.920 / max:  247.239) us     GPU-0:37877.397 us   +/-1264.400 (min:36084.095 / max:40126.465) us


# Single Benchmarks

In [79]:
x1 = cp.arange(2000*2000, dtype=cp.float32).reshape(2000, 2000)
x2 = cp.arange(2000*2000, dtype=cp.float32).reshape(2000, 2000)

In [80]:
print(repeat(mult, (x1,x2), n_repeat=100,n_warmup=30)) 

mult                :    CPU:   92.214 us   +/- 3.939 (min:   89.259 / max:  125.220) us     GPU-0: 1588.180 us   +/-66.454 (min: 1530.112 / max: 1743.200) us


In [81]:
print(repeat(mult_asarr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_asarr          :    CPU:   94.536 us   +/- 4.862 (min:   90.910 / max:  132.240) us     GPU-0: 1544.783 us   +/-10.859 (min: 1529.472 / max: 1581.056) us


In [82]:
print(repeat(mult_arr, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr            :    CPU:  114.795 us   +/- 2.394 (min:  111.319 / max:  127.420) us     GPU-0: 1635.588 us   +/-58.637 (min: 1577.152 / max: 1764.192) us


In [83]:
print(repeat(mult_arr_nc, (x1,x2), n_repeat=100,n_warmup=30)) 

mult_arr_nc         :    CPU:   93.958 us   +/- 3.701 (min:   90.670 / max:  115.840) us     GPU-0: 1540.557 us   +/- 7.272 (min: 1528.448 / max: 1555.296) us
