In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.reset_defaults()
sns.set_theme(style='whitegrid')

In [2]:
def bench(source, n=2**16, cc='g++', dec=False):
    res = !{cc} -std=c++17 -O3 -funroll-loops -march=native -D N={n} {'-D DEC' if dec else ''} {source}.cc -o run && ./run
    print(source, n, res)
    return float(res[-1].split()[0])

ns = list(int(2**k) for k in range(10, 21))

In [24]:
bench('simple')

simple 65536 ['35545 3722', '0.00004148 x 32768', '1.58 GFLOPS']


1.58

In [189]:
def f(n):
    r = 0
    for i in range(1, n + 1):
        r += 1/i
    return r

f(2**13)

9.588190046095265

In [10]:
bench('simdmin-intrinsics2', n=2**14)

simdmin-intrinsics2 16384 ['1270 100669', '8.82 GFLOPS']


8.82

In [9]:
bench('simdmin-intrinsics-ilp', n=2**14)

simdmin-intrinsics-ilp 16384 ['1270 100669', '10.38 GFLOPS']


10.38

In [11]:
sources = [
    'std',
    'simple',
    'hint',
    'index',
    'simdmin-intrinsics2',
    'simdmin-intrinsics-ilp',
    'simdmin-intrinsics-ilp3',
    'combined',
]

r = [bench(source, n=2**13) for source in sources]

std 8192 ['1270 100669', '0.28 GFLOPS']
simple 8192 ['1270 100669', '1.54 GFLOPS']
hint 8192 ['1270 100669', '1.95 GFLOPS']
index 8192 ['1270 100669', '4.08 GFLOPS']
simdmin-intrinsics2 8192 ['1270 100669', '8.51 GFLOPS']
simdmin-intrinsics-ilp 8192 ['1270 100669', '8.82 GFLOPS']
simdmin-intrinsics-ilp3 8192 ['1270 100669', '22.44 GFLOPS']
combined 8192 ['1270 100669', '20.54 GFLOPS']


In [5]:
bench('index', n=2**13)

index 8192 ['1270 100669', '4.06 GFLOPS']


4.06

In [3]:
bench('simple', n=2**13)

simple 8192 ['1270 100669', '1.55 GFLOPS']


1.55

In [8]:
bench('index-intrinsics', n=2**13)

In [19]:
bench('combined', n=2**13, dec=True)

combined 8192 ['8191 1', '12.92 GFLOPS']


12.92

In [22]:
bench('simdmin-intrinsics-ilp', n=2**13)

simdmin-intrinsics-ilp 8192 ['1270 100669', '9.28 GFLOPS']


9.28

In [14]:
d = [bench(source, n=2**13, dec=True) for source in sources]

std 8192 ['8191 1', '0.28 GFLOPS']
simple 8192 ['8191 1', '1.89 GFLOPS']
hint 8192 ['8191 1', '0.75 GFLOPS']
index 8192 ['8191 1', '4.27 GFLOPS']
simdmin-intrinsics2 8192 ['8191 1', '1.65 GFLOPS']
simdmin-intrinsics-ilp 8192 ['8191 1', '1.74 GFLOPS']
simdmin-intrinsics-ilp3 8192 ['8191 1', '2.70 GFLOPS']
combined 8192 ['8191 1', '18.29 GFLOPS']


In [13]:
bench('combined', n=2**13)

combined 8192 ['1270 100669', '20.21 GFLOPS']


20.21

In [12]:
bench('simdmin-intrinsics-ilp3', n=2**13, dec=True)

simdmin-intrinsics-ilp3 8192 ['8191 1', '2.70 GFLOPS']


2.7

In [25]:
results = {}
results_dec = {}
for source in ['std', 'simple', 'cmov', 'hint', 'index', 'simdmin-single', 'simdmin', 'simdmin-testz']:
    results[source] = [bench(source, n=n) for n in ns]
    results_dec[source] = [bench(source, n=n, dec=True) for n in ns]

std 1024 ['35545 3722', '0.00023086 x 32768', '0.28 GFLOPS']
std 2048 ['35545 3722', '0.00023119 x 32768', '0.28 GFLOPS']
std 4096 ['35545 3722', '0.00023115 x 32768', '0.28 GFLOPS']
std 8192 ['35545 3722', '0.00023077 x 32768', '0.28 GFLOPS']
std 16384 ['35545 3722', '0.00023088 x 32768', '0.28 GFLOPS']
std 32768 ['35545 3722', '0.00023105 x 32768', '0.28 GFLOPS']
std 65536 ['35545 3722', '0.00023095 x 32768', '0.28 GFLOPS']
std 131072 ['35545 3722', '0.00023054 x 32768', '0.28 GFLOPS']
std 262144 ['35545 3722', '0.00023059 x 32768', '0.28 GFLOPS']
std 524288 ['35545 3722', '0.00023129 x 32768', '0.28 GFLOPS']
std 1048576 ['35545 3722', '0.00023096 x 32768', '0.28 GFLOPS']
std 1024 ['65535 1', '0.00023060 x 32768', '0.28 GFLOPS']
std 2048 ['65535 1', '0.00023070 x 32768', '0.28 GFLOPS']
std 4096 ['65535 1', '0.00023063 x 32768', '0.28 GFLOPS']
std 8192 ['65535 1', '0.00023118 x 32768', '0.28 GFLOPS']
std 16384 ['65535 1', '0.00023055 x 32768', '0.28 GFLOPS']
std 32768 ['65535 1', '0.0

In [26]:
import pickle

with open('results.pkl', 'wb') as f:
    pickle.dump([results, results_dec], f)

In [38]:
bench('simdmin-intrinsics')

simdmin-intrinsics 65536 ['35545 3722', '0.00000679 x 32768', '9.65 GFLOPS']


9.65

In [40]:
for source in results.keys():
    print(source, max(results[source]), max(results_dec[source]))

std 0.28 0.28
simple 1.58 1.94
cmov 1.44 1.94
hint 2.26 1.5
index 4.38 4.38
simdmin-single 9.36 0.54
simdmin 14.65 1.41
simdmin-testz 13.59 1.41


In [148]:
bench('index', n=2**16)

index 65536 ['35545 3722', '4.19 GFLOPS']


4.19

In [86]:
bench('simple', n=2**12)

simple 4096 ['1270 100669', '1.54 GFLOPS']


1.54

In [83]:
bench('hint', n=2**16)

hint 65536 ['35545 3722', '2.61 GFLOPS']


2.61

In [95]:
bench('puremin', n=2**12)

puremin 4096 ['100669 -1', '0.08 GFLOPS']


0.08

In [115]:
[bench('simdmin-intrinsics-ilp', n=n) for n in [2**12, 2**13, 2**14, 2**15, 2**16]]

simdmin-intrinsics-ilp 4096 ['1270 100669', '10.85 GFLOPS']
simdmin-intrinsics-ilp 8192 ['1270 100669', '11.93 GFLOPS']
simdmin-intrinsics-ilp 16384 ['1270 100669', '13.00 GFLOPS']
simdmin-intrinsics-ilp 32768 ['27165 56172', '14.00 GFLOPS']
simdmin-intrinsics-ilp 65536 ['35545 3722', '13.36 GFLOPS']


[10.85, 11.93, 13.0, 14.0, 13.36]

In [117]:
[bench('simdmin-intrinsics-ilp2', n=n) for n in [2**12, 2**13, 2**14, 2**15, 2**16]]

simdmin-intrinsics-ilp2 4096 ['1270 100669', '12.21 GFLOPS']
simdmin-intrinsics-ilp2 8192 ['1270 100669', '14.57 GFLOPS']
simdmin-intrinsics-ilp2 16384 ['1270 100669', '13.81 GFLOPS']
simdmin-intrinsics-ilp2 32768 ['27165 56172', '13.87 GFLOPS']
simdmin-intrinsics-ilp2 65536 ['35545 3722', '14.36 GFLOPS']


[12.21, 14.57, 13.81, 13.87, 14.36]

In [184]:
bench('simdmin-intrinsics-ilp2', n=2**13)

simdmin-intrinsics-ilp2 8192 ['0 1804289383', '150.90 GFLOPS']


150.9

In [195]:
bench('simdmin-intrinsics-ilp3', n=2**13)

simdmin-intrinsics-ilp3 8192 ['1270 100669', '22.41 GFLOPS']


22.41

In [201]:
bench('simdmin-intrinsics-ilp3', n=2**13)

simdmin-intrinsics-ilp3 8192 ['1270 100669', '21.88 GFLOPS']


21.88

In [197]:
bench('simdmin-intrinsics-ilp4', n=2**13)

simdmin-intrinsics-ilp4 8192 ['1270 100669', '19.89 GFLOPS']


19.89

In [199]:
bench('simdmin-intrinsics-ilp5', n=2**13)

simdmin-intrinsics-ilp5 8192 ['1270 100669', '21.15 GFLOPS']


21.15

In [218]:
for n in [2**12, 2**13, 2**14, 2**15, 2**16, 2**17]:
    x = bench('simdmin-intrinsics-ilp3', n=n)
    y = bench('simdmin-intrinsics-ilp3', n=n)
    print(n, x, y)

simdmin-intrinsics-ilp3 4096 ['1270 100669', '17.66 GFLOPS']
simdmin-intrinsics-ilp3 4096 ['1270 100669', '17.68 GFLOPS']
4096 17.66 17.68
simdmin-intrinsics-ilp3 8192 ['1270 100669', '21.70 GFLOPS']
simdmin-intrinsics-ilp3 8192 ['1270 100669', '20.63 GFLOPS']
8192 21.7 20.63
simdmin-intrinsics-ilp3 16384 ['1270 100669', '14.52 GFLOPS']
simdmin-intrinsics-ilp3 16384 ['1270 100669', '14.46 GFLOPS']
16384 14.52 14.46
simdmin-intrinsics-ilp3 32768 ['27165 56172', '14.79 GFLOPS']
simdmin-intrinsics-ilp3 32768 ['27165 56172', '14.70 GFLOPS']
32768 14.79 14.7
simdmin-intrinsics-ilp3 65536 ['35545 3722', '15.02 GFLOPS']
simdmin-intrinsics-ilp3 65536 ['35545 3722', '14.43 GFLOPS']
65536 15.02 14.43
simdmin-intrinsics-ilp3 131072 ['35545 3722', '12.69 GFLOPS']
simdmin-intrinsics-ilp3 131072 ['35545 3722', '12.64 GFLOPS']
131072 12.69 12.64


In [221]:
bench('combined', n=2**13, dec=True)

combined 8192 ['8191 1', '18.56 GFLOPS']


18.56

In [202]:
bench('puremin', n=2**13)

puremin 8192 ['100669 -1', '24.25 GFLOPS']


24.25

In [11]:
[bench('simdmin', n=n) for n in ns]

simdmin 1024 ['35545 3722', '0.00000473 x 262144', '13.84 GFLOPS']
simdmin 2048 ['35545 3722', '0.00000450 x 262144', '14.55 GFLOPS']
simdmin 4096 ['35545 3722', '0.00000454 x 262144', '14.45 GFLOPS']
simdmin 8192 ['35545 3722', '0.00000447 x 262144', '14.65 GFLOPS']
simdmin 16384 ['35545 3722', '0.00000486 x 262144', '13.49 GFLOPS']
simdmin 32768 ['35545 3722', '0.00000451 x 262144', '14.53 GFLOPS']
simdmin 65536 ['35545 3722', '0.00000480 x 262144', '13.65 GFLOPS']
simdmin 131072 ['35545 3722', '0.00000447 x 262144', '14.66 GFLOPS']
simdmin 262144 ['35545 3722', '0.00000450 x 262144', '14.56 GFLOPS']
simdmin 524288 ['35545 3722', '0.00000478 x 262144', '13.71 GFLOPS']
simdmin 1048576 ['35545 3722', '0.00000480 x 262144', '13.66 GFLOPS']


[13.84, 14.55, 14.45, 14.65, 13.49, 14.53, 13.65, 14.66, 14.56, 13.71, 13.66]

In [96]:
bench('index')

index 65536 ['35545 3722', '4.21 GFLOPS']


4.21

In [8]:
bench('simple')

simple 65536 ['35545 3722', '0.00004134 x 262144', '1.59 GFLOPS']


1.59

In [9]:
bench('simdmin-testz')

simdmin-testz 65536 ['35545 3722', '0.00000514 x 262144', '12.75 GFLOPS']


12.75

In [None]:
def plot(title=None, ylabel=None, path=None, ch=None, legend=False, ylim=None, xlabel='Array size', loc='best'):
    if ch:
        lines = [
            (2**13, "32K"),
            (2**17, "512K"),
            (2**20, "4M"),
        ]

        for x, t in lines:
            plt.text(x * 1.2, ch, t)
            plt.axvline(x=x, color='black', linestyle='--')

    plt.xscale('log', basex=2)
    plt.xlabel(xlabel)

    if ylabel:
        plt.ylabel(ylabel)

    if title:
        plt.title(title, pad=12)

    if legend:
        plt.legend(loc=loc)

    plt.ylim(bottom=0, top=ylim)
    plt.margins(0)

    if path:
        fig = plt.gcf()
        fig.savefig(path)
    plt.show()


#plt.plot(ns, inc_res, color='darkred')
#plot('for (int i = 0; i < n; i++) a[i]++', 'Increments per second ($10^9$)', 'inc.svg', 4.7)