Purposes:
- speed test to find bottlenecks and further opportunities for optimization
- apply the lessons from Mike Muller's 2019 PyCon talk: https://www.youtube.com/watch?v=EcGWDNlGTNg
    

In [1]:
import cProfile
import os
import sys
import time
import timeit

In [None]:
from candidates import current_hamiltonian, current_simsignals, candidate_hamiltonian, candidate_simsignals

In [2]:
from nmrtools.qm import hamiltonian_dense, hamiltonian_sparse, hs2, nspinspec_dense, nspinspec_sparse, nss2

In [3]:
from simulation_data import spin11, spin8, rioux

In [None]:
def old_h():
    return current_hamiltonian(*spin11())

def new_h():
    return candidate_hamiltonian(*spin11())

In [None]:
old11_h = old_h()
new11_h = new_h()

In [None]:
def old_s():
    return current_simsignals(old11_h, 11)

def new_s():
    return candidate_simsignals(new11_h, 11)

In [4]:
f = spin11
def old_h():
    return hamiltonian_dense(*f())
def new_h():
    return hamiltonian_sparse(*f())
def newer_h():
    return hs2(*f())
def old_s():
    return nspinspec_dense(*f())
def new_s():
    return nspinspec_sparse(*f())
def newer_s():
    return nss2(*f())

In [18]:
print(type(hamiltonian_dense(*rioux())))
print(type(hamiltonian_sparse(*rioux())))
print(type(hs2(*rioux())))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'sparse.coo.core.COO'>


Use %timeit for a 1-liner, and %%timeit for multiple lines

In [None]:
%%timeit
old_h()

In [5]:
%%timeit
new_h()

174 ms ± 3.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
newer_h()

125 ms ± 2.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
old_s()

In [7]:
%%timeit
new_s()

3.56 s ± 56.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
newer_s()

3.35 s ± 200 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
profiler = cProfile.Profile()

In [10]:
profiler.runcall(new_h)

array([[ 5681.875+0.j,     0.   +0.j,     0.   +0.j, ...,     0.   +0.j,
            0.   +0.j,     0.   +0.j],
       [    0.   +0.j,  3389.125+0.j,     0.   +0.j, ...,     0.   +0.j,
            0.   +0.j,     0.   +0.j],
       [    0.   +0.j,     0.   +0.j,  4677.125+0.j, ...,     0.   +0.j,
            0.   +0.j,     0.   +0.j],
       ...,
       [    0.   +0.j,     0.   +0.j,     0.   +0.j, ..., -4670.875+0.j,
            0.   +0.j,     0.   +0.j],
       [    0.   +0.j,     0.   +0.j,     0.   +0.j, ...,     0.   +0.j,
        -3390.875+0.j,     0.   +0.j],
       [    0.   +0.j,     0.   +0.j,     0.   +0.j, ...,     0.   +0.j,
            0.   +0.j, -5666.125+0.j]])

In [11]:
profiler.print_stats()

         6854 function calls (6787 primitive calls) in 0.189 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       20    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
       12    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:416(parent)
        1    0.002    0.002    0.189    0.189 <ipython-input-4-6fe084f15d15>:4(new_h)
      158    0.000    0.000    0.000    0.000 <string>:1(__new__)
        4    0.000    0.000    0.000    0.000 _dtype.py:319(_name_get)
        2    0.000    0.000    0.000    0.000 _methods.py:45(_all)
       55    0.000    0.000    0.000    0.000 abc.py:137(__instancecheck__)
       30    0.000    0.000    0.000    0.000 abc.py:141(__subclasscheck__)
        4    0.000    0.000    0.000    0.000 arraysetops.py:138(_unpack_tuple)
        4    0.000    0.000    0.000    0.000 arraysetops.py:151(unique)
        4    0.000    0.000    0.000    0.000 

In [None]:
profiler2 = cProfile.Profile()
profiler2.runcall(new_h)
profiler2.print_stats()

You can save your result, then view them later with pstats.

In [None]:
cProfile.run('old_h()', 'old_h.stats')
cProfile.run('new_h()', 'new_h.stats')

In [None]:
cProfile.run('old_s()', 'old_s.stats')
cProfile.run('new_s()', 'new_s.stats')

Update 2018-05-18: vectorized_simsignals much improved! e.g. of 3.49 s on spin 11, 3.39s spent on intensity_and_energy (the calculation of I and E) and only 0.083s on the conversion to a spectrum!
Right now, Hamiltonian is not the bottleneck, and is as fast as it's going to get (for now).
In the new simsignals, the eigh is definitely the bottleneck (e.g. 2.8 out of 4.5 s) so probably can't be improved on much. However, of that ~4.5 s, ~0.86 s occurs within simsignals, so presumably in the loop. Can this be vectorized?

In [None]:
import pstats

In [None]:
stats = pstats.Stats('new_s.stats')

In [None]:
stats.print_callees('dot')

Easier in jupyter to do this with prun.
-l 12 limits to 12 lines

In [14]:
%%prun -l 12
new_s()

 

         9474 function calls (9377 primitive calls) in 3.282 seconds

   Ordered by: internal time
   List reduced from 232 to 12 due to restriction <12>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    2.662    2.662    2.662    2.662 linalg.py:1324(eigh)
        1    0.177    0.177    0.177    0.177 {method 'dot' of 'numpy.ndarray' objects}
        1    0.137    0.137    0.137    0.137 common.py:1273(_dot_coo_ndarray)
        2    0.040    0.020    0.040    0.020 common.py:1313(_dot_ndarray_coo)
        1    0.033    0.033    0.072    0.072 qm.py:312(new_compile_spectrum)
        2    0.022    0.011    0.022    0.011 {built-in method numpy.where}
        2    0.020    0.010    0.020    0.010 {method 'argsort' of 'numpy.ndarray' objects}
        3    0.019    0.006    0.025    0.008 core.py:1503(reshape)
        1    0.018    0.018    3.083    3.083 qm.py:322(vectorized_simsignals)
        1    0.017    0.017    0.175    0.175 qm.py:134(hamiltonian

In [15]:
%%prun -l 12
newer_s()

 

         11253 function calls (11151 primitive calls) in 3.161 seconds

   Ordered by: internal time
   List reduced from 302 to 12 due to restriction <12>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    2.554    2.554    2.554    2.554 linalg.py:1324(eigh)
        1    0.186    0.186    0.186    0.186 {method 'dot' of 'numpy.ndarray' objects}
        1    0.146    0.146    0.146    0.146 common.py:1273(_dot_coo_ndarray)
        1    0.033    0.033    0.073    0.073 qm.py:312(new_compile_spectrum)
        4    0.022    0.006    0.022    0.006 {built-in method numpy.where}
        9    0.020    0.002    0.020    0.002 {method 'argsort' of 'numpy.ndarray' objects}
        7    0.019    0.003    0.026    0.004 core.py:1503(reshape)
        3    0.018    0.006    0.018    0.006 {built-in method numpy.copyto}
        1    0.018    0.018    2.994    2.994 qm.py:322(vectorized_simsignals)
      122    0.017    0.000    0.017    0.000 {built-in method zlib

In [None]:
stats_new_s = %prun -r new_s()  # -r returns the pstats object

In [None]:
stats_new_s.print_stats()

In [None]:
%prun -T stats_new_s.txt new_s()  # -T saves results to file

In [None]:
%less stats_new_s.txt

In [None]:
%prun -D stats_new_s.stats new_s()  # saves as binary instead (I think)

In [None]:
def time_new_s():
    start_os_time0 = os.times()[0]
    start_time_clock = time.clock()
    start_default_timer = timeit.default_timer()
    start_perf = time.perf_counter()
    start_processtime = time.process_time()
    new_s()
    duration_os_time0 = os.times()[0] - start_os_time0
    duration_time_clock = time.clock() - start_time_clock
    duration_default_timer = timeit.default_timer() - start_default_timer
    duration_perf = time.perf_counter() - start_perf
    duration_processtime = time.process_time() - start_processtime
    print('os: ', duration_os_time0)
    print('time_clock: ', duration_time_clock)
    print('default_timer: ', duration_default_timer)
    print('perf: ', duration_perf)
    print('processtime: ', duration_processtime)

In [None]:
time_new_s()

Muller's recommendation is to use default_timer, which abstracts OS differences away. Very different behavior btwen Windows and Mac.

In [None]:
%load_ext snakeviz

In [None]:
time.process_time?

In [None]:
%snakeviz new_s()

In [None]:
# %load candidates.py
"""Collection of the current best candidates for nmrtools functions,
for testing speed etc.

"""


from nmrtools.nmrmath import hamiltonian as current_hamiltonian
from nmrtools.nmrmath import simsignals as current_simsignals

from speedtest.compare_hamiltonians import hamiltonian_sparse as \
    candidate_hamiltonian
from tests.test_simsignals import newer_simsignals as candidate_simsignals


if __name__ == '__main__':
    import numpy as np
    from simulation_data import rioux
    current_h = current_hamiltonian(*rioux())
    current_spectrum = current_simsignals(current_h, 3)
    candidate_h = candidate_hamiltonian(*rioux())
    candidate_spectrum = candidate_simsignals(candidate_h, 3)

    print(current_spectrum[:10])
    print(candidate_spectrum[:10])
    assert np.allclose(current_spectrum, candidate_spectrum)


In [None]:
%load_ext line_profiler

In [None]:
%lprun -f candidate_simsignals candidate_simsignals(new11_h, 11)

Update 2018-05-18: new vectorization reduces new_compile_spectrum (vectorization of loop) to only 3% of time! Eigh is the clear bottleneck.
kernprof indicates that eigen is bottleneck, but the for loop is not insignificant. Perhaps this can be vectorized at some point?

In [None]:
from tests.test_simsignals import intensity_and_energy

In [None]:
%lprun -f intensity_and_energy candidate_simsignals(new11_h, 11)

In intensity_and_energy, eigh is 90% of the time, and the matrix multiplication 9.7%.