Purposes:
- speed test to find bottlenecks and further opportunities for optimization
- apply the lessons from Mike Muller's 2019 PyCon talk: https://www.youtube.com/watch?v=EcGWDNlGTNg
    

In [1]:
import cProfile
import os
import sys
import time
import timeit

In [None]:
from candidates import current_hamiltonian, current_simsignals, candidate_hamiltonian, candidate_simsignals

In [3]:
from nmrtools.qm import hamiltonian_dense, hamiltonian_sparse, nspinspec_dense, nspinspec_sparse

In [4]:
from simulation_data import spin11, spin8, rioux

In [None]:
def old_h():
    return current_hamiltonian(*spin11())

def new_h():
    return candidate_hamiltonian(*spin11())

In [None]:
old11_h = old_h()
new11_h = new_h()

In [None]:
def old_s():
    return current_simsignals(old11_h, 11)

def new_s():
    return candidate_simsignals(new11_h, 11)

In [16]:
f = spin11
def old_h():
    return hamiltonian_dense(*f())
def new_h():
    return hamiltonian_sparse(*f())

def old_s():
    return nspinspec_dense(*f())
def new_s():
    return nspinspec_sparse(*f())


In [None]:
print(type(hamiltonian_dense(*rioux())))
print(type(hamiltonian_sparse(*rioux())))
print(type(hs2(*rioux())))

Use %timeit for a 1-liner, and %%timeit for multiple lines

In [None]:
%%timeit
old_h()

In [13]:
%%timeit
new_h()

12 ms ± 423 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%timeit
old_s()

341 ms ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
new_s()

30.1 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
profiler = cProfile.Profile()

In [None]:
profiler.runcall(new_h)

In [None]:
profiler.print_stats()

In [None]:
profiler2 = cProfile.Profile()
profiler2.runcall(new_h)
profiler2.print_stats()

You can save your result, then view them later with pstats.

In [None]:
cProfile.run('old_h()', 'old_h.stats')
cProfile.run('new_h()', 'new_h.stats')

In [None]:
cProfile.run('old_s()', 'old_s.stats')
cProfile.run('new_s()', 'new_s.stats')

Update 2018-05-18: vectorized_simsignals much improved! e.g. of 3.49 s on spin 11, 3.39s spent on intensity_and_energy (the calculation of I and E) and only 0.083s on the conversion to a spectrum!
Right now, Hamiltonian is not the bottleneck, and is as fast as it's going to get (for now).
In the new simsignals, the eigh is definitely the bottleneck (e.g. 2.8 out of 4.5 s) so probably can't be improved on much. However, of that ~4.5 s, ~0.86 s occurs within simsignals, so presumably in the loop. Can this be vectorized?

In [None]:
import pstats

In [None]:
stats = pstats.Stats('new_s.stats')

In [None]:
stats.print_callees('dot')

Easier in jupyter to do this with prun.
-l 12 limits to 12 lines

In [None]:
%%prun -l 12
new_s()

In [None]:
%%prun -l 12
newer_s()

In [None]:
stats_new_s = %prun -r new_s()  # -r returns the pstats object

In [None]:
stats_new_s.print_stats()

In [None]:
%prun -T stats_new_s.txt new_s()  # -T saves results to file

In [None]:
%less stats_new_s.txt

In [None]:
%prun -D stats_new_s.stats new_s()  # saves as binary instead (I think)

In [None]:
def time_new_s():
    start_os_time0 = os.times()[0]
    start_time_clock = time.clock()
    start_default_timer = timeit.default_timer()
    start_perf = time.perf_counter()
    start_processtime = time.process_time()
    new_s()
    duration_os_time0 = os.times()[0] - start_os_time0
    duration_time_clock = time.clock() - start_time_clock
    duration_default_timer = timeit.default_timer() - start_default_timer
    duration_perf = time.perf_counter() - start_perf
    duration_processtime = time.process_time() - start_processtime
    print('os: ', duration_os_time0)
    print('time_clock: ', duration_time_clock)
    print('default_timer: ', duration_default_timer)
    print('perf: ', duration_perf)
    print('processtime: ', duration_processtime)

In [None]:
time_new_s()

Muller's recommendation is to use default_timer, which abstracts OS differences away. Very different behavior btwen Windows and Mac.

In [None]:
%load_ext snakeviz

In [None]:
time.process_time?

In [None]:
%snakeviz new_s()

In [None]:
# %load candidates.py
"""Collection of the current best candidates for nmrtools functions,
for testing speed etc.

"""


from nmrtools.nmrmath import hamiltonian as current_hamiltonian
from nmrtools.nmrmath import simsignals as current_simsignals

from speedtest.compare_hamiltonians import hamiltonian_sparse as \
    candidate_hamiltonian
from tests.test_simsignals import newer_simsignals as candidate_simsignals


if __name__ == '__main__':
    import numpy as np
    from simulation_data import rioux
    current_h = current_hamiltonian(*rioux())
    current_spectrum = current_simsignals(current_h, 3)
    candidate_h = candidate_hamiltonian(*rioux())
    candidate_spectrum = candidate_simsignals(candidate_h, 3)

    print(current_spectrum[:10])
    print(candidate_spectrum[:10])
    assert np.allclose(current_spectrum, candidate_spectrum)


In [None]:
%load_ext line_profiler

In [None]:
%lprun -f candidate_simsignals candidate_simsignals(new11_h, 11)

Update 2018-05-18: new vectorization reduces new_compile_spectrum (vectorization of loop) to only 3% of time! Eigh is the clear bottleneck.
kernprof indicates that eigen is bottleneck, but the for loop is not insignificant. Perhaps this can be vectorized at some point?

In [None]:
from tests.test_simsignals import intensity_and_energy

In [None]:
%lprun -f intensity_and_energy candidate_simsignals(new11_h, 11)

In intensity_and_energy, eigh is 90% of the time, and the matrix multiplication 9.7%.