Purposes:
- speed test to find bottlenecks and further opportunities for optimization
- apply the lessons from Mike Muller's 2019 PyCon talk: https://www.youtube.com/watch?v=EcGWDNlGTNg
    

In [37]:
import cProfile
import os
import sys
import time
import timeit

In [2]:
from candidates import current_hamiltonian, current_simsignals, candidate_hamiltonian, candidate_simsignals

Import type:  <class 'numpy.ndarray'>


In [3]:
from simulation_data import spin11

In [4]:
def old_h():
    return current_hamiltonian(*spin11())

def new_h():
    return candidate_hamiltonian(*spin11())

In [5]:
old11_h = old_h()
new11_h = new_h()

In [6]:
def old_s():
    return current_simsignals(old11_h, 11)

def new_s():
    return candidate_simsignals(new11_h, 11)

In [7]:
%%timeit
old_h()

1.49 s ± 96.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
new_h()

205 ms ± 8.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
old_s()

18.5 s ± 131 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
new_s()

4.38 s ± 110 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
profiler = cProfile.Profile()

In [12]:
profiler.runcall(old_h)

matrix([[ 5681.875,     0.   ,     0.   , ...,     0.   ,     0.   ,
             0.   ],
        [    0.   ,  3389.125,     0.   , ...,     0.   ,     0.   ,
             0.   ],
        [    0.   ,     0.   ,  4677.125, ...,     0.   ,     0.   ,
             0.   ],
        ...,
        [    0.   ,     0.   ,     0.   , ..., -4670.875,     0.   ,
             0.   ],
        [    0.   ,     0.   ,     0.   , ...,     0.   , -3390.875,
             0.   ],
        [    0.   ,     0.   ,     0.   , ...,     0.   ,     0.   ,
         -5666.125]])

In [13]:
profiler.print_stats()

         29614 function calls (29599 primitive calls) in 1.417 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
        1    0.003    0.003    1.417    1.417 <ipython-input-4-9838e45a9e8b>:1(old_h)
       41    0.000    0.000    0.000    0.000 <string>:1(__new__)
      506    0.000    0.000    0.000    0.000 _util.py:129(_prune_array)
      132    0.000    0.000    0.000    0.000 abc.py:137(__instancecheck__)
        2    0.000    0.000    0.000    0.000 ast.py:30(parse)
        2    0.000    0.000    0.000    0.000 ast.py:38(literal_eval)
     17/2    0.000    0.000    0.000    0.000 ast.py:64(_convert)
      385    0.000    0.000    0.000    0.000 base.py:1190(isspmatrix)
     1012    0.000    0.000    0.001    0.000 base.py:243(nnz)
      132    0.001    0.000    0.912    0.007 base.py:407(__add__)
      132    0.000    0.00

In [14]:
profiler2 = cProfile.Profile()
profiler2.runcall(new_h)
profiler2.print_stats()

         6803 function calls (6736 primitive calls) in 0.236 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       20    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
       12    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:416(parent)
        1    0.002    0.002    0.236    0.236 <ipython-input-4-9838e45a9e8b>:4(new_h)
      158    0.000    0.000    0.000    0.000 <string>:1(__new__)
        4    0.000    0.000    0.000    0.000 _dtype.py:319(_name_get)
        2    0.000    0.000    0.000    0.000 _methods.py:45(_all)
       55    0.000    0.000    0.000    0.000 abc.py:137(__instancecheck__)
       30    0.000    0.000    0.000    0.000 abc.py:141(__subclasscheck__)
        4    0.000    0.000    0.000    0.000 arraysetops.py:138(_unpack_tuple)
        4    0.000    0.000    0.000    0.000 arraysetops.py:151(unique)
        4    0.000    0.000    0.000    0.000 

In [15]:
cProfile.run('old_h()', 'old_h.stats')
cProfile.run('new_h()', 'new_h.stats')

In [16]:
cProfile.run('old_s()', 'old_s.stats')
cProfile.run('new_s()', 'new_s.stats')

Right now, Hamiltonian is not the bottleneck, and is as fast as it's going to get (for now).
In the new simsignals, the eigh is definitely the bottleneck (e.g. 2.8 out of 4.5 s) so probably can't be improved on much. However, of that ~4.5 s, ~0.86 s occurs within simsignals, so presumably in the loop. Can this be vectorized?

In [17]:
import pstats

In [19]:
stats = pstats.Stats('new_s.stats')

In [24]:
stats.print_callees('dot')

   Random listing order was used
   List reduced from 187 to 6 due to restriction <'dot'>

Function                                                                                             called...
                                                                                                         ncalls  tottime  cumtime
C:\Users\Geoffrey\Miniconda3\envs\nmr\lib\site-packages\sparse\coo\core.py:1397(dot)                 ->       1    0.000    0.241  C:\Users\Geoffrey\Miniconda3\envs\nmr\lib\site-packages\sparse\coo\common.py:234(dot)
C:\Users\Geoffrey\Miniconda3\envs\nmr\lib\site-packages\sparse\coo\common.py:59(tensordot)           ->       2    0.000    0.000  C:\Users\Geoffrey\Miniconda3\envs\nmr\lib\site-packages\scipy\sparse\base.py:1190(isspmatrix)
                                                                                                              1    0.000    0.000  C:\Users\Geoffrey\Miniconda3\envs\nmr\lib\site-packages\sparse\coo\common.py:135(<listcomp>)
  

<pstats.Stats at 0x234f5ea79e8>

In [29]:
%%prun -l 12
new_s()

 

         24477 function calls (24447 primitive calls) in 4.662 seconds

   Ordered by: internal time
   List reduced from 187 to 12 due to restriction <12>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    3.070    3.070    3.070    3.070 linalg.py:1324(eigh)
        1    0.849    0.849    4.647    4.647 test_simsignals.py:187(newer_simsignals)
        1    0.491    0.491    0.491    0.491 {method 'dot' of 'numpy.ndarray' objects}
        1    0.232    0.232    0.232    0.232 common.py:1273(_dot_coo_ndarray)
        1    0.014    0.014    4.661    4.661 <ipython-input-6-f8a1f1193dd7>:4(new_s)
        1    0.001    0.001    4.662    4.662 <string>:2(<module>)
    11169    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
    10976    0.001    0.000    0.001    0.000 {built-in method builtins.abs}
       11    0.001    0.000    0.001    0.000 {method 'decompress' of 'zlib.Decompress' objects}
       19    0.000    0.000    0.000    0

In [31]:
stats_new_s = %prun -r new_s() 

 

         24477 function calls (24447 primitive calls) in 4.853 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    3.176    3.176    3.176    3.176 linalg.py:1324(eigh)
        1    0.897    0.897    4.835    4.835 test_simsignals.py:187(newer_simsignals)
        1    0.512    0.512    0.512    0.512 {method 'dot' of 'numpy.ndarray' objects}
        1    0.244    0.244    0.244    0.244 common.py:1273(_dot_coo_ndarray)
        1    0.015    0.015    4.849    4.849 <ipython-input-6-f8a1f1193dd7>:4(new_s)
        1    0.003    0.003    4.853    4.853 <string>:1(<module>)
    10976    0.001    0.000    0.001    0.000 {built-in method builtins.abs}
    11169    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
       11    0.001    0.000    0.001    0.000 {method 'decompress' of 'zlib.Decompress' objects}
       19    0.000    0.000    0.000    0.000 {built-in method zlib.crc32}
        1    0.000   

In [32]:
stats_new_s.print_stats()

         24477 function calls (24447 primitive calls) in 4.853 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    3.176    3.176    3.176    3.176 linalg.py:1324(eigh)
        1    0.897    0.897    4.835    4.835 test_simsignals.py:187(newer_simsignals)
        1    0.512    0.512    0.512    0.512 {method 'dot' of 'numpy.ndarray' objects}
        1    0.244    0.244    0.244    0.244 common.py:1273(_dot_coo_ndarray)
        1    0.015    0.015    4.849    4.849 <ipython-input-6-f8a1f1193dd7>:4(new_s)
        1    0.003    0.003    4.853    4.853 <string>:1(<module>)
    10976    0.001    0.000    0.001    0.000 {built-in method builtins.abs}
    11169    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
       11    0.001    0.000    0.001    0.000 {method 'decompress' of 'zlib.Decompress' objects}
       19    0.000    0.000    0.000    0.000 {built-in method zlib.crc32}
        1    0.000   

<pstats.Stats at 0x234f5e9c710>

In [33]:
%prun -T stats_new_s.txt new_s()

 
*** Profile printout saved to text file 'stats_new_s.txt'. 


         24477 function calls (24447 primitive calls) in 4.572 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    3.064    3.064    3.064    3.064 linalg.py:1324(eigh)
        1    0.853    0.853    4.556    4.556 test_simsignals.py:187(newer_simsignals)
        1    0.401    0.401    0.401    0.401 {method 'dot' of 'numpy.ndarray' objects}
        1    0.233    0.233    0.233    0.233 common.py:1273(_dot_coo_ndarray)
        1    0.014    0.014    4.571    4.571 <ipython-input-6-f8a1f1193dd7>:4(new_s)
        1    0.001    0.001    4.572    4.572 <string>:1(<module>)
    10976    0.001    0.000    0.001    0.000 {built-in method builtins.abs}
    11169    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
       11    0.001    0.000    0.001    0.000 {method 'decompress' of 'zlib.Decompress' objects}
       19    0.000    0.000    0.000    0.000 {built-in method zlib.crc32}
        1    0.000   

In [35]:
%less stats_new_s.txt

         24477 function calls (24447 primitive calls) in 4.572 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    3.064    3.064    3.064    3.064 linalg.py:1324(eigh)
        1    0.853    0.853    4.556    4.556 test_simsignals.py:187(newer_simsignals)
        1    0.401    0.401    0.401    0.401 {method 'dot' of 'numpy.ndarray' objects}
        1    0.233    0.233    0.233    0.233 common.py:1273(_dot_coo_ndarray)
        1    0.014    0.014    4.571    4.571 <ipython-input-6-f8a1f1193dd7>:4(new_s)
        1    0.001    0.001    4.572    4.572 <string>:1(<module>)
    10976    0.001    0.000    0.001    0.000 {built-in method builtins.abs}
    11169    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
       11    0.001    0.000    0.001    0.000 {method 'decompress' of 'zlib.Decompress' objects}
       19    0.000    0.000    0.000    0.000 {built-in method zlib.crc32}
        1    0.000   

In [36]:
%prun -D stats_new_s.stats new_s()

 
*** Profile stats marshalled to file 'stats_new_s.stats'. 


         24477 function calls (24447 primitive calls) in 5.222 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    3.366    3.366    3.366    3.366 linalg.py:1324(eigh)
        1    0.966    0.966    5.203    5.203 test_simsignals.py:187(newer_simsignals)
        1    0.618    0.618    0.618    0.618 {method 'dot' of 'numpy.ndarray' objects}
        1    0.245    0.245    0.245    0.245 common.py:1273(_dot_coo_ndarray)
        1    0.018    0.018    5.221    5.221 <ipython-input-6-f8a1f1193dd7>:4(new_s)
        1    0.001    0.001    5.222    5.222 <string>:1(<module>)
    10976    0.001    0.000    0.001    0.000 {built-in method builtins.abs}
    11169    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
        1    0.001    0.001    0.001    0.001 {built-in method io.open}
       11    0.001    0.000    0.001    0.000 {method 'decompress' of 'zlib.Decompress' objects}
       19    0.000    0.

In [49]:
def time_new_s():
    start_os_time0 = os.times()[0]
    start_time_clock = time.clock()
    start_default_timer = timeit.default_timer()
    start_perf = time.perf_counter()
    start_processtime = time.process_time()
    new_s()
    duration_os_time0 = os.times()[0] - start_os_time0
    duration_time_clock = time.clock() - start_time_clock
    duration_default_timer = timeit.default_timer() - start_default_timer
    duration_perf = time.perf_counter() - start_perf
    duration_processtime = time.process_time() - start_processtime
    print('os: ', duration_os_time0)
    print('time_clock: ', duration_time_clock)
    print('default_timer: ', duration_default_timer)
    print('perf: ', duration_perf)
    print('processtime: ', duration_processtime)

In [50]:
time_new_s()

  This is separate from the ipykernel package so we can avoid doing imports until


os:  11.046875
time_clock:  4.962005800000043
default_timer:  4.962005699999281
perf:  4.962005699999281
processtime:  12.0625


  if __name__ == '__main__':


Muller's recommendation is to use default_timer, which abstracts OS differences away. Very different behavior btwen Windows and Mac.

In [51]:
%load_ext snakeviz

In [48]:
time.process_time?

[1;31mDocstring:[0m
process_time() -> float

Process time for profiling: sum of the kernel and user-space CPU time.
[1;31mType:[0m      builtin_function_or_method


In [52]:
%snakeviz new_s()

 
*** Profile stats marshalled to file 'C:\\Users\\Geoffrey\\AppData\\Local\\Temp\\tmph6zw38ot'. 


In [53]:
# %load candidates.py
"""Collection of the current best candidates for nmrtools functions,
for testing speed etc.

"""


from nmrtools.nmrmath import hamiltonian as current_hamiltonian
from nmrtools.nmrmath import simsignals as current_simsignals

from speedtest.compare_hamiltonians import hamiltonian_sparse as \
    candidate_hamiltonian
from tests.test_simsignals import newer_simsignals as candidate_simsignals


if __name__ == '__main__':
    import numpy as np
    from simulation_data import rioux
    current_h = current_hamiltonian(*rioux())
    current_spectrum = current_simsignals(current_h, 3)
    candidate_h = candidate_hamiltonian(*rioux())
    candidate_spectrum = candidate_simsignals(candidate_h, 3)

    print(current_spectrum[:10])
    print(candidate_spectrum[:10])
    assert np.allclose(current_spectrum, candidate_spectrum)


In [54]:
%load_ext line_profiler

In [59]:
%lprun -f candidate_simsignals candidate_simsignals(new11_h, 11)

Timer unit: 1e-07 s

Total time: 6.63584 s
File: E:\Geoffrey\Documents\GitHub\cythontest\tests\test_simsignals.py
Function: newer_simsignals at line 187

Line #      Hits         Time  Per Hit   % Time  Line Contents
   187                                           @profile
   188                                           def newer_simsignals(H, nspins):
   189                                               """new_simsignals plus faster transition matrix"
   190         1         31.0     31.0      0.0      """
   191         1   28394955.0 28394955.0     42.8      m = 2 ** nspins
   192                                               E, V = np.linalg.eigh(H)
   193         1      56638.0  56638.0      0.1      # T = new_transition_matrix(m)
   194         1    7758819.0 7758819.0     11.7      T = cache_tm(m)
   195         1         29.0     29.0      0.0      I = np.square(V.T.dot(T.dot(V)))
   196      2048      10439.0      5.1      0.0      spectrum = []
   197   2098175   10218156.

kernprof indicates that eigen is bottleneck, but the for loop is not insignificant. Perhaps this can be vectorized at some point?