# CPU Profiling

# Timing

In [None]:
import timeit

In [None]:
a = 1
b = 2
repeat = 1_000_000

In [None]:
total_time = timeit.timeit('a + b', globals=locals(), number=repeat) / repeat

In [None]:
total_time

In [None]:
%timeit a + b

In [None]:
t_var = %timeit -o a + b

In [None]:
t_var.average

In [None]:
t_lit = %timeit -o 1 + 2

In [None]:
t_var.average / t_lit.average

Measure whole cell without assignment, which is moved to pre-condition. Cannot have in-line comment in the cell before magic.

In [None]:
%%timeit a = 1; b = 2

a + b

# Profiling

Might want to automate this for your project.

Note: `sleep` is inaccurate, especially on Windows.

In [None]:
# %load measuring/profile_me.py
# file profile_me.py

"""Example to be profiled.
"""
import sys
import time

if sys.version_info.major < 3:
    range = xrange


def fast():
    """Wait 0.001 seconds.
    """
    time.sleep(1e-3)


def slow():
    """Wait 0.1 seconds.
    """
    time.sleep(0.1)


def use_fast():
    """Call `fast` 100 times.
    """
    for _ in range(100):
        fast()


def use_slow():
    """Call `slow` 100 times.
    """
    for _ in range(100):
        slow()


if __name__ == '__main__':
    use_fast()
    use_slow()


In [None]:
import cProfile

In [None]:
profiler = cProfile.Profile()

In [None]:
profiler.runcall(use_fast)

In [None]:
profiler.print_stats()

In [None]:
profiler2 = cProfile.Profile()

In [None]:
profiler2.runcall(use_slow)

In [None]:
profiler2.print_stats()

In [None]:
cProfile.run('use_fast()', 'fast.stats')  # Store results in fast.stats file

In [None]:
import pstats

In [None]:
stats = pstats.Stats('fast.stats')

In [None]:
stats.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(3)

In [None]:
stats.print_callees('use_fast')

# Profiling in notebook

In [None]:
stats_slow = %prun -s cumulative -r use_slow()

In [None]:
stats_slow.print_stats(5)

# Wall Clock vs CPU Time

For multiprocessing, wall clock time might be less than CPU time.

In [None]:
# %load measuring/clock_check.py
# file: measuring/clock_check.py

"""Checking different timing functions.
"""

from __future__ import print_function

import os
import sys
import time
import timeit


if sys.version_info.major < 3:
    range = xrange


def clock_check(duration=1):
    """Check the measured time with different methods.
    """
    start_os_time0 = os.times()[0]  # CPU time
    start_time_clock = time.clock()  # Depends on OS (CPU time on NIX, but not Windows)
    start_default_timer = timeit.default_timer()  # Wall clock
    for _ in range(int(1e6)):
        1 + 1
    time.sleep(duration)
    durtation_os_time0 = os.times()[0] - start_os_time0
    durtation_time_clock = time.clock() - start_time_clock
    durtation_default_timer = timeit.default_timer() - start_default_timer
    print('durtation_os_time0:     ', durtation_os_time0)
    print('durtation_time_clock:   ', durtation_time_clock)
    print('durtation_default_timer:', durtation_default_timer)


if __name__ == '__main__':
    clock_check()


Note about deprecation warning: `time.perf_counter` is wall clock time, while `time.process_time` is CPU time. This takes the abstraction away from `time.clock`.

If you pass in `time.process_time` into `cProfile.Profile`, you will no longer see wall clock time in the stats.

# Pi

`simple_pi.py` is slower because it calculates one by one. `numpy_pi.py` is faster but it is memory-bound (sooner or later).

In [None]:
# %load pi/simple_pi.py
# file: simple_pi.py

"""Calculating pi with Monte Carlo.
"""

from __future__ import print_function

import math
import random
import sys


if sys.version_info[0] < 3:
    range = xrange


def pi_plain(total):
    """Calculate pi with `total` hits.
    """
    count_inside = 0
    for _ in range(total):
        x = random.random()
        y = random.random()
        dist = math.sqrt(x * x + y * y)
        if dist < 1:
            count_inside += 1
    return 4.0 * count_inside / total

if __name__ == '__main__':

    def test():
        """Check if it works.
        """
        n = int(1e6)
        print('pi:', pi_plain(n))

    test()


In [None]:
# %load pi/numpy_pi.py
# file: numpy_pi.py
"""Calculating pi with Monte Carlo Method and NumPy.
"""

from __future__ import print_function

import numpy                                                   #1


def pi_numpy(total):                                           #2
    """Compute pi.
    """
    x = numpy.random.rand(total)                               #3
    y = numpy.random.rand(total)                               #4
    dist = numpy.sqrt(x * x + y * y)                           #5
    count_inside = len(dist[dist < 1])                         #6
    return 4.0 * count_inside / total

if __name__ == '__main__':

    def test():
        """Time the execution.
        """
        import timeit
        start = timeit.default_timer()
        pi_numpy(int(1e6))
        print('run time', timeit.default_timer() - start)
    test()


In [None]:
%load_ext snakeviz

In [None]:
%snakeviz pi_plain(1_000_000)

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f use_fast use_fast()

# Memory Profiling

In [None]:
import sys

In [None]:
L = list(range(1_000_000))

`sys.getsizeof()` only measure object directly, not what is inside the object.

In [None]:
sys.getsizeof(L)

In [None]:
1e6 * 28

# Pympler

In [None]:
from pympler import tracker

In [None]:
m = tracker.SummaryTracker()

In [None]:
m.print_diff()

In [None]:
del L

In [None]:
m.print_diff()

In [None]:
import gc

gc.collect()

In [None]:
m.print_diff()

In [None]:
# %load measuring/memory_size_pympler.py
# file: memory_size_pympler.py

"""Measure the size of used memory with a decorator.
"""

from __future__ import print_function

import functools                                                #1
import sys

if sys.version_info.major < 3:
    range = xrange

from pympler import tracker                                     #2

memory = {}                                                     #3


def measure_memory(function):                                   #4
    """Decorator to measure memory size.
    """

    @functools.wraps(function)                                  #5
    def _measure_memory(*args, **kwargs):                       #6
        """This replaces the function that is to be measured.
        """
        measurer = tracker.SummaryTracker()                     #7
        for _ in range(2):                                      #8
            measurer.diff()                                     #9
        try:
            res = function(*args, **kwargs)                     #10
            return res
        finally:                                                #11
            memory[function.__name__] = (measurer.diff())
    return _measure_memory                                      #12


if __name__ == '__main__':

    @measure_memory                                             #13
    def make_big(number):
        """Example function that makes a large list.
        """
        return list(range(number))                              #14

    make_big(int(1e6))                                          #15
    print('used memory', memory)                                #16


In [None]:
a = 1

In [None]:
sys.getrefcount(a)

In [None]:
sys.getrefcount(0)

In [None]:
# %load measuring/memory_growth_pympler.py
# file memory_growth_pympler.py

"""Measure the memory growth during a function call.
"""
from __future__ import print_function

import sys

if sys.version_info.major < 3:
    range = xrange

from pympler import tracker                                     #1


def check_memory_growth(function, *args, **kwargs):             #2
    """Measure the memory usage of `function`.
    """
    measurer = tracker.SummaryTracker()                         #3
    for _ in range(2):                                          #4
        measurer.diff()                                         #5
    function(*args, **kwargs)                                   #6
    return measurer.diff()                                      #7

if __name__ == '__main__':

    def test():
        """Do some tests with different memory usage patterns.
        """

        def make_big(number):                                   #8
            """Function without side effects.

            It cleans up all used memory after it returns.
            """
            return list(range(number))

        data = []                                               #9

        def grow(number):
            """Function with side effects on global list.
            """
            for x in range(number):
                data.append(x)                                  #10
        size = int(1e6)
        print('memory make_big:', check_memory_growth(make_big,
                                                      size))     #11
        print('memory grow:', check_memory_growth(grow, size))   #12

    test()


In [None]:
# %load measuring/pympler_list_growth.py
# file: pympler_list_growth.py

"""Measure the size of a list as it grows.
"""
from __future__ import print_function

import sys

from pympler.asizeof import asizeof, flatsize


if sys.version_info.major < 3:
    range = xrange


def list_mem(length, size_func=flatsize):
    """Measure incremental memory increase of a growing list.
    """
    my_list = []
    mem = [size_func(my_list)]
    for elem in range(length):
        my_list.append(elem)
        mem.append(size_func(my_list))
    return mem


if __name__ == '__main__':

    def main():
        """Show plot or numbers.
        """
        SIZE = 1000
        SHOW = 20

        for func in [flatsize, asizeof, sys.getsizeof]:
            mem = list_mem(SIZE, size_func=func)
            try:
                from matplotlib import pylab
                pylab.plot(mem)
                pylab.show()
            except ImportError:
                print('matplotlib seems not be installed. Skipping the plot.')
                if SIZE > SHOW:
                    limit = SHOW // 2
                    print(mem[:limit],
                          '... skipping %d elements ...' % (SIZE - SHOW),
                          end='')
                    print(mem[-limit:])
                else:
                    print(mem)


`flatsize` is the same as `sys.getsizeof`. For performance, `list` allocates memory in chunks because memory allocation is slow (I think).

In [None]:
%matplotlib inline

main()

In [None]:
# %load measuring/list_alloc_steps.py
# file: list_alloc_steps.py

"""Measure the number of memory allocation steps for a list.
"""
from __future__ import print_function

import sys

if sys.version_info.major < 3:
    range = xrange

from pympler.asizeof import flatsize


def list_steps(lenght, size_func=sys.getsizeof):
    """Measure the number of memory alloaction steps for a list.
    """
    my_list = []
    steps = 0
    int_size = size_func(int())
    old_size = size_func(my_list)
    for elem in range(lenght):
        my_list.append(elem)
        new_size = sys.getsizeof(my_list)
        if new_size - old_size > int_size:
            steps += 1
        old_size = new_size
    return steps


if __name__ == '__main__':
    steps = [10, 100, 1000, 10000, int(1e5), int(1e6), int(1e7)]
    print('Using sys.getsizeof:')
    for size in steps:
        print('%10d: %3d' % (size, list_steps(size)))
    print('Using pympler.asizeof.flatsize:')
    for size in steps:
        print('%10d: %3d' % (size, list_steps(size, flatsize)))


"You can use the tools but take the results with a grain of salt."

Note: Instructor skipped `memory_profiler` due to some "compression" stuff on Macs.

# String Concatenation

In [None]:
n = 1_000_000

This implementation below is not a problem anymore in cPython but should be avoided in general.

In [None]:
s = ''

for _ in range(n):
    s += 'a'
    
print(len(s))

This below is faster.

In [None]:
s = 'a' * n

# List Comprehension

In [None]:
L = list(range(n))

In [None]:
%timeit res1 = [x + 10 for x in L]

In [None]:
%%timeit

res2 = []

for x in L:
    res2.append(x + 10)

In this case above, list comprehension is faster.

# Globals vs Locals

In [None]:
# %load algorithms/local_global.py
# file: local_global.py

"""Local vs. built-in.
"""

import sys

if sys.version_info.major < 3:
    range = xrange

GLOBAL = 1


def repeat(counter):
    """Using the GLOBAL value directly.
    """
    for count in range(counter):
        GLOBAL


def repeat_local(counter):
    """Making GLOBAL a local variable.
    """
    local = GLOBAL
    for count in range(counter):
        local


def test(counter):
    """Call both functions.
    """
    repeat(counter)
    repeat_local(counter)


if __name__ == '__main__':

    def do_profile():
        """Check the run times.
        """
        import cProfile
        profiler = cProfile.Profile()
        profiler.run('test(int(1e8))')
        profiler.print_stats()

    do_profile()


In the example above, there is extra overhead in looking up global namespace. So, it is faster to assign it to local first.

# Locals vs Built-ins

In [None]:
# %load algorithms/local_builtin.py
"""Local vs. built-in.
"""

import sys

if sys.version_info.major < 3:
    range = xrange


def repeat(counter):
    """Using the built-in `sum` in a loop.
    """
    for count in range(counter):
        sum


def repeat_local(counter):
    """Making `sum` a local variable.
    """
    sum_ = sum
    for count in range(counter):
        sum_


def test(counter):
    """Call both functions.
    """
    repeat(counter)
    repeat_local(counter)


if __name__ == '__main__':

    def do_profile():
        """Check the run times.
        """
        import cProfile
        profiler = cProfile.Profile()
        profiler.run('test(int(1e8))')
        profiler.print_stats()

    do_profile()


Like globals, there is overhead in looking up built-ins as opposed to assigning it to local first.

There is some savings in using locals but not too much. What would really help is to use a different data structure.

# Data Structures

## List vs Set

In [None]:
# %load datastructure/searching.py
# file: searching.py
"""Measuring the time for searching in a list and a set.
"""

import timeit


def search_list(n):
    """
    Search for element that is not in a list.
    """
    my_list = list(range(n))
    start = timeit.default_timer()
    n in my_list  # pylint: disable=pointless-statement
    return timeit.default_timer() - start


def search_set(n):
    """Search for an element in a set.
    """
    my_set = set(range(n))
    start = timeit.default_timer()
    n in my_set  # pylint: disable=pointless-statement
    return timeit.default_timer() - start


def calculate_ratio(n):
    """Calculate the ratio between a search in a list and a set.
    """
    list_time = search_list(n)
    set_time = search_set(n)
    return list_time, set_time, list_time / set_time


def compare(end=8, func=calculate_ratio, header='', col1='List', col2='Set'):
    """Show the results.
    """
    table_width = 43
    print()
    if header:
        print('=' * table_width)
        print(header)
        print('=' * table_width)
    width = end + end // 3
    print('{:>{width}s} {:>9s} {:>9s} {:>12s}'.format(
        'Size', col1, col2, 'Ratio', width=width))
    print('-' * table_width)
    fmt = '{count:{width},d} {list_time:9.2e} {set_time:9.2e} {ratio:12,.2f}'
    for n in range(1, end):
        count = 10 ** n
        list_time, set_time, ratio = func(count)

        print(fmt.format(count=count, ratio=ratio, list_time=list_time,
                         set_time=set_time, width=width))
    print('=' * table_width)


if __name__ == '__main__':
    compare(header='Single run')


The example above shows that searching in list is slower than set because the latter uses a hash table. The advantage of converting list to set for lookup only makes sense if you are doing so many lookups that it becomes a bottleneck.

In [None]:
def compare(end=8, func=calculate_ratio, header='', col1='List', col2='Set'):
    """Show the results.
    """
    table_width = 43
    print()
    if header:
        print('=' * table_width)
        print(header)
        print('=' * table_width)
    width = end + end // 3
    print('{:>{width}s} {:>9s} {:>9s} {:>12s}'.format(
        'Size', col1, col2, 'Ratio', width=width))
    print('-' * table_width)
    fmt = '{count:{width},d} {list_time:9.2e} {set_time:9.2e} {ratio:12,.2f}'
    for n in range(1, end):
        count = 10 ** n
        list_time, set_time, ratio = func(count)

        print(fmt.format(count=count, ratio=ratio, list_time=list_time,
                         set_time=set_time, width=width))
    print('=' * table_width)

In [None]:
# %load datastructure/searching_multiple.py
# file: searching_multiple.py
"""
Measuring the time for searching in a list and a set multiple times.
"""

from statistics import mean
import timeit


def search_multiple(obj, n, repeat=7):
    """Search `repeat` times for at least 1 second.
    """
    res = []
    for _ in range(repeat):
        count = 0
        duration = 0
        while duration < 1:
            start = timeit.default_timer()
            n in obj  # pylint: disable=pointless-statement
            duration += timeit.default_timer() - start
            count += 1
        res.append(duration / count)
    return mean(res)


def calculate_ratio_mutiple(n):
    """Calculate the ratio between a search in a list and a set.
    """
    my_list = list(range(n))
    my_set = set(range(n))
    list_time = search_multiple(my_list, n)
    set_time = search_multiple(my_set, n)
    return list_time, set_time, list_time / set_time


if __name__ == '__main__':

    compare(func=calculate_ratio_mutiple, header='Multiple runs')


In [None]:
# %load datastructure/searching_magic.py
# file: searching_magic.py
"""
Measuring the time for searching in a list and a set with IPythom %timeit.
"""

from IPython.terminal.interactiveshell import TerminalInteractiveShell


def timeit_magic(n, setup, statement):
    """Create a `%timeit` magic function with fixed `n`,
    more setup code and the statement to be timed.
    """
    return TerminalInteractiveShell().run_cell_magic(
        'timeit', '-o -q n = {n}; '.format(n=n) + setup, statement)


def search_list(n):
    """
    Search for last element in a list.
    """
    setup = 'my_list = list(range(n))'
    statement = 'n in my_list'
    return timeit_magic(n, setup, statement)


def search_set(n):
    """Search for an element in a set.
    """
    setup = 'my_set = set(range(n))'
    statement = 'n in my_set'
    return timeit_magic(n, setup, statement)


def calculate_ratio(n, search_list=search_list, search_set=search_set):
    """Calculate the ratio between a search in a list and a set.
    """
    list_time = search_list(n).average
    set_time = search_set(n).average
    return list_time, set_time, list_time / set_time


if __name__ == '__main__':
    compare(func=calculate_ratio, header='Magic timeit')

In [None]:
# %load datastructure/intersect.py
# file: intersect.py
"""
Measuring the time for searching in a list and a set including
creation time of the data structure.
"""

import timeit


def intersect_list(n):
    """Measure the run time for intersecting two lists.
    """
    list_a = range(n)
    list_b = range(n-3, 2 * n)
    start = timeit.default_timer()
    in_both = []
    for x in list_a:
        if x in list_b:
            in_both.append(x)
    run_time = timeit.default_timer() - start
    return run_time, in_both


def intersect_set(n):
    """Measure the run time for intersecting two setss.
    """
    set_a = set(range(n))
    set_b = set(range(n-3, 2 * n))
    start = timeit.default_timer()
    in_both = set_a.intersection(set_b)
    run_time = timeit.default_timer() - start
    return run_time, in_both


def calculate_intersect(n):
    """Calculate the intersecting time for two lists and two sets.
    """
    list_time, list_result = intersect_list(n)
    set_time, set_result = intersect_set(n)
    assert set_result == set(list_result)
    return list_time, set_time, list_time / set_time


if __name__ == '__main__':

    compare(func=calculate_intersect, header='Intersection')


# deque

List is efficient if you append, but not if you insert into the beginning.

In [None]:
from collections import deque

In [None]:
L = list(range(10))

In [None]:
L

In [None]:
L[2:4] = []  # Remove elements from list

In [None]:
L

In [None]:
d = deque(range(10))

In [None]:
d.rotate(-4)
d

In [None]:
d.pop()
d.pop()
d

In [None]:
d.rotate(2)
d

In [None]:
# %load datastructure/list_deque.py
# file: list_deque.py

"""Removing elements from a list vs. from a deque.
"""

from collections import deque
from statistics import mean
import timeit


def time_function(func, make_args, repeat=7, limit=1):
    """Measure the run time of a function."""
    timing_res = []
    for _ in range(repeat):
        count = 0
        duration = 0
        while duration < limit:
            args = make_args()
            start = timeit.default_timer()
            func(*args)
            duration += timeit.default_timer() - start
            count += 1
        timing_res.append(duration / count)
    return mean(timing_res)


def remove_from_list(my_list, start, end):
    """Remove elements between `start` and `end` from a list.
    """
    my_list[start:end] = []


def remove_from_deque(my_deque, start, end):
    """Remove elements between `start` and `end` from a deque.
    """
    my_deque.rotate(-end)
    for _ in range(end - start):
        my_deque.pop()
    my_deque.rotate(start)


def main():
    """Run some tests.
    """
    start = 100
    size = int(1e6)
    fmt = '{diff:10,d} {list_time:10.2e} {deque_time: 10.2e} {ratio:8.2f}'
    for limit in [0.00001, 0.0001, 0.001]:  #, 0.01, 0.1]:  # Took very long
        print('Limit:', limit)
        print('{:>10s} {:>10s} {:>10s} {:>8s}'.format(
            'Replaced', 'List', 'Deque', 'Ratio'))
        for end in [101, 110, 1100, 10100, 100100]:
            diff = end - start
            results = {}
            for obj, func in zip([list, deque], [remove_from_list,
                                                 remove_from_deque]):
                def make_args(obj=obj, size=size, start=start, end=end):
                    """Dynamically create function with right arguments.
                    """
                    return obj(range(size)), start, end

                res = time_function(func, make_args, limit=limit)
                results[obj.__name__] = res
            list_time = results['list']
            deque_time = results['deque']
            ratio = list_time / deque_time
            print(fmt.format(diff=diff, list_time=list_time,
                             deque_time=deque_time, ratio=ratio))


if __name__ == '__main__':
    main()

The example above shows that algorithm optimization also depends on your data. Do you have big or small arrays?

# defaultdict

In [None]:
s = 'adasdasdsadsad'

In [None]:
d = {}

for k in s:
    d.setdefault(k, 0)
    d[k] += 1
    
print(d)

In [None]:
from collections import defaultdict

d2 = defaultdict(int)  # Has to pass in something callable

for k in s:
    d2[k] += 1
    
print(d2)

In [None]:
# %load datastructure/setdefault_defaultdict.py
# file: setdefault_defaultdict.py

"""Defaultdict can faster than a standard dict.
"""

from collections import defaultdict


def standard_dict(text):
    """Count with standard dict.
    """
    d = {}
    for key in text:
        d.setdefault(key, 0)
        d[key] += 1
    return d


def default_dict(text):
    """Count with defaultdict.
    """
    dd = defaultdict(int)
    for key in text:
        dd[key] += 1
    return dd


def standard_dict_group(data):
    """Group with standard dict.
    """
    d = {}
    for key, value in data:
        d.setdefault(key, []).append(value)
    return d


def default_dict_group(data):
    """Group with defaultdict.
    """
    dd = defaultdict(list)
    for key, value in data:
        dd[key].append(value)
    return dd

In [None]:
%timeit standard_dict(s)

In [None]:
%timeit default_dict(s)

In the example above, `defaultdict` is a bit faster but not by much.

`O(1)` is independent of data size. `O(n)` is linear to size. Then, there is quadratic, and so on. It is good to understand big-O notation of your algorithms. Measure, measure, measure!