In [19]:
%run ../00_AdvancedPythonConcepts/talktools.py

<img src="https://www.evernote.com/l/AUV1r1xvhBdPF6lX-2SJLkO-vkkmCXEDrMwB/image.png">
http://www.slideshare.net/ManojitNandi/parallel-programming-in-python-speeding-up-your-analysis

Remember, you can create (fork) many processes, which are copies of the original parent process (memory, data, state) and act independently of each other. To share data between them you have to explicitly do that within each process. The Pythonic way we do multiprocessing (creation of new processes, communication between processes) is with `multiprocessing`.

*"effectively side-stepping the Global Interpreter Lock by using subprocesses instead of threads. Due to this, the multiprocessing module allows the programmer to fully leverage multiple processors on a given machine. It runs on both Unix and Windows."*

- https://docs.python.org/3/library/multiprocessing.html

The analog to `threading.Thread` is `multiprocessing.Process`. You should be able to do a drop-in replacement. Instead of `current_thread()` you'd use `os.getpid()`.

In [None]:
%load_ext snakeviz

In [None]:
%%snakeviz

import logging
import random
import time
import os

root = logging.getLogger()
root.handlers = []
logging.basicConfig(level=logging.DEBUG,
                    format='(%(threadName)-9s) %(message)s',)

import multiprocessing

def worker(num):
    """thread worker function"""
    
    sleep_time = random.randint(1, 5)
    logging.debug('worker: {0} sleeping for {1} s, name: {2}'
                   .format(num, sleep_time, os.getpid()))
    time.sleep(sleep_time)
    logging.debug('done')
    return

procs = []
for i in range(2):
    p = multiprocessing.Process(target=worker, args=(i,))
    procs.append(p)
    p.start()

If your machine has multiple cores, these two processes may get run on those two separate cores, independently.

You may need to share info between processes. You can do this, just like with Threads with `Queues`. You can also use the (UNIX-like) Pipe to have  two processes communicate with each other:

In [None]:
# https://docs.python.org/3/library/multiprocessing.html#exchanging-objects-between-processes
from multiprocessing import Process, Pipe

def f(conn):
    conn.send([42, None, 'hello'])
    conn.close()

if __name__ == '__main__':
    parent_conn, child_conn = Pipe()
    p = Process(target=f, args=(child_conn,))
    p.start()
    print(parent_conn.recv())   # prints "[42, None, 'hello']"
    p.join()

Using pools of workers (in separate processes) with multiprocessing `Pool.`

https://docs.python.org/3.6/library/multiprocessing.html#using-a-pool-of-workers

In [None]:
from multiprocessing import Pool 
import time


def g(x):
    # domain specific stuff here!
    time.sleep(0.2)
    return x*x

pool = Pool(processes=4)     # start 4 worker processes 

In [None]:
pool

In [None]:
%time pool.map(g, range(10))

In [None]:
%time list(map(g, range(10)))

In [None]:
# print same numbers in arbitrary order
for i in pool.imap_unordered(g, range(10)):
    print(i, sep=" ", end=" ")

Python is packaging (pickling) up your functions and sending them to different processes.

In [None]:
# This will fail...
pool.map(lambda x: x**3, range(10))

In [None]:
# run only one process "g(10)" asynchronously 
result = pool.apply_async(g, [10])

# prints "100" unless you timeout
print(result.get(timeout=0.25)) 

In [None]:
result

In [None]:
del pool

In [None]:
from multiprocessing import Pool 
import time

def f(x): 
    return x*x

for i in [1, 2, 3, 4, 8, 16, 32]:
    print(i,"*"*5, flush=True)
    pool = Pool(processes=i)               # start 4 worker processes 
    start = time.time()
    pool.map(f, range(1000000))
    print("{0:0.4f} sec".format(time.time() - start))
    pool.terminate()
    del pool

In [None]:
!ulimit -a

# Launching parallel tasks with `concurrent.futures`

Built-in, create different pools for executing **maps** (single loop over data). Local resources.

<i>"The `concurrent.futures` module provides a high-level interface for asynchronously executing callables.

The asynchronous execution can be performed with threads, using `ThreadPoolExecutor`, or separate processes, using `ProcessPoolExecutor`. Both implement the same interface, which is defined by the abstract Executor class."</i>

https://docs.python.org/3/library/concurrent.futures.html

In [None]:
from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor(2)  # can also use a threadpool

In [None]:
%%time 
from time import sleep

results = []
for i in range(8):
    sleep(1)
    results.append(i + 1)

In [None]:
results

In [None]:
%%time 
from time import sleep

from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor(2) 

def slowfunc(x):
    sleep(1)
    return(x+1)

results = list(e.map(slowfunc, range(8)))

In [None]:
e.shutdown()

Figured out I have 4 cores and ran it in 4 separate processes.

## Breakout

Convert the sequential code to parallel using `concurrent.futures`

In [None]:
%%time

import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Special:Random"

lens = []
for i in range(10):
    a = requests.get(url)
    resp = a.text
    print("title=",BeautifulSoup(resp, 'html.parser')
          .title.string.split("- Wikipedia")[0],"len=",len(resp))
    lens.append(len(resp))

print(lens)

### Executor.submit

`submit` starts an execution in a separate thread or process and immediately returns a `Future` object that points back to the result. Until the function completes, the future is pending. We get the result of a task with `.result()`, which blocks until the computation is complete.

In [None]:
%%time 
from time import sleep

from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor() 

def slowfunc(x, y, delay=1):
    sleep(delay)
    return(x + y)

future = e.submit(slowfunc, 1, 2)

In [None]:
future

In [None]:
future.result()

In [None]:
%%time 
futures = [e.submit(slowfunc,1,2, delay=1) for _ in range(10)]
results = [f.result() for f in futures]

## Joblib

http://pythonhosted.org/joblib/

Running Python functions as pipeline jobs. The *vision is to provide tools to easily achieve better performance and reproducibility when working with long running jobs.* Specifically meant to work well with large data (ie. numpy arrays).

  - **Avoid computing twice the same thing**: code is rerun over an over, for instance when prototyping computational-heavy jobs (as in scientific development), but hand-crafted solution to alleviate this issue is error-prone and often leads to unreproducible results
  - **Persist to disk transparently**: persisting in an efficient way arbitrary objects containing large data is hard. Using joblib’s caching mechanism avoids hand-written persistence and implicitly links the file on disk to the execution context of the original Python object. As a result, joblib’s persistence is good for resuming an application status or computational job, eg after a crash.

Joblib strives to address these problems while leaving your code and your flow control as unmodified as possible (no framework, no new paradigms).

In [None]:
!conda install joblib -y

In [None]:
from math import sqrt
[sqrt(i ** 2) for i in range(10)]

### Parallel Helpers

Joblib provides a simple helper class to write parallel for loops using multiprocessing. The core idea is to write the code to be executed as a generator expression, and convert it to parallel computing.

In [None]:
from math import sqrt
from joblib import Parallel, delayed

By default Parallel uses the Python multiprocessing module to fork separate Python worker processes to execute tasks concurrently on separate CPUs. This is a reasonable default for generic Python programs but it induces some overhead as the input and output data need to be serialized in a queue for communication with the worker processes. 

In [None]:
Parallel(n_jobs=2,backend="threading") \
  (delayed(sqrt)(i ** 2) for i in range(10))

In [None]:
import time
start = time.time()
Parallel(n_jobs=5,verbose=5) \
  (delayed(time.sleep)(1) for _ in range(10))
print(time.time()-start)

### On demand recomputing: the `Memory` class

Caching long running results so it can be reused. Let's try to cache to disk:

In [None]:
from joblib import Memory
memory = Memory(cachedir="/tmp/", verbose=0)  # try a higher verbosity

In [None]:
@memory.cache
def f(x):
    print('Running f(%s)' % x)
    return x

In [None]:
print(f(1))

In [None]:
print(f(1))

In [None]:
print(f(10))

In [None]:
print(f(20))

In [None]:
!ls -lat  /tmp/joblib/__main__--Users-jbloom-Classes-python-seminar-DataFiles_and_Notebooks-08_Parallelism-__ipython-input__/f

In [None]:
memory = Memory(cachedir="/tmp/",verbose=0, mmap_mode="r+")

In [None]:
@memory.cache
def josh(x,blah=True):
    print('Running josh(%s)' % x)
    return x

In [None]:
print(josh(1))

In [None]:
print(josh(1))

In [None]:
print(josh(1, blah=False))

Ignoring variables:

In [None]:
@memory.cache(ignore=['blah'])
def h(x,blah=True):
    print('Running h(%s)' % x)
    return x

In [None]:
print(h(1))

In [None]:
print(h(1,blah=False))

Note: joblib also gives (for persistence) `joblib.dump()` and `joblib.load()` provide a replacement for pickle to work efficiently on Python objects containing large data, in particular large numpy arrays.