Parallel Maps are Common
---------------------------

For simple operations like an embarrassingly parallel map (applying the same function to many inputs) we have *many* options.  They all perform about the same.

This notebook goes through the user interface of several of them on the same problem.

In [None]:
from glob import glob
import ujson as json
import pandas as pd

In [None]:
filenames = sorted(glob('../data/data-*.json'))

def load_parse_store(fn):
    with open(fn) as f:
        data = [json.loads(line) for line in f]
    # sort the data
    data = sorted(data, key=lambda record: record['type'])
    # write out the sorted data
    with open(fn + '.sorted', 'w') as f:
        json.dump(data, f)


### Sequential for loops

In [None]:
%%time

for fn in filenames:
    load_parse_store(fn)

### Concurrent.futures

In [None]:
%%time

from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor()

list(e.map(load_parse_store, filenames))

### Multiprocessing

In [None]:
%%time 

from multiprocessing import Pool
p = Pool()

list(p.map(load_parse_store, filenames))

### Joblib

In [None]:
%%time 

from joblib import Parallel, delayed

result = Parallel(n_jobs=4, backend='multiprocessing')(delayed(load_parse_store)(fn) for fn in filenames)

### IPython Parallel

Start an IPython cluster with:

    ipcluster start

In [None]:
from subprocess import Popen
ipcluster = Popen(['ipcluster', 'start'])

In [None]:
import ipyparallel as ipp
c = ipp.Client()

# use the same serialization as everyone else
c[:].use_cloudpickle()

view = c.load_balanced_view()

In [None]:
%%time

result = list(view.map(load_parse_store, filenames))

In [None]:
!ipcluster stop

### PySpark

In [None]:
%%time

import pyspark

sc = pyspark.SparkContext('local[4]')

In [None]:
%%time

rdd = sc.parallelize(filenames)
result = rdd.map(load_parse_store).collect()

### Dask.bag

In [None]:
%%time

import dask.bag as db

b = db.from_sequence(filenames)
b.map(load_parse_store).compute()

### Dask.delayed

In [None]:
%%time

from dask import delayed, compute
import dask.multiprocessing

compute(*[delayed(load_parse_store)(fn) for fn in filenames], 
        get=dask.multiprocessing.get)

### Dask.distributed

In [None]:
%%time
from dask.distributed import Executor
e = Executor()  # creates local scheduler and workers

In [None]:
%%time

futures = e.map(load_parse_store, filenames)
e.gather(futures)