In [16]:
#First let's create a test script for proof of concept. 
#This is a simple data structure transformation script taken from
#Youtuber 'Real Python' at https://www.youtube.com/watch?v=aysceqdGFw8

#Let's make the proper imports.
import collections
import os
import time
import yt
yt.funcs.mylog.setLevel(40)
from pprint import pprint

# Serial Processing

The following code will generate a tuple consisting of scientist names and data about them. Then, the tuple is passed through the 'map' process which performs a simple act on each element of the input tuple. The resulting act is saved in a second tuple and then the next entry is processed.

This is performed on one processor on one thread, so the computer must wait for the first entry to finish processing until the second one can begin. 

With the addition of a time.sleep() requirement, this can take quite some time!

In [11]:
Scientist = collections.namedtuple('Scientist', [
    'name',
    'field',
    'born',
    'PhD',
])

scientists = (
    Scientist(name='Sean Lewis', field='Astrophysics', born=1994, PhD=False),
    Scientist(name='Weixiang Yu', field='Astronomy', born=1992, PhD=False),
    Scientist(name='Jaqueline Moreno', field='Astronomy', born=1991, PhD=True),
    Scientist(name='Stephen Sclafani', field='Particle', born=1986, PhD=False),
    Scientist(name='Eli Worth', field='Condensed Matter', born=1993, PhD=False),
    Scientist(name='David Lioi', field='Biophysics', born=1990, PhD=True)
)

pprint(scientists)

def transform(x):
    print('Process ' + str(os.getpid()) + ' working record ' + str(x.name))
    time.sleep(1)
    result = {'name': x.name, 'age': 2019 - x.born}
    print('Process ' + str(os.getpid()) + ' done processing record ' + str(x.name))
    return result

start = time.time()
print '\n'

result = tuple(map(
    transform,
    scientists
))

end = time.time()
elapsed = end - start

print '\n'
pprint(result)
print '\n Time to complete:  {0:.3f}'.format(elapsed)

(Scientist(name='Sean Lewis', field='Astrophysics', born=1994, PhD=False),
 Scientist(name='Weixiang Yu', field='Astronomy', born=1992, PhD=False),
 Scientist(name='Jaqueline Moreno', field='Astronomy', born=1991, PhD=True),
 Scientist(name='Stephen Sclafani', field='Particle', born=1986, PhD=False),
 Scientist(name='Eli Worth', field='Condensed Matter', born=1993, PhD=False),
 Scientist(name='David Lioi', field='Biophysics', born=1990, PhD=True))


Process 31830 working record Sean Lewis
Process 31830 done processing record Sean Lewis
Process 31830 working record Weixiang Yu
Process 31830 done processing record Weixiang Yu
Process 31830 working record Jaqueline Moreno
Process 31830 done processing record Jaqueline Moreno
Process 31830 working record Stephen Sclafani
Process 31830 done processing record Stephen Sclafani
Process 31830 working record Eli Worth
Process 31830 done processing record Eli Worth
Process 31830 working record David Lioi
Process 31830 done processing record David

Great! We can see the input, which computer core is doing the processing for each task, the output, and the total time to complete the cell. Obviously from the output, the same core is processing each task, and we can watch each task crawl its way through. How can we utilize the full capacity of the quad-core processor in the MacBook Pro?

# Parallel Processing

In [12]:
import multiprocessing

pprint(scientists)

start = time.time()
print '\n'

pool = multiprocessing.Pool()
result = pool.map(transform, scientists)

end = time.time()


print '\n'
pprint(result)
print '\n Time to complete:  {0:.3f}'.format(end-start)

(Scientist(name='Sean Lewis', field='Astrophysics', born=1994, PhD=False),
 Scientist(name='Weixiang Yu', field='Astronomy', born=1992, PhD=False),
 Scientist(name='Jaqueline Moreno', field='Astronomy', born=1991, PhD=True),
 Scientist(name='Stephen Sclafani', field='Particle', born=1986, PhD=False),
 Scientist(name='Eli Worth', field='Condensed Matter', born=1993, PhD=False),
 Scientist(name='David Lioi', field='Biophysics', born=1990, PhD=True))


Process 31867 working record Sean LewisProcess 31868 working record Weixiang YuProcess 31869 working record Jaqueline Moreno

Process 31871 working record Eli WorthProcess 31870 working record Stephen Sclafani
Process 31872 working record David Lioi


Process 31868 done processing record Weixiang YuProcess 31867 done processing record Sean LewisProcess 31869 done processing record Jaqueline MorenoProcess 31870 done processing record Stephen SclafaniProcess 31871 done processing record Eli Worth



Process 31872 done processing record David 

Process PoolWorker-2:
Process PoolWorker-6:
Process PoolWorker-4:
Process PoolWorker-3:
Process PoolWorker-5:
Process PoolWorker-1:
Process PoolWorker-8:
Process PoolWorker-7:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/sean/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/sean/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/sean/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/sean/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/sean/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/sean/anaconda2/lib/python2.7/multiprocessing

As you can see, there are 6 different processors that are active in this execution. They each take on one task, this being the transform function on one data table entry and all execute simultaneously. The result is a process that completes ~6x faster than before! 

# Image Processing

Now let's try to apply the same principle to an image processing function. This will read in data files using yt, plot a data field, and save the plot to a folder. The folder from which we will be pulling data is: ./data-multiproc-testing and we are saving to ./figures-multiproc-testing.

In [20]:
import pandas as pd
import glob
data_files = (glob.glob('/Users/sean/Torch/notebooks/sean_notebooks/data-multiproc-testing/turbsph_hdf5_plt_cnt_00*'))

test_file_names = ['file0', 'file1', 'file2', 'file3', 'file4', 'file5']

In [21]:
def plot_all(file_name):
    print 'Processor ' + str(os.getpid())
    print 'is computing file number ' + str(file_name[-4:])
    ds = yt.load(file_name)
    slice_ = yt.ProjectionPlot(ds, ax, attr)
    slice_.zoom(4)
    slice_.set_zlim('dens', 1e-3, 2e-1)
    slice_.set_font_size(24)
    
    slice_.annotate_timestamp()
    slice_.annotate_title('$10^5$ M$_{\odot}$ Column Density')
    slice_.annotate_scale()
    slice_.save(sp + file_name[-4:])
    print 'Plot ' + str(file_name[-4:]) + ' saved.'
def map_test(f):
    result = f + 'testing'
    return result

start = time.time()

fp = '/Users/sean/Torch/notebooks/sean_notebooks/data-multiproc-testing/'

fn = 'turbsph_hdf5_plt_cnt_'

sp = '/Users/sean/Torch/notebooks/sean_notebooks/figures-multiproc-testing/'

attr = 'dens'
ax = 'z'

pool = multiprocessing.Pool()

#testing = map(map_test, test_file_names)
#print testing
pool.map(plot_all, data_files)

#plot_all(fp, fn, sp, 61, attr, ax)
print '\n All done. Total time:'
end = time.time()
print '{0:.3f} s'.format(end-start)

Processor 32219Processor 32220Processor 32217Processor 32215Processor 32216Processor 32218Processor 32222Processor 32221







is computing file number 0046is computing file number 0040is computing file number 0023is computing file number 0013is computing file number 0022is computing file number 0049is computing file number 0008
is computing file number 0012






Plot 0040 saved.
Processor 32215
is computing file number 0014
Plot 0023 saved.
Processor 32220
is computing file number 0024
Plot 0046 saved.
Processor 32219
is computing file number 0048
Plot 0008 saved.Plot 0013 saved.

Processor 32217
Processor 32222is computing file number 0025

is computing file number 0037
Plot 0022 saved.
Processor 32218
is computing file number 0041
Plot 0012 saved.
Processor 32221
is computing file number 0015
Plot 0049 saved.
Processor 32216
is computing file number 0047
Plot 0014 saved.
Processor 32215
is computing file number 0030
Plot 0024 saved.
Processor 32220
is computing file number 0006
Pl

In [19]:
def plot_all_serial(file_name):
    print 'Processor ' + str(os.getpid())
    print 'is computing file number ' + str(file_name[-4:])
    ds = yt.load(file_name)
    slice_ = yt.ProjectionPlot(ds, ax, attr)
    slice_.zoom(4)
    slice_.set_zlim('dens', 1e-3, 2e-1)
    slice_.set_font_size(24)
    
    slice_.annotate_timestamp()
    slice_.annotate_title('$10^5$ M$_{\odot}$ Column Density')
    slice_.annotate_scale()
    slice_.save(sp + file_name[-4:])
    
start = time.time()
for f in data_files:
    plot_all_serial(f)
end = time.time()

print '{0:.3f} s'.format(end-start)

Processor 31830
is computing file number 0049
Processor 31830
is computing file number 0047
Processor 31830
is computing file number 0040
Processor 31830
is computing file number 0014
Processor 31830
is computing file number 0013
Processor 31830
is computing file number 0025
Processor 31830
is computing file number 0022
Processor 31830
is computing file number 0041
Processor 31830
is computing file number 0046
Processor 31830
is computing file number 0048
Processor 31830
is computing file number 0023
Processor 31830
is computing file number 0024
Processor 31830
is computing file number 0012
Processor 31830
is computing file number 0015
Processor 31830
is computing file number 0008
Processor 31830
is computing file number 0037
Processor 31830
is computing file number 0030
Processor 31830
is computing file number 0039
Processor 31830
is computing file number 0006
Processor 31830
is computing file number 0001
Processor 31830
is computing file number 0055
Processor 31830
is computing file 

[ 0.37454012  0.95071431  0.73199394  0.59865848  0.15601864  0.15599452
  0.05808361  0.86617615  0.60111501  0.70807258]
