# Parallelism

In [1]:
import pandas as pd
import numpy as np


Collecting multiprocess
  Downloading multiprocess-0.70.12.2-py39-none-any.whl (128 kB)
     -------------------------------------- 128.7/128.7 KB 1.3 MB/s eta 0:00:00
Collecting dill>=0.3.4
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
     ---------------------------------------- 86.9/86.9 KB 1.6 MB/s eta 0:00:00
Installing collected packages: dill, multiprocess
Successfully installed dill-0.3.4 multiprocess-0.70.12.2


# Silly example

In [2]:
import time

def my_sleep(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    import time
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x

In [3]:
my_sleep(2)

Sleeping for 2 seconds.
Returning 2


2

In [4]:
my_list = [1,2,3,4,5,6]

In [5]:
sum(my_list)

21

In [6]:
from tqdm.auto import tqdm

## Serial code

In [7]:
for item in tqdm(my_list):
    my_sleep(item)

  0%|          | 0/6 [00:00<?, ?it/s]

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6


In [None]:
# magic commands

In [8]:
%%time
list(map(my_sleep, tqdm(my_list)))

  0%|          | 0/6 [00:00<?, ?it/s]

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6
CPU times: total: 31.2 ms
Wall time: 21.1 s


[1, 2, 3, 4, 5, 6]

## Parallel code

In [9]:
from multiprocessing import Pool, cpu_count

cpu_count()

8

## You have to create a pool of `n` process.

In [10]:
pool = Pool(processes=cpu_count())

### We'll `use the magic function` here to measure the velocity of this code in parallel.

However, if you run this code, watch what happens:

In [None]:
%%time

result = pool.map(my_sleep, my_list)
pool.terminate()

## This happens because multiprocessing not always (?) work in Jupyter Notebooks. 

_Some versions of linux or macbooks may handle it well (yay unix)_. But certainly it doesn't work for Windows.

### What should we do then? Two solutions.

1. We have to write our functions inside a `.py` file.

2. Install `multiprocess` (note it is different from Python's `multiprocessing` module)

In [7]:
from sleeper import my_sleep_from_file

In [8]:
%%time

result = pool.map(my_sleep_from_file, my_list)
pool.terminate()

Wall time: 6.04 s


In [11]:
pool = Pool(processes=2)

In [None]:
%%time

result = pool.map(my_sleep_from_file, my_list)
pool.terminate()

In [23]:
result

[1, 2, 3, 4, 5, 6]

## Using multiprocess


In [19]:
#!pip install multiprocess

In [None]:
# using multiprocess instead of multiprocessing
from multiprocess import Pool #biblioteca usada para jupyter notebook

In [None]:
pool = Pool(processes=6)

In [None]:
%%time

result = pool.map(my_sleep, [1,2,3,4,5,6])
pool.terminate()

# Running Asynchronous code

## What is asynchrony?

- `result.ready()`
- `result.wait()`
- `result.get()`

In [None]:
pool = Pool(processes=6)

In [None]:
result = pool.map_async(my_sleep, [10, 10, 10, 10, 10, 10])

In [33]:
#result.wait()

In [None]:
result.ready()

In [None]:
print('Do something that doesn"t depend on result')
print('...')
print('Now the time came when the result is needed.')
#result.wait()

result_list = result.get()
pool.terminate()
print(f'Now go on and use the results obtained - {result_list}')

# CPU intensive computations

In [None]:
def square(x):
    return x ** 2

In [21]:
n = 1000000

In [13]:
%%time
    
result = [square(item) for item in np.random.random(size=n)]

Wall time: 358 ms


In [23]:
pool = Pool(processes=6)

In [24]:
random_numbers = np.random.random(size=n)

In [25]:
%%time

result = pool.map(square, random_numbers)

Wall time: 13.6 s


In [2]:
#pool.terminate()
!pip install Cython

Collecting Cython
  Downloading Cython-0.29.27-py2.py3-none-any.whl (983 kB)
     ------------------------------------- 983.7/983.7 KB 12.5 MB/s eta 0:00:00
Installing collected packages: Cython
Successfully installed Cython-0.29.27


## profiling tools

In [None]:
%%prun

result = [square(item) for item in np.random.random(size=n)]

## Usually, for CPU intensive computations, Pool.map won't speed up your code.

Why? It will spend more time managing process, replicating data and sending data to other process than actually computing it.



In [20]:
!pip install Cython

Collecting Cython
  Using cached Cython-0.29.21-py2.py3-none-any.whl (974 kB)
Installing collected packages: Cython
Successfully installed Cython-0.29.21


In [18]:
## Cython - CPython

In [26]:
%load_ext Cython

In [27]:
%%cython -a
def square_c(x):
    return x ** 2

In [35]:
n = 1000000

In [36]:
random_numbers = np.random.random(size=n)

In [37]:
%%timeit

result = [square_c(item) for item in random_numbers]

3.42 s ± 25.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
%%timeit

result = [square(item) for item in random_numbers]

3.81 s ± 34.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# When is multiprocess useful then? 


## I/O bound computations

In [39]:
import pandas as pd

In [40]:
import requests
from bs4 import BeautifulSoup

In [38]:
n_max = 51

In [39]:
%%time
my_range = range(1,n_max)


for i in tqdm(my_range):
    response = requests.get(f'http://books.toscrape.com/catalogue/page{i}.html')
    html=response.content
    soup = BeautifulSoup(html)
    titles=[s.find_all('a')[0]['title'] for s in soup.find_all('h3')]
    prices = [s.text for s in soup.find_all('p', attrs={'class':'price_color'})]
    stocks = [(True if s.text.strip()=='In stock' else False) for s in soup.find_all('p', attrs={'class':'instock availability'})]
    df_temp=pd.DataFrame({'Title':titles,'Price':prices,'Stock Availability':stocks})
    df_temp.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))


Wall time: 18.7 s


In [43]:
def download(i):
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    response = requests.get(f'http://books.toscrape.com/catalogue/page{i}.html')
    html=response.content
    soup = BeautifulSoup(html)
    titles=[s.find_all('a')[0]['title'] for s in soup.find_all('h3')]
    prices = [s.text for s in soup.find_all('p', attrs={'class':'price_color'})]
    stocks = [(True if s.text.strip()=='In stock' else False) for s in soup.find_all('p', attrs={'class':'instock availability'})]
    df_temp=pd.DataFrame({'Title':titles,'Price':prices,'Stock Availability':stocks})
    df_temp.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')

In [51]:
pool = Pool(6)

In [52]:
%%time

results = pool.map(download, tqdm(my_range))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))


Wall time: 3.24 s


In [53]:
pool.terminate()