# Parallelism

In [1]:
import pandas as pd
import numpy as np

# Silly example

In [2]:
import time

def my_sleep(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x

In [3]:
my_sleep(5)

Sleeping for 5 seconds.
Returning 5


5

In [4]:
my_list = [1,2,3,4]

In [5]:
sum(my_list)

10

In [6]:
from tqdm.auto import tqdm

## Serial code

In [7]:
for item in tqdm(my_list):
    my_sleep(item)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4



In [None]:
map(my_sleep, my_list)

In [None]:
# magic commands

In [None]:
%%time

list(map(my_sleep, my_list))

## Parallel code

In [10]:
from multiprocessing import Pool, cpu_count

cpu_count()

6

## You have to create a pool of `n` process.

In [11]:
pool = Pool(processes=4)

### We'll `use the magic function` here to measure the velocity of this code in parallel.

However, if you run this code, watch what happens:

In [None]:
%%time

result = pool.map(my_sleep, my_list)
pool.terminate()

## This happens because multiprocessing not always (?) work in Jupyter Notebooks. 

_Some versions of linux or macbooks may handle it well (yay unix)_. But certainly it doesn't work for Windows.

### What should we do then? Two solutions.

1. We have to write our functions inside a `.py` file.

2. Install `multiprocess` (note it is different from Python's `multiprocessing` module)

In [None]:
import sys

In [None]:
sys.path.insert(0, '/Users/andreaguiar/Desktop/usr/dist/')

In [None]:
sys.path

In [None]:
from sleeper import my_sleep

In [None]:
%%time

result = pool.map(my_sleep, my_list)
pool.terminate()

In [None]:
pool = Pool(processes=2)

In [None]:
%%time

result = pool.map(my_sleep, my_list)
pool.terminate()

In [None]:
result

In [1]:
!pip install multiprocess

Collecting multiprocess
  Downloading multiprocess-0.70.10.zip (2.4 MB)
Collecting dill>=0.3.2
  Downloading dill-0.3.2.zip (177 kB)
Building wheels for collected packages: multiprocess, dill
  Building wheel for multiprocess (setup.py): started
  Building wheel for multiprocess (setup.py): finished with status 'done'
  Created wheel for multiprocess: filename=multiprocess-0.70.10-py3-none-any.whl size=108192 sha256=f5e3b4f02e02ecc6148b2c7135483f35c2e2b811ff2203e1114983b9723d8527
  Stored in directory: c:\users\shcle\appdata\local\pip\cache\wheels\69\a0\aa\14379d16112299afa0b5f8464971648ea2a660a8f6aa1ca088
  Building wheel for dill (setup.py): started
  Building wheel for dill (setup.py): finished with status 'done'
  Created wheel for dill: filename=dill-0.3.2-py3-none-any.whl size=78977 sha256=cac02e0720dd2c02a20d2106fe13e37a6e62c60ba2f47d4bafa67979bdd58487
  Stored in directory: c:\users\shcle\appdata\local\pip\cache\wheels\72\6b\d5\5548aa1b73b8c3d176ea13f9f92066b02e82141549d90e2100

## using multiprocess


In [None]:
# using multiprocess instead of multiprocessing
from multiprocess import Pool

In [None]:
pool = Pool(processes=4)

In [None]:
%%time

result = pool.map(my_sleep, [1,3,6,8])
pool.terminate()

In [None]:
print('oi')

# Running Asynchronous code

## What is asynchrony?

- `result.ready()`
- `result.wait()`
- `result.get()`

In [None]:
pool = Pool(processes=4)

In [None]:
%%time

result = pool.map_async(my_sleep, [60, 60, 60, 60, 60, 60, 60, 60, 60])

In [None]:
result.ready()

In [None]:
print('Do something that doesn"t depend on result')
print('...')
print('Now the time came when the result is needed.')
result.wait()

result_list = result.get()
pool.terminate()
print(f'Now go on and use the results obtained - {result_list}')

# CPU intensive computations

In [None]:
def square(x):
    return x ** 2

In [None]:
n = 1000000

In [None]:
%%timeit
    
result = [square(item) for item in np.random.random(size=n)]

In [None]:
%%time
    
result = [square(item) for item in np.random.random(size=n)]

In [None]:
pool = Pool(processes=4)

In [None]:
random_numbers = np.random.random(size=n)

In [None]:
pool = Pool(process=4)

In [None]:
%%time

result = pool.map(square, random_numbers)

In [None]:
pool.terminate()

In [None]:
# GIL - global interpreter lock

## profiling tools

In [None]:
%%prun

result = [square(item) for item in np.random.random(size=n)]

## Usually, for CPU intensive computations, Pool.map won't speed up your code.

Why? It will spend more time managing process, replicating data and sending data to other process than actually computing it.



In [None]:
## Cython - CPython

In [None]:
%load_ext Cython

In [None]:
%%cython -a
def square_c(x):
    return x ** 2

In [None]:
random_numbers = np.random.random(size=n)

In [None]:
%%timeit

result = [square_c(item) for item in np.random.random(size=n)]

# When is multiprocess useful then? 


## I/O bound computations

In [None]:
import pandas as pd

In [None]:
import requests

In [None]:
n_max = 51852

In [None]:
colnames = ['team_a','score_a','score_b','team_b','event','stars']
my_range = range(int(np.ceil(n_max/100)))


for i in tqdm(my_range):
    response = requests.get(f'https://www.hltv.org/results?offset={i * 100}')
    df = pd.concat(pd.read_html(response.text))
    df.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')

In [None]:
def download(i):
    response = requests.get(f'https://www.hltv.org/results?offset={i * 100}')
    df = pd.concat(pd.read_html(response.text))
    df.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')

In [None]:
pool = Pool(4)

In [None]:
%%time

results = pool.map(download, my_range)

In [None]:
pool.terminate()