In [None]:
# This code cell starts the necessary setup for Hour of CI lesson notebooks.
# First, it enables users to hide and unhide code by producing a 'Toggle raw code' button below.
# Second, it imports the hourofci package, which is necessary for lessons and interactive Jupyter Widgets.
# Third, it helps hide/control other aspects of Jupyter Notebooks to improve the user experience
# This is an initialization cell
# It is not displayed because the Slide Type is 'Skip'

from IPython.display import HTML, IFrame, Javascript, display
from ipywidgets import interactive
import ipywidgets as widgets
from ipywidgets import Layout

import getpass # This library allows us to get the username (User agent string)

# import package for hourofci project
import sys
sys.path.append('../../supplementary') # relative path (may change depending on the location of the lesson notebook)
import hourofci

# load javascript to initialize/hide cells, get user agent string, and hide output indicator
# hide code by introducing a toggle button "Toggle raw code"
HTML(''' 
    <script type="text/javascript" src=\"../../supplementary/js/custom.js\"></script>
    
    <style>
        .output_prompt{opacity:0;}
    </style>
    
    <input id="toggle_code" type="button" value="Toggle raw code">
''')

# Parallel Computing

## Exploration

Let's put our ideas into actions and give it a go. In this exploration exercise we are going to learn the basics of parallel computation (in Python).

First, let's write a really basic function that does some task. Right now it does not matter what the task does, because you can replace it later.

## Reminder

<font size="+1">

By continuing with this lesson you are granting your permission to take part in this research study for the Hour of Cyberinfrastructure: Developing Cyber Literacy for GIScience project. In this study, you will be learning about cyberinfrastructure and related concepts using a web-based platform that will take approximately one hour per lesson. Participation in this study is voluntary.

Participants in this research must be 18 years or older. If you are under the age of 18 then please exit this webpage or navigate to another website such as the Hour of Code at https://hourofcode.com, which is designed for K-12 students.

If you are not interested in participating please exit the browser or navigate to this website: http://www.umn.edu. Your participation is voluntary and you are free to stop the lesson at any time.

For the full description please navigate to this website: <a href="../../gateway-lesson/gateway/gateway-1.ipynb">Gateway Lesson Research Study Permission</a>.

</font>

## Setting up our problem.
In this first portion of the exploration, we will setup a simple task function.
We will then apply the task function to lots of data (without parallel computing).
In the next section, we will apply parallel computing to this problem.

In [None]:
# Task function that accepts a number and doubles it
# It returns the doubled number. That's it.
def task(some_number):
    # We will double the number, then return it.
    doubled = some_number * 2
    
    return doubled


In [None]:
# Let's try running our task function.

output = task(5)

print("Our output was", output)

## Lots of tasks
Okay, now we have a simple task function. Let's say that we have a lot of tasks to do. In this simple example, we have a lot of numbers that need doubling. Let's start with the manual way of doing a lot of tasks. Don't worry. We will improve the code soon.

In [None]:
output1 = task(6)
output2 = task(7)
output3 = task(9)
output4 = task(11)
output5 = task(15)

print("Our outputs", output1, output2, output3, output4, output5)

### Streamlining our tasks. Step 1. Organize our data and loop over it.

Instead of calling task 5 times. Let's ...
  * make a list of inputs 
  * save a list of outputs
  * use a for loop

In [None]:
# Notice here we have the same numbers as the task example above
input_list = [6, 7, 9, 11, 15]

output_list = []

# Loop over all task numbers in the list
for task_number in input_list:
    output = task(task_number)
    output_list.append(output)

print("Our outputs,", output_list)

### Streamlining or tasks. Step 2.

Instead of a for loop, let's use the 'map' function.

It just applies the function to everything in the list. Just like the for loop!

So if you compare the code above and the code below, the only thing that changes is that we use the map function.


In [None]:
# Notice here we have the same numbers as the task example above
input_list = [6, 7, 9, 11, 15]

output_list = []

# Use the map function instead of a for loop
map_list = map(task, input_list)

# Transform (a.k.a. cast) our map list into a Python list.
output_list = list(map_list)

# When we print our results, it should be the same list of numbers.
print("Our outputs,", output_list)

## Adding parallelism

Okay, now we have a task function and we are applying it to many different numbers.
Let's make it run in parallel!

First, let's import a new module called multiprocessing

This module will allow multiple tasks to be processed in parallel. It is the simplest Python module for parallelism and is widely used.

We will make a small change by introducing a 'pool' of processes to run our tasks in parallel. Think about having multiple farmers to plant seeds. We have a 'pool' of farmers that we can use to do our tasks in parallel.

In [None]:
import multiprocessing

# Now we will only make a small change to the code

input_list = [6, 7, 9, 11, 15]

output_list = []

# We begin by creating a 'pool' of processes to run our task in parallel
# Here we will create a pool of only 2 processes.
our_pool = multiprocessing.Pool(2)

# Now we run a slightly different map function, it is in our pool
# So this map function will run in parallel!
# Notice, the code is almost identical to the map function above ...
# but we get to use 2 parallel processes
output_list = our_pool.map(task, input_list)

# Now that we are done, we want to close up our pool of processes
our_pool.close()

print("Our outputs", output_list)        

... that's it. That is all that it takes to run a parallel computing process in Python.
Well, okay. There are more complex cases that we will get into, but that is the simplest way to add parallelism to your tasks.

#### Optional: There is a slightly better way to program this in Python.

If you are interested take a look at the code sample below If you are not, then just skip ahead. Our change is using the 'with' statement in Python that will automatically close the pool.


In [None]:

input_list = [6, 7, 9, 11, 15]

output_list = []

# Instead of creating a pool and running close(), we can use Python's with statement
with multiprocessing.Pool(2) as p:
    output_list = p.map(task, input_list)

print("Our outputs", output_list)

## Scale: Let's scale up our problem and our parallel processing

Okay, so we started with a simple problem: doubling numbers. We had a list of 5 numbers that we needed to double.
As you can imagine, this is a fairly small problem. So let's make it a bit bigger. And a bit more complex.

Our new task will be projecting points from latitude/longitude to the Mercator projection system

We need to import the math module to do some of this math.

We will also create our own function for this task. Here we will define our function using the __def__ keyword. We will call our function __proj_task__.

What is going on with all that math down there? Well, this __proj_task__ function is transforming Latitude and Longitude points to a projected coordinate system called the Mercator Projection. You have all seen this projection on a map somewhere. Take a look at the Wikipedia page for an example (https://en.wikipedia.org/wiki/Mercator_projection).


In [None]:
import math

# Inspiration: https://gis.stackexchange.com/questions/156035/calculating-mercator-coordinates-from-lat-lon

def proj_task(latlon_point):
    # Get lat/lon out of the input parameter
    lat = latlon_point[0]
    lon = latlon_point[1]
    
    r_major = 6378137.000
    x = r_major * math.radians(lon)
    scale = x/lon
    y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + lat * (math.pi/180.0)/2.0)) * scale
    
    proj_point = [y, x]
    
    return proj_point

Let's try running our task function. Run the code cell below to see if it works.

In [None]:
latlon_point = [44.97, -93.24]

output = proj_task(latlon_point)

print("Our output was", output)

### Use the map function

Use the map function applying our new projection task to a list of latitude and longitude points. This is a similar switch to the first example at the top of this notebook.


In [None]:

latlon_list = [ [44.5, -94.5], [45.5, -96.5], [44.4, -95.0], [40.5, -89.5] ]

proj_list = []

# Use the map function
proj_list = map(proj_task, latlon_list)

# Transform (a.k.a. cast) our map list into a Python list.
output_list = list(proj_list)

print("Our outputs,", output_list)


Let's use a pool of processes again and parallelize our __proj_task__ task.


In [None]:
latlon_list = [ [44.5, -94.5], [45.5, -96.5], [44.4, -95.0], [40.5, -89.5] ]

proj_list = []

our_pool = multiprocessing.Pool(2)

# Now we run a slightly different map function, it is in our pool
# So this map function will run in parallel!
# Notice, the code is almost identical to the map function above ...
# but we get to use 2 parallel processes
proj_list = our_pool.map(proj_task, latlon_list)

# Now that we are done, we want to close up our pool of processes
our_pool.close()

print("Our outputs", proj_list)

### Scaling up!

Let's scale up our list of points by creating a function to create large lists. Here we will use the __random__ package to generate random points.


In [None]:
# We are going to randomly generate points
import random

def make_latlon_list(number_of_points):

    # Create an empty list
    latlon_list = []
    
    # Loop 1 time for each number in number_of_points
    for i in range(number_of_points):
        
        # Create a latitude and longitude coordinate
        lat = 40.0 + random.random()*10
        lon = -85.0 + random.random()*10
        
        # Turn it into a point
        point = [lat, lon]
        
        latlon_list.append(point)
        
    return latlon_list



In [None]:
# Let's try it

latlon_list = make_latlon_list(20)

print("20 lat/lon coordinates", latlon_list)



In [None]:
# Let's really scale

# 1000 latitude longitude points
latlon_list = make_latlon_list(1000)

# 4 processes in our pool
our_pool = multiprocessing.Pool(4)

# Now we run a slightly different map function, it is in our pool
# So this map function will run in parallel!
# Notice, the code is almost identical to the map function above ...
# but we get to use 2 parallel processes
proj_list = our_pool.map(proj_task, latlon_list)

# Now that we are done, we want to close up our pool of processes
our_pool.close()

print("Our first projected point", proj_list[0])
print("Our last projected point", proj_list[len(proj_list)-1])

print("Number of projected points:", len(proj_list))


## Your turn!

Now we want you to try it.
We added a timer function to tell you how long each took.
Then, we can measure the speedup.

Your tasks:
  1) Change the number of points from 1000 to five million (5000000) points to increase the data even further.
  2) Run the code using 1 process in our pool, record the time
  3) Run the code using 2 processes in our pool, record the time
  4) Run the code using 4 processes in our pool, record the time
  5) Calculate the speedup for the parallel computing tasks (with 2 and 4 processes)


In [None]:
# Let's give it a go.
import time

# TODO: Change the number of points from 1000 to 1000000 points.

latlon_list = make_latlon_list(5000000)

start_time = time.time()

# TODO: Change the number of processes in our pool from 1 to 2 to 4

# Number of processes in our pool
our_pool = multiprocessing.Pool(1)

# Run our projection task in parallel using our pool of processes
proj_list = our_pool.map(proj_task, latlon_list)

# Now that we are done, we want to close up our pool of processes
our_pool.close()

end_time = time.time()

print("Our first projected point", proj_list[0])
print("Our last projected point", proj_list[len(proj_list)-1])

print("Number of projected points:", len(proj_list))
print("Execution time (seconds):", end_time - start_time)

## Calculating speedup
Now let's use the code above to calculate the speedup of our parallel code. Re-run the code for 5000000 (5 million) points using:
 * 1 process in our pool, 
 * 2 processes in our pool
 * 4 processes in our pool
 
For each time record the time in seconds below in the three variables. Then run the code below.

In [None]:
# Run the code above 

speed_1_process = 1.0
speed_2_process = 2.0
speed_4_process = 4.0

print("Speedup for 2 processes (2.0 is perfect speedup)", speed_2_process/speed_1_process)
print("Speedup for 4 processes (4.0 is perfect speedup)", speed_4_process/speed_1_process)

print("Remember Amdahl's Law, think about why you might not get perfect speedup by examining the code above")

# Congratulations!


**You have finished an Hour of CI!**


But, before you go ... 

1. Please fill out a very brief questionnaire to provide feedback and help us improve the Hour of CI lessons. It is fast and your feedback is very important to let us know what you learned and how we can improve the lessons in the future.
2. If you would like a certificate, then please type your name below and click "Create Certificate" and you will be presented with a PDF certificate.

<font size="+1"><a style="background-color:blue;color:white;padding:12px;margin:10px;font-weight:bold;" href="https://forms.gle/JUUBm76rLB8iYppN7">Take the questionnaire and provide feedback</a></font>

In [None]:

# This code cell loads the Interact Textbox that will ask users for their name
# Once they click "Create Certificate" then it will add their name to the certificate template
# And present them a PDF certificate
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw

from ipywidgets import interact

def make_cert(learner_name, lesson_name):
    cert_filename = 'hourofci_certificate.pdf'

    img = Image.open("../../supplementary/hci-certificate-template.jpg")
    draw = ImageDraw.Draw(img)

    cert_font   = ImageFont.truetype('../../supplementary/cruft.ttf', 150)
    cert_fontsm = ImageFont.truetype('../../supplementary/cruft.ttf', 80)
    
    _,_,w,h = cert_font.getbbox(learner_name)  
    draw.text( xy = (1650-w/2,1100-h/2), text = learner_name, fill=(0,0,0),font=cert_font)
    
    _,_,w,h = cert_fontsm.getbbox(lesson_name)
    draw.text( xy = (1650-w/2,1100-h/2 + 750), text = lesson_name, fill=(0,0,0),font=cert_fontsm)
    
    img.save(cert_filename, "PDF", resolution=100.0)   
    return cert_filename


interact_cert=interact.options(manual=True, manual_name="Create Certificate")

@interact_cert(name="Your Name")
def f(name):
    print("Congratulations",name)
    filename = make_cert(name, 'Beginner Parallel Computing')
    print("Download your certificate by clicking the link below.")
    
    
    

<font size="+1"><a style="background-color:blue;color:white;padding:12px;margin:10px;font-weight:bold;" href="hourofci_certificate.pdf?download=1" download="hourofci_certificate.pdf">Download your certificate</a></font>