## Assignment 2 - Parallel Programming!

### Imports

In [1]:
import utils
import images

### Setup the Project

In [2]:
utils.create_config_file()

Config file setup properly.


In [None]:
images.download_data()

### Exploratory Data Analysis (EDA)

In [None]:
%%time
df = images.get_df()

In [None]:
df.shape

In [None]:
# todo: more eda analysis here.

In [None]:
df.head()

### Downloading Images

#### Serial Way

In [None]:
%%time

images.download_images(quality='regular')

#### Parallel Way

In [None]:
from concurrent.futures import ThreadPoolExecutor

In [None]:
import pathlib
import requests

def download_single_image(id, url):
    """
    Downloads images from given image id and url.
    
    Parameters:
    id  : The name of the image to write to folder.
    url : The url of the image.
    """
    id = id
    url_quality = url
    image_path = pathlib.Path(f'data/images/{id}.jpg')

    if image_path.exists():
        image_path.unlink()

    response = requests.get(url_quality, stream=True)
    if response.status_code == 200:
        with open(image_path, 'wb') as f:
            f.write(response.content)

In [None]:
images_list, json_files = images._get_image_files_list()
params_list = [(image['id'], image['urls']['regular']) for image in images_list]

In [None]:
params_list[:1]

In [None]:
%%time

with ThreadPoolExecutor(max_workers=100, thread_name_prefix='my_thread') as executor:
    executor.map(lambda x: download_single_image(*x), params_list)

### Resizing Images

#### Serial Way

In [None]:
%%time

images.create_thumbnail(size=(128, 128))

#### Parallel Way

In [3]:
# todo: resize images in parallel way! 
import multiprocessing as mp
from multiprocessing import Pool
import concurrent.futures

In [4]:
from PIL import Image
import pathlib

def create_single_thumbnail(id, size=(128, 128)):
    """
    Create resized version of the image path given, with the same name 
    extended with _thumbnail.
    """
    Image.MAX_IMAGE_PIXELS = None
    
    id = id
    image_path = pathlib.Path(f'data/images/{id}.jpg')

    if image_path.exists():
        # create thumbnail
        image = Image.open(image_path.absolute())
        image.thumbnail(size)

        # save thumbnail
        new_filename = image_path.parent.joinpath(
            '{0}_thumbnail{1}'.format(image_path.stem, image_path.suffix))
        image.convert('RGB').save(new_filename)

In [None]:
create_single_thumbnail('Bnu5e3Bq--A')

In [5]:
# data for parallel execution
images_list, json_files = images._get_image_files_list()
params_list = [image['id'] for image in images_list]

In [6]:
params_list[:10], len(params_list)

(['G60LMFznEKs',
  '-Zp0X1oxOkc',
  'd26iwbzYSKM',
  'RDBFUjBqz6c',
  'q7lbJF9XJ9o',
  'emnSxj1yPD8',
  'qayNP9ccw9E',
  'b4w5nzANd8c',
  'D1aqAJGlTP8',
  'CKzs8kOg-F4'],
 1500)

In [None]:
df.id

In [None]:
%%time

processes_list = []

for params in params_list:
    process = mp.Process(target=create_single_thumbnail, args=(params,))
    processes_list.append(process)

print('Created processes.')
    
for process in processes_list:
    process.start()
    
print('Yey! processes started!')

for process in processes_list:
    process.join()

for process in processes_list:
    process.is_alive()

print('Completed!')

In [None]:
%%time

with Pool(processes=20) as p:
    p.map(create_single_thumbnail, params_list)

In [8]:
%%time

with concurrent.futures.ProcessPoolExecutor() as executor:
    executor.map(create_single_thumbnail, params_list)

Wall time: 354 ms


### Conclusion

You have completed your assignment! Now, it is time to share your results and conclusions!

You may need to comment about three things.

1. Your dataset. Explain your EDA findings.
2. Serial and Parallel way differences. What is the difference btw downloading and resizing?
3. Your timing results of both operations in both serial and parallel way.

In [None]:
# todo: share your conclusions