In [2]:
import os
import tarfile
import numpy as np
import dask
from dask.distributed import Client, LocalCluster 
from dask_saturn import SaturnCluster
import s3fs
import boto3
import io
fs = s3fs.S3FileSystem(anon=False)
s3 = boto3.resource('s3')

In [3]:
cluster = SaturnCluster()
client = Client(cluster)

In [5]:
cluster.scale(40)

In [6]:
client.restart()

0,1
Client  Scheduler: tcp://guillermo-glandmarks-dask.main-namespace:8786  Dashboard: https://guillermo-glandmarks-dask.demo.saturnenterprise.io,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [19]:
@dask.delayed
def image_classification(glandmarks_images_paths):
    import io
    import os
    import gc
    import numpy as np
    import tensorflow as tf
    from tensorflow.keras.mixed_precision import experimental as mixed_precision
    from tensorflow.keras.applications.resnet50 import ResNet50
    from tensorflow.keras.preprocessing import image
    from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
    from tensorflow.keras.backend import clear_session

    clear_session()
    gc.collect()
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_policy(policy)
    s3 = boto3.resource('s3')

    def preprocess_image(images_path):
        img = image.load_img(images_path) #, target_size=(224, 224) not necessary as images already resized 
        i = image.img_to_array(img)
        i = np.expand_dims(i, axis=0)
        i = preprocess_input(i)
        return tf.convert_to_tensor(i)

    model = ResNet50(weights='imagenet')
    ims = [preprocess_image(io.BytesIO(s3.Object('glandmarks', p).get()['Body'].read())) for p in glandmarks_images_paths]
    predict_ds = tf.data.Dataset.from_tensor_slices(ims)#.batch(32)
    # iterate over predict_ds to get prediction
    model.predict(predict_ds)
    for image in predict_ds:
        model.predict(image)

    return True

In [8]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [10]:
# TOTAL JPEGs of first 16 tars => 132256
files = fs.ls('glandmarks/224-dataset')
files.pop(0)
len(files)

132256

In [11]:
files[0][11:]

'224-dataset/0.jpg'

In [12]:
images_paths = [f[11:] for f in files]

In [14]:
len(images_paths)

132256

# Execution 32 Workers

In [23]:
cluster.scale(32)

In [25]:
client.restart()

0,1
Client  Scheduler: tcp://guillermo-glandmarks-dask.main-namespace:8786  Dashboard: https://guillermo-glandmarks-dask.demo.saturnenterprise.io,Cluster  Workers: 32  Cores: 32  Memory: 496.00 GB


In [26]:
chunksize = 32 #8266 produces 16 chunks # 4133 produces 32 chunks # 32 produces 4133 chunks
image_chunks = chunks(images_paths, chunksize)
results = []
for chunk in image_chunks:
    results.append(image_classification(chunk))

In [27]:
%time res = dask.compute(*results)

CPU times: user 2.61 s, sys: 190 ms, total: 2.8 s
Wall time: 13min 51s


# Execution 40 Workers

In [20]:
client.restart()
cluster.scale(40)

In [21]:
chunksize = 32 #8266 produces 16 chunks # 4133 produces 32 chunks # 32 produces 4133 chunks
image_chunks = chunks(images_paths, chunksize)
results = []
for chunk in image_chunks:
    results.append(image_classification(chunk))

In [22]:
%time res = dask.compute(*results)

CPU times: user 2.66 s, sys: 177 ms, total: 2.84 s
Wall time: 11min 12s
