### Mnist hyperparameter tuning example


In [1]:
# PyCloud


import logging
import time
import json

import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

tf.enable_v2_behavior()

from pycloud.core import PyCloud

CLOUD = PyCloud.get_instance()

LOGGER = logging.getLogger("MnistTraining")

### Define api endpoint acceptiong lists of hyperparameters

In [2]:
# PyCloud

@CLOUD.endpoint("api")
def train(learning_rates, epochs_list):
    learning_rates = json.loads(learning_rates)
    epochs_list = json.loads(epochs_list)
    results = []
    for lr in learning_rates:
        for epochs in epochs_list:
            results.append((lr, epochs, train_mnist(lr, epochs)))
    return results

### Define service responsibe for training mnist with specific hyperparams - learning rate nad number of epochs

In [3]:
# PyCloud

@CLOUD.endpoint("mnist_trainer")
def load_data():
    (ds_train, ds_test), ds_info = tfds.load(
        'mnist',
        split=['train', 'test'],
        shuffle_files=True,
        as_supervised=True,
        with_info=True,
        try_gcs=True
    )
    return ds_train, ds_test, ds_info


@CLOUD.endpoint("mnist_trainer")
def train_mnist(learning_rate, epochs):
    learning_rate = float(learning_rate)
    epochs = int(epochs)
    ds_train, ds_test, ds_info = load_data()

    def normalize_img(image, label):
        """Normalizes images: `uint8` -> `float32`."""
        return tf.cast(image, tf.float32) / 255., label

    ds_train = ds_train.map(
        normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_train = ds_train.cache()
    ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
    ds_train = ds_train.batch(128)
    ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)

    ds_test = ds_test.map(
        normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test = ds_test.batch(128)
    ds_test = ds_test.cache()
    ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE)

    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate),
        metrics=['accuracy'],
    )

    history = model.fit(
        ds_train,
        epochs=epochs,
        validation_data=ds_test,
    )
    accuracy = history.history['accuracy'][-1]
    CLOUD.collect_metric('accuracy', ['MAX'], accuracy)
    return accuracy

### Define building function

In [4]:
def build_mnist_training():
    accuracy = train("[0.02]", "[1]")
    print(accuracy)
    CLOUD.configure_service("api", exposed="True", preferred_ports={'HTTP': 5001})
    CLOUD.set_basic_auth_credentials("pycloud", "demo")


CLOUD.build(build_mnist_training)

2020-11-16:09:37:00,103 INFO     [core.py:473] Executing <function train at 0x7fd3704e9730> 
2020-11-16:09:37:00,109 INFO     [core.py:473] Executing <function train_mnist at 0x7fd3704e98c8> 
2020-11-16:09:37:00,110 INFO     [core.py:473] Executing <function load_data at 0x7fd3704e96a8> 
2020-11-16:09:37:01,523 INFO     [dataset_info.py:362] Load dataset info from gs://tfds-data/datasets/mnist/3.0.1
2020-11-16:09:37:02,593 INFO     [dataset_info.py:413] Field info.citation from disk and from code do not match. Keeping the one from code.
2020-11-16:09:37:02,859 INFO     [dataset_builder.py:323] Reusing dataset mnist (gs://tfds-data/datasets/mnist/3.0.1)
2020-11-16:09:37:02,860 INFO     [dataset_builder.py:529] Constructing tf.data.Dataset for split ['train', 'test'], from gs://tfds-data/datasets/mnist/3.0.1
2020-11-16:09:37:03,326 INFO     [core.py:480] Registering call on service mnist-trainer, train_mnist@mnist.ipynb -> load_data@mnist.ipynb




2020-11-16:09:37:06,464 INFO     [core.py:480] Registering call on service api, train@mnist.ipynb -> train_mnist@mnist.ipynb
2020-11-16:09:37:06,466 INFO     [core.py:593] Saving cloud definition at path: /home/krzych/git/pycloud-examples/training/_graph_.json


[(0.02, 1, 0.9265166521072388)]


### Deploy to the cluster ( change to EksLauncher to deploy on Amazon EKS)

In [5]:
from pycloud_cli.docker import DockerLauncher
# from pycloud_cli.eks import EksLauncher
launcher = DockerLauncher(CLOUD)
launcher.exec()

2020-11-16:09:37:10,528 INFO     [launcher.py:104] Executing app from directory: /home/krzych/git/pycloud-examples/training


Please review app architecture:
Services:
  id: api
    exposed: True
    endpoints:
      id: train@mnist.ipynb
        edges:
          train_mnist@mnist.ipynb
        protocols: GRPC, HTTP
  id: mnist-trainer
    exposed: False
    endpoints:
      id: train_mnist@mnist.ipynb
        edges:
          load_data@mnist.ipynb
        protocols: GRPC, HTTP
      id: load_data@mnist.ipynb
        edges:
        protocols: GRPC, HTTP
Do you want to deploy above graph to cluster? (y/n)
y
Deployment to cluster: APPROVED
Checking whether docker ps works...
OK
License key file found, copying to /home/krzych/git/pycloud-examples/training/.pycloud_license_key


2020-11-16:09:37:16,543 INFO     [core.py:593] Saving cloud definition at path: /home/krzych/git/pycloud-examples/training/./_graph_.json


Building image pycloud_service:pycloud-cluster-726315
Step 1/13 : FROM python:3.8.3
 ---> 7f5b6ccd03e9
Step 2/13 : RUN mkdir /app
 ---> Using cache
 ---> 37ff4a3eac70
Step 3/13 : WORKDIR /app
 ---> Using cache
 ---> 1012fcbdff0b
Step 4/13 : ADD ./temp_file_runner_requirements.txt /app/pycloud-dist/runner_requirements.txt
 ---> Using cache
 ---> 30a4425296ab
Step 5/13 : RUN pip install -r pycloud-dist/runner_requirements.txt
 ---> Using cache
 ---> 9e6a99cf914d
Step 6/13 : ADD requirements.txt /app/
 ---> Using cache
 ---> edfb021172aa
Step 7/13 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> cfa6c99d9813
Step 8/13 : ADD ./temp_file_pycloud.so /app/pycloud-dist/pycloud.so
 ---> Using cache
 ---> 201114dbb1ae
Step 9/13 : ADD ./temp_file_pycloud_cli.so /app/pycloud-dist/pycloud_cli.so
 ---> Using cache
 ---> c9aecdc16316
Step 10/13 : ADD ./temp_file_run_runner.py /app/pycloud-dist/run_runner.py
 ---> Using cache
 ---> d894f7378331
Step 11/13 : CMD python /app/pycloud-dist/ru

2020-11-16:09:37:21,921 INFO     [docker.py:97] Pulling image: influxdb:1.5
2020-11-16:09:37:24,745 INFO     [docker.py:70] Service deployment params: <pycloud_cli.deployment_params.ServiceDeploymentParams object at 0x7fd3600cf860>
2020-11-16:09:37:25,116 INFO     [docker.py:87] Starting container: influxdb-pycloud-cluster-726315 , envs :{'INFLUXDB_USERNAME': 'plnlndihfn', 'INFLUXDB_PASSWORD': 'nixqdsecdddyuasxxbgm'}, hostname: influxdb, port bindings: {}
2020-11-16:09:37:25,116 INFO     [launcher.py:256] Checking whether RabbitMQ is required...
2020-11-16:09:37:25,117 INFO     [launcher.py:264] No. Skipping.
2020-11-16:09:37:25,117 INFO     [launcher.py:210] Service params: <pycloud_cli.deployment_params.ServiceDeploymentParams object at 0x7fd3600cf860>
2020-11-16:09:37:25,117 INFO     [docker.py:70] Service deployment params: <pycloud_cli.deployment_params.ServiceDeploymentParams object at 0x7fd3600cf860>
2020-11-16:09:37:25,461 INFO     [docker.py:87] Starting container: pycloud-man

PyCloud cluster deployed with id: pycloud-cluster-726315
Internal services:
  id: mnist-trainer
    endpoints:
      train_mnist@mnist.ipynb
      load_data@mnist.ipynb
  id: pycloud-management
    endpoints:
      get_cluster_info@pycloud.monitoring_service.monitoring
      request_history@pycloud.monitoring_service.monitoring
      metrics_graph@pycloud.monitoring_service.monitoring
      authenticate@pycloud.monitoring_service.monitoring
      user_metrics@pycloud.monitoring_service.monitoring
  id: influxdb
Exposed services:
  id: api
    hosts: 
      HTTP localhost:5001
      GRPC localhost:43849
    endpoints:
      train@mnist.ipynb
  id: pycloud-console
    hosts: 
      HTTP localhost:57669
Console credentials: {'username': 'pycloud', 'password': 'demo'}


### Send request to the api with lists of hyperparameters, receive list of (lr, epochs, achieved_accuracy)

In [7]:
!curl localhost:5001 -F "endpoint_id=train@mnist.ipynb" -F "learning_rates=[0.1, 0.01, 0.001]" -F "epochs_list=[10, 20, 30]"

[[0.1, 10, 0.8432000279426575], [0.1, 20, 0.8336166739463806], [0.1, 30, 0.8153166770935059], [0.01, 10, 0.9847833514213562], [0.01, 20, 0.9913666844367981], [0.01, 30, 0.9948833584785461], [0.001, 10, 0.9912333488464355], [0.001, 20, 0.9987333416938782], [0.001, 30, 0.9999666810035706]]

## Best accuracy achieved: 0.99997