In [None]:
# !pip3 install -U google-cloud-aiplatform
# import IPython
# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

# Inner training script

In [None]:
%%writefile training_script.py

# Source: https://cloud.google.com/ai-platform-unified/docs/tutorials/image-recognition-custom

import argparse
import logging
import os

import tensorflow as tf
import tensorflow_datasets as tfds

IMG_WIDTH = 128

def normalize_img(image):
    """Normalizes image.

    * Resizes image to IMG_WIDTH x IMG_WIDTH pixels
    * Casts values from `uint8` to `float32`
    * Scales values from [0, 255] to [0, 1]

    Returns:
      A tensor with shape (IMG_WIDTH, IMG_WIDTH, 3). (3 color channels)
    """
    image = tf.image.resize_with_pad(image, IMG_WIDTH, IMG_WIDTH)
    return image / 255.


def normalize_img_and_label(image, label):
    """Normalizes image and label.

    * Performs normalize_img on image
    * Passes through label unchanged

    Returns:
      Tuple (image, label) where
      * image is a tensor with shape (IMG_WIDTH, IMG_WIDTH, 3). (3 color
        channels)
      * label is an unchanged integer [0, 4] representing flower type
    """
    return normalize_img(image), label

def get_args():
  """Argument parser.
  Returns:
    Dictionary of arguments.
  """
  parser = argparse.ArgumentParser(description='Flower classification sample')
  parser.add_argument(
      '--tfds',
      default=None,
      help='The tfds URI from https://www.tensorflow.org/datasets/ to load the data from')

  parser.add_argument('--lr', type=float, default=0.01)

  args = parser.parse_args()
  return args

# Training settings
args = get_args()

if 'AIP_MODEL_DIR' not in os.environ:
    raise KeyError(
        'The `AIP_MODEL_DIR` environment variable has not been' +
        'set. See https://cloud.google.com/ai-platform-unified/docs/tutorials/image-recognition-custom/training'
    )
output_directory = os.environ['AIP_MODEL_DIR']

logging.info('Loading and preprocessing data ...')
dataset = tfds.load(args.tfds,
                    split='train',
                    try_gcs=True,
                    shuffle_files=True,
                    as_supervised=True)
dataset = dataset.map(normalize_img_and_label,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.cache()
dataset = dataset.shuffle(1000)
dataset = dataset.batch(128)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

logging.info('Creating and training model ...')
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(16,
                           3,
                           padding='same',
                           activation='relu',
                           input_shape=(IMG_WIDTH, IMG_WIDTH, 3)),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation="relu"),
    tf.keras.layers.Dense(5)  # 5 classes
])
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=args.lr),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])
model.fit(dataset, epochs=10)

logging.info(f'Exporting SavedModel to: {output_directory}')
# Add softmax layer for intepretability
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])
probability_model.save(output_directory)

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

# Construct the Custom Training Job To pass into HP Tuning

In [None]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

aiplatform.init(project='sashaproject-1',
                staging_bucket='gs://ucaip-mb-sasha-dev')

job = aiplatform.CustomJob.from_local_script(
    display_name='my-job',
    script_path='training_script.py',
    container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest",
    requirements=["gcsfs==0.7.1"],
    args=["--tfds", "tf_flowers:3.*.*"],
    replica_count=1,
)

In [None]:
job.run()

Alternatively, you can create a Custom Job from worker pools specs. We'll reuse the Python Package we created fromt the local script above. 

In [None]:
worker_pool_specs = [
   {
       "replica_count": 1,
       "machine_spec": {
         "machine_type": 'n1-standard-4',
       },
       "python_package_spec": {
           "executor_image_uri": 'us-docker.pkg.dev/cloud-aiplatform/training/tf-cpu.2-4:latest',
           "package_uris": job.job_spec.worker_pool_specs[0].python_package_spec.package_uris,
           "python_module": "aiplatform_custom_trainer_script.task",
           "args": ["--tfds", "tf_flowers:3.*.*"]
       },
   }
]

In [None]:
job = aiplatform.CustomJob(display_name='test-from_worker_pool_spec',
                           worker_pool_specs=worker_pool_specs)
job.run()
