In [8]:
#pip install cloudml-hypertune
import os

In [9]:
PROJECT = !gcloud config list --format 'value(core.project)'
PROJECT = PROJECT[0]
BUCKET = !gcloud storage ls
BUCKET = BUCKET[-1].split("//")[-1]
REGION = "us-central1"

In [10]:
os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

In [11]:
%%bash
gcloud config set project ${PROJECT}
gcloud config set ai/region ${REGION}

Updated property [core/project].
Updated property [ai/region].


In [None]:
# Create an init file to identify the following code as a package

In [12]:
%%bash
mkdir -p spectrain_proc_img/trainer
touch spectrain_proc_img/trainer/__init__.py

In [None]:
# Create a file to parse the arguments
# We will use this later to parse arguments when training the model

In [13]:
%%writefile spectrain_proc_img/trainer/task.py
import argparse
import json
import os

from trainer import model

import tensorflow as tf

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_data_path",
        help="GCS location of training data",
        required=True
    )
    parser.add_argument(
        "--eval_data_path",
        help="GCS location of evaluation data",
        required=True
    )
    parser.add_argument(
        "--output_dir",
        help="GCS location to write checkpoints and export models",
        default = os.getenv("AIP_MODEL_DIR")
    )
    parser.add_argument(
        "--batch_size",
        help="Number of examples to compute gradient over.",
        type=int,
        default=64
    )
    parser.add_argument(
        "--nnsize_1",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=256
    )
    parser.add_argument(
        "--nnsize_2",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=128
    )
    parser.add_argument(
        "--ksize",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=4
    )
    parser.add_argument(
        "--pool_ksize",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=2
    )
    parser.add_argument(
        "--filt_size1",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=64
    )
    parser.add_argument(
        "--filt_size2",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=32
    )
    parser.add_argument(
        "--num_epochs",
        help="Number of epochs to train the model.",
        type=int,
        default=10
    )
    parser.add_argument(
        "--train_examples",
        help="""Number of examples (in thousands) to run the training job over.
        If this is more than actual # of examples available, it cycles through
        them. So specifying 1000 here when you have only 100k examples makes
        this 10 epochs.""",
        type=int,
        default=5000
    )
    parser.add_argument(
        "--eval_steps",
        help="""Positive number of steps for which to evaluate model. Default
        to None, which means to evaluate until input_fn raises an end-of-input
        exception""",
        type=int,
        default=None
    )

    # Parse all arguments
    args = parser.parse_args()
    arguments = args.__dict__

    # Modify some arguments
    arguments["train_examples"] *= 100

    # Run the training job
    model.train_and_evaluate(arguments)

Overwriting spectrain_proc_img/trainer/task.py


In [None]:
# Place all the preprocessing, model building, training and evaluation code in this cell to package in
# model.py to later train directly in vertex ai

In [6]:
%%writefile spectrain_proc_img/trainer/model.py
import datetime
import os
import shutil
import numpy as np
import tensorflow as tf
import hypertune
import numpy as np
from google.cloud import bigquery, storage
from google.oauth2 import credentials
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (Conv1D, Dense, Dropout, Flatten, MaxPooling1D, Softmax)

def get_blob(blobs):
    for blob in blobs:
        yield blob
        
def get_image_paths(image_input_dir):
    # initialize the GCS client
    image_bucket = image_input_dir.split('/')[2]
    prefix_dir = '/'.join(image_input_dir.split('/')[3:])
    prefix_dir = prefix_dir[1:]
    storage_client = storage.Client()

    # get the storage bucket
    bucket = storage_client.get_bucket(image_bucket)

    image_paths=[]
    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(image_bucket, prefix=prefix_dir)
    
    for blob in get_blob(blobs):
        if "output" in blob.name:
            image_paths.append('gs://spectrain_new/'+blob.name)
            
    return image_paths

def load_images(imagePath):
    # read the image from disk, decode it, convert the data type to
    # floating point, and resize it
    
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, (256,256))
    # parse the class label from the file path
    label = tf.strings.split(imagePath, os.path.sep)[-2]
    if label=='positive':
        label=1
    else:
        label=0
    # return the image and the label
    return (image, label)

def load_dataset(images_dir, batch_size, training):
    
    filePaths = get_image_paths(image_input_dir=images_dir)
    ds = tf.data.Dataset.from_tensor_slices(filePaths)
    ds = (ds.map(load_images).cache().shuffle(len(filePaths)).batch(batch_size))

    if training:
        return ds.repeat()
    else:
        return ds

def build_model(filter_size_1, filter_size_2, ksize, pool_kernel_size, hidden_units_1, hidden_units_2):
    model = Sequential()
    model.add(Conv1D(filter_size_1, kernel_size=ksize, activation='relu', input_shape=(256, 256)))
    model.add(Conv1D(filter_size_1, kernel_size=ksize, activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Conv1D(filter_size_2, kernel_size=ksize,activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Flatten())
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_2, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['AUC'])
    
    return model

    
# Instantiate the HyperTune reporting object
hpt = hypertune.HyperTune()

# Reporting callback
class HPTCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        global hpt
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='auc',
            metric_value=logs['val_auc'],
            global_step=epoch)
        
        
def train_and_evaluate(args):
    filt_size_1 = int(args["filt_size1"])
    filt_size_2 = int(args["filt_size2"])
    ksize = int(args['ksize'])
    pool_kernel_size = int(args['pool_ksize'])
    hidden_units_1 = int(args['nnsize_1'])
    hidden_units_2 = int(args['nnsize_2'])
    
    model = build_model(filter_size_1=filt_size_1, filter_size_2=filt_size_2, 
                        ksize=ksize, pool_kernel_size=pool_kernel_size
                        , hidden_units_1=hidden_units_1, hidden_units_2=hidden_units_2)

    trainds = load_dataset(args["train_data_path"], args["batch_size"], training=True)

    evalds = load_dataset(args["eval_data_path"], args["batch_size"], training=False)
    
    if args["eval_steps"]:
        evalds = evalds.take(count=args["eval_steps"])

    num_batches = args["batch_size"] * args["num_epochs"]
    steps_per_epoch = args["train_examples"] // args["batch_size"]
    
    checkpoint_path = os.path.join(args["output_dir"], "checkpoints/spectrain_proc_img")
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path, verbose=1, save_weights_only=True)

    history = model.fit(
        trainds,
        validation_data=evalds,
        epochs=args["num_epochs"],
        steps_per_epoch=steps_per_epoch,
        verbose=2,
        callbacks=[cp_callback, HPTCallback()])
    
    EXPORT_PATH = os.path.join(
        args["output_dir"], datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    tf.saved_model.save(
        obj=model, export_dir=EXPORT_PATH)  # with default serving function
    
    print("Exported trained model to {}".format(EXPORT_PATH))

Overwriting spectrain_proc_img/trainer/model.py


In [None]:
#### The following is the code to train the model on vertex ai with a randomly selected hyperparameters ####

In [15]:
%%writefile spectrain_proc_img/setup.py
from setuptools import find_packages
from setuptools import setup

setup(
    name='spectrain_proc_img_trainer',
    version='0.1',
    packages=find_packages(),
    include_package_data=True,
    description='spectrain edge detected image model training application.'
)

Overwriting spectrain_proc_img/setup.py


In [None]:
# Create a local directory to store source distribution package

In [16]:
%%bash
cd spectrain_proc_img
python ./setup.py sdist --formats=gztar
cd ..

running sdist
running egg_info
writing spectrain_proc_img_trainer.egg-info/PKG-INFO
writing dependency_links to spectrain_proc_img_trainer.egg-info/dependency_links.txt
writing top-level names to spectrain_proc_img_trainer.egg-info/top_level.txt
reading manifest file 'spectrain_proc_img_trainer.egg-info/SOURCES.txt'
writing manifest file 'spectrain_proc_img_trainer.egg-info/SOURCES.txt'
running check
creating spectrain_proc_img_trainer-0.1
creating spectrain_proc_img_trainer-0.1/spectrain_proc_img_trainer.egg-info
creating spectrain_proc_img_trainer-0.1/trainer
copying files to spectrain_proc_img_trainer-0.1...
copying setup.py -> spectrain_proc_img_trainer-0.1
copying spectrain_proc_img_trainer.egg-info/PKG-INFO -> spectrain_proc_img_trainer-0.1/spectrain_proc_img_trainer.egg-info
copying spectrain_proc_img_trainer.egg-info/SOURCES.txt -> spectrain_proc_img_trainer-0.1/spectrain_proc_img_trainer.egg-info
copying spectrain_proc_img_trainer.egg-info/dependency_links.txt -> spectrain_pro




In [17]:
%%bash
gsutil cp spectrain_proc_img/dist/spectrain_proc_img_trainer-0.1.tar.gz gs://${BUCKET}/spectrain_cnn/

Copying file://spectrain_proc_img/dist/spectrain_proc_img_trainer-0.1.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  3.0 KiB/  3.0 KiB]                                                
Operation completed over 1 objects/3.0 KiB.                                      


In [None]:
# Submit model training to vertex ai with specific random hyperparameters
# and passing them as arguments through .yaml file to task.py file

In [18]:
%%bash
TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
BASE_OUTPUT_DIR=gs://${BUCKET}/spectrain_cnn/hp_tuning_$TIMESTAMP
JOB_NAME=spectrain_cnn_hpt_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_cnn/spectrain_proc_img_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./hyperparam2.yaml "displayName: $JOB_NAME
studySpec:
  metrics:
  - metricId: auc
    goal: MAXIMIZE
  parameters:
  - parameterId: batch_size
    discreteValueSpec:
      values:
      - 32
      - 64
  - parameterId: nnsize_1
    discreteValueSpec:
      values:
      - 256
      - 512
  - parameterId: nnsize_2
    discreteValueSpec:
      values:
      - 64
      - 128
  - parameterId: filt_size1
    discreteValueSpec:
      values:
      - 16
      - 32
      - 64
  - parameterId: filt_size2
    discreteValueSpec:
      values:
      - 8
      - 16
      - 32
  algorithm: ALGORITHM_UNSPECIFIED # results in Bayesian optimization
trialJobSpec:
  baseOutputDirectory:
    outputUriPrefix: $BASE_OUTPUT_DIR
  workerPoolSpecs:
  - machineSpec:
      machineType: n1-standard-8
      acceleratorType: NVIDIA_TESLA_V100
      acceleratorCount: 1
    pythonPackageSpec:
      executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
      packageUris:
      - $PYTHON_PACKAGE_URI
      pythonModule: $PYTHON_MODULE
      args:
      - --train_data_path=gs://${BUCKET}/bhavani/train_images
      - --eval_data_path=gs://${BUCKET}/bhavani/valid_images
      - --num_epochs=20
      - --train_examples=5000
      - --eval_steps=100
      - --batch_size=32
    replicaCount: 1"
        
gcloud ai hp-tuning-jobs create \
    --region=$REGION \
    --display-name=$JOB_NAME \
    --config=hyperparam2.yaml \
    --max-trial-count=20 \
    --parallel-trial-count=5

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
CustomJob [projects/469700469475/locations/us-central1/customJobs/8681114754829254656] is submitted successfully.

Your job is still active. You may view the status of your job with the command

  $ gcloud ai custom-jobs describe projects/469700469475/locations/us-central1/customJobs/8681114754829254656

or continue streaming the logs with the command

  $ gcloud ai custom-jobs stream-logs projects/469700469475/locations/us-central1/customJobs/8681114754829254656
