In [1]:
#pip install cloudml-hypertune

In [58]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [45]:
import os

In [33]:
def get_blob(blobs):
    for blob in blobs:
        yield blob
        
def get_image_paths(image_input_dir):
    # initialize the GCS client
    image_bucket = image_input_dir.split('/')[2]
    prefix_dir = '/'.join(image_input_dir.split('/')[3:])
    storage_client = storage.Client()

    # get the storage bucket
    bucket = storage_client.get_bucket(image_bucket)

    image_paths=[]
    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(image_bucket, prefix=prefix_dir)
    
    for blob in get_blob(blobs):
        if "output" in blob.name:
            
            image_paths.append('gs://spectrain/'+blob.name)
            
    return image_paths

def load_images(imagePath):
    # read the image from disk, decode it, convert the data type to
    # floating point, and resize it
    
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, (256,256))
    # parse the class label from the file path
    label = tf.strings.split(imagePath, os.path.sep)[-2]
    if label=='positive':
        label=1
    else:
        label=0

    # return the image and the label
    return (image, label)

def load_dataset(images_dir, batch_size, training):
    
    filePaths = get_image_paths(image_input_dir=images_dir)

    ds = tf.data.Dataset.from_tensor_slices(filePaths)
    ds = (ds
        .shuffle(len(filePaths))
        .map(load_images)
        .cache()
        .batch(batch_size)
    )

    if training:
        return ds.repeat()
    else:
        return ds

In [34]:
import datetime
import os
import shutil
import numpy as np
import tensorflow as tf
import hypertune
import numpy as np
from google.cloud import bigquery, storage
from google.oauth2 import credentials
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (Conv1D, Dense, Dropout, Flatten, MaxPooling1D, Softmax)
def build_model(filter_size_1, filter_size_2, kernel_size, pool_kernel_size, hidden_units_1, hidden_units_2):
    model = Sequential()
    model.add(Conv1D(filter_size_1, kernel_size=kernel_size, activation='relu', input_shape=(256, 256)))
    model.add(Conv1D(filter_size_1, kernel_size=kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Conv1D(filter_size_2, kernel_size=kernel_size,activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Flatten())
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_2, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['AUC'])
    
    return model

2023-06-14 06:14:35.388013: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-14 06:14:38.209818: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-06-14 06:14:38.209915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

In [35]:
model = build_model(filter_size_1=16, filter_size_2=8, 
                        kernel_size=2, pool_kernel_size=2
                        , hidden_units_1=128, hidden_units_2=128)

trainds = load_dataset(images_dir="gs://spectrain/bhavani/valid_images", batch_size=16, training=True)
evalds = load_dataset(images_dir="gs://spectrain/bhavani/test_images", batch_size=16, training=True)

2023-06-14 06:14:59.409138: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 06:14:59.421237: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 06:14:59.422862: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 06:14:59.425117: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [44]:
from tensorflow.keras.callbacks import TensorBoard
hpt = hypertune.HyperTune()

# Reporting callback
class HPTCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        global hpt
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='auc',
            metric_value=logs['val_auc'],
            global_step=epoch)
        
history = model.fit(
        trainds,
        validation_data=evalds,
        epochs=16,
        steps_per_epoch=10,
        verbose=2,
        validation_steps=10,
    callbacks=[HPTCallback()])

Epoch 1/16
10/10 - 0s - loss: 0.2117 - auc: 0.9988 - val_loss: 1.1370 - val_auc: 0.5530 - 111ms/epoch - 11ms/step
Epoch 2/16
10/10 - 0s - loss: 0.0865 - auc: 1.0000 - val_loss: 1.3106 - val_auc: 0.5522 - 104ms/epoch - 10ms/step
Epoch 3/16
10/10 - 0s - loss: 0.0223 - auc: 1.0000 - val_loss: 2.0109 - val_auc: 0.5697 - 107ms/epoch - 11ms/step
Epoch 4/16
10/10 - 0s - loss: 0.0112 - auc: 1.0000 - val_loss: 3.3081 - val_auc: 0.5724 - 106ms/epoch - 11ms/step
Epoch 5/16
10/10 - 0s - loss: 0.0321 - auc: 0.9991 - val_loss: 3.4563 - val_auc: 0.5413 - 103ms/epoch - 10ms/step
Epoch 6/16
10/10 - 0s - loss: 0.0279 - auc: 0.9996 - val_loss: 2.0279 - val_auc: 0.5425 - 105ms/epoch - 10ms/step
Epoch 7/16
10/10 - 0s - loss: 0.0708 - auc: 0.9991 - val_loss: 2.3020 - val_auc: 0.5319 - 104ms/epoch - 10ms/step
Epoch 8/16
10/10 - 0s - loss: 0.1123 - auc: 0.9909 - val_loss: 3.1677 - val_auc: 0.5557 - 105ms/epoch - 11ms/step
Epoch 9/16
10/10 - 0s - loss: 0.0782 - auc: 0.9914 - val_loss: 1.9534 - val_auc: 0.5179 

In [46]:
PROJECT = !gcloud config list --format 'value(core.project)'
PROJECT = PROJECT[0]
BUCKET = !gcloud storage ls
BUCKET = BUCKET[-1].split("//")[-1]
REGION = "us-central1"

In [47]:
os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

In [48]:
%%bash
gcloud config set project ${PROJECT}
gcloud config set ai/region ${REGION}

Updated property [core/project].
Updated property [ai/region].


In [6]:
# Create an init file to identify the following code as a package

In [51]:
%%bash
mkdir -p spectrain_proc_img/trainer
touch spectrain_proc_img/trainer/__init__.py

In [52]:
# Create a file to parse the arguments
# We will use this later to parse arguments when training the model

In [53]:
%%writefile spectrain_proc_img/trainer/task.py
import argparse
import json
import os

from trainer import model

import tensorflow as tf

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_data_path",
        help="GCS location of training data",
        required=True
    )
    parser.add_argument(
        "--eval_data_path",
        help="GCS location of evaluation data",
        required=True
    )
    parser.add_argument(
        "--output_dir",
        help="GCS location to write checkpoints and export models",
        default = os.getenv("AIP_MODEL_DIR")
    )
    parser.add_argument(
        "--batch_size",
        help="Number of examples to compute gradient over.",
        type=int,
        default=64
    )
    parser.add_argument(
        "--nnsize_1",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=512
    )
    parser.add_argument(
        "--nnsize_2",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=64
    )
    parser.add_argument(
        "--ksize",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=4
    )
    parser.add_argument(
        "--pool_ksize",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=2
    )
    parser.add_argument(
        "--filt_size1",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=64
    )
    parser.add_argument(
        "--filt_size2",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=32
    )
    parser.add_argument(
        "--num_epochs",
        help="Number of epochs to train the model.",
        type=int,
        default=10
    )
    parser.add_argument(
        "--train_examples",
        help="""Number of examples (in thousands) to run the training job over.
        If this is more than actual # of examples available, it cycles through
        them. So specifying 1000 here when you have only 100k examples makes
        this 10 epochs.""",
        type=int,
        default=5000
    )
    parser.add_argument(
        "--eval_steps",
        help="""Positive number of steps for which to evaluate model. Default
        to None, which means to evaluate until input_fn raises an end-of-input
        exception""",
        type=int,
        default=None
    )

    # Parse all arguments
    args = parser.parse_args()
    arguments = args.__dict__

    # Modify some arguments
    arguments["train_examples"] *= 100

    # Run the training job
    model.train_and_evaluate(arguments)

Overwriting spectrain_proc_img/trainer/task.py


In [54]:
# Place all the preprocessing, model building, training and evaluation code in this cell to package in
# model.py to later train directly in vertex ai

In [80]:
%%writefile spectrain_proc_img/trainer/model.py
import datetime
import os
import shutil
import numpy as np
import tensorflow as tf
import hypertune
import numpy as np
from google.cloud import bigquery, storage
from google.oauth2 import credentials
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (Conv1D, Dense, Dropout, Flatten, MaxPooling1D, Softmax)

def get_blob(blobs):
    for blob in blobs:
        yield blob
        
def get_image_paths(image_input_dir):
    # initialize the GCS client
    image_bucket = image_input_dir.split('/')[2]
    prefix_dir = '/'.join(image_input_dir.split('/')[3:])
    prefix_dir = prefix_dir[1:]
    storage_client = storage.Client()

    # get the storage bucket
    bucket = storage_client.get_bucket(image_bucket)

    image_paths=[]
    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(image_bucket, prefix=prefix_dir)
    
    for blob in get_blob(blobs):
        if "output" in blob.name:
            image_paths.append('gs://spectrain/'+blob.name)
            
    return image_paths

def load_images(imagePath):
    # read the image from disk, decode it, convert the data type to
    # floating point, and resize it
    
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, (256,256))
    # parse the class label from the file path
    label = tf.strings.split(imagePath, os.path.sep)[-2]
    if label=='positive':
        label=1
    else:
        label=0
    # return the image and the label
    return (image, label)

def load_dataset(images_dir, batch_size, training):
    
    filePaths = get_image_paths(image_input_dir=images_dir)
    ds = tf.data.Dataset.from_tensor_slices(filePaths)
    ds = (ds
        .map(load_images)
        .cache()
        .shuffle(len(filePaths))
        .batch(batch_size)
    )

    if training:
        return ds.repeat()
    else:
        return ds

def build_model(filter_size_1, filter_size_2, kernel_size, pool_kernel_size, hidden_units_1, hidden_units_2):
    model = Sequential()
    model.add(Conv1D(filter_size_1, kernel_size=kernel_size, activation='relu', input_shape=(256, 256)))
    model.add(Conv1D(filter_size_1, kernel_size=kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Conv1D(filter_size_2, kernel_size=kernel_size,activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Flatten())
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_2, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['AUC'])
    
    return model

    
# Instantiate the HyperTune reporting object
hpt = hypertune.HyperTune()

# Reporting callback
class HPTCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        global hpt
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='auc',
            metric_value=logs['val_auc'],
            global_step=epoch)
        
        
def train_and_evaluate(args):
    model = build_model(filter_size_1=args["filt_size1"], filter_size_2=args["filt_size2"], 
                        kernel_size=args['ksize'], pool_kernel_size=args['pool_ksize']
                        , hidden_units_1=args['nnsize_1'], hidden_units_2=args['nnsize_2'])

    trainds = load_dataset(args["train_data_path"], args["batch_size"], training=True)

    evalds = load_dataset(args["eval_data_path"], args["batch_size"], training=False)
    
    if args["eval_steps"]:
        evalds = evalds.take(count=args["eval_steps"])

    num_batches = args["batch_size"] * args["num_epochs"]
    steps_per_epoch = args["train_examples"] // args["batch_size"]
    
    checkpoint_path = os.path.join(args["output_dir"], "checkpoints/spectrain_proc_img")
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path, verbose=1, save_weights_only=True)

    history = model.fit(
        trainds,
        validation_data=evalds,
        epochs=args["batch_size"],
        steps_per_epoch=steps_per_epoch,
        verbose=2,
        callbacks=[cp_callback, HPTCallback()])
    
    EXPORT_PATH = os.path.join(
        args["output_dir"], datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    tf.saved_model.save(
        obj=model, export_dir=EXPORT_PATH)  # with default serving function
    
    print("Exported trained model to {}".format(EXPORT_PATH))

Overwriting spectrain_proc_img/trainer/model.py


In [81]:
#### Train the model locally to check if everything is good #########

In [82]:
%%bash
OUTDIR=spectrain_proc_img_trained
rm -rf ${OUTDIR}
export PYTHONPATH=${PYTHONPATH}:${PWD}/spectrain_proc_img
python3 -m trainer.task \
    --train_data_path=gs://${BUCKET}/bhavani/train_images \
    --eval_data_path=gs://${BUCKET}/bhavani/valid_images \
    --output_dir=${OUTDIR} \
    --batch_size=10 \
    --num_epochs=1 \
    --train_examples=10 \
    --eval_steps=10

2023-06-14 07:05:07.967708: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-14 07:05:10.740396: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-06-14 07:05:10.740534: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

Epoch 1/10


2023-06-14 07:05:27.683796: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 105 of 1064
2023-06-14 07:05:37.679213: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 210 of 1064
2023-06-14 07:05:47.654155: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 313 of 1064
2023-06-14 07:05:57.656797: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 423 of 1064
2023-06-14 07:06:07.699094: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 521 of 1064
2023-06-14 07:06:17.866936: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 629 of 1064
2023-06-14 07:06:27.642595: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle b

CalledProcessError: Command 'b'OUTDIR=spectrain_proc_img_trained\nrm -rf ${OUTDIR}\nexport PYTHONPATH=${PYTHONPATH}:${PWD}/spectrain_proc_img\npython3 -m trainer.task \\\n    --train_data_path=gs://${BUCKET}/bhavani/train_images \\\n    --eval_data_path=gs://${BUCKET}/bhavani/valid_images \\\n    --output_dir=${OUTDIR} \\\n    --batch_size=10 \\\n    --num_epochs=1 \\\n    --train_examples=10 \\\n    --eval_steps=10\n'' returned non-zero exit status 1.

In [None]:
#### The following is the code to train the model on vertex ai with a randomly selected hyperparameters ####

In [None]:
# Create a package source distribution

In [83]:
%%writefile spectrain_proc_img/setup.py
from setuptools import find_packages
from setuptools import setup

setup(
    name='spectrain_proc_img_trainer',
    version='0.1',
    packages=find_packages(),
    include_package_data=True,
    description='spectrain edge detected image model training application.'
)

Writing spectrain_proc_img/setup.py


In [None]:
# Create a local directory to store source distribution package

In [None]:
%%bash
cd spectrain_proc_img
python ./setup.py sdist --formats=gztar
cd ..

In [None]:
# Store the package in the cloud bucket

In [None]:
%%bash
gsutil cp spectrain_proc_img/dist/spectrain_proc_img_trainer-0.1.tar.gz gs://${BUCKET}/spectrain_cnn/

In [None]:
# Submit model training to vertex ai with specific random hyperparameters
# and passing them as arguments through .yaml file to task.py file

In [None]:
%%bash

TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
OUTDIR=gs://${BUCKET}/spectrain_cnn/trained_model_$TIMESTAMP
JOB_NAME=spectrain_cnn_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_cnn/spectrain_proc_img_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./config.yaml "workerPoolSpecs:
  machineSpec:
    machineType: n1-standard-4
  replicaCount: 1
  pythonPackageSpec:
    executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
    packageUris: $PYTHON_PACKAGE_URI
    pythonModule: $PYTHON_MODULE
    args:
    - --train_data_path=gs://${BUCKET}/bhavani/train_images
    - --eval_data_path=gs://${BUCKET}/bhavani/valid_images
    - --output_dir=$OUTDIR
    - --num_epochs=10
    - --train_examples=10000
    - --eval_steps=100
    - --batch_size=32"

gcloud ai custom-jobs create \
  --region=${REGION} \
  --display-name=$JOB_NAME \
  --config=config.yaml

In [None]:
### The training of model in vertex ai code ends here #############

In [None]:
##### The following is the code for hyper parameter tuning ############

In [None]:
# Perfomr hyper parameter tuning using arguments in .YAML file

In [None]:
%%bash
TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
BASE_OUTPUT_DIR=gs://${BUCKET}/spectrain_cnn/hp_tuning_$TIMESTAMP
JOB_NAME=spectrain_cnn_hpt_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_cnn/spectrain_proc_img_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./hyperparam.yaml "displayName: $JOB_NAME
studySpec:
  metrics:
  - metricId: val_rmse
    goal: MINIMIZE
  parameters:
  - parameterId: batch_size
    integerValueSpec:
      minValue: 8
      maxValue: 512
    scaleType: UNIT_LOG_SCALE
  - parameterId: nembeds
    integerValueSpec:
      minValue: 3
      maxValue: 30
    scaleType: UNIT_LINEAR_SCALE
  algorithm: ALGORITHM_UNSPECIFIED # results in Bayesian optimization
trialJobSpec:
  baseOutputDirectory:
    outputUriPrefix: $BASE_OUTPUT_DIR
  workerPoolSpecs:
  - machineSpec:
      machineType: n1-standard-8
    pythonPackageSpec:
      executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
      packageUris:
      - $PYTHON_PACKAGE_URI
      pythonModule: $PYTHON_MODULE
      args:
      - --train_data_path=gs://${BUCKET}/bhavani/train_images
      - --eval_data_path=gs://${BUCKET}/bhavani/valid_images
      - --num_epochs=10
      - --train_examples=5000
      - --eval_steps=100
      - --batch_size=32
      - --nembeds=8
    replicaCount: 1"
        
gcloud ai hp-tuning-jobs create \
    --region=$REGION \
    --display-name=$JOB_NAME \
    --config=hyperparam.yaml \
    --max-trial-count=20 \
    --parallel-trial-count=5

In [None]:
# Take the best hyperparameters after fine-tuning and train the final model

In [None]:
%%bash
TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
OUTDIR=gs://${BUCKET}/spectrain_cnn/tuned_$TIMESTAMP
JOB_NAME=spectrain_cnn_tuned_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_cnn/spectrain_proc_img_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./tuned_config.yaml "workerPoolSpecs:
  machineSpec:
    machineType: n1-standard-8
  replicaCount: 1
  pythonPackageSpec:
    executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
    packageUris: $PYTHON_PACKAGE_URI
    pythonModule: $PYTHON_MODULE
    args:
    - --train_data_path=gs://${BUCKET}/bhavani/train_images
    - --eval_data_path=gs://${BUCKET}/bhavani/valid_images
    - --output_dir=$OUTDIR
    - --num_epochs=10
    - --train_examples=20000
    - --eval_steps=100
    - --batch_size=32
    - --nembeds=8"
    
gcloud ai custom-jobs create \
  --region=${REGION} \
  --display-name=$JOB_NAME \
  --config=tuned_config.yaml