In [None]:
#!pip install opencv-python

In [None]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pandas as pd
import os
import io
from sklearn.model_selection import train_test_split
import gcsfs
from google.cloud import bigquery, storage
from google.oauth2 import credentials
import numpy as np
import cv2 as cv
import tensorflow as tf
from PIL import Image

In [None]:
def get_blob(blobs):
    for blob in blobs:
        yield blob

In [None]:
def generate_images(input_dir, image_dir, transform):

    # initialize the GCS client
    storage_client = storage.Client()

    # get the storage bucket
    bucket = storage_client.get_bucket('spectrain')

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs('spectrain', prefix=input_dir)
    
    # Note: The call returns a response only when the iterator is consumed.
    for blob in get_blob(blobs):
        if("output" in blob.name):
            if transform:
                
                plt.figure(figsize=(200,40), clear=True)
                data = blob.download_as_bytes()
                img = Image.open(io.BytesIO(data)) 
                img=np.array(img)
                edges = cv.Canny(img,100,200)
                plt.axis('off')
                buf = io.BytesIO()
                edges = Image.fromarray(edges)
                edges.save(buf, "png")
                filename = blob.name.split('.')[0] # remove the suffix/file extension
                filename = filename.split('/')[-1] # remove the containing directory name from filename
                upload_blob = bucket.blob(image_dir+filename+'.png')
                upload_blob.upload_from_file(buf, content_type='image/png', rewind=True)
                buf.close()
                plt.close("all")
                plt.close()
                
            else:
                plt.figure(figsize=(300,40), clear=True)
                df=pd.read_csv("gs://spectrain/"+blob.name, sep='\s', header=None)
                df.columns=['x_axis', 'y_axis']
                plt.plot(df['x_axis'], df['y_axis'])
                plt.axis('off')
                buf = io.BytesIO()
                filename = blob.name.split('.')[0] # remove the suffix/file extension
                filename = filename.split('/')[-1] # remove the containing directory name from filename
                plt.savefig(buf, format="png")
                upload_blob = bucket.blob(image_dir+filename+'.png')
                upload_blob.upload_from_file(buf, content_type='image/png', rewind=True)
                buf.close()
                plt.close("all")
                plt.close()

In [None]:
input_dir='spec_train_output/processed_images'
output_image_dir='bhavani/transformed_images/'
transform=True
generate_images(input_dir, output_image_dir, transform)

In [None]:
# Create import file for AutoML IMAGE CLASSIFICATION

df=pd.read_csv('gs://qwiklabs-asl-00-c812c3b423f2/spec_train_output/input/Kidney_TX_data.csv')
df['Spectrum_file_new'] = df.Spectrum_file.str.split('\.').str[0]
df['Spectrum_file_new'] = "gs://spectrain/spec_train_output/images/" + df['Spectrum_file_new'] + '_nmr.png'
df = df[['Spectrum_file_new', 'Case']]
df.to_csv('gs://spectrain/spec_train_output/image_dir_paths_labels1.csv', index=False, header=None)

In [None]:
# Create data split column for structured data

df=pd.read_csv('gs://qwiklabs-asl-00-c812c3b423f2/spec_train_output/input/Kidney_TX_data.csv')
X=df.drop(columns=['Case'])
y=df[['Case']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)
X_train['data_split'] = 'TRAIN'
X_val['data_split'] = 'VALID'
X_test['data_split'] = 'TEST'
X_train['Case'] = y_train
X_val['Case'] = y_val
X_test['Case'] = y_test
final_df=pd.concat([X_train, X_val, X_test])
final_df.to_csv('gs://spectrain/Kidney_TX_data_with_split.csv', index=False)

In [None]:
# Create data split column for images data import file
data_split_df=pd.read_csv('gs://spectrain/Kidney_TX_data_with_split.csv')
image_paths_df=pd.read_csv('gs://spectrain/spec_train_output/image_dir_paths_labels1.csv', header=None)
data_split_df['Spectrum_file_new'] = data_split_df.Spectrum_file.str.split('\.').str[0]
data_split_df['Spectrum_file_new'] = "gs://spectrain/spec_train_output/images/" + data_split_df['Spectrum_file_new'] + '_nmr.png'
data_split_df = data_split_df[['Spectrum_file_new', 'data_split']]
image_paths_df.columns=['Spectrum_file_new', 'Case']
image_paths_df=pd.merge(image_paths_df, data_split_df, on=['Spectrum_file_new'], how='inner')
image_paths_df=image_paths_df[['data_split', 'Spectrum_file_new', 'Case']]
image_paths_df['data_split'] = image_paths_df.data_split.replace({'TRAIN':'TRAINING', 'VALID':'VALIDATION', 'TEST':'TEST'})
image_paths_df.to_csv('gs://spectrain/spec_train_output/image_dir_paths_labels_with_split.csv', header=None, index=False)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression.fit(X_train, y_train)
pred_probas=model.predict_proba(X_test, y_test)

In [None]:
def copy_images(df, split_name):
    pos_files=df[df.label==1].proc_img_dir.values.tolist()
    neg_files=df[df.label==0].proc_img_dir.values.tolist()
    for (label, class_dir) in [('positive', pos_files), ('negative', neg_files)]:
        print(split_name, label)
        for files in class_dir:
            filename = files.split('/')[-1] # remove the containing directory name from filename
            !gsutil cp {files} gs://spectrain/bhavani/{split_name}_images/{label}/{filename}
 
    

In [None]:
def create_train_test_dir(image_file_path):
    df = pd.read_csv(image_file_path, header=None)
    df.columns=['split', 'directory', 'label']
    df["proc_img_dir"] = "gs://spectrain/bhavani/transformed_images/"+df.directory.str.split('/').str[-1]
    train_df = df[df.split=='TRAINING']
    valid_df = df[df.split=='VALIDATION']
    test_df = df[df.split=='TEST']
    copy_images(train_df, 'train')
    #copy_images(valid_df, 'valid')
    #copy_images(test_df, 'test')
    

In [None]:
image_file_path = 'gs://spectrain/spec_train_output/image_dir_paths_labels_with_split.csv'
#create_train_test_dir(image_file_path)

In [None]:
ds=image_dataset_from_directory(images_dir, batch_size, image_size=(1600, 25600), shuffle=True, seed=1)
ds.repeat()

In [None]:
 #### START THE CNN MODEL ####

In [191]:
storage_client = storage.Client()

# get the storage bucket
bucket = storage_client.get_bucket('spectrain')
#this is blobs <google.api_core.page_iterator.HTTPIterator object at 0x7f05b2b50a10>
image_paths=[]
# Note: Client.list_blobs requires at least package version 1.17.0.
blobs = storage_client.list_blobs('spectrain', prefix='/bhavani/train_images')
print('this is blobs', blobs)
for blob in get_blob(blobs):
    if "output" in blob.name:
        print(blob.name)

this is blobs <google.api_core.page_iterator.HTTPIterator object at 0x7f0bc0099fd0>


Updated property [core/project].
Updated property [ai/region].


In [None]:
import tensorflow as tf

def get_image_paths(split_name="train"):
    # initialize the GCS client
    storage_client = storage.Client()

    # get the storage bucket
    bucket = storage_client.get_bucket('spectrain')

    image_paths=[]
    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs('spectrain', prefix="bhavani/train_images/")
    
    for blob in get_blob(blobs):
        if "output" in blob.name:
            print(blob.name)
            #image_paths.append('gs://spectrain/'+blob.name)
            
    return image_paths
'''
def load_images(imagePath, split_name="train"):
    # read the image from disk, decode it, convert the data type to
    # floating point, and resize it
    
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, (256,256))
    # parse the class label from the file path
    label = tf.strings.split(imagePath, os.path.sep)[-2]

    # return the image and the label
    return (image, label)

trainPaths = get_image_paths(split_name)

trainDS = tf.data.Dataset.from_tensor_slices(trainPaths)
trainDS = (trainDS
    .shuffle(len(trainPaths))
    .map(load_images)
    .cache()
    .batch(64)
)
'''

In [None]:
imagePath="gs://spectrain/bhavani/test_images/positive/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_575_00000_withoutBackground_20001_nmr.png"
image = tf.io.read_file(imagePath)
image = tf.image.decode_png(image, channels=1)
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
image = tf.image.resize(image, (256,256))
# parse the class label from the file path
print(tf.strings.split(imagePath, os.path.sep))
label = tf.strings.split(imagePath, os.path.sep)[-2]
print(label)

In [13]:
### Build Custom CNN model

In [68]:
import os

In [202]:
PROJECT = !gcloud config list --format 'value(core.project)'
PROJECT = PROJECT[0]
BUCKET = !gcloud storage ls
BUCKET = BUCKET[-1].split("//")[-1]
REGION = "us-central1"

In [203]:
os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

In [204]:
%%bash
gcloud config set project ${PROJECT}
gcloud config set ai/region ${REGION}

Updated property [core/project].
Updated property [ai/region].


In [72]:
# Create an init file to identify the following code as a package

In [73]:
%%bash
mkdir -p spectrain_proc_img/trainer
touch spectrain_proc_img/trainer/__init__.py

In [74]:
# Create a file to parse the arguments
# We will use this later to parse arguments when training the model

In [80]:
%%writefile spectrain_proc_img/trainer/task.py
import argparse
import json
import os

from trainer import model

import tensorflow as tf

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_data_path",
        help="GCS location of training data",
        required=True
    )
    parser.add_argument(
        "--eval_data_path",
        help="GCS location of evaluation data",
        required=True
    )
    parser.add_argument(
        "--output_dir",
        help="GCS location to write checkpoints and export models",
        default = os.getenv("AIP_MODEL_DIR")
    )
    parser.add_argument(
        "--batch_size",
        help="Number of examples to compute gradient over.",
        type=int,
        default=64
    )
    parser.add_argument(
        "--nnsize_1",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=512
    )
    parser.add_argument(
        "--nnsize_2",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=64
    )
    parser.add_argument(
        "--ksize",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=4
    )
    parser.add_argument(
        "--pool_ksize",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=2
    )
    parser.add_argument(
        "--filt_size1",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=64
    )
    parser.add_argument(
        "--filt_size2",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=32
    )
    parser.add_argument(
        "--num_epochs",
        help="Number of epochs to train the model.",
        type=int,
        default=10
    )
    parser.add_argument(
        "--train_examples",
        help="""Number of examples (in thousands) to run the training job over.
        If this is more than actual # of examples available, it cycles through
        them. So specifying 1000 here when you have only 100k examples makes
        this 10 epochs.""",
        type=int,
        default=5000
    )
    parser.add_argument(
        "--eval_steps",
        help="""Positive number of steps for which to evaluate model. Default
        to None, which means to evaluate until input_fn raises an end-of-input
        exception""",
        type=int,
        default=None
    )

    # Parse all arguments
    args = parser.parse_args()
    arguments = args.__dict__

    # Modify some arguments
    arguments["train_examples"] *= 100

    # Run the training job
    model.train_and_evaluate(arguments)

Overwriting spectrain_proc_img/trainer/task.py


In [81]:
# Place all the preprocessing, model building, training and evaluation code in this cell to package in
# model.py to later train directly in vertex ai

In [163]:
%%writefile spectrain_proc_img/trainer/model.py
import datetime
import os
import shutil
import numpy as np
import tensorflow as tf
import hypertune
import numpy as np
from google.cloud import bigquery, storage
from google.oauth2 import credentials
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (Conv1D, Dense, Dropout, Flatten, MaxPooling1D, Softmax)

def get_blob(blobs):
    for blob in blobs:
        yield blob
        
def get_image_paths(image_input_dir):
    # initialize the GCS client
    image_bucket = image_input_dir.split('/')[2]
    prefix_dir = '/'.join(image_input_dir.split('/')[3:])
    prefix_dir = prefix_dir[1:]
    storage_client = storage.Client()
    # get the storage bucket
    bucket = storage_client.get_bucket(image_bucket)

    image_paths=[]
    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(image_bucket, prefix=prefix_dir)
    
    for blob in get_blob(blobs):
        if "output" in blob.name:
            image_paths.append('gs://spectrain/'+blob.name)
    return image_paths

def load_images(imagePath):
    # read the image from disk, decode it, convert the data type to
    # floating point, and resize it
    
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, (256,256))
    # parse the class label from the file path
    label = tf.strings.split(imagePath, os.path.sep)[-2]
    if label=='positive':
        label=1
    else:
        label=0
    # return the image and the label
    return (image, label)

    # return the image and the label
    return (image, label)

def load_dataset(images_dir, batch_size, training):
    
    filePaths = get_image_paths(image_input_dir=images_dir)
    ds = tf.data.Dataset.from_tensor_slices(filePaths)
    ds = (ds
        .map(load_images)
        .cache()
        .shuffle(len(filePaths))
        .batch(batch_size)
    )

    if training:
        return ds.repeat()
    else:
        return ds

def build_model(filter_size_1, filter_size_2, kernel_size, pool_kernel_size, hidden_units_1, hidden_units_2):
    model = Sequential()
    model.add(Conv1D(filter_size_1, kernel_size=kernel_size, activation='relu', input_shape=(256, 256)))
    model.add(Conv1D(filter_size_1, kernel_size=kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Conv1D(filter_size_2, kernel_size=kernel_size,activation='relu'))
    model.add(MaxPooling1D(pool_kernel_size))
    model.add(Flatten())
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_2, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['AUC'])
    
    return model

    
# Instantiate the HyperTune reporting object
hpt = hypertune.HyperTune()

# Reporting callback
class HPTCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        global hpt
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='auc',
            metric_value=logs['val_auc'],
            global_step=epoch)
        
        
def train_and_evaluate(args):
    model = build_model(filter_size_1=args["filt_size1"], filter_size_2=args["filt_size2"], 
                        kernel_size=args['ksize'], pool_kernel_size=args['pool_ksize']
                        , hidden_units_1=args['nnsize_1'], hidden_units_2=args['nnsize_2'])

    trainds = load_dataset(args["train_data_path"], args["batch_size"], training=True)

    evalds = load_dataset(args["eval_data_path"], args["batch_size"], training=False)
    
    if args["eval_steps"]:
        evalds = evalds.take(count=args["eval_steps"])

    num_batches = args["batch_size"] * args["num_epochs"]
    steps_per_epoch = args["train_examples"] // args["batch_size"]
    checkpoint_path = os.path.join(args["output_dir"], "checkpoints/spectrain_proc_img")
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path, verbose=1, save_weights_only=True)

    history = model.fit(
        trainds,
        validation_data=evalds,
        epochs=args["batch_size"],
        steps_per_epoch=steps_per_epoch,
        verbose=2,
        callbacks=[cp_callback, HPTCallback()])
    
    EXPORT_PATH = os.path.join(
        args["output_dir"], datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    tf.saved_model.save(
        obj=model, export_dir=EXPORT_PATH)  # with default serving function
    
    print("Exported trained model to {}".format(EXPORT_PATH))

Overwriting spectrain_proc_img/trainer/model.py


In [164]:
#### Train the model locally to check if everything is good #########

In [165]:
%%bash
OUTDIR=spectrain_proc_img_trained
rm -rf ${OUTDIR}
export PYTHONPATH=${PYTHONPATH}:${PWD}/spectrain_proc_img
python3 -m trainer.task \
    --train_data_path=gs://${BUCKET}/bhavani/train_images \
    --eval_data_path=gs://${BUCKET}/bhavani/valid_images \
    --output_dir=${OUTDIR} \
    --batch_size=10 \
    --num_epochs=1 \
    --train_examples=1 \
    --eval_steps=1

Epoch 1/10

Epoch 1: saving model to spectrain_proc_img_trained/checkpoints/spectrain_proc_img
10/10 - 129s - loss: 0.6852 - auc: 0.4377 - val_loss: 0.6479 - val_auc: 0.2857 - 129s/epoch - 13s/step
Epoch 2/10

Epoch 2: saving model to spectrain_proc_img_trained/checkpoints/spectrain_proc_img
10/10 - 0s - loss: 0.6046 - auc: 0.5490 - val_loss: 0.6386 - val_auc: 0.4286 - 466ms/epoch - 47ms/step
Epoch 3/10

Epoch 3: saving model to spectrain_proc_img_trained/checkpoints/spectrain_proc_img
10/10 - 0s - loss: 0.6643 - auc: 0.5830 - val_loss: 0.6530 - val_auc: 0.7292 - 413ms/epoch - 41ms/step
Epoch 4/10

Epoch 4: saving model to spectrain_proc_img_trained/checkpoints/spectrain_proc_img
10/10 - 0s - loss: 0.6412 - auc: 0.5767 - val_loss: 0.6194 - val_auc: 0.0938 - 348ms/epoch - 35ms/step
Epoch 5/10

Epoch 5: saving model to spectrain_proc_img_trained/checkpoints/spectrain_proc_img
10/10 - 0s - loss: 0.6297 - auc: 0.6029 - val_loss: 0.5142 - val_auc: 0.8750 - 423ms/epoch - 42ms/step
Epoch 6/10

2023-06-14 07:06:00.147020: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-06-14 07:06:00.147078: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-06-14 07:06:00.147103: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (asl): /proc/driver/nvidia/version does not exist
2023-06-14 07:06:00.147413: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-14 07:06:11.909

In [None]:
#### The following is the code to train the model on vertex ai with a randomly selected hyperparameters ####

In [179]:
%%writefile spectrain_proc_img/setup.py
from setuptools import find_packages
from setuptools import setup

setup(
    name='spectrain_proc_img_trainer',
    version='0.1',
    packages=find_packages(),
    include_package_data=True,
    description='spectrain edge detected image model training application.'
)

Overwriting spectrain_proc_img/setup.py


In [180]:
# Create a local directory to store source distribution package

In [181]:
%%bash
cd spectrain_proc_img
python ./setup.py sdist --formats=gztar
cd ..

running sdist
running egg_info
writing spectrain_proc_img_trainer.egg-info/PKG-INFO
writing dependency_links to spectrain_proc_img_trainer.egg-info/dependency_links.txt
writing top-level names to spectrain_proc_img_trainer.egg-info/top_level.txt
reading manifest file 'spectrain_proc_img_trainer.egg-info/SOURCES.txt'
writing manifest file 'spectrain_proc_img_trainer.egg-info/SOURCES.txt'
running check
creating spectrain_proc_img_trainer-0.1
creating spectrain_proc_img_trainer-0.1/spectrain_proc_img_trainer.egg-info
creating spectrain_proc_img_trainer-0.1/trainer
copying files to spectrain_proc_img_trainer-0.1...
copying setup.py -> spectrain_proc_img_trainer-0.1
copying spectrain_proc_img_trainer.egg-info/PKG-INFO -> spectrain_proc_img_trainer-0.1/spectrain_proc_img_trainer.egg-info
copying spectrain_proc_img_trainer.egg-info/SOURCES.txt -> spectrain_proc_img_trainer-0.1/spectrain_proc_img_trainer.egg-info
copying spectrain_proc_img_trainer.egg-info/dependency_links.txt -> spectrain_pro




In [207]:
%%bash
gsutil cp spectrain_proc_img/dist/spectrain_proc_img_trainer-0.1.tar.gz gs://${BUCKET}/spectrain_cnn/

Copying file://spectrain_proc_img/dist/spectrain_proc_img_trainer-0.1.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  3.0 KiB/  3.0 KiB]                                                
Operation completed over 1 objects/3.0 KiB.                                      


In [None]:
# Submit model training to vertex ai with specific random hyperparameters
# and passing them as arguments through .yaml file to task.py file

In [197]:
%%bash

TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
OUTDIR=gs://${BUCKET}/spectrain_cnn/trained_model_$TIMESTAMP
JOB_NAME=spectrain_cnn_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_cnn/spectrain_proc_img_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./config.yaml "workerPoolSpecs:
  machineSpec:
    machineType: n1-standard-4
  replicaCount: 1
  pythonPackageSpec:
    executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
    packageUris: $PYTHON_PACKAGE_URI
    pythonModule: $PYTHON_MODULE
    args:
    - --train_data_path=gs://${BUCKET}/bhavani/train_images
    - --eval_data_path=gs://${BUCKET}/bhavani/valid_images
    - --output_dir=$OUTDIR
    - --num_epochs=10
    - --train_examples=10000
    - --eval_steps=100
    - --batch_size=32"

gcloud ai custom-jobs create \
  --region=${REGION} \
  --display-name=$JOB_NAME \
  --config=config.yaml

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
CustomJob [projects/469700469475/locations/us-central1/customJobs/6257327133304029184] is submitted successfully.

Your job is still active. You may view the status of your job with the command

  $ gcloud ai custom-jobs describe projects/469700469475/locations/us-central1/customJobs/6257327133304029184

or continue streaming the logs with the command

  $ gcloud ai custom-jobs stream-logs projects/469700469475/locations/us-central1/customJobs/6257327133304029184


In [None]:
### The training of model in vertex ai code ends here #############

In [None]:
##### The following is the code for hyper parameter tuning ############

In [None]:
# Perfomr hyper parameter tuning using arguments in .YAML file

In [208]:
%%bash
TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
BASE_OUTPUT_DIR=gs://${BUCKET}/spectrain_cnn/hp_tuning_$TIMESTAMP
JOB_NAME=spectrain_cnn_hpt_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_cnn/spectrain_proc_img_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./hyperparam.yaml "displayName: $JOB_NAME
studySpec:
  metrics:
  - metricId: val_auc
    goal: MAXIMIZE
  parameters:
  - parameterId: batch_size
    integerValueSpec:
      minValue: 8
      maxValue: 64
    scaleType: UNIT_LOG_SCALE
  - parameterId: filt_size1
    integerValueSpec:
      minValue: 16
      maxValue: 64
    scaleType: UNIT_LINEAR_SCALE
  - parameterId: filt_size2
    integerValueSpec:
      minValue: 8
      maxValue: 32
    scaleType: UNIT_LINEAR_SCALE
  algorithm: ALGORITHM_UNSPECIFIED # results in Bayesian optimization
trialJobSpec:
  baseOutputDirectory:
    outputUriPrefix: $BASE_OUTPUT_DIR
  workerPoolSpecs:
  - machineSpec:
      machineType: n1-standard-8
    pythonPackageSpec:
      executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
      packageUris:
      - $PYTHON_PACKAGE_URI
      pythonModule: $PYTHON_MODULE
      args:
      - --train_data_path=gs://${BUCKET}/bhavani/train_images
      - --eval_data_path=gs://${BUCKET}/bhavani/valid_images
      - --num_epochs=10
      - --train_examples=5000
      - --eval_steps=100
      - --batch_size=32
    replicaCount: 1"
        
gcloud ai hp-tuning-jobs create \
    --region=$REGION \
    --display-name=$JOB_NAME \
    --config=hyperparam.yaml \
    --max-trial-count=20 \
    --parallel-trial-count=5

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
ERROR: (gcloud.ai.hp-tuning-jobs.create) FAILED_PRECONDITION: The Cloud Storage bucket of `gs://spectrain//spectrain_cnn/hp_tuning_20230614_073544` is in location `us`. It must be in the same regional location as the service location `us-central1`.


CalledProcessError: Command 'b'TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)\nBASE_OUTPUT_DIR=gs://${BUCKET}/spectrain_cnn/hp_tuning_$TIMESTAMP\nJOB_NAME=spectrain_cnn_hpt_$TIMESTAMP\n\nPYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_cnn/spectrain_proc_img_trainer-0.1.tar.gz\nPYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"\nPYTHON_MODULE=trainer.task\n\necho > ./hyperparam.yaml "displayName: $JOB_NAME\nstudySpec:\n  metrics:\n  - metricId: val_auc\n    goal: MAXIMIZE\n  parameters:\n  - parameterId: batch_size\n    integerValueSpec:\n      minValue: 8\n      maxValue: 64\n    scaleType: UNIT_LOG_SCALE\n  - parameterId: filt_size1\n    integerValueSpec:\n      minValue: 16\n      maxValue: 64\n    scaleType: UNIT_LINEAR_SCALE\n  - parameterId: filt_size2\n    integerValueSpec:\n      minValue: 8\n      maxValue: 32\n    scaleType: UNIT_LINEAR_SCALE\n  algorithm: ALGORITHM_UNSPECIFIED # results in Bayesian optimization\ntrialJobSpec:\n  baseOutputDirectory:\n    outputUriPrefix: $BASE_OUTPUT_DIR\n  workerPoolSpecs:\n  - machineSpec:\n      machineType: n1-standard-8\n    pythonPackageSpec:\n      executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI\n      packageUris:\n      - $PYTHON_PACKAGE_URI\n      pythonModule: $PYTHON_MODULE\n      args:\n      - --train_data_path=gs://${BUCKET}/bhavani/train_images\n      - --eval_data_path=gs://${BUCKET}/bhavani/valid_images\n      - --num_epochs=10\n      - --train_examples=5000\n      - --eval_steps=100\n      - --batch_size=32\n    replicaCount: 1"\n        \ngcloud ai hp-tuning-jobs create \\\n    --region=$REGION \\\n    --display-name=$JOB_NAME \\\n    --config=hyperparam.yaml \\\n    --max-trial-count=20 \\\n    --parallel-trial-count=5\n'' returned non-zero exit status 1.

In [None]:
# Take the best hyperparameters after fine-tuning and train the final model
%%bash
TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
OUTDIR=gs://${BUCKET}/babyweight/tuned_$TIMESTAMP
JOB_NAME=babyweight_tuned_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/babyweight/babyweight_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./tuned_config.yaml "workerPoolSpecs:
  machineSpec:
    machineType: n1-standard-8
  replicaCount: 1
  pythonPackageSpec:
    executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
    packageUris: $PYTHON_PACKAGE_URI
    pythonModule: $PYTHON_MODULE
    args:
    - --train_data_path=gs://${BUCKET}/babyweight/data/train*.csv
    - --eval_data_path=gs://${BUCKET}/babyweight/data/eval*.csv
    - --output_dir=$OUTDIR
    - --num_epochs=10
    - --train_examples=20000
    - --eval_steps=100
    - --batch_size=32
    - --nembeds=8"
    
gcloud ai custom-jobs create \
  --region=${REGION} \
  --display-name=$JOB_NAME \
  --config=tuned_config.yaml

In [None]:
%%bash
OUTDIR=local_training
rm -rf ${OUTDIR}
export PYTHONPATH=${PYTHONPATH}:${PWD}/spectrain_image_model
python3 -m trainer.task \
    --train_data_path=gs://${BUCKET}/spectrain_image_model/train*.csv \
    --eval_data_path=gs://${BUCKET}/spectrain_image_model/eval*.csv \
    --output_dir=${OUTDIR} \
    --batch_size=32 \
    --num_epochs=1 \
    --train_examples=10