In [93]:
import os
import sklearn

In [2]:
PROJECT = !gcloud config list --format 'value(core.project)'
PROJECT = PROJECT[0]
BUCKET = !gcloud storage ls
BUCKET = BUCKET[-1].split("//")[-1]
REGION = "us-central1"

In [3]:
os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

In [4]:
%%bash
gcloud config set project ${PROJECT}
gcloud config set ai/region ${REGION}

Updated property [core/project].
Updated property [ai/region].


In [5]:
PROJECT, BUCKET

('qwiklabs-asl-00-c812c3b423f2', 'spectrain/')

In [6]:
%%bash
mkdir -p spectrain_csv_dnn/trainer
touch spectrain_csv_dnn/trainer/__init__.py

In [7]:
%%writefile spectrain_csv_dnn/trainer/task.py
import argparse
import json
import os

from trainer import model

import tensorflow as tf

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_data_path",
        help="GCS location of training data",
        required=True
    )
    parser.add_argument(
        "--eval_data_path",
        help="GCS location of evaluation data",
        required=True
    )
    parser.add_argument(
        "--output_dir",
        help="GCS location to write checkpoints and export models",
        default = os.getenv("AIP_MODEL_DIR")
    )
    parser.add_argument(
        "--batch_size",
        help="Number of examples to compute gradient over.",
        type=int,
        default=64
    )
    parser.add_argument(
        "--nnsize_1",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=512
    )
    parser.add_argument(
        "--nnsize_2",
        help="Hidden layer sizes for DNN -- provide space-separated layers",
        default=64
    )
    parser.add_argument(
        "--num_epochs",
        help="Number of epochs to train the model.",
        type=int,
        default=10
    )
    parser.add_argument(
        "--train_examples",
        help="""Number of examples (in thousands) to run the training job over.
        If this is more than actual # of examples available, it cycles through
        them. So specifying 1000 here when you have only 100k examples makes
        this 10 epochs.""",
        type=int,
        default=5000
    )
    parser.add_argument(
        "--eval_steps",
        help="""Positive number of steps for which to evaluate model. Default
        to None, which means to evaluate until input_fn raises an end-of-input
        exception""",
        type=int,
        default=None
    )

    # Parse all arguments
    args = parser.parse_args()
    arguments = args.__dict__

    # Modify some arguments
    arguments["train_examples"] *= 100

    # Run the training job
    model.train_and_evaluate(arguments)

Overwriting spectrain_csv_dnn/trainer/task.py


In [87]:
%%writefile spectrain_csv_dnn/trainer/model.py
import datetime
import os
import shutil
import numpy as np
import tensorflow as tf
import hypertune
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
from google.cloud import bigquery, storage
from google.oauth2 import credentials
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (Conv1D, Dense, Dropout, Flatten, MaxPooling1D, Softmax)

# Define the CKD-EPI equation function
def calculate_eGFR(row):
    if row['Sex'] == 'male':
        kappa = 0.9
        alpha = -0.302
        beta = 1.0
    else:
        kappa = 0.7
        alpha = -0.241
        beta = 1.012

    eGFR = 142 * min(row['serum_creatinine'] / kappa, 1)**alpha * \
           max(row['serum_creatinine'] / kappa, 1)**(-1.2) * \
           0.9938**row['Patient.Age.at.Biopsy'] * beta
    return eGFR

def get_add_var(image_input_dir):
    df = pd.read_csv(image_input_dir)
    df['eGFR'] = df.apply(calculate_eGFR, axis=1)
    df['time.TX']=abs(df['Patient.Age.at.Biopsy'] - df['Patient.Age.at.TX'])
    eGFR_bins = [float('-inf'), 60, 89, float('inf')]
    TimeTX_bins = [float('-inf'), 1, float('inf')]

    # Create the binned columns for 'eGFR' and 'Time.TX'
    df['eGFR_bin'] = pd.cut(df['eGFR'], bins=eGFR_bins, labels=['<60', '60-89', '>=90'])
    df['time.TX_bin'] = pd.cut(df['time.TX'], bins=TimeTX_bins, labels=['<1 year', '>1 year'])
    
    return df

CSV_COLUMNS = [
    "serum_creatinine",
    "urea",
    "dimethylamine",
    "UA.Pro",
    "phenylacetylglutamine",
    "Hypertension",
    "trigonellin",
    "lactate",
    "citrate",
    "hippurate",
    "Sex",
    "alanine",
    "Diabetes",
    "UA.Hb",
    "eGFR",
    "time.TX",
    "eGFR_bin",
    "time.TX_bin",
    "Case"
]
LABEL_COLUMN = "Case"

NUMERICAL_COLUMNS = ["serum_creatinine", "urea","dimethylamine", "phenylacetylglutamine",
    "trigonellin","lactate","citrate","hippurate","alanine","eGFR","time.TX"]
CATEGORICAL_COLUMNS = ["Sex", "Hypertension", "eGFR_bin","UA.Pro", "UA.Hb","Diabetes", "time.TX_bin"]

ONE_HOT_COLS = ['Sex_female', 'Sex_male',
       'Hypertension_False', 'Hypertension_True', 'Hypertension_unknown',
       'eGFR_bin_<60', 'eGFR_bin_60-89', 'eGFR_bin_>=90', 'UA.Pro_False',
       'UA.Pro_True', 'UA.Pro_unknown', 'UA.Hb_False', 'UA.Hb_True',
       'UA.Hb_unknown', 'Diabetes_False', 'Diabetes_True', 'Diabetes_unknown',
       'time.TX_bin_<1 year', 'time.TX_bin_>1 year']

def transform_data(features_df):
    
    features_df[NUMERICAL_COLUMNS] = features_df[NUMERICAL_COLUMNS].fillna(0)
    features_df[CATEGORICAL_COLUMNS] = features_df[CATEGORICAL_COLUMNS].fillna('unknown')
    features_df = pd.get_dummies(features_df, columns=CATEGORICAL_COLUMNS, drop_first=True)
    
    scaler = preprocessing.StandardScaler()
    scaled_features = scaler.fit_transform(features_df[NUMERICAL_COLUMNS])
    scaled_features = pd.DataFrame(scaled_features, columns=NUMERICAL_COLUMNS)
    
    features_df=features_df.drop(columns=NUMERICAL_COLUMNS)
    features_df = pd.concat([scaled_features,features_df], axis=1)
    
    for COLS in ONE_HOT_COLS:
        if COLS not in features_df.columns:
            features_df[COLS]=0
            
    return features_df
    
    

def load_dataset(csv_input_dir, batch_size, mode=tf.estimator.ModeKeys.EVAL):
    # Make a CSV dataset
    df = get_add_var(csv_input_dir)
    df=df[CSV_COLUMNS]
    features,labels = df,df.pop(LABEL_COLUMN)
    features = transform_data(features)
 
    dataset = tf.data.Dataset.from_tensor_slices((features.values, labels)).cache().shuffle(len(features)).batch(batch_size)
    
    # Shuffle and repeat for training
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(buffer_size=1000).repeat()

    # Take advantage of multi-threading; 1=AUTOTUNE
    dataset = dataset.prefetch(buffer_size=1)

    return dataset


def build_model(hidden_units_1, hidden_units_2):
    model = Sequential()
    
    model.add(Dense(hidden_units_1, activation='relu', input_shape=(30,)))
    model.add(Dense(hidden_units_1, activation='relu'))
    model.add(Dense(hidden_units_2, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(hidden_units_2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['AUC'])
    
    return model

    
# Instantiate the HyperTune reporting object
hpt = hypertune.HyperTune()

# Reporting callback
class HPTCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        global hpt
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='auc',
            metric_value=logs['val_auc'],
            global_step=epoch)
        
        
def train_and_evaluate(args):
    model = build_model(args['nnsize_1'], args['nnsize_2'])

    trainds = load_dataset(args["train_data_path"], args["batch_size"], tf.estimator.ModeKeys.TRAIN)

    evalds = load_dataset(args["eval_data_path"], args["batch_size"], tf.estimator.ModeKeys.EVAL)
    
    if args["eval_steps"]:
        evalds = evalds.take(count=args["eval_steps"])

    num_batches = args["batch_size"] * args["num_epochs"]
    steps_per_epoch = args["train_examples"] // args["batch_size"]
    checkpoint_path = os.path.join(args["output_dir"], "checkpoints/spectrain_csv_dnn")
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path, verbose=1, save_weights_only=True)

    history = model.fit(
        trainds,
        validation_data=evalds,
        epochs=args["num_epochs"],
        steps_per_epoch=steps_per_epoch,
        verbose=2,
        callbacks=[cp_callback, HPTCallback()])
    
    EXPORT_PATH = os.path.join(
        args["output_dir"], datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    tf.saved_model.save(
        obj=model, export_dir=EXPORT_PATH)  # with default serving function
    
    print("Exported trained model to {}".format(EXPORT_PATH))

Overwriting spectrain_csv_dnn/trainer/model.py


In [88]:
%%bash
OUTDIR=spectrain_csv_dnn_trained
rm -rf ${OUTDIR}
export PYTHONPATH=${PYTHONPATH}:${PWD}/spectrain_csv_dnn
python3 -m trainer.task \
    --train_data_path=gs://${BUCKET}bhavani/csv_split/train.csv \
    --eval_data_path=gs://${BUCKET}bhavani/csv_split/valid.csv \
    --output_dir=${OUTDIR} \
    --batch_size=10 \
    --num_epochs=1 \
    --train_examples=1 \
    --eval_steps=1

2023-06-14 16:51:28.176531: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-14 16:51:30.871016: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-06-14 16:51:30.871146: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca


Epoch 1: saving model to spectrain_csv_dnn_trained/checkpoints/spectrain_csv_dnn
10/10 - 3s - loss: 0.6770 - auc: 0.4931 - val_loss: 0.6102 - val_auc: 0.1875 - 3s/epoch - 295ms/step
Exported trained model to spectrain_csv_dnn_trained/20230614165142


In [89]:
%%writefile spectrain_csv_dnn/setup.py
from setuptools import find_packages
from setuptools import setup

setup(
    name='spectrain_csv_dnn_trainer',
    version='0.1',
    packages=find_packages(),
    include_package_data=True,
    description='spectrain edge detected image model training application.'
)

Overwriting spectrain_csv_dnn/setup.py


In [90]:
%%bash
cd spectrain_csv_dnn
python ./setup.py sdist --formats=gztar
cd ..

running sdist
running egg_info
writing spectrain_csv_dnn_trainer.egg-info/PKG-INFO
writing dependency_links to spectrain_csv_dnn_trainer.egg-info/dependency_links.txt
writing top-level names to spectrain_csv_dnn_trainer.egg-info/top_level.txt
reading manifest file 'spectrain_csv_dnn_trainer.egg-info/SOURCES.txt'
writing manifest file 'spectrain_csv_dnn_trainer.egg-info/SOURCES.txt'





running check
creating spectrain_csv_dnn_trainer-0.1
creating spectrain_csv_dnn_trainer-0.1/spectrain_csv_dnn_trainer.egg-info
creating spectrain_csv_dnn_trainer-0.1/trainer
copying files to spectrain_csv_dnn_trainer-0.1...
copying setup.py -> spectrain_csv_dnn_trainer-0.1
copying spectrain_csv_dnn_trainer.egg-info/PKG-INFO -> spectrain_csv_dnn_trainer-0.1/spectrain_csv_dnn_trainer.egg-info
copying spectrain_csv_dnn_trainer.egg-info/SOURCES.txt -> spectrain_csv_dnn_trainer-0.1/spectrain_csv_dnn_trainer.egg-info
copying spectrain_csv_dnn_trainer.egg-info/dependency_links.txt -> spectrain_csv_dnn_trainer-0.1/spectrain_csv_dnn_trainer.egg-info
copying spectrain_csv_dnn_trainer.egg-info/top_level.txt -> spectrain_csv_dnn_trainer-0.1/spectrain_csv_dnn_trainer.egg-info
copying trainer/__init__.py -> spectrain_csv_dnn_trainer-0.1/trainer
copying trainer/model.py -> spectrain_csv_dnn_trainer-0.1/trainer
copying trainer/task.py -> spectrain_csv_dnn_trainer-0.1/trainer
Writing spectrain_csv_dnn_

In [91]:
%%bash
gsutil cp spectrain_csv_dnn/dist/spectrain_csv_dnn_trainer-0.1.tar.gz gs://${BUCKET}/spectrain_csv_dnn/

Copying file://spectrain_csv_dnn/dist/spectrain_csv_dnn_trainer-0.1.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  3.7 KiB/  3.7 KiB]                                                
Operation completed over 1 objects/3.7 KiB.                                      


In [92]:
%%bash

TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
OUTDIR=gs://${BUCKET}/spectrain_csv_dnn/trained_model_$TIMESTAMP
JOB_NAME=spectrain_csv_dnn_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_csv_dnn/spectrain_csv_dnn_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./config.yaml "workerPoolSpecs:
  machineSpec:
    machineType: n1-standard-4
  replicaCount: 1
  pythonPackageSpec:
    executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
    packageUris: $PYTHON_PACKAGE_URI
    pythonModule: $PYTHON_MODULE
    args:
    - --train_data_path=gs://${BUCKET}/bhavani/csv_split/train.csv
    - --eval_data_path=gs://${BUCKET}/bhavani/csv_split/valid.csv
    - --output_dir=$OUTDIR
    - --num_epochs=10
    - --train_examples=4000
    - --eval_steps=100
    - --batch_size=32"

gcloud ai custom-jobs create \
  --region=${REGION} \
  --display-name=$JOB_NAME \
  --config=config.yaml

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
CustomJob [projects/469700469475/locations/us-central1/customJobs/85713391015952384] is submitted successfully.

Your job is still active. You may view the status of your job with the command

  $ gcloud ai custom-jobs describe projects/469700469475/locations/us-central1/customJobs/85713391015952384

or continue streaming the logs with the command

  $ gcloud ai custom-jobs stream-logs projects/469700469475/locations/us-central1/customJobs/85713391015952384


In [None]:
%%bash
TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
BASE_OUTPUT_DIR=gs://${BUCKET}/spectrain_csv_dnn/hp_tuning_$TIMESTAMP
JOB_NAME=spectrain_csv_dnn_hpt_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_csv_dnn/spectrain_csv_dnn_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./hyperparam.yaml "displayName: $JOB_NAME
studySpec:
  metrics:
  - metricId: val_auc
    goal: MAXIMIZE
  parameters:
  - parameterId: batch_size
    integerValueSpec:
      minValue: 8
      maxValue: 64
    scaleType: UNIT_LOG_SCALE
  - parameterId: filt_size1
    integerValueSpec:
      minValue: 16
      maxValue: 64
    scaleType: UNIT_LINEAR_SCALE
  - parameterId: filt_size2
    integerValueSpec:
      minValue: 8
      maxValue: 32
    scaleType: UNIT_LINEAR_SCALE
  algorithm: ALGORITHM_UNSPECIFIED # results in Bayesian optimization
trialJobSpec:
  baseOutputDirectory:
    outputUriPrefix: $BASE_OUTPUT_DIR
  workerPoolSpecs:
  - machineSpec:
      machineType: n1-standard-8
    pythonPackageSpec:
      executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
      packageUris:
      - $PYTHON_PACKAGE_URI
      pythonModule: $PYTHON_MODULE
      args:
      - --train_data_path=gs://${BUCKET}/bhavani/csv_split/train.csv
      - --eval_data_path=gs://${BUCKET}/bhavani/csv_split/valid.csv
      - --num_epochs=10
      - --train_examples=5000
      - --eval_steps=100
      - --batch_size=32
    replicaCount: 1"
        
gcloud ai hp-tuning-jobs create \
    --region=$REGION \
    --display-name=$JOB_NAME \
    --config=hyperparam.yaml \
    --max-trial-count=20 \
    --parallel-trial-count=5

In [None]:
%%bash
TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)
OUTDIR=gs://${BUCKET}/spectrain_csv_dnn/tuned_$TIMESTAMP
JOB_NAME=spectrain_csv_dnn_tuned_$TIMESTAMP

PYTHON_PACKAGE_URI=gs://${BUCKET}/spectrain_csv_dnn/spectrain_csv_dnn_trainer-0.1.tar.gz
PYTHON_PACKAGE_EXECUTOR_IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest"
PYTHON_MODULE=trainer.task

echo > ./tuned_config.yaml "workerPoolSpecs:
  machineSpec:
    machineType: n1-standard-8
  replicaCount: 1
  pythonPackageSpec:
    executorImageUri: $PYTHON_PACKAGE_EXECUTOR_IMAGE_URI
    packageUris: $PYTHON_PACKAGE_URI
    pythonModule: $PYTHON_MODULE
    args:
    - --train_data_path=gs://${BUCKET}/bhavani/csv_split/train.csv
    - --eval_data_path=gs://${BUCKET}/bhavani/csv_split/valid.csv
    - --output_dir=$OUTDIR
    - --num_epochs=10
    - --train_examples=20000
    - --eval_steps=100
    - --batch_size=32
    - --nembeds=8"
    
gcloud ai custom-jobs create \
  --region=${REGION} \
  --display-name=$JOB_NAME \
  --config=tuned_config.yaml