### Write file task.py

In [24]:
!pip install cloudml-hypertune

Collecting cloudml-hypertune
  Downloading cloudml-hypertune-0.1.0.dev6.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: cloudml-hypertune
  Building wheel for cloudml-hypertune (setup.py) ... [?25ldone
[?25h  Created wheel for cloudml-hypertune: filename=cloudml_hypertune-0.1.0.dev6-py2.py3-none-any.whl size=3987 sha256=313e6726477b1117be4b12ad7df401812f7e7a5f55d047b8e94af1790dd3a20b
  Stored in directory: /home/jupyter/.cache/pip/wheels/a7/ff/87/e7bed0c2741fe219b3d6da67c2431d7f7fedb183032e00f81e
Successfully built cloudml-hypertune
Installing collected packages: cloudml-hypertune
Successfully installed cloudml-hypertune-0.1.0.dev6


In [60]:
%%writefile task.py

# Owner - Hasan Rafiq
# Load the TensorBoard notebook extension

import hypertune
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse
import tensorflow as tf
import tensorflow.keras as keras
  
# Determine CSV, label, and key columns
#Columns in training sheet -> Can have extra columns too
CSV_COLUMNS = ['fare', 'trip_start_month', 'trip_start_hour', 'trip_start_day',
       'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
       'dropoff_longitude']
LABEL_COLUMN = 'fare'

# Set default values for each CSV column( Including Y column )
DEFAULTS = [[0.0], ['1'], ['1'],['1'],[0.0],[0.0],[0.0],[0.0]]

bins_lat = [41.66367065, 41.85934972, 41.87740612, 41.87925508, 41.88099447,
       41.88498719, 41.88530002, 41.89204214, 41.89207263, 41.89265811,
       41.89830587, 41.89960211, 41.90026569, 41.90741282, 41.92187746,
       41.92926299, 41.9442266 , 41.95402765, 41.97907082, 42.02122359]

bins_lon = [-87.9136246 , -87.76550161, -87.68751552, -87.67161972,
       -87.66341641, -87.65599818, -87.65253448, -87.64629348,
       -87.642649  , -87.63784421, -87.63330804, -87.63274649,
       -87.63186395, -87.62887416, -87.62621491, -87.62519214,
       -87.62099291, -87.62076287, -87.61886836, -87.54093551]

RAW_DATA_FEATURE_SPEC = dict([
        ('fare', tf.io.VarLenFeature(tf.float32)),
        ('trip_start_month', tf.io.VarLenFeature(tf.string)),
        ('trip_start_hour', tf.io.VarLenFeature(tf.string)),
        ('trip_start_day', tf.io.VarLenFeature(tf.string)),
        ('pickup_latitude', tf.io.FixedLenFeature([], tf.float32)),
        ('pickup_longitude', tf.io.FixedLenFeature([], tf.float32)),
        ('dropoff_latitude', tf.io.FixedLenFeature([], tf.float32)),
        ('dropoff_longitude', tf.io.FixedLenFeature([], tf.float32)),
        ])
    
###############################
##Feature engineering functions
def feature_engg_features(features):
  #Add new features( Non-TFT transformation ) -> Just for study purposes
  features['distance'] = ((features['pickup_latitude'] - features['dropoff_latitude'])**2 +  (features['pickup_longitude'] - features['dropoff_longitude'])**2)**0.5

  return(features)

#To be called from TF
def feature_engg(features, label):
  #Add new features
  features = feature_engg_features(features)

  return(features, label)

###############################
###Data Input pipeline function

def make_input_fn(filename, mode, vnum_epochs = None, batch_size = 512):
    def _input_fn(v_test=False):     
        # Create list of files that match pattern
        file_list = tf.io.gfile.glob(filename)

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = vnum_epochs # indefinitely
        else:
            num_epochs = 1 # end-of-input after this        
        
        # Create dataset from file list
        dataset = tf.compat.v1.data.experimental.make_csv_dataset(file_list,
                                                   batch_size=batch_size,
                                                   column_names=CSV_COLUMNS,
                                                   column_defaults=DEFAULTS,
                                                   label_name=LABEL_COLUMN,
                                                   num_epochs = num_epochs,
                                                   num_parallel_reads=30)
        
        dataset = dataset.prefetch(buffer_size = batch_size)

        #Feature engineering
        dataset = dataset.map(feature_engg)

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = vnum_epochs # indefinitely
            dataset = dataset.shuffle(buffer_size = batch_size)
        else:
            num_epochs = 1 # end-of-input after this

        dataset = dataset.repeat(num_epochs)       
        
        #Begins - Uncomment for testing only -----------------------------------------------------<
        if v_test == True:
          print(next(dataset.__iter__()))
          
        #End - Uncomment for testing only -----------------------------------------------------<
        return dataset
    return _input_fn

# Define feature columns(Including feature engineered ones )
# These are the features which come from the TF Data pipeline
def create_feature_cols():
    #Keras format features
    # k_pickup_longitude_scaled = tf.keras.Input(name='pickup_longitude_scaled', shape=(1,), dtype=tf.float32, sparse=False) #-> Sparse because VarLenFeature
    # k_pickup_latitude_scaled = tf.keras.Input(name='pickup_latitude_scaled', shape=(1,), dtype=tf.float32, sparse=False) #-> Sparse because VarLenFeature
    k_month = tf.keras.Input(name='trip_start_month', shape=(1,), dtype=tf.string, sparse=False)
    k_hour  = tf.keras.Input(name='trip_start_hour', shape=(1,), dtype=tf.string, sparse=False)
    k_day  = tf.keras.Input(name='trip_start_day', shape=(1,), dtype=tf.string, sparse=False)
    k_picklat  = tf.keras.Input(name='pickup_latitude', shape=(1,), dtype=tf.float32, sparse=False)
    k_picklon  = tf.keras.Input(name='pickup_longitude', shape=(1,), dtype=tf.float32, sparse=False)
    k_droplat  = tf.keras.Input(name='dropoff_latitude', shape=(1,), dtype=tf.float32, sparse=False)
    k_droplon  = tf.keras.Input(name='dropoff_longitude', shape=(1,), dtype=tf.float32, sparse=False)
    k_distance  = tf.keras.Input(name='distance', shape=(1,), dtype=tf.float32, sparse=False)
    keras_dict_input = {'trip_start_month': k_month, 'trip_start_hour': k_hour, 'trip_start_day' : k_day,
                        'pickup_latitude': k_picklat, 'pickup_longitude': k_picklon,
                        'dropoff_latitude': k_droplat, 'dropoff_longitude': k_droplon, 'distance' : k_distance,
                        # 'pickup_longitude_scaled': k_pickup_longitude_scaled,
                        # 'pickup_latitude_scaled' : k_pickup_latitude_scaled
                        }

    return({'K' : keras_dict_input})

def create_keras_model(params, feature_cols):
    METRICS = [
            keras.metrics.RootMeanSquaredError(name='rmse')
    ]

    #Input layers
    input_feats = []
    for inp in feature_cols['K'].keys():
      input_feats.append(feature_cols['K'][inp])

    ##Input processing
    ##https://keras.io/examples/structured_data/structured_data_classification_from_scratch/
    ##https://github.com/tensorflow/community/blob/master/rfcs/20191212-keras-categorical-inputs.md

    ##Handle categorical attributes( One-hot encoding )
    cat_day = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary = ['0','1','2','3','4','5','6','7'], mask_token=None, oov_token = '0')(feature_cols['K']['trip_start_day'])
    cat_day = tf.keras.layers.experimental.preprocessing.CategoryEncoding(num_tokens=8)(cat_day)

    cat_hour = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=['1','2','3','4','5','6','7','8'
                                                                                      '9','10','11','12','13','14','15','16',
                                                                                      '17','18','19','20','21','22','23','0'
                                                                                      ], mask_token=None)(feature_cols['K']['trip_start_hour'])
    cat_hour = tf.keras.layers.experimental.preprocessing.CategoryEncoding(num_tokens=24)(cat_hour)

    cat_month = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=['1','2','3','4','5','6','7','8'
                                                                                      '9','10','11','12'], mask_token=None)(feature_cols['K']['trip_start_month'])
    cat_month = tf.keras.layers.experimental.preprocessing.CategoryEncoding(num_tokens=12)(cat_month)

    # cat_company = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=df['company'].unique(), mask_token=None)(feature_cols['K']['company'])
    # cat_company = tf.keras.layers.experimental.preprocessing.CategoryEncoding(num_tokens=len(df['company'].unique()))(cat_company)

    ##Binning
    bins_pickup_lat = tf.keras.layers.experimental.preprocessing.Discretization(bins = bins_lat)(feature_cols['K']['pickup_latitude'])
    cat_pickup_lat = tf.keras.layers.experimental.preprocessing.CategoryEncoding(len(bins_lat)+1)(bins_pickup_lat)

    bins_pickup_lon = tf.keras.layers.experimental.preprocessing.Discretization(bins = bins_lon)(feature_cols['K']['pickup_longitude'])
    cat_pickup_lon = tf.keras.layers.experimental.preprocessing.CategoryEncoding(len(bins_lon)+1)(bins_pickup_lon)

    bins_drop_lat = tf.keras.layers.experimental.preprocessing.Discretization(bins = bins_lat)(feature_cols['K']['dropoff_latitude'])
    cat_drop_lat = tf.keras.layers.experimental.preprocessing.CategoryEncoding(len(bins_lat)+1)(bins_drop_lat)

    bins_drop_lon = tf.keras.layers.experimental.preprocessing.Discretization(bins = bins_lon)(feature_cols['K']['dropoff_longitude'])
    cat_drop_lon = tf.keras.layers.experimental.preprocessing.CategoryEncoding(len(bins_lon)+1)(bins_drop_lon)

    ##Categorical cross
    cross_day_hour = tf.keras.layers.experimental.preprocessing.CategoryCrossing()([cat_day, cat_hour])
    # hash_cross_day_hour = tf.keras.layers.experimental.preprocessing.HashedCrossing(num_bins=24 * 8, output_mode='one_hot')(cross_day_hour)

#     cross_pick_lon_lat = tf.keras.layers.experimental.preprocessing.CategoryCrossing()([cat_pickup_lat, cat_pickup_lon])
#     hash_cross_pick_lon_lat = tf.keras.layers.experimental.preprocessing.HashedCrossing(num_bins=(len(bins_lat) + 1) ** 2)(cross_pick_lon_lat)

#     cross_drop_lon_lat = tf.keras.layers.experimental.preprocessing.CategoryCrossing()([cat_drop_lat, cat_drop_lon])
#     hash_cross_drop_lon_lat = tf.keras.layers.experimental.preprocessing.HashedCrossing(num_bins=(len(bins_lat) + 1) ** 2)(cross_drop_lon_lat)

    # Cross to embedding
#     embed_cross_pick_lon_lat = tf.keras.layers.Embedding(((len(bins_lat) + 1) ** 2), 4)(hash_cross_pick_lon_lat)
#     embed_cross_pick_lon_lat = tf.reduce_sum(embed_cross_pick_lon_lat, axis=-2)

#     embed_cross_drop_lon_lat = tf.keras.layers.Embedding(((len(bins_lat) + 1) ** 2), 4)(hash_cross_drop_lon_lat)
#     embed_cross_drop_lon_lat = tf.reduce_sum(embed_cross_drop_lon_lat, axis=-2)

    # Also pass time attributes as Deep signal( Cast to integer )
    int_trip_start_day = tf.strings.to_number(feature_cols['K']['trip_start_day'], tf.float32)
    int_trip_start_hour = tf.strings.to_number(feature_cols['K']['trip_start_hour'], tf.float32)
    int_trip_start_month = tf.strings.to_number(feature_cols['K']['trip_start_month'], tf.float32)

    #Add feature engineered columns - LAMBDA layer

    ###Create MODEL
    ####Concatenate all features( Numerical input )
    x_input_numeric = tf.keras.layers.concatenate([
                    feature_cols['K']['pickup_latitude'], feature_cols['K']['pickup_longitude'],
                    feature_cols['K']['dropoff_latitude'], feature_cols['K']['dropoff_longitude'],
                    # feature_cols['K']['pickup_latitude_scaled'], feature_cols['K']['pickup_longitude_scaled'],
                    feature_cols['K']['distance'], 
                    # embed_cross_pick_lon_lat, embed_cross_drop_lon_lat,
                    int_trip_start_day, int_trip_start_hour, int_trip_start_month
                    ])

    #DEEP - This Dense layer connects to input layer - Numeric Data
    # x_numeric = tf.keras.layers.Dense(32, activation='selu', kernel_initializer="lecun_normal")(x_input_numeric)
    x_numeric = tf.keras.layers.BatchNormalization()(x_input_numeric)

    ####Concatenate all Categorical features( Categorical converted )
    x_categ = tf.keras.layers.concatenate([
                    cat_month, #cat_cross_day_hour, 
                    cat_pickup_lat, cat_pickup_lon,
                    cat_drop_lat, cat_drop_lon
                    ])
    
    #WIDE - This Dense layer connects to input layer - Categorical Data
    # x_categ = tf.keras.layers.Dense(32, activation='selu', kernel_initializer="lecun_normal")(x_input_categ)

    ####Concatenate both Wide and Deep layers
    x = tf.keras.layers.concatenate([x_categ, x_numeric])

    for l_ in range(params['hidden_layers']):
        x = tf.keras.layers.Dense(32, activation='selu', kernel_initializer="lecun_normal",
                                  activity_regularizer=tf.keras.regularizers.l2(0.00001))(x)
        x = tf.keras.layers.BatchNormalization()(x)

    #Final Layer
    out = tf.keras.layers.Dense(1, activation='linear')(x)
    model = tf.keras.Model(input_feats, out)

    #Set optimizer
    opt = tf.keras.optimizers.Adam(lr= params['lr'])

    #Compile model
    model.compile(loss='mean_squared_error',  optimizer=opt, metrics = METRICS)

    #Print Summary
    print(model.summary())
    return model

def keras_train_and_evaluate(model, train_dataset, validation_dataset, epochs=100):
  #Add callbacks
  reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                                patience=5, min_lr=0.00001, verbose = 1)
  
  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs/")

  #Train and Evaluate
  out = model.fit(train_dataset, 
                  validation_data = validation_dataset,
                  epochs=epochs,
                  # validation_steps = 3,   ###Keep this none for running evaluation on full EVAL data every epoch
                  steps_per_epoch = 100,   ###Has to be passed - Cant help it :) [ Number of batches per epoch ]
                  callbacks=[reduce_lr, #modelsave_callback, 
                             tensorboard_callback, 
                             keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True, verbose=True)]
                  )
  
  return (min(out.history['val_rmse']))
  
def main(args):
    @tf.function
    def serving(dropoff_latitude, dropoff_longitude, pickup_latitude, pickup_longitude, trip_start_day, trip_start_hour, trip_start_month):
        #Params coming in request
        features = {
            'dropoff_latitude': dropoff_latitude,
            'dropoff_longitude': dropoff_longitude,
            'pickup_latitude': pickup_latitude,
            'pickup_longitude': pickup_longitude,
            'trip_start_day': trip_start_day,
            'trip_start_hour': trip_start_hour,
            'trip_start_month': trip_start_month
        }

        #Add TFT transformations
        raw_features = {}
        for key, val in features.items():
          if key not in RAW_DATA_FEATURE_SPEC:
            continue
          if isinstance(RAW_DATA_FEATURE_SPEC[key], tf.io.VarLenFeature):
            raw_features[key] = tf.RaggedTensor.from_tensor(
                tf.expand_dims(val, -1)).to_sparse()
            continue
          raw_features[key] = val
        # tft_new_features = tft_layer(raw_features)

        # pickup_longitude_scaled = tft_new_features['pickup_longitude_scaled'] 
        # pickup_latitude_scaled = tft_new_features['pickup_latitude_scaled']
        distance = ((features['pickup_latitude'] - features['dropoff_latitude'])**2 +  (features['pickup_longitude'] - features['dropoff_longitude'])**2)**0.5 ##tft_new_features['distance']

        ##Feature engineering( calculate distance )
        # distance = tf.cast( tf.sqrt((tf.abs(dropoff_latitude - pickup_latitude))**2 + (tf.abs(dropoff_longitude - pickup_longitude))**2), tf.float32)

        #Params in request + New Feature engineering params
        payload = {
            'dropoff_latitude': dropoff_latitude,
            'dropoff_longitude': dropoff_longitude,
            'pickup_latitude': pickup_latitude,
            'pickup_longitude': pickup_longitude,
            'trip_start_day': trip_start_day,
            'trip_start_hour': trip_start_hour,
            'trip_start_month': trip_start_month,
            'distance': distance,
            # 'pickup_longitude_scaled': pickup_longitude_scaled,
            # 'pickup_latitude_scaled': pickup_latitude_scaled,
        }

        ## Predict
        ##IF THERE IS AN ERROR IN NUMBER OF PARAMS PASSED HERE OR DATA TYPE THEN IT GIVES ERROR, "COULDN'T COMPUTE OUTPUT TENSOR"
        predictions = m_(payload)
        return predictions

    #####MAIN STARTS
    ##Device Strategy
    device = "cpu"
    if device == "tpu":
      resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
      tf.config.experimental_connect_to_cluster(resolver)
      # This is the TPU initialization code that has to be at the beginning.
      tf.tpu.experimental.initialize_tpu_system(resolver)
      strategy = tf.distribute.experimental.TPUStrategy(resolver)
    else:
      strategy = tf.distribute.MultiWorkerMirroredStrategy()
    
    #Create FC
    create_feature_cols()

    #Create model
    params_default = {
        'lr' : args.lr,
        'beta_1' : 0.99,
        'beta_2' : 0.999,
        'epsilon' : 1e-08,
        'decay' : 0.01,
        'hidden_layers' : args.hidden_layers
    }

    #Create dataset input functions
    train_dataset = make_input_fn(filename = args.train_file,
                        mode = tf.estimator.ModeKeys.TRAIN,
                        batch_size = 128)()

    validation_dataset = make_input_fn(filename = args.eval_file,
                        mode = tf.estimator.ModeKeys.EVAL,
                        batch_size = 512)()

    m_ = create_keras_model(params = params_default, feature_cols = create_feature_cols())
    # tf.keras.utils.plot_model(m_, show_shapes=True, rankdir="LR")

    #Train Model
    rmse = keras_train_and_evaluate(m_, train_dataset, validation_dataset, args.epochs)
    print("Final Val RMSE: ", rmse)
    
    #Report metrics for HPT
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
      hyperparameter_metric_tag='eval_rmse',
      metric_value=rmse,
      global_step=args.epochs)
    
    #Save model
    serving = serving.get_concrete_function(trip_start_day=tf.TensorSpec([None,], dtype= tf.string, name='trip_start_day'), 
                                            trip_start_hour=tf.TensorSpec([None,], dtype= tf.string, name='trip_start_hour'),
                                            trip_start_month=tf.TensorSpec([None], dtype= tf.string, name='trip_start_month'), 
                                            dropoff_latitude=tf.TensorSpec([None,], dtype= tf.float32, name='dropoff_latitude'),
                                            dropoff_longitude=tf.TensorSpec([None,], dtype= tf.float32, name='dropoff_longitude'), 
                                            pickup_latitude=tf.TensorSpec([None,], dtype= tf.float32, name='pickup_latitude'),
                                            pickup_longitude=tf.TensorSpec([None,], dtype= tf.float32, name='pickup_longitude')
                                            )

    print("Saving model...")
    version = "1"  #{'serving_default': call_output}
    tf.saved_model.save(
        m_,
        args.model_save_location + version,
        signatures=serving
    )

if __name__ == '__main__':
    ##Parse Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
      '--train_file', required=True, type=str, help='Training file')
    parser.add_argument(
      '--eval_file', required=True, type=str, help='Eval file')
    parser.add_argument(
      '--model_save_location', required=True, type=str, help='Model save location')
    parser.add_argument(
      '--epochs', required=False, type=int, help='Epochs', default=100)
    parser.add_argument(
      '--lr', required=False, type=float, help='Learning Rate', default=0.001)
    parser.add_argument(
      '--hidden_layers', required=False, type=int, help='Hidden layers', default=1)
    args = parser.parse_args()
    
    #Run Main Trainer
    main(args)

Overwriting task.py


### Test task.py locally

In [61]:
!python task.py \
--train_file='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/train.csv' \
--eval_file='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/eval.csv' \
--model_save_location='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/model/' \
--epochs=3 \
--lr=0.001 \
--hidden_layers=2

2021-12-28 12:53:22.825845: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
  super(Adam, self).__init__(name, **kwargs)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 trip_start_month (InputLayer)  [(None, 1)]          0           []                               
                                                                                                  
 pickup_latitude (InputLayer)   [(None, 1)]          0           []                               
                                                                                                  
 pickup_longitude (InputLayer)  [(None, 1)]          0           []                               
                                            

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir logs

### Test task.py Gcloud Local Run

In [3]:
!gcloud ai custom-jobs local-run \
--executor-image-uri='us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-7:latest' \
--script=task.py \
-- \
--train_file='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/train.csv' \
--eval_file='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/eval.csv' \
--model_save_location='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/model/' \
--epochs=10 \
--lr=0.001 \
--hidden_layers=2

Package is set to /home/jupyter.
Sending build context to Docker daemon   33.8MB
Step 1/12 : FROM us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-7:latest
 ---> 7dc88f3ac4e2
Step 2/12 : RUN mkdir -m 777 -p /usr/app /home
 ---> Running in 0b4ac9ca389b
Removing intermediate container 0b4ac9ca389b
 ---> 00ff2a8b2176
Step 3/12 : WORKDIR /usr/app
 ---> Running in a14147673617
Removing intermediate container a14147673617
 ---> 65dde0dfc4ce
Step 4/12 : ENV HOME=/home
 ---> Running in c82959819862
Removing intermediate container c82959819862
 ---> 7b6c44904df1
Step 5/12 : ENV PYTHONDONTWRITEBYTECODE=1
 ---> Running in 21d6eb032407
Removing intermediate container 21d6eb032407
 ---> 6460d4394084
Step 6/12 : RUN rm -rf /var/sitecustomize
 ---> Running in a70f90745d8a
Removing intermediate container a70f90745d8a
 ---> 5bac260fb8a0
Step 7/12 : COPY ["./setup.py", "./setup.py"]
 ---> 2218fd57196e
Step 8/12 : RUN pip3 install --no-cache-dir .
 ---> Running in f474a7322a0d
Processing /usr/app
  Preparin

### Prebuilt container - GCloud Custom Training Job

https://cloud.google.com/vertex-ai/docs/training/create-custom-job

In [58]:
%%writefile setup.py

from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = ['absl-py','pandas',
                     'google-cloud','google-cloud-storage','google-cloud-firestore','google-api-python-client', 'google-auth']

setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Hasan - Vertex AI Taxi Trainer Job'
)

Overwriting setup.py


In [59]:
%%writefile __init__.py
  

Overwriting __init__.py


In [60]:
# Create ML train package
!rm -rf dist*
!rm -rf trainer*
!rm -rf trainer/
!rm -rf dist/

!mkdir trainer/
!cp task.py trainer/
!cp __init__.py trainer/
!python setup.py sdist

# Copy trainer.gz to GCS training path
!gsutil cp dist/trainer-0.1.tar.gz gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/ml_scripts/

running sdist
running egg_info
creating trainer.egg-info
writing trainer.egg-info/PKG-INFO
writing dependency_links to trainer.egg-info/dependency_links.txt
writing requirements to trainer.egg-info/requires.txt
writing top-level names to trainer.egg-info/top_level.txt
writing manifest file 'trainer.egg-info/SOURCES.txt'
reading manifest file 'trainer.egg-info/SOURCES.txt'
writing manifest file 'trainer.egg-info/SOURCES.txt'

running check


creating trainer-0.1
creating trainer-0.1/trainer
creating trainer-0.1/trainer.egg-info
copying files to trainer-0.1...
copying setup.py -> trainer-0.1
copying trainer/__init__.py -> trainer-0.1/trainer
copying trainer/task.py -> trainer-0.1/trainer
copying trainer.egg-info/PKG-INFO -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/SOURCES.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/dependency_links.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/requires.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-in

In [56]:
##Launch Job on GCLOUD
!gcloud ai custom-jobs create \
--region=us-central1 \
--display-name=vertex-custom-taxi5 \
--python-package-uris=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/ml_scripts/trainer-0.1.tar.gz \
--worker-pool-spec=machine-type=n1-standard-4,executor-image-uri='us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-7:latest',python-module=trainer.task \
--args='--train_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/train.csv' \
--args='--eval_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/eval.csv' \
--args='--model_save_location=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/model/' \
--args='--epochs=10' \
--args='--hidden_layers=2'

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
CustomJob [projects/318948681665/locations/us-central1/customJobs/6510310602643079168] is submitted successfully.

Your job is still active. You may view the status of your job with the command

  $ gcloud ai custom-jobs describe projects/318948681665/locations/us-central1/customJobs/6510310602643079168

or continue streaming the logs with the command

  $ gcloud ai custom-jobs stream-logs projects/318948681665/locations/us-central1/customJobs/6510310602643079168


### Prebuilt container - GCloud HPT Job

https://cloud.google.com/vertex-ai/docs/training/using-hyperparameter-tuning

In [56]:
%%writefile config.yaml
studySpec:
  metrics:
  - metricId: eval_rmse
    goal: MINIMIZE
  parameters:
  - parameterId: lr
    doubleValueSpec:
      minValue: 0.001
      maxValue: 0.05
  - parameterId: hidden_layers
    integerValueSpec:
      minValue: 1
      maxValue: 4
  algorithm: RANDOM_SEARCH
trialJobSpec:
  workerPoolSpecs:
  - machineSpec:
      machineType: n1-standard-4
    replicaCount: "1"
    pythonPackageSpec: 
        packageUris: gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/ml_scripts/trainer-0.1.tar.gz
        pythonModule: trainer.task
        executorImageUri: us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-7:latest
        args: [
          "--train_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/train.csv",
          "--eval_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/eval.csv",
          "--model_save_location=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/model/"
        ]
    

Overwriting config.yaml


In [57]:
!gcloud ai hp-tuning-jobs create \
--region=us-central1 \
--display-name=vertex-custom-hpt-taxi1 \
--max-trial-count=10 \
--parallel-trial-count=2 \
--config=config.yaml \
# --verbosity=debug

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Hyperparameter tuning job [6260923773277437952] submitted successfully.

Your job is still active. You may view the status of your job with the command

  $ gcloud ai hp-tuning-jobs describe 6260923773277437952 --region=us-central1

Job State: JOB_STATE_PENDING


### Custom container - GCloud Custom Training Job

https://cloud.google.com/vertex-ai/docs/training/create-custom-job

In [62]:
%%writefile requirements.txt
tensorflow==2.7.0
pandas==1.3.5
matplotlib
cloudml-hypertune
graphviz
pydot

Overwriting requirements.txt


In [63]:
%%writefile Dockerfile
##Build Dockerfile
FROM python:3.7-slim AS builder
COPY requirements.txt .

RUN pip install -r requirements.txt

COPY . .

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "task.py"]

Overwriting Dockerfile


In [64]:
#Get access to write to GAR
!gcloud auth configure-docker us-central1-docker.pkg.dev


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


In [65]:
## Build Docker image
!docker build -t us-central1-docker.pkg.dev/hasanrafiq-test-331814/vertex-custom-training-docker/latest:latest -f Dockerfile .

Sending build context to Docker daemon  34.14MB
Step 1/5 : FROM python:3.7-slim AS builder
 ---> d3c9ad326043
Step 2/5 : COPY requirements.txt .
 ---> Using cache
 ---> 5f9591bbc943
Step 3/5 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> 463635d3dacb
Step 4/5 : COPY . .
 ---> 8c9941fb103f
Step 5/5 : ENTRYPOINT ["python", "task.py"]
 ---> Running in 9f7c3ac03341
Removing intermediate container 9f7c3ac03341
 ---> f1225905dc39
Successfully built f1225905dc39
Successfully tagged us-central1-docker.pkg.dev/hasanrafiq-test-331814/vertex-custom-training-docker/latest:latest


In [66]:
## Test run docker image in local
!docker run 'us-central1-docker.pkg.dev/hasanrafiq-test-331814/vertex-custom-training-docker/latest:latest' \
--train_file='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/train.csv' \
--eval_file='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/eval.csv' \
--model_save_location='gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/model/' \
--epochs=10 \
--lr=0.001 \
--hidden_layers=2

2021-12-28 12:54:20.223260: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-28 12:54:20.223310: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-12-28 12:54:22.005427: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-28 12:54:22.005482: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-28 12:54:22.005512: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (7154c91bb4ee): /proc/driver/nvidia/version does not exist
2021-12-28 12:54:22.006113: I tensorflow/core/platform/cpu_featu

In [67]:
## Push Docker image to GAR
!docker push us-central1-docker.pkg.dev/hasanrafiq-test-331814/vertex-custom-training-docker/latest:latest

The push refers to repository [us-central1-docker.pkg.dev/hasanrafiq-test-331814/vertex-custom-training-docker/latest]

[1B72c73987: Preparing 
[1B115bd59e: Preparing 
[1B65b7decb: Preparing 
[1B0307b4c1: Preparing 
[1B45955cb1: Preparing 
[1B23303735: Preparing 
[1B20bfdce7: Preparing 
[8B72c73987: Pushed   34.14MB/32.45MB[3A[2K[1A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2Klatest: digest: sha256:6fe3155710c3d76ce2a9e5c17188552dd2b22f3858ef7c88dc48f0dfb40540fb size: 2003


In [68]:
##Launch Job on GCLOUD
!gcloud ai custom-jobs create \
--region=us-central1 \
--display-name=vertex-custom-taxicustomcontainer-3 \
--worker-pool-spec=machine-type=n1-standard-4,replica-count=1,container-image-uri='us-central1-docker.pkg.dev/hasanrafiq-test-331814/vertex-custom-training-docker/latest:latest' \
--args='--train_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/train.csv' \
--args='--eval_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/eval.csv' \
--args='--model_save_location=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/model/' \
--args='--epochs=10' \
--args='--hidden_layers=2'

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
CustomJob [projects/318948681665/locations/us-central1/customJobs/217093073346232320] is submitted successfully.

Your job is still active. You may view the status of your job with the command

  $ gcloud ai custom-jobs describe projects/318948681665/locations/us-central1/customJobs/217093073346232320

or continue streaming the logs with the command

  $ gcloud ai custom-jobs stream-logs projects/318948681665/locations/us-central1/customJobs/217093073346232320


### Custom container - GCloud HPT Job

https://cloud.google.com/vertex-ai/docs/training/using-hyperparameter-tuning

In [69]:
%%writefile config_custom_container.yaml
studySpec:
  metrics:
  - metricId: eval_rmse
    goal: MINIMIZE
  parameters:
  - parameterId: lr
    doubleValueSpec:
      minValue: 0.001
      maxValue: 0.05
  - parameterId: hidden_layers
    integerValueSpec:
      minValue: 1
      maxValue: 4
  algorithm: RANDOM_SEARCH
trialJobSpec:
  workerPoolSpecs:
  - machineSpec:
      machineType: n1-standard-4
    replicaCount: "1"
    containerSpec:
        imageUri: us-central1-docker.pkg.dev/hasanrafiq-test-331814/vertex-custom-training-docker/latest:latest
        args: [
          "--train_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/train.csv",
          "--eval_file=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/eval.csv",
          "--model_save_location=gs://gcs-hasanrafiq-test-331814/ml_data/taxi_dataset/model/"
          ]
    

Overwriting config_custom_container.yaml


In [70]:
!gcloud ai hp-tuning-jobs create \
--region=us-central1 \
--display-name=vertex-customcontainer-hpt-taxi1 \
--max-trial-count=10 \
--parallel-trial-count=2 \
--config=config_custom_container.yaml \
# --verbosity=debug

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Hyperparameter tuning job [7783140447328665600] submitted successfully.

Your job is still active. You may view the status of your job with the command

  $ gcloud ai hp-tuning-jobs describe 7783140447328665600 --region=us-central1

Job State: JOB_STATE_PENDING
