# AI Platform - Training Job and Serving

Adopted From: https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/custom_job_image_classification_model_for_online_prediction.ipynb

Using Google Cloud Client Libraries: https://googleapis.dev/python/aiplatform/latest/index.html

## Training Job

Setup the environment:

In [232]:
from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
from datetime import datetime

Define Parameters:

In [233]:
# Locations
REGION = 'us-central1'
PROJECT_ID='statmike-mlops'
BUCKET_NAME='gs://statmike-models/digits/aip_train_job' #BUCKET_NAME
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOB_NAME='AIP_DIGITS_'+TIMESTAMP
MODEL_DIR = '{}/{}'.format(BUCKET_NAME, JOB_NAME)
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

# Resources
TRAIN_IMAGE='us-docker.pkg.dev/cloud-aiplatform/training/tf-cpu.2-4:latest'
DEPLOY_IMAGE ='us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'
TRAIN_COMPUTE='n1-standard-4'
DEPLOY_COMPUTE='n1-standard-4'

# TF Parameters to pass
EPOCHS = 25
BATCH_SIZE = 30

Create a client for the AIP Job Service:

In [234]:
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
client_options = {"api_endpoint": API_ENDPOINT}
clients = {}
clients['job'] = aiplatform.gapic.JobServiceClient(client_options=client_options)

Helper functions wrappers for interacting with the client: create, list, get, cancel

In [235]:
def create_custom_job(custom_job):
    response = clients['job'].create_custom_job(parent=PARENT, custom_job=CUSTOM_JOB)
    print("name:", response.name)
    print("display_name:", response.display_name)
    print("state:", response.state)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    return response.name

def list_custom_jobs():
    response = clients['job'].list_custom_jobs(parent=PARENT)
    for job in response:
        print(response)
        
def get_custom_job(name, silent=False):
    response = clients['job'].get_custom_job(name=name)
    if silent:
        return response
    print("name:", response.name)
    print("display_name:", response.display_name)
    print("state:", response.state)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    return response

def cancel_job(name):
    try:
        response = clients['job'].cancel_custom_job(name=name)
        print(response)
    except Exception as e:
        print(e)

Define job parameters to pass to the client:

In [236]:
MACHINE_SPEC = {
    "machine_type": TRAIN_COMPUTE,
    "accelerator_count": 0
}


CMDARGS = [
    "--epochs=" + str(EPOCHS),
    "--batch_size=" + str(BATCH_SIZE)
]

WORKER_POOL_SPEC = [
    {
        "replica_count": 1,
        "machine_spec": MACHINE_SPEC,
        "python_package_spec": {
            "executor_image_uri": TRAIN_IMAGE,
            "package_uris": [BUCKET_NAME + "/trainer_cifar.tar.gz"],
            "python_module": "trainer.task",
            "args": CMDARGS
        }
    }
]

JOB_SPEC = {
    "worker_pool_specs": WORKER_POOL_SPEC,
    "base_output_directory": {"output_uri_prefix": MODEL_DIR}
    
}

CUSTOM_JOB = {
    "display_name": JOB_NAME,
    "job_spec": JOB_SPEC
}

Assemble the Python training package:

In [237]:
!rm -rf custom
!mkdir custom


setup_py = "from setuptools import setup\n\
if __name__ == '__main__':\n\
    setup()"

setup_py = "import setuptools\n\
REQUIRED_PACKAGES = ['tensorflow_io']\n\
setuptools.setup(\n\
    name='trainer',\n\
    version='0.1',\n\
    #install_requires=REQUIRED_PACKAGES,\n\
    packages=setuptools.find_packages(),\n\
    #include_package_data=True,\n\
    description='Digit Training Package')"

!echo "$setup_py" > custom/setup.py

!mkdir custom/trainer
!touch custom/trainer/__init__.py

In [238]:
%%writefile custom/trainer/task.py
from tensorflow_io.bigquery import BigQueryClient
from tensorflow_io.bigquery import BigQueryReadSession
import tensorflow as tf
from google.cloud import bigquery
import argparse
import os
import sys

parser = argparse.ArgumentParser()
# the passed param, dest: a name for the param, default: if absent fetch this param from the OS, type: type to convert to, help: description of argument
parser.add_argument('--model-dir', dest='model_dir', default=os.getenv("AIP_MODEL_DIR"), type=str, help='Model dir.')
parser.add_argument('--epochs',dest='epochs', default=10, type=int, help='Number of Epochs')
parser.add_argument('--batch_size',dest='batch_size', default=32, type=int, help='Batch Size')
#parser.add_argument('',dest='', default=, type=, help='')
args = parser.parse_args()

# built in parameters for data source:
PROJECT_ID='statmike-mlops'
BQDATASET_ID='digits'
BQTABLE_ID='digits_prepped'

selected_fields = ['p0', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10', 'p11', 'p12', 'p13', 'p14', 'p15', 'p16', 'p17', 'p18', 'p19', 'p20', 'p21', 'p22', 'p23', 'p24', 'p25', 'p26', 'p27', 'p28', 'p29', 'p30', 'p31', 'p32', 'p33', 'p34', 'p35', 'p36', 'p37', 'p38', 'p39', 'p40', 'p41', 'p42', 'p43', 'p44', 'p45', 'p46', 'p47', 'p48', 'p49', 'p50', 'p51', 'p52', 'p53', 'p54', 'p55', 'p56', 'p57', 'p58', 'p59', 'p60', 'p61', 'p62', 'p63', 'target']
output_types = ['FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'FLOAT64', 'INT64']

feature_columns = []
feature_layer_inputs = {}
for header in selected_fields:
    if header != 'target':
        feature_columns.append(tf.feature_column.numeric_column(header))
        feature_layer_inputs[header] = tf.keras.Input(shape=(1,),name=header)

from tensorflow.python.framework import dtypes
output_types = [dtypes.float64 if x=='FLOAT64' else dtypes.int64 for x in output_types]

def transTable(row_dict):
    target=row_dict.pop('target')
    target = tf.one_hot(tf.cast(target,tf.int64),10)
    target = tf.cast(target,tf.float32)
    return(row_dict,target)

client = BigQueryClient()
session = client.read_session("projects/"+PROJECT_ID,PROJECT_ID,BQTABLE_ID,BQDATASET_ID,selected_fields,output_types,row_restriction="SPLITS='TRAIN'",requested_streams=3)
table = session.parallel_read_rows()
table = table.map(transTable)
train = table.shuffle(100000).batch(args.batch_size)

client = BigQueryClient()
session = client.read_session("projects/"+PROJECT_ID,PROJECT_ID,BQTABLE_ID,BQDATASET_ID,selected_fields,output_types,row_restriction="SPLITS='TEST'",requested_streams=3)
table = session.parallel_read_rows()
table = table.map(transTable)
test = table.batch(args.batch_size)

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
feature_layer_outputs = feature_layer(feature_layer_inputs)
model = tf.keras.Model(inputs=[v for v in feature_layer_inputs.values()],outputs=tf.keras.layers.Dense(10,activation=tf.nn.softmax)(feature_layer_outputs))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
tf.keras.utils.plot_model(model,show_shapes=True, show_dtype=True)

history = model.fit(train,epochs=args.epochs)

model.save(args.model_dir)

Writing custom/trainer/task.py


Store the training package in a Cloud Storage Bucket:

In [239]:
!rm -f custom.tar custom.tar.gz
!tar cvf custom.tar custom
!gzip custom.tar
!gsutil cp custom.tar.gz $BUCKET_NAME/trainer_cifar.tar.gz

E0408 20:57:08.298529294   10113 backup_poller.cc:133]       Run client channel backup poller: {"created":"@1617915428.298371494","description":"pollset_work","file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":321,"referenced_errors":[{"created":"@1617915428.298353495","description":"Bad file descriptor","errno":9,"file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":956,"os_error":"Bad file descriptor","syscall":"epoll_wait"}]}
custom/
custom/setup.py
custom/trainer/
custom/trainer/__init__.py
custom/trainer/task.py
Copying file://custom.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  1.6 KiB/  1.6 KiB]                                                
Operation completed over 1 objects/1.6 KiB.                                      


Submit the training job:

In [240]:
JOB_ID = create_custom_job(CUSTOM_JOB)

name: projects/691911073727/locations/us-central1/customJobs/2185556437134999552
display_name: AIP_DIGITS_20210408205703
state: JobState.JOB_STATE_PENDING
create_time: 2021-04-08 20:57:18.380660+00:00
update_time: 2021-04-08 20:57:18.380660+00:00


Get information as the job runs:

In [285]:
trainjob_response = get_custom_job(JOB_ID)

name: projects/691911073727/locations/us-central1/customJobs/2185556437134999552
display_name: AIP_DIGITS_20210408205703
state: JobState.JOB_STATE_RUNNING
create_time: 2021-04-08 20:57:18.380660+00:00
update_time: 2021-04-08 21:07:25.297154+00:00


## Deployment

### Upload the Model

Check that model training was successful (and update the path to the model store):

In [286]:
if trainjob_response.state == aiplatform.gapic.JobState.JOB_STATE_SUCCEEDED:
    MODEL_DIR = MODEL_DIR + "/model"

Create a client to the Model Service:

In [287]:
clients['model'] = aiplatform.gapic.ModelServiceClient(client_options=client_options)

Upload the model using the client:

In [289]:
MODEL = {
    "display_name": trainjob_response.display_name,
    "metadata_schema_uri": "",
    "artifact_uri": MODEL_DIR,
    "container_spec": {
        "image_uri": DEPLOY_IMAGE,
        "command": [],
        "args": [],
        "env": [{"name": "env_name", "value": "env_value"}],
        "ports": [{"container_port": 8080}],
        "predict_route": "",
        "health_route": ""
    }
}

uploaded_model = clients['model'].upload_model(parent=PARENT, model=MODEL)

Review the uploaded models information:

In [310]:
model_info = clients['model'].get_model(name=uploaded_model.result(timeout=180).model)
model_info.name

'projects/691911073727/locations/us-central1/models/2001515782831341568'

### Endpoint Creation

Create a client to the endpoint service:

In [311]:
clients['endpoint'] = aiplatform.gapic.EndpointServiceClient(client_options=client_options)

Create the endpoint:

In [313]:
ENDPOINT_NAME = 'ENDPOINT_'+JOB_NAME
endpoint = clients['endpoint'].create_endpoint(parent=PARENT, endpoint={"display_name": ENDPOINT_NAME})

In [321]:
endpoint_info = clients['endpoint'].get_endpoint(name=endpoint.result(timeout=180).name)
endpoint_info.name

'projects/691911073727/locations/us-central1/endpoints/8465377516759023616'

### Deploy Model to Endpoint

## Prediction

In [None]:
clients['prediction'] = aiplatform.gapic.PredictionServiceClient(client_options=client_options)

In [None]:
import json
f=open('newob.json')
newob=json.load(f)
f.close()
newob=newob['instances'][0]
newob

In [None]:
from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

endpoint = client['prediction'].endpoint_path(project=PROJECT_ID, location=REGION, endpoint=ENDPOINT_ID)
response = client['prediction'].predict(endpoint=endpoint, instances=[json_format.ParseDict(newob, Value())], parameters=json_format.ParseDict({}, Value()))
prediction = response.predictions
prediction

In [None]:
import numpy as np
np.argmax(prediction[0])