# AI Platform - Training Job and Serving

Adopted From: https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/custom_job_image_classification_model_for_online_prediction.ipynb

## Training Job

In [None]:
from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

In [None]:
REGION = 'us-central1'
PROJECT_ID='statmike-mlops'
BUCKET_NAME='gs://statmike-models/digits/aip_train_job' #BUCKET_NAME
JOB_NAME='AIP_DIGITS_1'

PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION
TRAIN_IMAGE='us-docker.pkg.dev/cloud-aiplatform/training/tf-cpu.2-4:latest'
DEPLOY_IMAGE ='us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'
TRAIN_COMPUTE='n1-standard-4'
DEPLOY_COMPUTE='n1-standard-4'

In [None]:
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
client_options = {"api_endpoint": API_ENDPOINT}
clients = {}
clients['job'] = aiplatform.gapic.JobServiceClient(client_options=client_options)

In [None]:

MODEL_DIR = '{}/{}'.format(BUCKET_NAME, JOB_NAME)

CONTAINER_SPEC = {
    "image_uri": TRAIN_IMAGE,
    "args": [
        "--model-dir=" + MODEL_DIR
    ],
}

machine_spec = {
        "machine_type": TRAIN_COMPUTE,
        "accelerator_count": 0
    }

WORKER_POOL_SPEC = [
    {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "container_spec": CONTAINER_SPEC,
    }
]

CUSTOM_JOB = {
    "display_name": JOB_NAME,
    "job_spec": {
        "worker_pool_specs": WORKER_POOL_SPEC,
        "base_output_directory": {"output_uri_prefix": MODEL_DIR}
    }
}

In [None]:
def create_custom_job(custom_job):
    response = clients['job'].create_custom_job(parent=PARENT, custom_job=CUSTOM_JOB)
    print("name:", response.name)
    print("display_name:", response.display_name)
    print("state:", response.state)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    return response.name

def list_custom_jobs():
    response = clients['job'].list_custom_jobs(parent=PARENT)
    for job in response:
        print(response)
        
def get_custom_job(name, silent=False):
    response = clients['job'].get_custom_job(name=name)
    if silent:
        return response

    print("name:", response.name)
    print("display_name:", response.display_name)
    print("state:", response.state)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    return response

def cancel_job(name):
    try:
        response = clients['job'].cancel_custom_job(name=name)
        print(response)
    except Exception as e:
        print(e)

In [3]:
!rm -rf custom
!mkdir custom
!mkdir custom/trainer
!touch custom/trainer/__init__.py

In [6]:
%%writefile custom/trainer/task.py

PROJECT_ID='statmike-mlops'
BQDATASET_ID='digits'
BQTABLE_ID='digits_prepped'
MODEL_DIR='gs://statmike-models/digits/keras'
BATCH_SIZE = 30


from google.cloud import bigquery
bqclient = bigquery.Client()
bqjob = bqclient.query(
"""
SELECT * FROM `"""+BQDATASET_ID+""".INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
WHERE TABLE_NAME = '"""+BQTABLE_ID+"""' """
)
schema = bqjob.result().to_dataframe()


OMIT = ['target_OE','SPLITS']
selected_fields=schema[~schema.column_name.isin(OMIT)].column_name.tolist()

feature_columns = []
feature_layer_inputs = {}
for header in selected_fields:
    if header != 'target':
        feature_columns.append(tf.feature_column.numeric_column(header))
        feature_layer_inputs[header] = tf.keras.Input(shape=(1,),name=header)

from tensorflow.python.framework import dtypes
output_types=schema[~schema.column_name.isin(OMIT)].data_type.tolist()
output_types = [dtypes.float64 if x=='FLOAT64' else dtypes.int64 for x in output_types]

def transTable(row_dict):
    target=row_dict.pop('target')
    target = tf.one_hot(tf.cast(target,tf.int64),10)
    target = tf.cast(target,tf.float32)
    return(row_dict,target)

client = BigQueryClient()
session = client.read_session("projects/"+PROJECT_ID,PROJECT_ID,BQTABLE_ID,BQDATASET_ID,selected_fields,output_types,row_restriction="SPLITS='TRAIN'",requested_streams=3)
table = session.parallel_read_rows()
table = table.map(transTable)
train = table.shuffle(100000).batch(BATCH_SIZE)

client = BigQueryClient()
session = client.read_session("projects/"+PROJECT_ID,PROJECT_ID,BQTABLE_ID,BQDATASET_ID,selected_fields,output_types,row_restriction="SPLITS='TEST'",requested_streams=3)
table = session.parallel_read_rows()
table = table.map(transTable)
test = table.batch(BATCH_SIZE)

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
feature_layer_outputs = feature_layer(feature_layer_inputs)
model = tf.keras.Model(inputs=[v for v in feature_layer_inputs.values()],outputs=tf.keras.layers.Dense(10,activation=tf.nn.softmax)(feature_layer_outputs))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
tf.keras.utils.plot_model(model,show_shapes=True, show_dtype=True)

history = model.fit(train,epochs=25)

model.save(MODEL_DIR)

Overwriting custom/trainer/task.py


In [None]:
!rm -f custom.tar custom.tar.gz
!tar cvf custom.tar custom
!gzip custom.tar
!gsutil cp custom.tar.gz gs://$BUCKET_NAME/trainer_cifar.tar.gz

In [None]:
JOB_ID = create_custom_job(CUSTOM_JOB)

In [None]:
response = get_custom_job(JOB_ID)

## Deployment