In [1]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'ml-pipeline-project-401216'

In [None]:
REGION = 'us-central1'
DATANAME = 'fraud'
NOTEBOOK = 'automl_with_client_code'

# resources
DEPLOY_COMPUTE = 'n1-highmem-4'

# model training
VAR_TARGET = 'Class'
VAR_OMIT = 'transaction_id' #can add more variables to the string with space delimiters

In [8]:
# packages

from google.cloud import aiplatform
from datetime import datetime

from google.cloud import bigquery
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
import json
import numpy as np

In [10]:
# clients
aiplatform.init(project = PROJECT_ID, location=REGION)
bigquery = bigquery.Client()

In [13]:
# parameters
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
DIR = f"temp/{NOTEBOOK}"
TIMESTAMP

'20231016131756'

In [15]:
# environment
!rm -rf {DIR}
!mkdir -p {DIR}

In [44]:
# create dataset (link to BigQuery table)
dataset = aiplatform.TabularDataset.create(
    display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    bq_source = f'bq://{PROJECT_ID}.{DATANAME}.{DATANAME}_prepped',
    labels = {'notebook' : f'{NOTEBOOK}'}
)

Creating TabularDataset
Create TabularDataset backing LRO: projects/1181571513/locations/us-central1/datasets/5593058420333740032/operations/6212980049784602624
TabularDataset created. Resource name: projects/1181571513/locations/us-central1/datasets/5593058420333740032
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/1181571513/locations/us-central1/datasets/5593058420333740032')


In [45]:
dataset.column_names

['V21',
 'V10',
 'V2',
 'V17',
 'V11',
 'V18',
 'V12',
 'V25',
 'V5',
 'V9',
 'Time',
 'V26',
 'V23',
 'V15',
 'V19',
 'V13',
 'V4',
 'transaction_id',
 'V7',
 'V22',
 'V6',
 'V20',
 'V27',
 'V16',
 'splits',
 'V8',
 'V28',
 'V3',
 'V24',
 'Class',
 'V1',
 'V14',
 'Amount']

In [46]:
# train model with AutoML

column_specs = list(set(dataset.column_names) - set(VAR_OMIT.split()) - set([VAR_TARGET, 'splits']))


In [47]:
column_specs = dict.fromkeys(column_specs, 'auto')
column_specs

{'V21': 'auto',
 'V10': 'auto',
 'V2': 'auto',
 'V17': 'auto',
 'V11': 'auto',
 'V18': 'auto',
 'V12': 'auto',
 'V25': 'auto',
 'V5': 'auto',
 'V9': 'auto',
 'Time': 'auto',
 'V26': 'auto',
 'V23': 'auto',
 'V15': 'auto',
 'V19': 'auto',
 'V4': 'auto',
 'V13': 'auto',
 'V7': 'auto',
 'V22': 'auto',
 'V6': 'auto',
 'V20': 'auto',
 'V27': 'auto',
 'V16': 'auto',
 'V8': 'auto',
 'V28': 'auto',
 'V3': 'auto',
 'V24': 'auto',
 'V1': 'auto',
 'V14': 'auto',
 'Amount': 'auto'}

In [50]:
# creating training job and training the model using AutoML

###### Define a Job:
###### a. Consider Weighting
###### b. Model type
###### c. Optimization objective

In [48]:
tabular_classification_job = aiplatform.AutoMLTabularTrainingJob(
    display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    optimization_prediction_type = 'classification',
    optimization_objective = 'maximize-au-prc',
    column_specs = column_specs,
    labels = {'notebook' : f'{NOTEBOOK}'}  
)

In [49]:
model = tabular_classification_job.run(
    dataset = dataset,
    target_column = VAR_TARGET,
    predefined_split_column_name = 'splits',
    budget_milli_node_hours = 1000,
    model_display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    disable_early_stopping = False,
    model_labels = {'notebook' : f'{NOTEBOOK}'}
)

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6921408998017073152?project=1181571513
AutoMLTabularTrainingJob projects/1181571513/locations/us-central1/trainingPipelines/6921408998017073152 current state:
PipelineState.PIPELINE_STATE_PENDING
AutoMLTabularTrainingJob projects/1181571513/locations/us-central1/trainingPipelines/6921408998017073152 current state:
PipelineState.PIPELINE_STATE_PENDING
AutoMLTabularTrainingJob projects/1181571513/locations/us-central1/trainingPipelines/6921408998017073152 current state:
PipelineState.PIPELINE_STATE_PENDING
AutoMLTabularTrainingJob projects/1181571513/locations/us-central1/trainingPipelines/6921408998017073152 current state:
PipelineState.PIPELINE_STATE_PENDING
AutoMLTabularTrainingJob projects/1181571513/locations/us-central1/trainingPipelines/6921408998017073152 current state:
PipelineState.PIPELINE_STATE_PENDING
AutoMLTabularTrainingJob projects/1181571513/locations/us-central1/trainingPipelines/

In [54]:
# creating model client for this model

model.resource_name

model_client = aiplatform.gapic.ModelServiceClient(
    client_options = {
        'api_endpoint' : f'{REGION}-aiplatform.googleapis.com'
    }
)

model_client

<google.cloud.aiplatform_v1.services.model_service.client.ModelServiceClient at 0x7f3f4198da00>

In [61]:
# retrieving the aggregate model evaluation matrics. First using .list_model_evaluations to retrieve
# the evaluation id and then using .get_model_evaluation for the evaluation id

evaluations = model_client.list_model_evaluations(parent = model.resource_name)
evals = iter(evaluations)
evals_id = next(evals).name
geteval = model_client.get_model_evaluation(name = evals_id)

geteval.metrics['auPrc']

0.99967927

In [63]:
for i in range(len(geteval.metrics['confusionMatrix']['annotationSpecs'])):
    print('True Label = ', geteval.metrics['confusionMatrix']['annotationSpecs'][i]['displayName'], \
         ' has predicted labels = ', geteval.metrics['confusionMatrix']['rows'][i])

True Label =  0  has predicted labels =  [28678.0, 1.0]
True Label =  1  has predicted labels =  [10.0, 37.0]


In [66]:
slices = model_client.list_model_evaluation_slices(parent = evals_id)

In [68]:
for slice in slices:
    print('Label = ', slice.slice_.value, 'has auPrc = ', slice.metrics['auPrc'])

Label =  0 has auPrc =  0.99975467
Label =  1 has auPrc =  0.83377564


In [69]:
# Endpoint and deployment

endpoint = aiplatform.Endpoint.create(
    display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    labels = {'notebook' : f'{NOTEBOOK}'}
)

Creating Endpoint
Create Endpoint backing LRO: projects/1181571513/locations/us-central1/endpoints/4060085224027979776/operations/128898428183773184
Endpoint created. Resource name: projects/1181571513/locations/us-central1/endpoints/4060085224027979776
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/1181571513/locations/us-central1/endpoints/4060085224027979776')


In [None]:
endpoint.deploy(
    model = model,
    deployed_model_display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    traffic_percentage = 100,
    machine_type = 'n1-highmem-4',
    min_replica_count = 1,
    max_replica_count = 1
)

Deploying Model projects/1181571513/locations/us-central1/models/2652292526056144896 to Endpoint : projects/1181571513/locations/us-central1/endpoints/4060085224027979776
Deploy Endpoint model backing LRO: projects/1181571513/locations/us-central1/endpoints/4060085224027979776/operations/5145908413074505728
