# Custom Prediction Routines for Endpoints That Train

- https://cloud.google.com/vertex-ai/docs/predictions/custom-prediction-routines
- https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.prediction.LocalModel

In [1]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'statmike-mlops-349915'

In [2]:
REGION = 'us-central1'
EXPERIMENT = 'cpr_training'
SERIES = '04'

# source data
BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'fraud'
BQ_TABLE = 'fraud_prepped'

# Resources
DEPLOY_COMPUTE = 'n1-standard-2'

# Model Training
VAR_TARGET = 'Class'
VAR_OMIT = 'transaction_id' # add more variables to the string with space delimiters

In [59]:
from google.cloud import aiplatform
from google.cloud import bigquery
from google.cloud import service_usage_v1

from datetime import datetime
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
from sklearn import metrics

from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

In [58]:
aiplatform.init(project=PROJECT_ID, location=REGION)
bq = bigquery.Client(project = PROJECT_ID)

In [275]:
BUCKET = PROJECT_ID
URI = f"gs://{BUCKET}/{SERIES}/{EXPERIMENT}"
REPOSITORY = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{PROJECT_ID}"
DIR = f"temp/{EXPERIMENT}"

In [11]:
SERVICE_ACCOUNT = !gcloud config list --format='value(core.account)' 
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]
SERVICE_ACCOUNT

'1026793852137-compute@developer.gserviceaccount.com'

In [12]:
!rm -rf {DIR}
!mkdir -p {DIR}

## Idea: Decision Tree on Samples

- Input parameter is a sample size `n`
- Retrieve a sample of size `n` from a BigQuery table to a Pandas dataframe
- Use sklearn.tree.DecisionTreeClassifier to build a classifier
- Retrieve the rules of the tree

In [23]:
n = 400

In [24]:
train = bq.query(query = f"""
        SELECT * EXCEPT(splits, transaction_id)
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
            WHERE splits = 'TRAIN' and RAND() < 0.1
            LIMIT {n}
        """).to_dataframe()y = train[VAR_TARGET]
X = train.drop(VAR_TARGET, axis = 1)

In [25]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_leaf_nodes = 3, random_state = 0)
clf.fit(X, y)

DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)

In [26]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold

def path_builder(node_num, path, x):
        path.append(node_num)
        if node_num == x:
            return True
        left = False
        right = False
        if (children_left[node_num] !=-1):
            left = path_builder(children_left[node_num], path, x)
        if (children_right[node_num] !=-1):
            right = path_builder(children_right[node_num], path, x)
        if left or right :
            return True
        path.remove(node_num)
        return False


def rule_builder(path, column_names):
    rule = ''
    for index, node in enumerate(path):
        if index != len(path)-1:
            if len(rule) > 0: rule += ' and '
            if (children_left[node] == path[index+1]):
                rule += f"{column_names[feature[node]]} <= {threshold[node]}"
            else:
                rule += f"{column_names[feature[node]]} > {threshold[node]}"
    return rule

paths ={}
for leaf in np.unique(clf.apply(X)):
    path_leaf = []
    path_builder(0, path_leaf, leaf)
    paths[leaf] = np.unique(np.sort(path_leaf))

rules = {}
for key in paths:
    rules[key] = rule_builder(paths[key], X.columns)

rules

{1: 'V10 <= -2.574588894844055',
 3: 'V10 > -2.574588894844055 and V7 <= 1.294544279575348',
 4: 'V10 > -2.574588894844055 and V7 > 1.294544279575348'}

---
## Build CPR

In [32]:
!pip install google-cloud-aiplatform[prediction] -U -q

In [276]:
%%writefile {DIR}/SRC/requirements.txt
fastapi
uvicorn==0.17.6
#joblib~=1.0
numpy~=1.20
scikit-learn~=0.24
pandas
#google-cloud-storage>=1.26.0,<2.0.0dev
google-cloud-aiplatform[prediction]>=1.16.0
google-cloud-bigquery
pyarrow

Overwriting temp/cpr_training/SRC/requirements.txt


In [277]:
%%writefile {DIR}/SRC/predictor.py

# packages
import numpy as np
import json
from sklearn.tree import DecisionTreeClassifier
from google.cloud.aiplatform.prediction.predictor import Predictor
from google.cloud import bigquery

##################################################################################################

# clients
bq = bigquery.Client()

# source data
BQ_PROJECT = 'statmike-mlops-349915'
BQ_DATASET = 'fraud'
BQ_TABLE = 'fraud_prepped'

# Model Training
VAR_TARGET = 'Class'

def ruler(n):
    # helper function: 
    def path_builder(node_num, path, x):
            path.append(node_num)
            if node_num == x:
                return True
            left = False
            right = False
            if (children_left[node_num] !=-1):
                left = path_builder(children_left[node_num], path, x)
            if (children_right[node_num] !=-1):
                right = path_builder(children_right[node_num], path, x)
            if left or right :
                return True
            path.remove(node_num)
            return False
    # helper function:
    def rule_builder(path, column_names):
        rule = ''
        for index, node in enumerate(path):
            if index != len(path)-1:
                if len(rule) > 0: rule += ' and '
                if (children_left[node] == path[index+1]):
                    rule += f"{column_names[feature[node]]} <= {threshold[node]}"
                else:
                    rule += f"{column_names[feature[node]]} > {threshold[node]}"
        return rule
    
    # data
    train = bq.query(query = f"""
        SELECT * EXCEPT(splits, transaction_id)
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
            WHERE splits = 'TRAIN' and RAND() < 0.1
            LIMIT {n}
        """).to_dataframe()
    y = train[VAR_TARGET]
    X = train.drop(VAR_TARGET, axis = 1)
    
    # model
    clf = DecisionTreeClassifier(max_leaf_nodes = 3, random_state = 0)
    clf.fit(X, y)
    
    # outputs
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold
    
    # decision
    paths ={}
    for leaf in np.unique(clf.apply(X)):
        path_leaf = []
        path_builder(0, path_leaf, leaf)
        paths[leaf] = np.unique(np.sort(path_leaf))
    rules = {}
    for key in paths:
        rules[key] = rule_builder(paths[key], X.columns)

    return rules

##################################################################################################

class CprPredictor(Predictor):
    def __init__(self):
        return

    def load(self, artifacts_uri: str) -> None:
        # no model to load here, this example trains a model and returns parameters
        pass

    def predict(self, instances):

        instances = instances["instances"]
        results = [f"{ruler(instance)}" for instance in instances]
        
        return {"predictions": results}

Overwriting temp/cpr_training/SRC/predictor.py


In [278]:
from google.cloud.aiplatform.prediction import LocalModel
# load the local predictor class:
from temp.cpr_training.SRC.predictor import CprPredictor

local_model = LocalModel.build_cpr_model(
    src_dir = f"{DIR}/SRC",
    output_image_uri = f"{REPOSITORY}/{SERIES}_{EXPERIMENT}",
    predictor = CprPredictor,
    requirements_path = os.path.join(f"{DIR}/SRC", "requirements.txt"),
)

In [279]:
with local_model.deploy_to_local_endpoint() as local_endpoint:
    predict_response = local_endpoint.predict(
        request = '{"instances": [100, 1000, 2000, 3000]}',
        headers={"Content-Type": "application/json"}
    )

    health_check_response = local_endpoint.run_health_check()

In [280]:
[print(r+'\n') for r in json.loads(predict_response.content)['predictions']]

{0: ''}

{1: 'V7 <= -11.742929458618164', 2: 'V7 > -11.742929458618164'}

{1: 'V10 <= -3.1931523084640503', 3: 'V10 > -3.1931523084640503 and V7 <= 1.2830491065979004', 4: 'V10 > -3.1931523084640503 and V7 > 1.2830491065979004'}

{1: 'V10 <= -3.0351459980010986', 3: 'V10 > -3.0351459980010986 and V13 <= -2.43116557598114', 4: 'V10 > -3.0351459980010986 and V13 > -2.43116557598114'}



[None, None, None, None]

In [281]:
!gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


In [282]:
local_model.push_image()

In [283]:
model = aiplatform.Model.upload(
    local_model = local_model,
    display_name = f"{SERIES}_{EXPERIMENT}"
)

Creating Model
Create Model backing LRO: projects/1026793852137/locations/us-central1/models/3594015439153266688/operations/1390498823175929856
Model created. Resource name: projects/1026793852137/locations/us-central1/models/3594015439153266688@1
To use this Model in another session:
model = aiplatform.Model('projects/1026793852137/locations/us-central1/models/3594015439153266688@1')


In [284]:
endpoint = model.deploy(
    machine_type = DEPLOY_COMPUTE,
    min_replica_count = 1,
    max_replica_count = 5
)

Creating Endpoint
Create Endpoint backing LRO: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808/operations/5587290725931810816
Endpoint created. Resource name: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/1026793852137/locations/us-central1/endpoints/2800574863201271808')
Deploying model to Endpoint : projects/1026793852137/locations/us-central1/endpoints/2800574863201271808
Deploy Endpoint model backing LRO: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808/operations/3166605926220169216
Endpoint model deployed. Resource name: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808


In [None]:
endpoint.predict(instances = [100, 1000, 2000])

In [286]:
endpoint.delete(force = True)

Undeploying Endpoint model: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808
Undeploy Endpoint model backing LRO: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808/operations/1596538506128130048
Endpoint model undeployed. Resource name: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808
Deleting Endpoint : projects/1026793852137/locations/us-central1/endpoints/2800574863201271808
Delete Endpoint  backing LRO: projects/1026793852137/locations/us-central1/operations/518489345326317568
Endpoint deleted. . Resource name: projects/1026793852137/locations/us-central1/endpoints/2800574863201271808


In [287]:
model.delete()

Deleting Model : projects/1026793852137/locations/us-central1/models/3594015439153266688
Delete Model  backing LRO: projects/1026793852137/locations/us-central1/operations/7309917583401025536
Model deleted. . Resource name: projects/1026793852137/locations/us-central1/models/3594015439153266688
