# Google Cloud Platform Vertex AI - Model Building and Deployment with watsonx.gov AI Factsheet

Install the required dependencies that are needed to build the model

In [None]:
%%writefile requirements.txt
fastapi
uvicorn==0.17.6
joblib~=1.0
numpy~=1.20
scikit-learn
pandas
google-cloud-storage>=1.26.0,<2.0.0dev
google-cloud-aiplatform[prediction]>=1.16.0

Pip install the dependencies in the notebook.

In [None]:
!pip install -U --user -r requirements.txt

## Setup AI Factsheet client

In [None]:
try:
    from ibm_aigov_facts_client import AIGovFactsClient
except:
    !pip install -U ibm-aigov-facts-client
    from ibm_aigov_facts_client import AIGovFactsClient
        
from ibm_aigov_facts_client import AIGovFactsClient,CloudPakforDataConfig

In [None]:
creds=CloudPakforDataConfig(service_url="xxxxxx",
                            username="xxxxxx",
                            api_key="xxxxxx")

In [None]:
EXPERIMENT_NAME='credit-default-model'

In [None]:
facts_client = AIGovFactsClient(cloud_pak_for_data_configs=creds,experiment_name=EXPERIMENT_NAME,set_as_current_experiment=True, external_model=True, enable_autolog=True)

## Create the directories where the model artifacts are stored

In [None]:
USER_SRC_DIR = "src_dir"

In [None]:
!mkdir $USER_SRC_DIR

In [None]:
!mkdir model_artifacts

In [None]:
# copy the requirements to the source dir
!cp requirements.txt $USER_SRC_DIR/requirements.txt

This would be a sklearn based model, import the necessary packages

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import shutil

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

import joblib
import logging

# set logging to see the docker container logs
logging.basicConfig(level=logging.INFO)

# delete previous mlruns record if exists
shutil.rmtree('./mlruns', ignore_errors=True)

Identifiers to describe the model and the cloud storage bucket details.

In [None]:
REGION = "us-central1"
MODEL_ARTIFACT_DIR = "credit-default-model"
REPOSITORY = "credit-default"
IMAGE = "credit-default-image"
MODEL_DISPLAY_NAME = "credit-default-model"

# Replace with your project
PROJECT_ID = "prime-rainfall-425716-j6"

# Replace with your bucket
BUCKET_NAME = "gs://driven-density-wos-cpr-bucket"

Load the data

In [None]:
# data = sns.load_dataset('diamonds', cache=True, data_home=None)
data= pd.read_csv("UCI_Credit_Card.csv")

label = 'defaultpaymentnextmonth'

y = data['defaultpaymentnextmonth']
X = data.drop(columns=['defaultpaymentnextmonth', "ID"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
len(X_train), len(y_test)

Column transformations - One hot encode the categorical features and scale the numerical features

In [None]:
column_transform = make_column_transformer(
    (preprocessing.OneHotEncoder(), [1, 2, 3]),
    (preprocessing.StandardScaler(), [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]))

Create a Logistic Regression

In [None]:
# instantiate the model (using the default parameters)
regr = LogisticRegression(random_state=16)

Create the model pipeline and fit it with the training data

In [None]:
my_pipeline = make_pipeline(column_transform, regr)

In [None]:
my_pipeline.fit(X_train, y_train)

Perform local predictions

In [None]:
my_pipeline.predict_proba(X_train.iloc[0:10])

In [None]:
y_pred_train = my_pipeline.predict_proba(X_train)[:,1]
#y_pred_validation = model_pipeline.predict_proba(X_validation)[:,1]
y_pred_test = my_pipeline.predict_proba(X_test)[:,1]

In [None]:
fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, y_pred_train)
#fpr_validation, tpr_validation, thresholds_validation = metrics.roc_curve(y_validation, y_pred_validation)
fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test)

auc_train = metrics.auc(fpr_train, tpr_train)
#auc_validation = metrics.auc(fpr_validation, tpr_validation)
auc_test = metrics.auc(fpr_test, tpr_test)

print('Training AUC : ' + str(np.round(auc_train, 3)) +  ' and Test AUC :' + str(np.round(auc_test, 3)) )

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,5))
plt.plot([0, 1], [0, 1], label='random')

#plt.plot(fpr_validation, tpr_validation, label= 'Validation AUC=' + str(np.round(auc_test, 2)) + ')')
plt.plot(fpr_test, tpr_test, label= 'Test AUC=' + str(np.round(auc_test, 2)) + ')')

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.savefig('./ROC_curve.png')
plt.show()

Export the model pipeline to the artifacts folder

In [None]:
joblib.dump(my_pipeline, 'model_artifacts/model.joblib')

Copy the model artifact the the cloud storage bucket

In [None]:
!gsutil cp model_artifacts/model.joblib {BUCKET_NAME}/{MODEL_ARTIFACT_DIR}/

Define the pre-processing map for model inference

In [None]:
clarity_dict={"Flawless": "FL",
              "Internally Flawless": "IF",
              "Very Very Slightly Included": "VVS1",
              "Very Slightly Included": "VS2",
              "Slightly Included": "S12",
              "Included": "I3"}

In [None]:
import json
with open("model_artifacts/preprocessor.json", "w") as f:
    json.dump(clarity_dict, f)

In [None]:
!gsutil cp model_artifacts/preprocessor.json {BUCKET_NAME}/{MODEL_ARTIFACT_DIR}/

Define the Custom Prediction Routine to load the model, pre-process the data and post-process the scoring response to what the wrapping WML scoring endpoint and thereby OpenScale expects

In [None]:
%%writefile $USER_SRC_DIR/predictor.py

import joblib
import numpy as np
import json

from google.cloud import storage
from google.cloud.aiplatform.prediction.sklearn.predictor import SklearnPredictor


class CprPredictor(SklearnPredictor):

    def __init__(self):
        return

    def load(self, artifacts_uri: str) -> None:
        """Loads the sklearn pipeline and preprocessing artifact."""

        super().load(artifacts_uri)

        # # open preprocessing artifact
        # with open("preprocessor.json", "rb") as f:
        #     self._preprocessor = json.load(f)


    def preprocess(self, prediction_input: np.ndarray) -> np.ndarray:
        """Performs preprocessing by checking if clarity feature is in abbreviated form."""

        inputs = super().preprocess(prediction_input)

#         for sample in inputs:
#             if sample[3] not in self._preprocessor.values():
#                 sample[3] = self._preprocessor[sample[3]]
        return inputs

    def predict(self, instances):
        outputs = self._model.predict_proba(instances) 
        return outputs

    def postprocess(self, prediction_results: np.ndarray) -> dict:
        """Performs postprocessing by rounding predictions and converting to WML scoring format."""
        # return {"predictions": [f"${value}" for value in np.round(prediction_results)]}                                
        return {"predictions": [{"fields":["prediction", "probability"], "values":[[int(np.round(value[0])), value.tolist()] for value in prediction_results]}]}

Build the Custom Routine Predictor docker image

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

import os

from google.cloud.aiplatform.prediction import LocalModel

from src_dir.predictor import CprPredictor  # Should be path of variable $USER_SRC_DIR

local_model = LocalModel.build_cpr_model(
    USER_SRC_DIR,
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}",
    predictor=CprPredictor,
    requirements_path=os.path.join(USER_SRC_DIR, "requirements.txt"),
)

In [None]:
X_test.iloc[0:10].to_dict("split")["data"]

In [None]:
import json

sample = {"instances": X_test.iloc[0:10].to_dict("split")["data"]}

with open('instances.json', 'w') as fp:
    json.dump(sample, fp)

Make local predictions against the predictor routine

In [None]:
with local_model.deploy_to_local_endpoint(
    artifact_uri = 'model_artifacts/', # local path to artifacts
) as local_endpoint:
    predict_response = local_endpoint.predict(
        request_file='instances.json',
        headers={"Content-Type": "application/json"},
    )

    health_check_response = local_endpoint.run_health_check()

The scoring response..

In [None]:
predict_response.content

In [None]:
REPOSITORY

In [None]:
!gcloud artifacts repositories create {REPOSITORY} --repository-format=docker --location=us-central1 --description="Docker repository"

In [None]:
!gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet

Push the custom routine predictor docker image

In [None]:
local_model.push_image()

In [None]:
MODEL_DISPLAY_NAME

In [None]:
BUCKET_NAME

In [None]:
MODEL_ARTIFACT_DIR

In [None]:
model = aiplatform.Model.upload(local_model = local_model,
                                display_name=MODEL_DISPLAY_NAME,
                                artifact_uri=f"{BUCKET_NAME}/{MODEL_ARTIFACT_DIR}",)

Create an endpoint to the Custom Routine Predictor Image

In [None]:
endpoint = model.deploy(machine_type="n1-standard-2")

Perform scoring against the endpoint

In [None]:
endpoint.predict(instances=X_test.iloc[0:100].to_dict("split")["data"])

In [None]:
endpoint.predict(instances=X_test.iloc[0:1].to_dict("split")["data"])

Add deployment details for factsheet

In [None]:
import re

pattern = r"resource name: (.*)"
match = re.search(pattern, str(endpoint))
if match:
    resource_name = match.group(1)
    print(resource_name)
else:
    print("Resource name not found")

## Send facts to watsonx AI Factsheet

In [None]:
from ibm_aigov_facts_client.supporting_classes.factsheet_utils import DeploymentDetails,TrainingDataReference,ExternalModelSchemas
import json

In [None]:
model_identifier="GCP-credit-default-model"
model_name="GCP-VertexAI-Credit-Default-logreg--model"

deployment_details=DeploymentDetails(identifier="GCP-credit-default-model"
                                    ,name="GCP-credit-model-default-deployment"
                                    ,deployment_type="online"
                                    ,scoring_endpoint=resource_name)

In [None]:
#add custom facts definitions
import wget, time

!rm Asset_type_definition.csv
wget.download("https://raw.githubusercontent.com/IBM/ai-governance-factsheet-samples/main/Assets/data/Asset_type_definition.csv")
pd.read_csv('Asset_type_definition.csv')
facts_client.assets.create_custom_facts_definitions("Asset_type_definition.csv",overwrite=True)
time.sleep(5)

In [None]:
# function to convert input columns as input schema for AI factsheet
def convert_column_to_json(column_name):
    return {
        "metadata": {
            "columnInfo": {
                "columnLength": 64
            },
            "measure": "discrete",
            "modeling_role": "feature"
        },
        "name": column_name,
        "nullable": True,
        "type": "string" if data[column_name].dtype == "object" else "integer"
    }

# Get column names from DataFrame
input_df = data.drop(['defaultpaymentnextmonth'],axis=1)
columns = input_df.columns.tolist()

# Convert each column to JSON format
fields = [convert_column_to_json(column) for column in columns]

# Create the final JSON structure
input_payload = [{"fields": fields, "type": "struct"}]

In [None]:
# Derive training data reference schema from model training input schema
training_data_schema={}

input_schema= input_payload[0]
training_data_schema["schema"]=input_schema
train_data_ref=TrainingDataReference(schema=training_data_schema)

external_schemas=ExternalModelSchemas(input=input_payload)

In [None]:
external_model=facts_client.external_model_facts.save_external_model_asset(model_identifier=model_identifier,name=model_name
                        ,deployment_details=deployment_details
                        ,schemas=external_schemas
                        ,training_data_reference=train_data_ref
                        )

### Optional custom model facts

In [None]:
external_model.set_custom_fact(fact_id="TrainingData_Size",value=5000)
external_model.set_custom_fact(fact_id="TrainingData_Ratio",value="70% of total")
external_model.set_custom_fact(fact_id="TestData_Size",value=1000)
external_model.set_custom_fact(fact_id="TestData_Ratio",value="30% of total")

In [None]:
external_model.set_attachment_fact(file_to_upload="ROC_curve.png",description="ROC Curve",fact_id="0001")

In [None]:
external_model.get_all_facts()

In [None]:
#model_usecase=external_model.get_tracking_model_usecase()
#model_usecase.get_info()

In [None]:
#model_usecase.get_tracked_models()

In [None]:
external_model.get_environment_type()

In [None]:
# Change the model liftcycle stage if needed
external_model.set_environment_type(from_container="test",to_container="validate")

In [None]:
external_model.set_environment_type(from_container="validate",to_container="operate")