# Notebook to reproduce Explainability issues using prebuilt container for custom scikit-learn model 

 
I have created a scikit-learn based classifier ( RandomForest Classifier) and deployed it to Vertex AI endpoit using prebuilt container. I am able to use the endpoint for generating the predictions but not for generating explanations.

In this note book, I created some test data, trained the model, uploaded it to Model registry. Then deployed it to endpoint. 

In [30]:
# Import standard libraries. Set the Tensorflow log level to minimum to avoid information level messages from TFDV
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import warnings
warnings.filterwarnings("ignore")
import json
import pprint
import pandas as pd
import random
import sys
import random
import numpy as np
import string
import warnings
warnings.filterwarnings("ignore")
# Modelling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib
from google.cloud import storage
import tensorflow_data_validation as tfdv
from sklearn.metrics import confusion_matrix
from typing import List
from google.cloud import aiplatform

In [2]:
print(f"Version of scikit learn {sklearn.__version__}")

Version of scikit learn 1.2.2


In [3]:
# Set the static prameters
RANDOM_SEED=42
PROJECT='motorway-appsbroker'
REGION='europe-west2'
MODEL_DIR= f'./model'
STAGING_BUCKET='mo_ab_vertex_ai_dev_env'
MODEL_PATH = f'{MODEL_DIR}/model.joblib'

## Generate some test data 

In [4]:
# Helper method to generate sample data 
def generate_sample(num_samples=5000, random_seed=42):
    
    def generate_unique_identifier():
        chars = string.ascii_uppercase + string.digits
        return ''.join(random.choice(chars) for _ in range(6))
    
    makes = ["BMW", "Mercedes", "Audi", "Toyota", "Honda"]
    fuel_types = ["Petrol", "Diesel", "Hybrid", "Electric"]
    transmissions = ["Automatic", "Manual"]
    body_types = ["Hatchback", "Sedan", "SUV", "Convertible"]
    vehicle_colours = ["Red", "Blue", "White", "Black", "Silver", "Orange"]
    damage = ['1','0']
    #random.seed(random_seed)
    data = []
    for _ in range(num_samples):
        sample = {
            "request_unique_identifier": generate_unique_identifier(),
            "make": random.choice(makes),
            "generic_model": random.choice(['yes', 'no','None']),
            "fuel": random.choice(fuel_types),
            "transmission": random.choice(transmissions),
            "body": random.choice(body_types),
            "vehicle_colour": random.choice(vehicle_colours),
            "previous_keepers": random.randint(1, 3),
            "number_of_doors": random.randint(2, 5),
            "engine_size": random.randint(1000, 5000),
            "year": random.randint(2010, 2023),
            "mileage": random.randint(1000, 80000),
            "cap_price": random.randint(20000, 30000),
            "damage": random.choices(damage,weights=(0.3,0.7))[0]
        }
        data.append(sample)
    
    return pd.DataFrame(data)

In [5]:
# Generate the DataFrame with 5000 samples
data = generate_sample(num_samples=5000, random_seed=RANDOM_SEED)
data.head(5)

Unnamed: 0,request_unique_identifier,make,generic_model,fuel,transmission,body,vehicle_colour,previous_keepers,number_of_doors,engine_size,year,mileage,cap_price,damage
0,JT1LMP,Toyota,yes,Diesel,Automatic,SUV,Red,3,5,4727,2013,48792,28396,1
1,QJY9IZ,Honda,,Hybrid,Manual,SUV,Orange,3,4,3784,2010,75072,27629,0
2,VEM1BK,Mercedes,yes,Electric,Automatic,Convertible,Red,2,4,2357,2020,78813,21360,0
3,V3Z8G2,Audi,yes,Electric,Automatic,SUV,Orange,2,5,1626,2021,60183,28836,0
4,BCFOY3,Toyota,yes,Hybrid,Manual,SUV,Blue,1,5,2649,2018,33287,20433,0


In [6]:
# Check data types
data.dtypes

request_unique_identifier    object
make                         object
generic_model                object
fuel                         object
transmission                 object
body                         object
vehicle_colour               object
previous_keepers              int64
number_of_doors               int64
engine_size                   int64
year                          int64
mileage                       int64
cap_price                     int64
damage                       object
dtype: object

In [7]:
# Create a new feature called age 
data['age']=data['year'].apply(lambda x: 2023-x)
data.drop(columns=['year'],inplace=True)

In [8]:
# Check the distribution of label 
data.damage.value_counts()

0    3565
1    1435
Name: damage, dtype: int64

# TFDV visualisation

In [9]:
stats = tfdv.generate_statistics_from_dataframe(data)

In [10]:
# Visualise the data distributions.
tfdv.visualize_statistics(stats)

In [11]:
# Extract the schema 
schema=tfdv.infer_schema(stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'request_unique_identifier',BYTES,required,,-
'make',STRING,required,,'make'
'generic_model',STRING,required,,'generic_model'
'fuel',STRING,required,,'fuel'
'transmission',STRING,required,,'transmission'
'body',STRING,required,,'body'
'vehicle_colour',STRING,required,,'vehicle_colour'
'previous_keepers',INT,required,,-
'number_of_doors',INT,required,,-
'engine_size',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'make',"'Audi', 'BMW', 'Honda', 'Mercedes', 'Toyota'"
'generic_model',"'None', 'no', 'yes'"
'fuel',"'Diesel', 'Electric', 'Hybrid', 'Petrol'"
'transmission',"'Automatic', 'Manual'"
'body',"'Convertible', 'Hatchback', 'SUV', 'Sedan'"
'vehicle_colour',"'Black', 'Blue', 'Orange', 'Red', 'Silver', 'White'"


In [12]:
# Save the categorical encoding features 
domains=schema.string_domain
domain_kv={}
for domain in domains:
    domain_kv[domain.name]= list(domain.value)
print(domain_kv)
with open(f'{MODEL_DIR}/categorical_features.json', 'w') as f:
    json.dump(domain_kv, f)

{'make': ['Audi', 'BMW', 'Honda', 'Mercedes', 'Toyota'], 'generic_model': ['None', 'no', 'yes'], 'fuel': ['Diesel', 'Electric', 'Hybrid', 'Petrol'], 'transmission': ['Automatic', 'Manual'], 'body': ['Convertible', 'Hatchback', 'SUV', 'Sedan'], 'vehicle_colour': ['Black', 'Blue', 'Orange', 'Red', 'Silver', 'White']}


In [13]:
# Set the columns
categorical_cols=['make', 'generic_model', 'fuel','transmission', 'body', 'vehicle_colour' ]
numeric_cols=['previous_keepers','number_of_doors','engine_size','age','mileage','cap_price']
feature_cols=categorical_cols + numeric_cols
label_cols=['damage']

In [14]:
# Set the X and Y datasets
X=data[feature_cols]
Y=data[label_cols]

In [15]:
# Split of data into Train and Test
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.30, 
                                                    random_state=RANDOM_SEED,
                                                     shuffle=True)

In [16]:
# Use onehot encoding
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_categorical_train = encoder.fit_transform(X_train[categorical_cols])
encoded_categorical_test = encoder.transform(X_test[categorical_cols])
encoded_feature_names = encoder.get_feature_names_out(input_features=categorical_cols)
X_train_encoded = pd.concat([pd.DataFrame(encoded_categorical_train.toarray(), columns=encoded_feature_names),
                                      X_train[numeric_cols].reset_index(drop=True)], axis=1)
X_test_encoded = pd.concat([pd.DataFrame(encoded_categorical_test.toarray(), columns=encoded_feature_names),
                                     X_test[numeric_cols].reset_index(drop=True)], axis=1)

print(f"Dimensions of X : {X_train_encoded.shape}")

Dimensions of X : (3500, 30)


In [17]:
# Use TFDV to record the feature set 
stats = tfdv.generate_statistics_from_dataframe(X_train_encoded)
schema=tfdv.infer_schema(stats)
# Create 
features = [feature.name for feature in schema.feature]
with open(f'{MODEL_DIR}/features.txt','w') as f:
    for feature in features:
        f.write(feature)
        f.write('\n')

In [18]:
# Create a pipeline
pipeline = Pipeline([
    ('clf', RandomForestClassifier())
])

In [19]:
# Train the pipeline
pipeline.fit(X_train_encoded, Y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test_encoded)

# Evaluate the model
cm = confusion_matrix(Y_test, y_pred)
print(f"Confusion matrix : {cm}")

Confusion matrix : [[1067   11]
 [ 412   10]]


In [20]:
# Save the trained model
joblib.dump(pipeline, MODEL_PATH)

['./model/model.joblib']

# Serving side code

In [44]:
# Helper function runs on serving serving server

def pre_processing(requests: List, model_dir: str) -> pd.DataFrame:
    # Load instance as dataframe
    instance = pd.DataFrame.from_dict(requests)

    instance['age'] = instance['year'].apply(lambda x: 2023 - x)
    instance.drop(columns='year', inplace=True)

    # Load known categorical features 
    with open(f'{model_dir}/categorical_features.json') as f:
        try:
            cat_cols = json.load(f)
        except Exception as e:
            raise Exception(f'Error encountered while reading cat_cols. Error {e}')

    # Load sequence of features 
    with open(f'{model_dir}/features.txt') as f:
        try:
            features = f.readlines()
        except Exception as e:
            raise Exception(f'Error encountered while reading features file. Error {e}')

    # Apply one hot encoding
    categorical_cols = list(cat_cols.keys())
    updated_instance = instance.copy(deep=True)
    for col in categorical_cols:
        temp = pd.Categorical(instance[col], categories=cat_cols[col])
        updated_instance = pd.concat([updated_instance, pd.get_dummies(temp, prefix=f'{col}')], axis=1)

    updated_instance.drop(columns=categorical_cols, inplace=True)

    # Extract any features that are not needed
    request_unique_identifier = updated_instance.pop('request_unique_identifier')
    features = [i.replace('\n', '') for i in features]

    # Get the order of features correct
    updated_instance = updated_instance[features]

    return updated_instance, request_unique_identifier.values[0]

In [45]:
# Create a sample prediction request
prediction_request = generate_sample(num_samples=1, random_seed=RANDOM_SEED)
prediction_request.drop(columns=['damage'],inplace=True)
prediction_request = prediction_request.to_json(orient='records')
prediction_request = { 'instances': json.loads(prediction_request)}
prediction_request

{'instances': [{'request_unique_identifier': 'K933PF',
   'make': 'Audi',
   'generic_model': 'no',
   'fuel': 'Electric',
   'transmission': 'Manual',
   'body': 'Convertible',
   'vehicle_colour': 'Silver',
   'previous_keepers': 1,
   'number_of_doors': 3,
   'engine_size': 3345,
   'year': 2011,
   'mileage': 5710,
   'cap_price': 26856}]}

In [46]:
# Run pre-processing on the request 
processed_instance,enquiry_id=pre_processing(prediction_request['instances'],model_dir=MODEL_DIR)

In [50]:
# Load model into memory
source='model'
pipeline=joblib.load(f'./{MODEL_DIR}/model.joblib')

In [51]:
# Run prediction
output = {'probabilities':[ {
            'label' : pipeline.predict_proba(processed_instance),
            'source': source,
            'enquiry_id': enquiry_id
}
    ]
        }
output

{'probabilities': [{'label': array([[0.58, 0.42]]),
   'source': 'model',
   'enquiry_id': 'K933PF'}]}

# Upload the model to Vertex AI registry using pre-built container

In [28]:
!gsutil cp -r 'model' gs://{STAGING_BUCKET}/

Copying file://model/tfdv_graph.html [Content-Type=text/html]...
Copying file://model/features.txt [Content-Type=text/plain]...                  
Copying file://model/model.joblib [Content-Type=application/octet-stream]...    
Copying file://model/categorical_features.json [Content-Type=application/json]...
- [4 files][ 12.4 MiB/ 12.4 MiB]    1.0 MiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://model/input_schema.pb [Content-Type=application/octet-stream]...
- [5 files][ 12.4 MiB/ 12.4 MiB]   1006 KiB/s                                   
Operation completed over 5 objects/12.4 MiB.                                     


In [39]:
# Extract all the features into memory
with open(f'./{MODEL_DIR}/features.txt') as f:
    features=f.readlines()
features = [feature.replace('\n','') for feature in features]

# built input schema for explanability metadata
inputs={}
for feature in features:
    inputs[feature]={}
inputs

{'make_Audi': {},
 'make_BMW': {},
 'make_Honda': {},
 'make_Mercedes': {},
 'make_Toyota': {},
 'generic_model_None': {},
 'generic_model_no': {},
 'generic_model_yes': {},
 'fuel_Diesel': {},
 'fuel_Electric': {},
 'fuel_Hybrid': {},
 'fuel_Petrol': {},
 'transmission_Automatic': {},
 'transmission_Manual': {},
 'body_Convertible': {},
 'body_Hatchback': {},
 'body_SUV': {},
 'body_Sedan': {},
 'vehicle_colour_Black': {},
 'vehicle_colour_Blue': {},
 'vehicle_colour_Orange': {},
 'vehicle_colour_Red': {},
 'vehicle_colour_Silver': {},
 'vehicle_colour_White': {},
 'previous_keepers': {},
 'number_of_doors': {},
 'engine_size': {},
 'age': {},
 'mileage': {},
 'cap_price': {}}

In [41]:
explanation_parameters=aiplatform.explain.ExplanationParameters({"sampled_shapley_attribution": {"path_count": 25}})
explanation_metadata = aiplatform.explain.ExplanationMetadata(
    inputs=inputs,
    outputs={"predictions": {}},
)

In [52]:
# Prebuilt images : https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers#scikit-learn
%%time
serving_container_image_uri='europe-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest'
model_display_name='dummy-classifier'
model = aiplatform.Model.upload(project=PROJECT,
                                location=REGION,
                                display_name='dummy-classifier',
                                artifact_uri=f'gs://{STAGING_BUCKET}/model',
                                serving_container_image_uri=serving_container_image_uri,
                                explanation_parameters=explanation_parameters,
                                explanation_metadata=explanation_metadata,
                                upload_request_timeout=1800,
                                sync=True)


Creating Model
Create Model backing LRO: projects/234586475385/locations/europe-west2/models/5516214652180103168/operations/3218592691407290368
Model created. Resource name: projects/234586475385/locations/europe-west2/models/5516214652180103168@1
To use this Model in another session:
model = aiplatform.Model('projects/234586475385/locations/europe-west2/models/5516214652180103168@1')


In [53]:
model.deploy(machine_type='n1-standard-8')

Creating Endpoint
Create Endpoint backing LRO: projects/234586475385/locations/europe-west2/endpoints/6435582294861283328/operations/1723397615120285696
Endpoint created. Resource name: projects/234586475385/locations/europe-west2/endpoints/6435582294861283328
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/234586475385/locations/europe-west2/endpoints/6435582294861283328')
Deploying model to Endpoint : projects/234586475385/locations/europe-west2/endpoints/6435582294861283328
Deploy Endpoint model backing LRO: projects/234586475385/locations/europe-west2/endpoints/6435582294861283328/operations/4303960201603579904
Endpoint model deployed. Resource name: projects/234586475385/locations/europe-west2/endpoints/6435582294861283328


<google.cloud.aiplatform.models.Endpoint object at 0x7fa89258fac0> 
resource name: projects/234586475385/locations/europe-west2/endpoints/6435582294861283328

In [61]:
model_display_name='dummy-classifier'

In [67]:
endpoints=aiplatform.Endpoint.list(project=PROJECT,
                                  location=REGION,filter=f'display_name={model_display_name}_endpoint')
endpoint=aiplatform.Endpoint(endpoints[0].resource_name)

In [78]:
prediction_request = generate_sample(num_samples=1, random_seed=RANDOM_SEED)
prediction_request.drop(columns=['damage'],inplace=True)
prediction_request = prediction_request.to_json(orient='records')
prediction_request = { 'instances': json.loads(prediction_request)}
# Run pre-processing on the request 
processed_instance,enquiry_id=pre_processing(prediction_request['instances'],model_dir=MODEL_DIR)

In [101]:
# Predict using preprocessed instance 
endpoint.predict(processed_instance.values.reshape(1,-1).tolist())

Prediction(predictions=['0'], deployed_model_id='2810835505711677440', model_version_id='1', model_resource_name='projects/234586475385/locations/europe-west2/models/5516214652180103168', explanations=None)

In [103]:
# Get explanations
endpoint.explain(processed_instance.values.reshape(1,-1).tolist())

InvalidArgument: 400 {"error": "Unable to explain the requested instance(s) because: Nameless inputs are allowed only if there is a single input in the explanation metadata."}

In [113]:
%%time
# As the explanations fail with Nameless inputs, I tried with single input in the metadata 
inputs={
"features": {}

}
explanation_parameters=aiplatform.explain.ExplanationParameters({"sampled_shapley_attribution": {"path_count": 25}})
explanation_metadata = aiplatform.explain.ExplanationMetadata(
    inputs=inputs,
    outputs={"predictions": {}},
)


serving_container_image_uri='europe-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest'
model_display_name='dummy-classifier'
model = aiplatform.Model.upload(project=PROJECT,
                                location=REGION,
                                display_name='dummy-classifier',
                                artifact_uri=f'gs://{STAGING_BUCKET}/model',
                                is_default_version=True,            # make the new model default
                                serving_container_image_uri=serving_container_image_uri,
                                explanation_parameters=explanation_parameters,
                                explanation_metadata=explanation_metadata,
                                upload_request_timeout=1800,
                                sync=True)



Creating Model
Create Model backing LRO: projects/234586475385/locations/europe-west2/models/1261438904221827072/operations/6455554923579834368
Model created. Resource name: projects/234586475385/locations/europe-west2/models/1261438904221827072@1
To use this Model in another session:
model = aiplatform.Model('projects/234586475385/locations/europe-west2/models/1261438904221827072@1')
CPU times: user 88 ms, sys: 56 ms, total: 144 ms
Wall time: 2min 39s


In [115]:
# Deploy the new model 
endpoints=aiplatform.Endpoint.list(project=PROJECT,
                                  location=REGION,filter=f'display_name={model_display_name}_endpoint')
endpoint=aiplatform.Endpoint(endpoints[0].resource_name)


In [117]:
endpoint.deploy(model)

Deploying Model projects/234586475385/locations/europe-west2/models/1261438904221827072 to Endpoint : projects/234586475385/locations/europe-west2/endpoints/6435582294861283328
Using default machine_type: n1-standard-2
Deploy Endpoint model backing LRO: projects/234586475385/locations/europe-west2/endpoints/6435582294861283328/operations/3340189881346293760
Endpoint model deployed. Resource name: projects/234586475385/locations/europe-west2/endpoints/6435582294861283328


In [119]:
prediction_request = generate_sample(num_samples=1, random_seed=RANDOM_SEED)
prediction_request.drop(columns=['damage'],inplace=True)
prediction_request = prediction_request.to_json(orient='records')
prediction_request = { 'instances': json.loads(prediction_request)}
# Run pre-processing on the request 
processed_instance,enquiry_id=pre_processing(prediction_request['instances'],model_dir=MODEL_DIR)
endpoint.predict(processed_instance.values.reshape(1,-1).tolist())

Prediction(predictions=['0'], deployed_model_id='2810835505711677440', model_version_id='1', model_resource_name='projects/234586475385/locations/europe-west2/models/5516214652180103168', explanations=None)

In [122]:
# I get the same error 
endpoint.explain(processed_instance.values.reshape(1,-1).tolist())

InvalidArgument: 400 {"error": "Unable to explain the requested instance(s) because: Nameless inputs are allowed only if there is a single input in the explanation metadata."}