In [1]:
import kfp
from kfp import dsl
from kfp import compiler
from kfp.dsl import (component, pipeline, Input, Output, Dataset, Model, Artifact, Metrics)
import google.cloud.aiplatform as aiplatform

@component
def ingest_test_data(raw_data: Output[Dataset]):
    import subprocess
    subprocess.run(['pip', 'install', 'google-cloud-storage', 'pandas'], check=True)
    
    from google.cloud import storage
    from io import BytesIO
    import pandas as pd

    storage1 = storage.Client()
    bucket = storage1.bucket("final_demo2_blackfriday")
    blob = bucket.blob("test.csv")
    black_friday_data = blob.download_as_bytes()

    black_friday_test_df = pd.read_csv(BytesIO(black_friday_data))
    black_friday_test_df.to_csv(raw_data.path, index=False)


@component
def batch_prediction(
    engineered_test_df: Input[Dataset],
    predictions_output: Output[Dataset]
):
    import subprocess
    subprocess.run(['pip', 'install', 'joblib', 'pandas','scikit-learn', 'xgboost', 'google-cloud-storage'], check=True)
    import joblib
    import pandas as pd
    import xgboost as xgb
    from google.cloud import storage
    import json
    import os

    # Define the GCS bucket and file details
    bucket_name = 'final_demo2_blackfriday'
    model_blob_name = 'model.joblib'
    selected_features_blob_name = 'selected_features_names.json'
    local_model_path = '/tmp/model.joblib'
    local_selected_features_path = '/tmp/selected_features_names.json'
    
    # Define the output path for predictions
    predictions_output_path = predictions_output.path

    # Initialize GCS client
    storage_client = storage.Client()

    # Download the model from GCS
    bucket = storage_client.bucket(bucket_name)
    model_blob = bucket.blob(model_blob_name)
    model_blob.download_to_filename(local_model_path)

    # Download the selected features file from GCS
    selected_features_blob = bucket.blob(selected_features_blob_name)
    selected_features_blob.download_to_filename(local_selected_features_path)

    # Load the engineered data
    df = pd.read_csv(engineered_test_df.path)

    # Load the selected features
    with open(local_selected_features_path, 'r') as f:
        selected_feature_names = json.load(f)

    # Remove the 'Purchase' column if present
    if 'Purchase' in selected_feature_names:
        selected_feature_names.remove('Purchase')

    # Load the model
    model = joblib.load(local_model_path)

    # Perform predictions
    predictions = model.predict(df[selected_feature_names])
    predictions_df = pd.DataFrame({'Prediction': predictions})

    # Save predictions to a CSV file
    predictions_df.to_csv(predictions_output_path, index=False)


  return component_factory.create_component_from_func(


In [2]:
# Load components from the separate file
from preprocessing_module import preprocessing, feature_engineering

@dsl.pipeline(name="black_friday_sales_inference_pipeline")
def black_friday_sales_inference_pipeline():
    ingest_task=ingest_test_data()
    
    preprocessing_testing_data = preprocessing(
        raw_data=ingest_task.outputs['raw_data']
    )

    feature_engineering_task = feature_engineering(
        preprocessed_df=preprocessing_testing_data.outputs['preprocessed_df']
    )

    prediction_task = batch_prediction(
        engineered_test_df=feature_engineering_task.outputs['engineered_test_df']
    )

if __name__ == "__main__":
    compiler.Compiler().compile(
        pipeline_func=black_friday_sales_inference_pipeline,
        package_path='black_friday_sales_inference_pipeline.json'
    )

In [4]:
from google.cloud import aiplatform

# Initialize the Vertex AI client
aiplatform.init(project='brldi-gcpcapabilities-ai-audit', location='us-central1')

# Create a pipeline job
job = aiplatform.PipelineJob(
    display_name="black_friday_sales_inference_pipeline",
    template_path="black_friday_sales_inference_pipeline.json",
    pipeline_root="gs://bucketdemo2blackfridaysales/root",
    enable_caching=True,
)

# Run the pipeline job
job.run()

Creating PipelineJob


InvalidArgument: 400 You do not have permission to act as service_account: 971203737354-compute@developer.gserviceaccount.com. (or it may not exist).