In [1]:
import kfp
from kfp import dsl
from kfp import compiler
from kfp.dsl import (component, pipeline, Input, Output, Dataset, Model, Artifact, Metrics)

@component
def ingest(raw_data: Output[Dataset]):
    import subprocess
    subprocess.run(['pip', 'install', 'google-cloud-storage', 'pandas'], check=True)
    
    from google.cloud import storage
    from io import BytesIO
    import pandas as pd

    # Initialize a Cloud Storage client
    storage_client = storage.Client()
    
    # Define the bucket and the blob (file)
    bucket = storage_client.bucket("final_demo2_blackfriday")
    blob = bucket.blob("train.csv")
    
    # Download the file as bytes
    black_friday_data = blob.download_as_bytes()
    
    # Read the bytes into a pandas DataFrame
    black_friday_df = pd.read_csv(BytesIO(black_friday_data))
    
    # Save the DataFrame to a CSV file
    black_friday_df.to_csv(raw_data.path, index=False)
    
@component
def feature_selection(
    engineered_data: Input[Dataset],
    finalized_features_data: Output[Dataset]
):
    import subprocess
    subprocess.run(['pip', 'install', 'pandas', 'scikit-learn', 'google-cloud-storage'], check=True)
    
    import pandas as pd
    from sklearn.feature_selection import SelectKBest, f_regression
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestRegressor
    import json
    from google.cloud import storage

    def feature_selection_with_k_selection(df, target_column):
        """
        Perform feature selection on a DataFrame with categorical features and a numerical target, and select the best k value.
        """
        X = df.drop(columns=[target_column])
        y = df[target_column]

        # Define the range of k values to test
        k_values = list(range(1, X.shape[1] + 1))

        # Create a pipeline with SelectKBest and a RandomForestRegressor
        pipeline = Pipeline([
            ('selector', SelectKBest(score_func=f_regression)),
            ('model', RandomForestRegressor(random_state=42))
        ])

        # Define the parameter grid
        param_grid = {'selector__k': k_values}

        # Use GridSearchCV to find the best k value
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')
        grid_search.fit(X, y)

        # Get the best k value
        best_k = grid_search.best_params_['selector__k']
        print(f"Best k value: {best_k}")

        # Fit the selector with the best k value
        selector = SelectKBest(score_func=f_regression, k=best_k)
        X_new = selector.fit_transform(X, y)

        # Get the selected feature names
        selected_features = X.columns[selector.get_support(indices=True)].tolist()

        # Create a DataFrame with the selected features
        selected_df = pd.DataFrame(X_new, columns=selected_features, index=df.index)

        return selected_df, selected_features, best_k

    # Load the data
    black_friday_df = pd.read_csv(engineered_data.path)

    # Perform feature selection
    selected_df, selected_features, best_k = feature_selection_with_k_selection(black_friday_df, 'Purchase')

    # Add the target column back to the selected features DataFrame
    selected_df['Purchase'] = black_friday_df['Purchase']

    # Save the selected features to a JSON file
    selected_feature_names = selected_features
    local_model_path = "selected_features_names.json"
    with open(local_model_path, 'w') as f:
        json.dump(selected_feature_names, f)

    # Upload the selected features JSON to GCS
    client = storage.Client()
    bucket = client.bucket('final_demo2_blackfriday')
    blob = bucket.blob('selected_features_names.json')
    blob.upload_from_filename(local_model_path)

    # Save the selected features DataFrame to a CSV file
    selected_df.to_csv(finalized_features_data.path, index=False)

@component
def train_validation_test_split(
    finalized_features_data: Input[Dataset],
    train_data: Output[Dataset],
    validation_data: Output[Dataset],
    test_data: Output[Dataset]
):
    import subprocess
    subprocess.run(['pip', 'install', 'pandas', 'scikit-learn'], check=True)

    import pandas as pd
    from sklearn.model_selection import train_test_split as sk_train_test_split

    # Load the data
    black_friday_df = pd.read_csv(finalized_features_data.path)

    # Split the data into train and test sets
    train_df, test_df = sk_train_test_split(black_friday_df, test_size=0.1, random_state=42)
    
    # Further split the train data into train and validation sets
    train_df, validation_df = sk_train_test_split(train_df, test_size=0.2, random_state=42)
    
    # Save the train, validation, and test sets to CSV files
    train_df.to_csv(train_data.path, index=False)
    validation_df.to_csv(validation_data.path, index=False)
    test_df.to_csv(test_data.path, index=False)

@component
def hyperparameter_tuning(
    validation_data: Input[Dataset],
    best_params: Output[Artifact]
):
    """
    Perform hyperparameter tuning on the validation dataset using Optuna.
    
    """
    import subprocess
    subprocess.run(['pip', 'install', 'pandas', 'scikit-learn', 'xgboost==1.6', 'optuna'], check=True)

    import optuna
    import pandas as pd
    import xgboost as xgb
    from sklearn.metrics import mean_squared_error
    from functools import partial
    import json

    # Load the validation dataset
    validation_df = pd.read_csv(validation_data.path)
    X_val = validation_df.drop('Purchase', axis=1)
    y_val = validation_df['Purchase']

    def objective(trial, X, y):
        """
        Objective function for Optuna hyperparameter optimization.

        """
        params = {
            "objective": "reg:squarederror",
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
            "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
            "max_depth": trial.suggest_int("max_depth", 3, 9),
            "eta": trial.suggest_loguniform("eta", 1e-8, 1.0),
            "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
            "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
            "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
            "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-8, 1.0),
            "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
            "scale_pos_weight": trial.suggest_loguniform("scale_pos_weight", 1e-8, 1.0),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "sampling_method": trial.suggest_categorical("sampling_method", ["uniform"])
        }

        model = xgb.XGBRegressor(**params)
        model.fit(X, y)
        y_pred = model.predict(X)
        rmse = mean_squared_error(y, y_pred, squared=False)
        return rmse

    # Create and optimize the study
    study = optuna.create_study(direction="minimize")
    study.optimize(partial(objective, X=X_val, y=y_val), n_trials=50)
    best_params_dict = study.best_params

    # Save the best parameters to a JSON file
    best_params_path = best_params.path + ".json"
    with open(best_params_path, 'w') as f:
        json.dump(best_params_dict, f)
        
@component
def model_building(
    train_data: Input[Dataset],
    best_params: Input[Artifact],
    model_output: Output[Model]
):
    import subprocess
    subprocess.run(['pip', 'install', 'pandas','scikit-learn', 'xgboost==1.6', 'joblib', 'google-cloud-storage'], check=True)
    
    import pandas as pd
    import xgboost as xgb
    import joblib
    import json
    from google.cloud import storage

    # Load the train data
    train_df = pd.read_csv(train_data.path)
    X_train = train_df.drop('Purchase', axis=1)
    y_train = train_df['Purchase']

    # Load best parameters
    best_params_path = best_params.path + ".json"
    with open(best_params_path, 'r') as f:
        best_params_dict = json.load(f)

    # Train the model using best parameters
    model = xgb.XGBRegressor(**best_params_dict)
    model.fit(X_train, y_train)

    # Save the model
    local_model_path = "/tmp/model.joblib"
    joblib.dump(model, local_model_path)
    
    # Upload the model to GCS
    client = storage.Client()
    bucket = client.bucket('final_demo2_blackfriday')
    blob = bucket.blob('model.joblib')
    blob.upload_from_filename(local_model_path) 
    
     # Output the model artifact
    model_output.uri = f"gs://{bucket.name}/model.joblib"
        
@component
def upload_model_to_vertex_ai(
    model_output: Input[Model],
    project_id: str,
    region: str,
    display_name: str,
    serving_image: str,
    parent_model: str
):
    """
    Upload a trained model to Vertex AI Model Registry.

    Args:
        model_output (Input[Model]): The input model artifact.
        project_id (str): GCP project ID.
        region (str): The region for Vertex AI.
        display_name (str): Display name for the model in Vertex AI.
        serving_image (str): The container image URI for serving the model.
        parent_model (str): Parent model ID, if applicable.
    """
    import subprocess
    subprocess.run(['pip', 'install', 'google-cloud-aiplatform', 'google-cloud-storage'], check=True)
    
    import os
    from google.cloud import storage
    from google.cloud import aiplatform

    # Define the GCS bucket and file details
    model_gcs_uri = model_output.uri

    # Upload model to Vertex AI Model Registry
    aiplatform.init(project=project_id, location=region)
    model = aiplatform.Model.upload(
        display_name=display_name,
        artifact_uri=os.path.dirname(model_gcs_uri),
        serving_container_image_uri=serving_image,
        serving_container_ports=[5005],
        serving_container_health_route="/health",
        serving_container_predict_route="/predict",
        parent_model=parent_model if parent_model.lower() != 'none' else None,
        sync=True
    )

@component
def model_evaluation(
    train_data: Input[Dataset],
    test_data: Input[Dataset],
    model_output: Input[Model],
    metrics: Output[Metrics]
):
    """
    Evaluate a trained model on training and test datasets, and log metrics.

    """
    import subprocess
    subprocess.run(['pip', 'install', 'pandas', 'xgboost==1.6', 'joblib', 'scikit-learn', 'google-cloud-storage'], check=True)
    
    import pandas as pd
    import xgboost as xgb
    import joblib
    from sklearn.metrics import mean_squared_error, r2_score
    from google.cloud import storage
    import os

    # Load the train and test data
    train_df = pd.read_csv(train_data.path)
    X_train = train_df.drop('Purchase', axis=1)
    y_train = train_df['Purchase']
    
    test_df = pd.read_csv(test_data.path)
    X_test = test_df.drop('Purchase', axis=1)
    y_test = test_df['Purchase']

    # Load the model from GCS
    client = storage.Client()
    bucket = client.bucket('final_demo2_blackfriday')
    blob = bucket.blob('model.joblib')
    local_model_path = "/tmp/model.joblib"
    
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(local_model_path), exist_ok=True)
    
    # Download the model
    blob.download_to_filename(local_model_path)
    
    # Load the model
    model = joblib.load(local_model_path)
    
    # Predict and evaluate on train data
    y_train_pred = model.predict(X_train)
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    train_r2 = r2_score(y_train, y_train_pred)

    # Predict and evaluate on test data
    y_test_pred = model.predict(X_test)
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Log metrics
    metrics.log_metric("Train RMSE", train_rmse)
    metrics.log_metric("Train R2", train_r2)
    metrics.log_metric("Test RMSE", test_rmse)
    metrics.log_metric("Test R2", test_r2)

# Load components from the separate file
from preprocessing_module import preprocessing, feature_engineering

@pipeline(
    name='black_friday_purchase_model_training_pipeline',
    description='A pipeline that processes data from Black Friday sales and builds a predictive model.',
)
def black_friday_purchase_model_training_pipeline(
    project_id: str,
    region: str,
    display_name: str,
    serving_image: str,
    parent_model: str
):
    ingest_task = ingest()
    preprocessed_data = preprocessing(raw_data=ingest_task.output)
    engineered_data = feature_engineering(preprocessed_data=preprocessed_data.output)
    selected_features_data = feature_selection(engineered_data=engineered_data.output)
    
    split_data = train_validation_test_split(
        finalized_features_data=selected_features_data.output
    )
    
    tuned_params = hyperparameter_tuning(
        validation_data=split_data.outputs['validation_data']
    )
    
    trained_model = model_building(
        train_data=split_data.outputs['train_data'],
        best_params=tuned_params.output
    )
    
    # Upload model to Vertex AI Model Registry
    upload_model_task = upload_model_to_vertex_ai(
        model_output=trained_model.outputs['model_output'],
        project_id=project_id,
        region=region,
        display_name=display_name,
        serving_image=serving_image,
        parent_model=parent_model
    )
    
    model_evaluation(
        train_data=split_data.outputs['train_data'],
        test_data=split_data.outputs['test_data'],
        model_output=trained_model.output
    )

if __name__ == "__main__":
    # Compile the updated pipeline
    kfp.compiler.Compiler().compile(
        black_friday_purchase_model_training_pipeline,
        'black_friday_purchase_model_training_pipeline.yaml'
    )

  return component_factory.create_component_from_func(


# Deploying to Endpoint

In [2]:
from google.cloud import aiplatform

# Initialize the AI Platform client
aiplatform.init(project='brldi-gcpcapabilities-ai-audit', location='us-central1')

# Specify the model resource path
my_model = aiplatform.Model("projects/971203737354/locations/us-central1/models/6291045993831989248")

# Deploy the model to an endpoint
endpoint = my_model.deploy(
    deployed_model_display_name='black_friday_endpoint',
    traffic_split={"0": 100},
    machine_type="n1-standard-4",
    min_replica_count=1,
    max_replica_count=1, 
)

Creating Endpoint
Create Endpoint backing LRO: projects/971203737354/locations/us-central1/endpoints/5150514885758550016/operations/2505232055567122432
Endpoint created. Resource name: projects/971203737354/locations/us-central1/endpoints/5150514885758550016
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/971203737354/locations/us-central1/endpoints/5150514885758550016')
Deploying model to Endpoint : projects/971203737354/locations/us-central1/endpoints/5150514885758550016
Deploy Endpoint model backing LRO: projects/971203737354/locations/us-central1/endpoints/5150514885758550016/operations/1190180964374937600
Endpoint model deployed. Resource name: projects/971203737354/locations/us-central1/endpoints/5150514885758550016


# Online Predictions

In [3]:
from google.cloud import aiplatform

# Initialize the AI Platform client
aiplatform.init(project='brldi-gcpcapabilities-ai-audit', location='us-central1')
endpoint = aiplatform.Endpoint('projects/971203737354/locations/us-central1/endpoints/5150514885758550016')

data ={
  "instances": [
    {
      "User_ID": 1000011,
      "Product_ID": "P00053842",
      "Gender": "F",
      "Age": "26-35",
      "Occupation": 1,
      "City_Category": "C",
      "Stay_In_Current_City_Years": 1,
      "Marital_Status": 0,
      "Product_Category_1": 4,
      "Product_Category_2": 5,
      "Product_Category_3": 12
    }
  ]
}

# Perform online prediction
prediction = endpoint.predict(instances=data['instances']).predictions
# Print the prediction results
print(prediction)

[{'label': 2509.952880859375}]
