In [None]:
USER_FLAG = "--user"
# Install ai platform and kfp
!pip3 install {USER_FLAG} google-cloud-aiplatform==1.3.0 --upgrade
!pip3 install {USER_FLAG} kfp --upgrade
!pip install google_cloud_pipeline_component



In [None]:
from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-east1"

PROJECT_ID= "{your-project-id}"


BUCKET_NAME = "gs://vertex-ai-udemy"

PIPELINE_ROOT = f"{BUCKET_NAME}/cc-fraud-kfpl/"

USER_FLAG = "--user"

In [None]:
@component(
    packages_to_install=["pandas","pyarrow","scikit-learn","google-cloud-storage==1.26.0","fsspec","gcsfs==0.6.2"],
    base_image="python:3.9",
    output_component_file="get_cc_fraud_data.yaml"
)
def get_cc_fraud_data(dataset_train: Output[Dataset],dataset_test: Output[Dataset]):
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split as tts
    
    df = pd.read_csv('gs://vertex-ai-udemy/cc-fraud-model-input/credit_card_payments.csv')
    
    train, test = tts(df, test_size=0.3)
    train.to_csv(dataset_train.path + ".csv" , index=False, encoding='utf-8-sig')
    test.to_csv(dataset_test.path + ".csv" , index=False, encoding='utf-8-sig')


In [None]:
@component(
    packages_to_install = [
        "pandas",
        "scikit-learn",
    ], base_image="python:3.9",
)
def train_cc_fraud_model(dataset:Input[Dataset],model: Output[Model]):
    
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import pickle

    data = pd.read_csv(dataset.path+".csv")
    model_rf = RandomForestClassifier(n_estimators=10)
    model_rf.fit(data.drop(columns=["Class"]),data.Class,)
    model.metadata["framework"] = "RF"
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_rf, file)

In [None]:
@component(
    packages_to_install = [
        "pandas",
        "scikit-learn"
    ], base_image="python:3.9",
)
def model_evaluation(
    test_set:  Input[Dataset],
    rf_adclick_model: Input[Model],
    thresholds_dict_str: str,
    metrics: Output[ClassificationMetrics],
    kpi: Output[Metrics]
) -> NamedTuple("output", [("deploy", str)]):

    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import logging 
    import pickle
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    import json
    import typing

    def threshold_check(val1, val2):
        cond = "false"
        if val1 >= val2 :
            cond = "true"
        return cond

    data = pd.read_csv(test_set.path+".csv")
    model = RandomForestClassifier()
    file_name = rf_adclick_model.path + ".pkl"
    with open(file_name, 'rb') as file:  
        model = pickle.load(file)
    
    y_test = data.drop(columns=["Class"])
    y_target=data.Class
    y_pred = model.predict(y_test)

    y_scores =  model.predict_proba(data.drop(columns=["Class"]))[:, 1]
    fpr, tpr, thresholds = roc_curve(
         y_true=data.Class.to_numpy(), y_score=y_scores, pos_label=True
    )
    metrics.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())  
    
    metrics.log_confusion_matrix(
       ["False", "True"],
       confusion_matrix(
           data.Class, y_pred
       ).tolist(), 
    )
    
    accuracy = accuracy_score(data.Class, y_pred.round())
    thresholds_dict = json.loads(thresholds_dict_str)
    rf_adclick_model.metadata["accuracy"] = float(accuracy)
    kpi.log_metric("accuracy", float(accuracy))
    deploy = threshold_check(float(accuracy), int(thresholds_dict['roc']))
    return (deploy,)

In [None]:
@component(
    packages_to_install=["google-cloud-aiplatform","scikit-learn==1.0.0","kfp"],
    base_image="python:3.9",
    output_component_file="model_cc_fraud_component.yml"
)
def deploy_cc_fraud_models(
    model: Input[Model],
    project: str,
    region: str,
    serving_container_image_uri : str, 
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=region)

    DISPLAY_NAME  = "cc-fraud"
    MODEL_NAME = "cc-fraud"
    ENDPOINT_NAME = "cc-fraud-ep"

    def create_endpoint():
        endpoints = aiplatform.Endpoint.list(
        filter='display_name="{}"'.format(ENDPOINT_NAME),
        order_by='create_time desc',
        project=project, 
        location=region,
        )
        if len(endpoints) > 0:
            endpoint = endpoints[0]
        else:
            endpoint = aiplatform.Endpoint.create(
            display_name=ENDPOINT_NAME, project=project, location=region
        )
    endpoint = create_endpoint()   
    
    artifact_uri_tmp = model.uri
    
    model_upload = aiplatform.Model.upload(
        display_name = DISPLAY_NAME, 
        artifact_uri = artifact_uri_tmp[:artifact_uri_tmp.rfind('/')],
        serving_container_image_uri =  serving_container_image_uri,
        serving_container_health_route=f"/v1/models/{MODEL_NAME}",
        serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
        serving_container_environment_variables={
        "MODEL_NAME": MODEL_NAME,
    },       
    )
    
    model_deploy = model_upload.deploy(
        machine_type="n1-standard-2", 
        endpoint=endpoint,
        traffic_split={"0": 100},
        deployed_model_display_name=DISPLAY_NAME,
    )
    vertex_model.uri = model_deploy.resource_name

In [None]:
from datetime import datetime
TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = 'cc-fraud-detection-job{}'.format(TIMESTAMP)

In [None]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="cc-fraud-pipeline",   
)
def pipeline(
    project: str = PROJECT_ID,
    region: str = REGION, 
    display_name: str = "cc-fraud-detection-pipeline",
    serving_container_image_uri: str = "europe-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest" #Done change this
    ):
    data_op = get_cc_fraud_data()
    train_model_op = train_cc_fraud_model(data_op.outputs["dataset_train"])
    thresholds_dict_str: str = '{"roc":0.8}'
    model_evaluation_op = model_evaluation(
        test_set = data_op.outputs["dataset_test"],
        rf_adclick_model=train_model_op.outputs["model"],
        thresholds_dict_str = thresholds_dict_str,
    )
    with dsl.Condition(
        model_evaluation_op.outputs["deploy"]=="true",
        name="deploy-cc-fraud-model",
    ):
        deploy_model_op = deploy_cc_fraud_models(
            model = train_model_op.outputs['model'],
            project = project,
            region=region, 
            serving_container_image_uri = serving_container_image_uri
        )

In [None]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='cc-fraud-pipeline.json')

In [None]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="cc-pipeline",
    template_path="cc-fraud-pipeline.json",
    enable_caching=False,
    location=REGION,
)

In [None]:
start_pipeline.run()