In [6]:
DOWNLOAD_AND_EXTRACT_COMPONENT_URL = "https://raw.githubusercontent.com/lehrig/kubeflow-ppc64le-components/main/data-extraction/download-and-extract-from-url/component.yaml"

DATASET_URL = "https://ibm.box.com/shared/static/5mhxb1k13mnklij8w3zngog9rqpbg32e.zip"
DATASET_FILE_NAME = "card_transdata_big.zip"
MODEL_NAME = "fraud_detection-classification"

MINIO_URL = "minio-service.kubeflow:9000"
MINIO_USER = "minio"
MINIO_PASS = "minio123"

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()
NAMESPACE

'user-example-com'

In [15]:
import kfp
import kfp.components as comp
from typing import NamedTuple
import kfp.dsl as dsl

In [16]:
client = kfp.Client()

# 2 Pipeline
## 2.1 Load Dataset

In [17]:
download_and_extract_comp = comp.load_component_from_url(
    DOWNLOAD_AND_EXTRACT_COMPONENT_URL
)

## 2.2 Preprocessing

# Train Test Split

In [18]:
def preprocess_data(
    data_dir: comp.InputPath(str),
    prep_data_dir: comp.OutputPath(str)
):
    from sklearn.model_selection import train_test_split
    import numpy as np
    import os
    import pandas as pd

    data = f'{data_dir}/card_transdata_big.csv'
    
    df=pd.read_csv(data, delimiter=',')
    print(df.columns)

    if not os.path.exists(prep_data_dir):
        os.makedirs(prep_data_dir)

    df.to_pickle(f'{prep_data_dir}/df.pkl')


preprocess_data_comp = kfp.components.create_component_from_func(
    func=preprocess_data,
    base_image='quay.io/ibm/kubeflow-notebook-image-ppc64le@sha256:23b21f6563eb6cb8b33a0c061c82516f3967d1222e66690f46785a6cda81cc83',
)

In [19]:
def train_model(
    prep_data_dir: comp.InputPath(str),
    model_dir: comp.OutputPath(str),
    traintest_dir: comp.OutputPath(str)
):
    import os, random
    import numpy as np
    import pandas as pd
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import train_test_split
    import pickle

    df = pd.read_pickle(f'{prep_data_dir}/df.pkl')

    y = df.fraud
    X = df.drop('fraud', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    mlp = MLPClassifier()
    print("Starting training...")
    mlp.fit(X_train, y_train)
    print("Training finished.")

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if not os.path.exists(traintest_dir):
        os.makedirs(traintest_dir)

    filename = f'{model_dir}/trained_model.sav'
    pickle.dump(mlp, open(filename, 'wb'))

    np.savez(f'{traintest_dir}/train_data.npz', X_train, y_train)
    np.savez(f'{traintest_dir}/val_data.npz', X_test, y_test)

train_model_comp = kfp.components.create_component_from_func(
    func=train_model,
    base_image='quay.io/ibm/kubeflow-notebook-image-ppc64le@sha256:97695b7b4dfab12a65b3d9aaea65649bee1769e578c0965f96648aa55f81fb27'
)

# Evaluation

In [20]:
def evaluate_model(
    prep_data_dir: comp.InputPath(str),
    model_dir: comp.InputPath(str),
):
    import numpy as np
    from sklearn.metrics import (
        precision_score,
        recall_score,
        f1_score
    )
    import pickle

    val_data = np.load(f'{prep_data_dir}/val_data.npz')
    X_test = val_data[val_data.files[0]]
    y_test = val_data[val_data.files[1]]

    mlp = pickle.load(open(f'{model_dir}/trained_model.sav', 'rb'))

    # make prediction
    preds = mlp.predict(X_test)
    # calculate accuracy
    accuracy = np.mean(preds == y_test)
    print(accuracy)

    # calculate precision, recall, f1-score
    precision = precision_score(y_test, preds, average='weighted')
    recall = recall_score(y_test, preds, average='weighted')
    f1 = f1_score(y_test, preds, average='weighted')
    print(precision, recall, f1)


evaluate_model_comp = kfp.components.create_component_from_func(
    func=evaluate_model,
    base_image='quay.io/ibm/kubeflow-notebook-image-ppc64le@sha256:97695b7b4dfab12a65b3d9aaea65649bee1769e578c0965f96648aa55f81fb27'
)

# 3 Pipeline

In [21]:
@dsl.pipeline(
  name='Fraud detection classification pipeline',
  description='Fraud detection'
)
def fraud_detection_pipeline(dataset_url: str,
                    dataset_file_name: str = "data.zip",
                    data_dir: str = "/train/data",
                    prep_data_dir: str = "/train/prep_data",
                    model_dir: str = "/train/model",
                    model_name: str = "fraud_detection-classification",
                    minio_url: str = MINIO_URL,
                    minio_user: str = MINIO_USER,
                    minio_pass: str = MINIO_PASS):
    download_and_extract_task = download_and_extract_comp(
        url=dataset_url,
        file_name=dataset_file_name
    )

    preprocess_data_task = preprocess_data_comp(
        download_and_extract_task.outputs['data_path']
    )

    train_model_task = train_model_comp(
        preprocess_data_task.output
    ).set_gpu_limit(1)

    evaluate_model_task = evaluate_model_comp(
        train_model_task.outputs['traintest_dir'],
        train_model_task.outputs['model_dir']
    ).set_gpu_limit(1)

In [22]:
# Specify argument values for your pipeline run.
arguments = {
    'dataset_url': DATASET_URL,
    'dataset_file_name': DATASET_FILE_NAME,
    'data_dir': '/train/data',
    'prep_data_dir': '/train/prep_data',
    'model_dir': '/train/model',
    'model_name': MODEL_NAME,
    'minio_url': MINIO_URL,
    'minio_user': MINIO_USER,
    'minio_pass': MINIO_PASS
}

client.create_run_from_pipeline_func(
    fraud_detection_pipeline,
    arguments=arguments,
    namespace=NAMESPACE
)

RunPipelineResult(run_id=342f8bdc-d03e-4b38-9ff9-31b880c31383)