In [1]:
# !pip install kfp==1.8.0

In [2]:
from typing import NamedTuple

import kfp
from kfp.components import InputPath, InputTextFile, OutputPath, OutputTextFile
from kfp.components import func_to_container_op

from datetime import datetime

import sys
sys.path.insert(0, "..")
from constants import NAMESPACE, HOST
from utils.auth import get_session_cookie
from utils import helpers

In [3]:
import pandas as pd

### Define several constants

In [4]:
EXPERIMENT_NAME = "tutorial"
PIPELINE_NAME = "linear regression"
PIPELINE_VERSION = "0.0.1" # remember to change every run
PIPELINE_DESCRIPTION = "Using linear regression to predict house prices"
DATASET_URL = "https://raw.githubusercontent.com/quan-dang/kubeflow-tutorials/master/data/housing.csv"

### Create components from func

In [5]:
def prepare_data(
    url: str,
    X_train_path: OutputPath('PKL'),
    y_train_path: OutputPath('PKL'),
    X_val_path: OutputPath('PKL'),
    y_val_path: OutputPath('PKL'),
    X_test_path: OutputPath('PKL'),
    y_test_path: OutputPath('PKL'),
):
    import pandas as pd
    import wget
    from sklearn.model_selection import train_test_split
    import joblib
    
    # download housing.csv to local
    wget.download(url)

    df = pd.read_csv("housing.csv")
    X = df.drop(columns=["price"])
    y = df["price"]

    # create train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # continue to split train set into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
        
    # dump data to pkl
    joblib.dump(X_train, X_train_path)
    joblib.dump(y_train, y_train_path)
    joblib.dump(X_val, X_val_path)
    joblib.dump(y_val, y_val_path)
    joblib.dump(X_test, X_test_path)
    joblib.dump(y_test, y_test_path)
    
prepare_data_op = func_to_container_op(
    func=prepare_data, 
   packages_to_install=["scikit-learn==1.0.2", 
                        "joblib==1.1.0",
                        "pandas==1.3.5",
                        "wget==3.2"]
)

In [6]:
def train(
    X_train_path: InputPath('PKL'),
    y_train_path: InputPath('PKL'),
    X_val_path: InputPath('PKL'),
    y_val_path: InputPath('PKL'),
    clf_path: OutputPath('Model')
):
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LinearRegression
    from sklearn.compose import ColumnTransformer
    from sklearn.metrics import r2_score
    import joblib

    # load data
    X_train = joblib.load(X_train_path)
    y_train = joblib.load(y_train_path)
    X_val = joblib.load(X_val_path)
    y_val = joblib.load(y_val_path)
    
    categorical_features = X_train.loc[:, X_train.dtypes == object].columns

    categorical_transformer = OneHotEncoder()

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_features),
        ],
        remainder = 'passthrough'
    )

    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
    )

    clf.fit(X_train, y_train)
    
    # make prediction on the val data
    y_val_pred = clf.predict(X_val)
    # evaluate on the val data
    print("r2_score: ", r2_score(y_val, y_val_pred))
    
    joblib.dump(clf, clf_path)
    
train_op = func_to_container_op(
    func=train, 
    packages_to_install=["scikit-learn==1.0.2", 
                        "joblib==1.1.0",
                        "pandas==1.3.5"]
)

In [7]:
def evaluate(
    X_test_path: InputPath('PKL'),
    y_test_path: InputPath('PKL'),
    clf_path: InputPath('Model'),
    y_test_pred_path: OutputPath('PKL')
) -> NamedTuple('Outputs', [
  ('mlpipeline_metrics', 'Metrics'),
]):
    import joblib
    from sklearn.metrics import r2_score
    import json
    
    # load data
    X_test = joblib.load(X_test_path)
    y_test = joblib.load(y_test_path)
    
    # load model
    clf = joblib.load(clf_path)
    
    # make prediction on the test data
    y_test_pred = clf.predict(X_test)
    
    joblib.dump(y_test_pred, y_test_pred_path)
    
    # evaluate on the test data
    metrics = {
        'metrics': [{
            'name': 'r2_score', # The name of the metric. Visualized as the column name in the runs table.
            'numberValue':  r2_score(y_test, y_test_pred), # The value of the metric. Must be a numeric value.
            'format': "RAW",   # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
        }]
    }
    return [json.dumps(metrics)]
        
evaluate_op = func_to_container_op(
    func=evaluate, 
    packages_to_install=["scikit-learn==1.0.2", 
                        "joblib==1.1.0",
                        "pandas==1.3.5"]
)

In [8]:
def visualize(
    X_test_path: InputPath('PKL'),
    y_test_path: InputPath('PKL'),
    y_test_pred_path: InputPath('PKL'),
    mlpipeline_ui_metadata_path: kfp.components.OutputPath(),
):
    import joblib
    import matplotlib.pyplot as plt
    import base64
    from io import BytesIO
    import json
    
    # load data
    X_test = joblib.load(X_test_path)
    y_test = joblib.load(y_test_path)
    y_test_pred = joblib.load(y_test_pred_path)
    
    ncols = 4
    nrows = 3

    fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(10, 5),
                            constrained_layout=True)

    for row in range(nrows):
        for col in range(ncols):
            # corresponding feature index to this subplot
            feature_index = row*nrows + col
            axs[row, col].scatter(X_test.iloc[:,feature_index], y_test, color="red")
            axs[row, col].scatter(X_test.iloc[:,feature_index], y_test_pred, color="blue")
            axs[row, col].set_title(X_test.columns[feature_index])

    fig.suptitle('Test data')
    
    # Ref: https://stackoverflow.com/questions/48717794/matplotlib-embed-figures-in-auto-generated-html
    tmpfile = BytesIO()
    fig.savefig(tmpfile, format='png')
    encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
    html = '<img src=\'data:image/png;base64,{}\'>'.format(encoded)

    with open('test.html','w') as f:
        f.write(html)

    metadata = {
        'outputs' : [{
          'type': 'web-app',
          'storage': 'inline',
          'source': html,
        }]
    }

    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:
        json.dump(metadata, metadata_file)
    
    
visualize_op = func_to_container_op(
    func=visualize,
    packages_to_install=["matplotlib==3.5.1", 
                        "joblib==1.1.0",
                        "pandas==1.3.5"]
)

In [9]:
# Define a pipeline and create a task from a component:
def my_pipeline(url):
    prepare_data_task = prepare_data_op(url=url)
    train_task = train_op(x_train=prepare_data_task.outputs['X_train'], 
                       y_train=prepare_data_task.outputs['y_train'],
                       x_val=prepare_data_task.outputs['X_val'],
                       y_val=prepare_data_task.outputs['y_val'],
                      )
    evaluate_task = evaluate_op(x_test=prepare_data_task.outputs['X_test'], 
                            y_test=prepare_data_task.outputs['y_test'],
                            clf=train_task.outputs['clf'])
    visualize_task = visualize_op(x_test=prepare_data_task.outputs['X_test'], 
                            y_test=prepare_data_task.outputs['y_test'],
                            y_test_pred=evaluate_task.outputs['y_test_pred'])

In [10]:
session_cookie = get_session_cookie()
client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
    namespace=NAMESPACE,
)
client.create_run_from_pipeline_func(
    my_pipeline,
    arguments={
        'url': DATASET_URL
    })

RunPipelineResult(run_id=974d04c1-22df-41c9-addf-8a18ec45282e)