## Simple pipeline with MLflow model tracking

An experiment with iris dataset. In general, to use MLflow in a Kubeflow Pipeline, the necessary environment should be passed to the containers using the MLflow logic. This is implemented in the `add_env_to_mlflow_kfp_container` function.

In [None]:
import os

import kfp.dsl as dsl
from kfp.client import Client
from kfp.dsl import Dataset, Input, Model, Output


def add_env_to_mlflow_kfp_container(container_task):
    # these variables are needed by MLflow
    mlflow_vars = {
        "AWS_ENDPOINT_URL": os.environ["AWS_ENDPOINT_URL"],
        "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"],
        "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"],
        "DB_URI": os.environ["DB_URI"],
        "MLFLOW_S3_BUCKET": os.environ["MLFLOW_S3_BUCKET"],
        "MLFLOW_URI": os.environ["MLFLOW_URI"]
    }
    for key, value in mlflow_vars.items():
        container_task.set_env_variable(key, value)
    return container_task


@dsl.component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn"],
    base_image="python:3.11",
)
def preprocess_data(
    x_train_df: Output[Dataset],
    y_train_df: Output[Dataset],
    x_test_df: Output[Dataset],
    y_test_df: Output[Dataset],
    test_size: float = 0.2,
    seed: int = 42,
):
    """Reads iris data and writes it to pipeline artifacts as parquet."""
    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    df = datasets.load_iris(as_frame=True)
    x = df.data
    y = df.target.to_frame()

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_size, random_state=seed
    )

    for obj, artifact in zip(
        (x_train, x_test, y_train, y_test),
        (x_train_df, x_test_df, y_train_df, y_test_df)
    ):
        obj.to_parquet(artifact.path)


@dsl.component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn", "mlflow", "boto3"],
    base_image="python:3.11",
)
def train_and_log_model(
    x_train: Input[Dataset],
    y_train: Input[Dataset],
    x_test: Input[Dataset],
    y_test: Input[Dataset],
    trained_model: Output[Model],
    seed: int = 42,
):
    import os

    import mlflow
    import pandas as pd
    from joblib import dump
    from mlflow.models import infer_signature
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score


    x_train = pd.read_parquet(x_train.path)
    y_train = pd.read_parquet(y_train.path)
    x_test = pd.read_parquet(x_test.path)
    y_test = pd.read_parquet(y_test.path)

    # Define the model hyperparameters
    params = {
        "solver": "lbfgs",
        "max_iter": 1000,
        "multi_class": "auto",
        "random_state": seed,
    }

    # Train the model
    lr = LogisticRegression(**params)
    lr.fit(x_train, y_train)

    # Save the trained model
    dump(lr, trained_model.path)

    # Predict on the test set
    y_pred = lr.predict(x_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)

    # Set our tracking server uri for logging
    mlflow.set_tracking_uri(uri=os.environ["MLFLOW_URI"])

    # Create a new MLflow Experiment
    mlflow.set_experiment("MLflow Quickstart with KFP")

    # Start an MLflow run
    with mlflow.start_run():
        # Log the hyperparameters
        mlflow.log_params(params)

        # Log the loss metric
        mlflow.log_metric("accuracy", accuracy)

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", "Basic LR model for iris data, KFP")

        # Infer the model signature
        signature = infer_signature(x_train, lr.predict(x_train))

        # Log the model
        model_info = mlflow.sklearn.log_model(
            sk_model=lr,
            artifact_path="iris_model",
            signature=signature,
            input_example=x_train,
            registered_model_name="tracking-quickstart-pipeline",
        )


@dsl.pipeline
def simple_pipeline():

    # Step 1: Preprocess the data
    preprocess_data_task = preprocess_data()

    # Step 2: Train the model and add necessary env vars
    train_and_log_model_task = train_and_log_model(
        x_train=preprocess_data_task.outputs['x_train_df'],
        y_train=preprocess_data_task.outputs['y_train_df'],
        x_test=preprocess_data_task.outputs['x_test_df'],
        y_test=preprocess_data_task.outputs['y_test_df'],
    )
    train_and_log_model_task = add_env_to_mlflow_kfp_container(train_and_log_model_task)


# Initialize the Kubeflow Pipelines client
client = Client()

# Create a new run from the pipeline function
client.create_run_from_pipeline_func(
    simple_pipeline,
    experiment_name="iris-dataset-classification",
    enable_caching=True,
)

# kfp.compiler.Compiler().compile(simple_pipeline, 'simple_pipeline.yaml')