# Mobile Price Predictions

In this notebook, you will learn how to build a Kubeflow pipeline using the lightweight components. This is the simplest way to get started with Kubeflow Pipelines.
For this tutorial, we will utilize the Mobile Price Classification dataset available [on Kaggle](https://www.kaggle.com/datasets/iabhishekofficial/mobile-price-classification?datasetId=11167&sortBy=voteCount).

As a prerequisite, please download the dataset, unzip it, and upload it to MinIO. In our example, we uploaded both tables to a MinIO bucket, such that the respective paths are: 'kubeflow-examples/mobile-price-classification/test.csv' and 'kubeflow-examples/mobile-price-classification/train.csv'. These paths are input parameters for the pipeline. If you decide to store the files in a different location, you can easily modify these parameters in the "Compile and run pipeline" section below.


In [None]:
!pip install kfp[all]==2.0.1

In [None]:
import os
import pandas as pd
import numpy as np
import kfp
import kfp.dsl as dsl
from kfp.dsl import HTML, Input, Output, Dataset, Artifact, Model, ClassificationMetrics, Markdown
from kfp.client import Client
from typing import Dict, Tuple, List

## Create components

### Read data

In [None]:
@dsl.component(
    packages_to_install=["pandas", "pyarrow"],
    base_image="python:3.9",
)
def read_data(
    minio_train_data: Input[Dataset],
    minio_test_data: Input[Dataset],
    train_df: Output[Dataset],
    test_df: Output[Dataset],    
):
    """Reads training and test data writes it to pipeline artifacts as parquet."""
    import pandas as pd

    df_train = pd.read_csv(minio_train_data.path)
    df_test = pd.read_csv(minio_test_data.path)
    

    df_train.to_parquet(train_df.path)
    df_test.to_parquet(test_df.path)

### Split data

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn", "pyarrow"],
    base_image="python:3.9",
)
def split_data(
    train_df: Input[Dataset],
    x_train_df: Output[Dataset],
    y_train_df: Output[Dataset],
    x_val_df: Output[Dataset],
    y_val_df: Output[Dataset],
    test_size: float = 0.5,
    seed: int = 42,
):
    """Splits the provided dataset into training and validation sets."""
    
    import pandas as pd
    from sklearn.model_selection import train_test_split 

    # Read the input dataset
    data = pd.read_parquet(train_df.path)

    # Separate target from features
    y = data["price_range"].to_frame()
    x_data = data.drop(["price_range"], axis=1)
    
    # Split the data into training and validation sets
    x_train, x_val, y_train, y_val = train_test_split(x_data, y, test_size=test_size, random_state=seed)

    # Save the splitted data to their respective output paths
    for object, artifact in zip((x_train, x_val, y_train, y_val), (x_train_df, x_val_df, y_train_df, y_val_df)):
        object.to_parquet(artifact.path)


### Fit scaler

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn", "pyarrow"],
    base_image="python:3.9",
)
def fit_scaler(
    train_x: Input[Dataset],
    fitted_scaler: Output[Artifact]
):
    """
    Fits a MinMaxScaler on the provided training data and saves the fitted scaler.
    """
    from sklearn.preprocessing import MinMaxScaler
    from joblib import dump
    import pandas as pd
    
    # Read the input dataset
    x_train = pd.read_parquet(train_x.path)
    
    # Fit the MinMaxScaler on the training data
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    
    # Save the fitted scaler
    dump(scaler, fitted_scaler.path)

### Run grid search

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn", "pyarrow"],
    base_image="python:3.9",
)
def tune_hyperparams(
    train_x: Input[Dataset],
    train_y: Input[Dataset],
    fitted_scaler: Input[Artifact],
    C: List = [1, 0.1, 0.25, 0.5, 2, 0.75],
    kernel: List = ["linear", "rbf"],
    gamma: List = ["auto", 0.01, 0.001, 0.0001, 1],
    decision_function_shape: List[str] = ["ovo", "ovr"],
    seed: int = 42,
) -> dict:
    """
    Performs hyperparameter tuning using GridSearchCV for a SVM classifier on the provided training data.
    Returns the best hyperparameters found.
    """
    import pandas as pd
    from sklearn.model_selection import GridSearchCV
    from sklearn.svm import SVC
    from joblib import load

    # Load the fitted scaler
    scaler = load(fitted_scaler.path)

    # Read and preprocess the training data
    x_train, y_train = [pd.read_parquet(path) for path in (train_x.path, train_y.path)]
    x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)

    # Initialize SVM with a random seed
    svm = SVC(random_state=seed)

    # Define grid search with provided hyperparameters
    grid_svm = GridSearchCV(
        estimator=svm,
        cv=5,
        param_grid=dict(
            kernel=kernel, 
            C=C, 
            gamma=gamma, 
            decision_function_shape=decision_function_shape
        )
    )

    # Perform grid search
    grid_svm.fit(x_train, y_train['price_range'].values)
    
    # Print the best score found
    print("Best score: ", grid_svm.best_score_)

    # Return the best hyperparameters
    return grid_svm.best_params_

### Train model with optimal hyper parameters

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn", "pyarrow"],
    base_image="python:3.9",
)
def train_model(
    train_x: Input[Dataset],
    train_y: Input[Dataset],
    fitted_scaler: Input[Artifact],
    hparams: Dict,
    trained_model: Output[Model],
    seed: int = 42,
):
    """
    Trains an SVM classifier on the provided training data using the best hyperparameters from tuning.
    The trained model is then saved.
    """
    import pandas as pd
    from sklearn.svm import SVC
    from joblib import dump, load

    # Load the fitted scaler
    scaler = load(fitted_scaler.path)

    # Read and preprocess the training data
    x_train, y_train = [pd.read_parquet(path) for path in (train_x.path, train_y.path)]
    x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)

    # Initialize SVM with the best hyperparameters and a random seed
    svm_model = SVC(random_state=seed, **hparams)

    # Train the SVM model
    svm_model.fit(x_train, y_train['price_range'].values)

    # Save the trained model
    dump(svm_model, trained_model.path)


### Evaluate model on validation dataset

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn", "pyarrow"],
    base_image="python:3.9",
)
def evaluate_model(
    val_x: Input[Dataset],
    val_y: Input[Dataset],
    fitted_scaler: Input[Artifact],
    trained_model: Input[Model],
    confusion_matrix_plot: Output[ClassificationMetrics],
    classification_report_md: Output[Markdown]
):
    """
    Evaluates the performance of a trained SVM model using validation data.
    Outputs a confusion matrix plot and a markdown file containing a classification report.
    """
    import pandas as pd
    from sklearn.svm import SVC
    from joblib import load
    from sklearn.metrics import confusion_matrix, classification_report

    # Load the fitted scaler and preprocess validation data
    scaler = load(fitted_scaler.path)
    x_val, y_val = [pd.read_parquet(path) for path in (val_x.path, val_y.path)]
    x_val = pd.DataFrame(scaler.transform(x_val), columns=x_val.columns)

    # Load the trained SVM model and make predictions on validation data
    svm_model = load(trained_model.path)
    predictions = svm_model.predict(x_val)

    # Log the confusion matrix
    confusion_matrix_plot.log_confusion_matrix(
        [str(v) for v in y_val['price_range'].unique()],
        confusion_matrix(y_val['price_range'].values.tolist(), predictions.tolist()).tolist()
    )

    # Create the markdown content for classification report
    markdown_content = f"```\n{classification_report(y_val['price_range'].values, predictions)}\n```"

    # Write the content to a Markdown file
    with open(classification_report_md.path, 'w') as f:
        f.write(markdown_content)

### Run predictions on test dataset

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn", "pyarrow", "plotly"],
    base_image="python:3.9",
)
def test_model(
    test_x: Input[Dataset],
    trained_model: Input[Model],
    fitted_scaler: Input[Artifact],
    column_x: str,
    column_y: str,
    scatter_plot: Output[HTML]
):
    """
    Test a trained SVM model on provided test data and produce a scatter plot.
    The scatter plot will have points colored by the predicted class based on two columns
    specified by the user.
    """
    import pandas as pd
    from joblib import load
    import plotly.express as px

    # Load the fitted scaler and preprocess test data
    scaler = load(fitted_scaler.path)
    x_test = pd.read_parquet(test_x.path)
    x_test = x_test.drop('id', axis=1)
    x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

    # Load the trained SVM model and make predictions on test data
    svm_model = load(trained_model.path)
    predictions = svm_model.predict(x_test)

    # Add predictions as a column to the x_test DataFrame for visualization
    x_test['Predicted Class'] = predictions

    # Create the scatter plot using plotly
    fig = px.scatter(x_test,
                     x=column_x,
                     y=column_y,
                     color='Predicted Class',
                     color_continuous_scale='Viridis',
                     title=f"Scatter plot of {column_x} vs. {column_y} colored by Predicted Class",
                     template='plotly_dark')

    # Save the plot as an HTML file
    fig.write_html(scatter_plot.path)

## Build pipeline

In [None]:
@dsl.pipeline
def mobile_price_classification_pipeline(
    minio_train_data_path: str = 'minio://kubeflow-examples/mobile-price-classification/train.csv',
    minio_test_data_path: str = 'minio://kubeflow-examples/mobile-price-classification/test.csv',
    test_size: float = 0.5,
    C: List = [1, 0.1, 0.25, 0.5, 2, 0.75],
    kernel: List = ["linear", "rbf"],
    gamma: List = ["auto", 0.01, 0.001, 0.0001, 1],
    decision_function_shape: List[str] = ["ovo", "ovr"],
    scatter_plot_column_x: str = 'ram',
    scatter_plot_column_y: str = 'battery_power',
    seed: int = 42,
):
    """
    Define the mobile price classification pipeline.
    
    This pipeline covers the following steps:
    1. Read data from the specified paths.
    2. Split the data into training and validation sets.
    3. Fit the MinMax scaler.
    4. Tune hyperparameters for the SVM model.
    5. Train the SVM model with the best hyperparameters.
    6. Evaluate the trained model.
    7. Test the model and visualize the results with a scatter plot.
    """
    import_train = dsl.importer(
        artifact_uri=minio_train_data_path,
        artifact_class=dsl.Dataset,
        reimport=True)
    
    import_test = dsl.importer(
        artifact_uri=minio_test_data_path,
        artifact_class=dsl.Dataset,
        reimport=True)
    
    # Step 1: Read the data
    read_data_task = read_data(
        minio_train_data=import_train.output,
        minio_test_data=import_test.output
    )

    # Step 2: Split the data
    split_data_task = split_data(
        train_df=read_data_task.outputs['train_df'],
        test_size=test_size,
        seed=seed
    )

    # Step 3: Fit the scaler
    fit_scaler_task = fit_scaler(
        train_x=split_data_task.outputs['x_train_df']
    )

    # Step 4: Tune hyperparameters
    tune_hyperparams_task = tune_hyperparams(
        train_x=split_data_task.outputs['x_train_df'],
        train_y=split_data_task.outputs['y_train_df'],
        fitted_scaler=fit_scaler_task.outputs['fitted_scaler']
    )

    # Step 5: Train the model
    train_model_task = train_model(
        train_x=split_data_task.outputs['x_train_df'],
        train_y=split_data_task.outputs['y_train_df'],
        hparams=tune_hyperparams_task.output,
        fitted_scaler=fit_scaler_task.outputs['fitted_scaler']
    )

    # Step 6: Evaluate the model
    evaluate_model_task = evaluate_model(
        val_x=split_data_task.outputs['x_val_df'],
        val_y=split_data_task.outputs['y_val_df'],
        trained_model=train_model_task.outputs['trained_model'],
        fitted_scaler=fit_scaler_task.outputs['fitted_scaler']
    )

    # Step 7: Test the model and visualize
    test_model_task = test_model(
        test_x=read_data_task.outputs['test_df'],
        trained_model=train_model_task.outputs['trained_model'],
        fitted_scaler=fit_scaler_task.outputs['fitted_scaler'],
        column_x=scatter_plot_column_x,
        column_y=scatter_plot_column_y
    )


## Compile and run pipeline

In [None]:
# Initialize the Kubeflow Pipelines client
client = Client()

# Define the arguments to be passed to the pipeline
args = dict(
    minio_train_data_path='minio://kubeflow-examples/mobile-price-classification/train.csv',
    minio_test_data_path='minio://kubeflow-examples/mobile-price-classification/test.csv',
    test_size=0.2,
    C=[1, 0.1, 0.25, 0.5, 2, 0.75],
    kernel=["linear", "rbf"],
    gamma=["auto", 0.01, 0.001, 0.0001, 1],
    decision_function_shape=["ovo", "ovr"],
    scatter_plot_column_x='ram',
    scatter_plot_column_y='battery_power',
    seed=42
)

# Create a new run from the pipeline function
client.create_run_from_pipeline_func(
    mobile_price_classification_pipeline,
    arguments=args,
    experiment_name="mobile-price-classification",
    enable_caching=True,
)


# Debugging with Lightweight Components
Debugging can be challenging when using lightweight components in Kubeflow Pipelines. A practical approach is to download the artifacts from the steps preceding the failing one, from MinIO, and then run the functions used in the components locally. You can easily copy the paths to these artifacts from the Kubeflow Pipelines UI. Once you have these paths, you can use them as shown in the example below to download and read in Pandas DataFrames or perform similar operations. Make sure to adjust the paths according to your specific setup requirements.

In [None]:
storage_options={
        "key": os.environ["AWS_ACCESS_KEY_ID"],
        "secret": os.environ["AWS_SECRET_ACCESS_KEY"],
        "client_kwargs": {"endpoint_url": f'http://{os.environ["S3_ENDPOINT"]}'}
}

In [None]:
# reading pandas dataframes (Modify paths!)
x_train_path = 's3://mlpipeline/v2/artifacts/mmobile-price-classification-pipeline/f053e118-abc8-43f0-a1a2-5b8a11156287/split-data/x_train_df'
y_train_path = 's3://mlpipeline/v2/artifacts/mmobile-price-classification-pipeline/f053e118-abc8-43f0-a1a2-5b8a11156287/split-data/y_train_df'
x_val_path = 's3://mlpipeline/v2/artifacts/mmobile-price-classification-pipeline/f053e118-abc8-43f0-a1a2-5b8a11156287/split-data/x_val_df'
y_val_path = 's3://mlpipeline/v2/artifacts/mmobile-price-classification-pipeline/f053e118-abc8-43f0-a1a2-5b8a11156287/split-data/y_val_df'
x_test_path = 's3://mlpipeline/v2/artifacts/mmobile-price-classification-pipeline/23e796ae-df8e-4b4b-a36c-aad4a85da4b0/read-data/test_df'
raw_train_data_path = 's3://mlpipeline/v2/artifacts/mmobile-price-classification-pipeline/23e796ae-df8e-4b4b-a36c-aad4a85da4b0/read-data/train_df'
x_train_raw, x_train, y_train, x_val, y_val, x_test = [
    pd.read_parquet(path, storage_options=storage_options) for path in (raw_train_data_path, x_train_path, y_train_path, x_val_path, y_val_path, x_test_path)]

In [None]:
# use the mc tool to download model artifact
!mc cp minio/mlpipeline/v2/artifacts/mmobile-price-classification-pipeline/90e172f1-5143-475d-b02c-92fbc34338cb/train-model/trained_model ./trained_model

In [None]:
from joblib import load
svm_model = load('./trained_model') 