In [None]:
!pip install kfp --upgrade

In [1]:
import kfp
from kfp import components

### Load pre-built components for our pipeline

In [2]:
chicago_taxi_dataset_op = components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml'
)


convert_csv_to_apache_parquet_op = components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/0d7d6f41c92bdc05c2825232afe2b47e5cb6c4b3/components/_converters/ApacheParquet/from_CSV/component.yaml'
)


xgboost_train_on_csv_op = components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'
)


xgboost_predict_on_csv_op = components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/31939086d66d633732f75300ce69eb60e9fb0269/components/XGBoost/Predict/component.yaml'
)


xgboost_train_on_parquet_op = components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/0ae2f30ff24beeef1c64cc7c434f1f652c065192/components/XGBoost/Train/from_ApacheParquet/component.yaml'
)


xgboost_predict_on_parquet_op = components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/31939086d66d633732f75300ce69eb60e9fb0269/components/XGBoost/Predict/from_ApacheParquet/component.yaml'
)


### Creating a pipeline

In [3]:
@kfp.dsl.pipeline(name='xgboost')
def xgboost_pipeline():
    # Based on experimentation, many steps need 1Gi memory.

    training_data_csv = chicago_taxi_dataset_op(
        where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"',
        select='tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total',
        limit=10000,
    ).output

    # Training and prediction on dataset in CSV format
    model_trained_on_csv = xgboost_train_on_csv_op(
        training_data=training_data_csv,
        label_column=0,
        objective='reg:squarederror',
        num_iterations=200,
    ).set_memory_limit('1Gi').outputs['model']

    xgboost_predict_on_csv_op(
        data=training_data_csv,
        model=model_trained_on_csv,
        label_column=0,
    ).set_memory_limit('1Gi')

    # Training and prediction on dataset in Apache Parquet format
    training_data_parquet = convert_csv_to_apache_parquet_op(
        training_data_csv).output

    model_trained_on_parquet = xgboost_train_on_parquet_op(
        training_data=training_data_parquet,
        label_column_name='tips',
        objective='reg:squarederror',
        num_iterations=200,
    ).set_memory_limit('1Gi').outputs['model']

    xgboost_predict_on_parquet_op(
        data=training_data_parquet,
        model=model_trained_on_parquet,
        label_column_name='tips',
    ).set_memory_limit('1Gi')

    # Checking cross-format predictions
    xgboost_predict_on_parquet_op(
        data=training_data_parquet,
        model=model_trained_on_csv,
        label_column_name='tips',
    ).set_memory_limit('1Gi')

    xgboost_predict_on_csv_op(
        data=training_data_csv,
        model=model_trained_on_parquet,
        label_column=0,
    ).set_memory_limit('1Gi')


### Connect to your kfp client

In [None]:
authservice_session='authservice_session=<your-cookie>'
client = kfp.Client(host="dev-tap.aiops-platform.io/pipeline", cookies=authservice_session)

###### To create a pipeline it must be attached to an experiment, if you donot specify an experiment name, the pipelines will run under the 'Default' experiment.

In [None]:
# creating an experiment
experiment = client.create_experiment(
    name="<specify-the-experiment-name>",
    description="add a desccription",
    namespace="<your-namespace>"
) 

In [None]:
client.create_run_from_pipeline_func(
        xgboost_pipeline, # name of the pipeline function
        arguments={},
        run_name="<specify-the-run-name>",
        experiment_name="<specify-the-experiment-name>"
)