In [1]:
import kfp
from kfp.components import load_component_from_url, create_component_from_func
from kfp.components import InputPath, OutputPath

import sys
sys.path.insert(0, "..")
from constants import NAMESPACE, HOST
from utils.auth import get_session_cookie

In [2]:
web_downloader_op = load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.1/components/web/Download/component.yaml')

In [3]:
def merge_csv(file_path: InputPath('Tarball'),
              output_csv: OutputPath('CSV')):
    import glob
    import pandas as pd
    import tarfile

    tarfile.open(name=file_path, mode="r|gz").extractall('data')
    df = pd.concat(
      [pd.read_csv(csv_file, header=None) 
       for csv_file in glob.glob('data/*.csv')])
    df.to_csv(output_csv, index=False, header=False)
    
create_step_merge_csv = create_component_from_func(
    func=merge_csv,
    output_component_file='../components/merge_csv/component.yaml', # This is optional. It saves the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4'])

In [4]:
def get_csv_info(input_csv: InputPath('CSV')) -> tuple:
    import pandas as pd
    
    df = pd.read_csv(input_csv, header=None)
    print(f"[Debug] df.shape: {df.shape}")
    return df.shape
    
get_csv_info_op = create_component_from_func(
    func=get_csv_info,
    output_component_file='../components/get_csv_info/component.yaml', # This is optional. It saves the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4'])

In [5]:
# Define a pipeline and create a task from a component:
def my_pipeline(url):
    web_downloader_task = web_downloader_op(url=url)
    merge_csv_task = create_step_merge_csv(file=web_downloader_task.outputs['Data'])
    get_csv_info_task = get_csv_info_op(input_csv=merge_csv_task.outputs['output_csv'])

In [6]:
session_cookie = get_session_cookie()
client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
    namespace=NAMESPACE,
)
client.create_run_from_pipeline_func(
    my_pipeline,
    arguments={
        'url': 'https://storage.googleapis.com/ml-pipeline-playground/iris-csv-files.tar.gz'
    })

RunPipelineResult(run_id=e83b8d06-1906-435b-8631-a033c77b2023)