In [1]:
!gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

Activated service account credentials for: [kubeflow-user@learning-gcp-255620.iam.gserviceaccount.com]


In [2]:
import os

import kfp
import kfp.gcp

In [3]:
PROJECT_ID = 'learning-gcp-255620'
REGION = 'us-central1'
BASE_GCS_PATH = f'gs://{PROJECT_ID}-kubeflow-dev/kubeflow_on_gcp_demo'

In [4]:
BASE_IMAGE = kfp.containers.build_image_from_working_dir(
    working_dir='src',
    base_image='tensorflow/tensorflow:1.14.0-py3',
)
BASE_IMAGE

'gcr.io/learning-gcp-255620/notebook-kubeflow-rafaelbarreto87-testing/kfp_container@sha256:b972afe8bcfdda2da2cc3ccbb5d6d35346c4bae6cd133e9fc25017c5e971c23b'

In [5]:
@kfp.dsl.component
def print_comp(msg: str):
    return kfp.dsl.ContainerOp(
        name='print',
        image='alpine:3.6',
        command=['echo', msg],
    )


@kfp.dsl.component
def big_query_examples_gen_comp(
    project_id: str,
    region: str,
    temp_dir: str,
    query: str,
    output_dir: str,
    runner: str='DirectRunner'
):
    return kfp.dsl.ContainerOp(
        name='BigQuery examples generator',
        image=BASE_IMAGE,
        command=['python3', '-m', 'demo.examples_gen'],
        arguments=[
            '--runner', runner,
            '--project', project_id,
            '--region', region,
            '--temp_location', temp_dir,
            '--query', query,
            '--output_dir', output_dir
        ]
    )


@kfp.dsl.component
def tfrecord_stats_gen_comp(
    project_id: str,
    region: str,
    temp_dir: str,
    data_location: str,
    output_dir: str,
    runner: str='DirectRunner'
) -> {'stats_output_path': str,
      'stats_viz_output_path': str}:
    return kfp.dsl.ContainerOp(
        name='TFRecord statistics generator',
        image=BASE_IMAGE,
        command=['python3', '-m', 'demo.tfrecord_stats_gen'],
        arguments=[
            '--runner', runner,
            '--project', project_id,
            '--region', region,
            '--temp_location', temp_dir,
            '--data_location', data_location,
            '--output_dir', output_dir,
        ],
        file_outputs={
            'stats_output_path': '/tmp/stats_output_path.txt',
            'stats_viz_output_path': '/tmp/stats_viz_output_path.txt',
            'inferred_schema_output_path': '/tmp/inferred_schema_output_path.txt'
        },
        output_artifact_paths={
            'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'
        }
    )

In [6]:
@kfp.dsl.component
def chicago_taxi_trips_examples_gen_comp(
    project_id: str,
    region: str,
    temp_dir: str,
    output_dir: str,
    runner: str='DirectRunner'
):
    query = """
    SELECT
        pickup_community_area,
        fare,
        EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
        EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
        EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
        UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
        pickup_latitude,
        pickup_longitude,
        dropoff_latitude,
        dropoff_longitude,
        trip_miles,
        pickup_census_tract,
        dropoff_census_tract,
        payment_type,
        company,
        trip_seconds,
        dropoff_community_area,
        tips
    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
    LIMIT 10000
    """

    return big_query_examples_gen_comp(
        project_id=project_id,
        region=region,
        temp_dir=temp_dir,
        query=query,
        output_dir=output_dir,
        runner=runner
    )

In [7]:
@kfp.dsl.pipeline(
    name='Chicago taxi trips',
    description='Demo of TFX and Kubeflow Pipelines on GCP using the Chicago taxi trips dataset.'
)
def chicago_taxi_trips_pipeline(
    project_id: str,
    region: str,
    working_dir: str
):
    temp_dir = os.path.join(str(working_dir), '{{workflow.name}}', 'temp')
    examples_output_dir = os.path.join(str(working_dir), '{{workflow.name}}', 'examples')
    stats_output_dir = os.path.join(str(working_dir), '{{workflow.name}}', 'stats')
    
    examples_gen = chicago_taxi_trips_examples_gen_comp(
        project_id=project_id,
        region=region,
        temp_dir=temp_dir,
        output_dir=examples_output_dir
    ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa'))
    
    tfrecord_stats_gen = tfrecord_stats_gen_comp(
        project_id=project_id,
        region=region,
        temp_dir=temp_dir,
        data_location=os.path.join(str(examples_output_dir), 'part-*.tfrecord.gz'),
        output_dir=stats_output_dir
    ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa'))
    
    tfrecord_stats_gen.after(examples_gen)
    
    print_comp(tfrecord_stats_gen.outputs['inferred_schema_output_path'])
    print_comp(tfrecord_stats_gen.outputs['stats_output_path'])
    print_comp(tfrecord_stats_gen.outputs['stats_viz_output_path'])

In [9]:
kfp.Client().create_run_from_pipeline_func(chicago_taxi_trips_pipeline, arguments={
    'project_id': PROJECT_ID,
    'region': REGION,
    'working_dir': os.path.join(BASE_GCS_PATH, 'chicago_taxi_trips'),
})

<kfp._client.Client.create_run_from_pipeline_package.<locals>.RunPipelineResult at 0x7f2d609b7e80>