In [1]:
import os
import time

import pandas as pd

In [97]:
# Setting constants, properties
#

REGION = "us-central1"

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

ARTIFACT_STORE = f"gs://{PROJECT_ID}-spectrain-artifact-store"

DATA_ROOT = f"{ARTIFACT_STORE}/data"
JOB_DIR_ROOT = f"{ARTIFACT_STORE}/jobs"
TRAINING_FILE_PATH = f"{DATA_ROOT}/training/dataset.csv"
VALIDATION_FILE_PATH = f"{DATA_ROOT}/validation/dataset.csv"
TEST_FILE_PATH = f"{DATA_ROOT}/test/dataset.csv"
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

In [87]:
# This checks if spectrain artifact store is present, creates otherwise
# to see this go to cloud storage bucket and there should be a bucket
# qwiklabs-asl-00-c812c3b423f2-spectrain-artifact-store
#
!gsutil ls | grep ^{ARTIFACT_STORE}/$ || gsutil mb -l {REGION} {ARTIFACT_STORE}

gs://qwiklabs-asl-00-c812c3b423f2-spectrain-artifact-store/


In [88]:
os.environ["JOB_DIR_ROOT"] = JOB_DIR_ROOT
os.environ["TRAINING_FILE_PATH"] = TRAINING_FILE_PATH
os.environ["VALIDATION_FILE_PATH"] = VALIDATION_FILE_PATH
os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["REGION"] = REGION

In [89]:
%%bash

# This loads the source csv file into BQ table to 
# get it ready for training & validation split 

DATASET_LOCATION=US
DATASET_ID=kidney_tx_dataset
TABLE_ID=kidneytx
DATA_SOURCE=gs://spectrain/Kidney_TX_Data/Kidney_TX_data.csv
SCHEMA=Patient_Sample_ID:STRING,\
Patient_ID:STRING,\
Nmr_sample_ID:STRING,\
Patient_Age_at_Biopsy:INTEGER,\
Patient_Age_at_TX:INTEGER,\
Case:INTEGER,\
Sex:STRING,\
serum_creatinine:FLOAT64,\
hippurate:FLOAT64,\
phenylacetylglutamine:FLOAT64,\
trigonellin:FLOAT64,\
urea:FLOAT64,\
alanine:FLOAT64,\
citrate:FLOAT64,\
dimethylamine:FLOAT64,\
lactate:FLOAT64,\
Biopsy_reason:STRING,\
Banff1_y:BOOL,\
Banff2_y:BOOL,\
Banff2Act:BOOL,\
Banff2Chron:BOOL,\
Banff2C4d:BOOL,\
Banff2C4dNoRej:BOOL,\
Banff3_y:BOOL,\
Banff4_y:BOOL,\
Banff4IA:BOOL,\
Banff4IB:BOOL,\
Banff4IIA:BOOL,\
Banff4IIB:BOOL,\
Banff4III:BOOL,\
Banff4Chron:BOOL,\
Banff5_y:BOOL,\
Banff5I:BOOL,\
Banff5II:BOOL,\
Banff5III:BOOL,\
Banff6_y:BOOL,\
LS_i:INTEGER,\
LS_t:INTEGER,\
LS_v:INTEGER,\
LS_g:INTEGER,\
LS_ptc:INTEGER,\
LS_ci:INTEGER,\
LS_ct:INTEGER,\
LS_cv:INTEGER,\
LS_cg:INTEGER,\
LS_mm:INTEGER,\
LS_ah:INTEGER,\
LS_ti:INTEGER,\
LS_i_IFTA:INTEGER,\
LS_IFTA:INTEGER,\
Biopsy_BKV:BOOL,\
Diabetes:BOOL,\
Hypertension:BOOL,\
UA_Pro:BOOL,\
UA_Hb:BOOL,\
Source:STRING,\
Spectrum_file:STRING

# Remove an old copy, if exists
bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID rm -r -f $DATASET_ID

# Recreate the datset afresh
bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID

# Load the data from the csv file
bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA

Dataset 'qwiklabs-asl-00-c812c3b423f2:kidney_tx_dataset' successfully created.


W0613 06:53:57.976574 139646178285376 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
W0613 06:53:59.720565 140136463415104 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
W0613 06:54:01.852635 140422348556096 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
Waiting on bqjob_r175256c56508d618_00000188b3886a7e_1 ... (1s) Current status: DONE   


In [90]:
%%bigquery
SELECT *
FROM `kidney_tx_dataset.kidneytx`

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 689.34query/s]                         
Downloading: 100%|██████████| 1474/1474 [00:01<00:00, 832.27rows/s]


Unnamed: 0,Patient_Sample_ID,Patient_ID,Nmr_sample_ID,Patient_Age_at_Biopsy,Patient_Age_at_TX,Case,Sex,serum_creatinine,hippurate,phenylacetylglutamine,...,LS_ti,LS_i_IFTA,LS_IFTA,Biopsy_BKV,Diabetes,Hypertension,UA_Pro,UA_Hb,Source,Spectrum_file
0,YB)ecoX1atQDG1nxP^Z*#yfa2,76ScIFYYUt^x8YKu5w6*0*ZmY,YB)ecoX1atQDG1nxP^Z*#yfa2,56,54,0,male,1.48000,0.109185,0.136410,...,,,,False,False,True,False,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...
1,AuumE0FJhj^b#jGya*ULAt2,gd2440dWG3MaafL9a4BoTEYHs,AuumE0FJhj^b#jGya*ULAt2,52,44,0,female,2.36000,0.048538,0.121817,...,,,,False,True,True,False,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...
2,VcO95f@pi0ZmgN4qFXeQA9tHg,*lgfIZ$7gzpjJU7lSYHiLM9My,VcO95f@pi0ZmgN4qFXeQA9tHg,54,53,1,female,2.71652,0.032343,0.065281,...,0.0,,,False,False,True,True,True,numares,output_NormalizationTool_spectrum_zgpr30-urine...
3,8t2*c9bM9l5nhF#N4y9rGSW4K,B(ocEgL#hGpPR6*duK17kyYVo,8t2*c9bM9l5nhF#N4y9rGSW4K,37,26,1,male,2.98433,0.062100,0.071755,...,3.0,,,False,False,True,True,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...
4,DetnzZqt$UQa(jWSQ2J*MOZQt,bhdLeIiEnep6TPu8qeostZ8o(,DetnzZqt$UQa(jWSQ2J*MOZQt,55,55,0,male,2.15000,0.027913,0.039593,...,0.0,,,True,False,False,,,mayo,output_NormalizationTool_spectrum_zgpr30-urine...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1469,oL#Lxy@k#PIoW*Jl8I0Y37pr$,cIVL8e0*hGuL5AI4I7z4pZim6,oL#Lxy@k#PIoW*Jl8I0Y37pr$,52,44,1,female,5.76000,0.055037,0.135501,...,3.0,3.0,3.0,False,False,True,True,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...
1470,hqoHdM9eOPbhRRPAjbZJ^a*jk,0(1r9AP4@zw3HyDwnKbzHpMZn,hqoHdM9eOPbhRRPAjbZJ^a*jk,65,64,1,female,1.65000,0.081151,0.100113,...,3.0,3.0,3.0,True,False,True,True,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...
1471,Xgw*jgSPW7Vj^dA3DkhPUrCy1,F1QPuFO7ATXcqI@RO$Psj8MIH,Xgw*jgSPW7Vj^dA3DkhPUrCy1,52,51,1,female,1.04000,0.285604,0.032253,...,3.0,3.0,3.0,False,False,True,False,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...
1472,VCvdHASgOPQB^F1IxIiB1CDzq,4VxEFeLvNP7vNyiIw*XFeYcfa,VCvdHASgOPQB^F1IxIiB1CDzq,54,51,1,male,4.13000,0.058046,0.063102,...,3.0,3.0,,False,False,True,True,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...


In [19]:
# 80% Training

!bq query \
-n 0 \
--destination_table kidney_tx_dataset.training \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `kidney_tx_dataset.kidneytx` AS kidney \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(kidney))), 10) IN (0, 1, 2, 3, 4, 5, 6, 7)' 

W0612 20:16:21.837682 139953877661504 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
Waiting on bqjob_r5d7941199bcb74ec_00000188b1409d4e_1 ... (1s) Current status: DONE   


In [20]:
!bq extract \
--destination_format CSV \
kidney_tx_dataset.training \
$TRAINING_FILE_PATH

W0612 20:16:26.454799 139785366738752 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
Waiting on bqjob_r6f21918c9d779f8a_00000188b140af57_1 ... (0s) Current status: DONE   


In [24]:
# 10% Validation

!bq query \
-n 0 \
--destination_table kidney_tx_dataset.validation \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `kidney_tx_dataset.kidneytx` AS kidney \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(kidney))), 10) IN (8)' 

W0612 20:20:22.968087 140656345532224 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
Waiting on bqjob_r796b6c862757a81_00000188b1444b39_1 ... (1s) Current status: DONE   


In [25]:
!bq extract \
--destination_format CSV \
kidney_tx_dataset.validation \
$VALIDATION_FILE_PATH

W0612 20:20:29.893918 140010312943424 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
Waiting on bqjob_r36ccb5530623fe72_00000188b1446647_1 ... (0s) Current status: DONE   


In [30]:
# 10% Test

!bq query \
-n 0 \
--destination_table kidney_tx_dataset.test \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `kidney_tx_dataset.kidneytx` AS kidney \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(kidney))), 10) IN (9)' 

W0612 20:24:18.652364 140450821646144 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
Waiting on bqjob_r4303df2142f30f1c_00000188b147e3dd_1 ... (1s) Current status: DONE   


In [31]:
!bq extract \
--destination_format CSV \
kidney_tx_dataset.test \
$TEST_FILE_PATH

W0612 20:24:24.520304 140502015493952 bigquery_client.py:731] There is no apilog flag so non-critical logging is disabled.
Waiting on bqjob_r51799517980d7866_00000188b147fac9_1 ... (0s) Current status: DONE   


In [138]:
%%writefile ./pogue_python/pipeline_vertex_automl.py
# ADDED BY ALEX
# Copyright 2021 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at

# https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

"""Kubeflow Pipeline."""

import os

from google_cloud_pipeline_components.aiplatform import (
    AutoMLTabularTrainingJobRunOp,
    EndpointCreateOp,
    ModelDeployOp,
    TabularDatasetCreateOp,
)
from kfp.v2 import dsl

PIPELINE_ROOT = os.getenv("PIPELINE_ROOT")
PROJECT = os.getenv("PROJECT")
DATASET_SOURCE = os.getenv("DATASET_SOURCE")
PIPELINE_NAME = os.getenv("PIPELINE_NAME", "covertype")
DISPLAY_NAME = os.getenv("MODEL_DISPLAY_NAME", PIPELINE_NAME)
TARGET_COLUMN = os.getenv("TARGET_COLUMN", "Cover_Type")
SERVING_MACHINE_TYPE = os.getenv("SERVING_MACHINE_TYPE", "n1-standard-16")


@dsl.pipeline(
    name=f"{PIPELINE_NAME}-vertex-automl-pipeline",
    description=f"AutoML Vertex Pipeline for {PIPELINE_NAME}",
    pipeline_root=PIPELINE_ROOT,
)
def create_pipeline():
    # DATASET_SOURCE = 'bq://qwiklabs-asl-00-c812c3b423f2.kidney_tx_dataset.kidneytx'

    dataset_create_task = TabularDatasetCreateOp(
        display_name=DISPLAY_NAME,
        bq_source=DATASET_SOURCE,
        project=PROJECT,
    )

    automl_training_task = AutoMLTabularTrainingJobRunOp(
        project=PROJECT,
        display_name=DISPLAY_NAME,
        optimization_prediction_type="classification",
        dataset=dataset_create_task.outputs["dataset"],
        target_column=TARGET_COLUMN,
    )

    endpoint_create_task = EndpointCreateOp(
        project=PROJECT,
        display_name=DISPLAY_NAME,
    )

    model_deploy_task = ModelDeployOp(  # pylint: disable=unused-variable
        model=automl_training_task.outputs["model"],
        endpoint=endpoint_create_task.outputs["endpoint"],
        deployed_model_display_name=DISPLAY_NAME,
        dedicated_resources_machine_type=SERVING_MACHINE_TYPE,
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
    )

Writing ./pogue_python/pipeline_vertex_automl.py


In [139]:
# ADDED BY ALEX
PIPELINE_ROOT=f"{ARTIFACT_STORE}/pipeline"
DATASET_SOURCE=f"bq://{PROJECT_ID}.kidney_tx_dataset.kidneytx"
PIPELINE_NAME="kidneytx_2"
TARGET_COLUMN="Case"

%env PIPELINE_ROOT={PIPELINE_ROOT}
%env PROJECT={PROJECT_ID}
%env REGION={REGION}
%env DATASET_SOURCE={DATASET_SOURCE}
%env TARGET_COLUMN={TARGET_COLUMN}
%env PIPELINE_NAME={PIPELINE_NAME}

env: PIPELINE_ROOT=gs://qwiklabs-asl-00-c812c3b423f2-spectrain-artifact-store/pipeline
env: PROJECT=qwiklabs-asl-00-c812c3b423f2
env: REGION=us-central1
env: DATASET_SOURCE=bq://qwiklabs-asl-00-c812c3b423f2.kidney_tx_dataset.kidneytx
env: TARGET_COLUMN=Case
env: PIPELINE_NAME=kidneytx_2


In [140]:
PIPELINE_JSON = "spectrain_automl_vertex_pipeline.json"

In [141]:
!gsutil ls | grep ^{ARTIFACT_STORE}/$ || gsutil mb -l {REGION} {ARTIFACT_STORE}

gs://qwiklabs-asl-00-c812c3b423f2-spectrain-artifact-store/


In [142]:
from kfp.v2 import compiler
from pogue_python.pipeline_vertex_automl import create_pipeline

print(os.getenv("DATASET_SOURCE"))

compiler.Compiler().compile(
    pipeline_func=create_pipeline, 
    package_path=PIPELINE_JSON,
)
print(os.getenv("DATASET_SOURCE"))

bq://qwiklabs-asl-00-c812c3b423f2.kidney_tx_dataset.kidneytx
bq://qwiklabs-asl-00-c812c3b423f2.kidney_tx_dataset.kidneytx


In [143]:
!cat {PIPELINE_JSON}

{
  "pipelineSpec": {
    "components": {
      "comp-automl-tabular-training-job": {
        "executorLabel": "exec-automl-tabular-training-job",
        "inputDefinitions": {
          "artifacts": {
            "dataset": {
              "artifactType": {
                "schemaTitle": "google.VertexDataset",
                "schemaVersion": "0.0.1"
              }
            }
          },
          "parameters": {
            "disable_early_stopping": {
              "type": "STRING"
            },
            "display_name": {
              "type": "STRING"
            },
            "export_evaluated_data_items": {
              "type": "STRING"
            },
            "labels": {
              "type": "STRING"
            },
            "location": {
              "type": "STRING"
            },
            "optimization_prediction_type": {
              "type": "STRING"
            },
            "project": {
              "type": "STRING"
            },
            "targe

In [65]:
print(os.getenv("DATASET_SOURCE"))

bq://qwiklabs-asl-00-c812c3b423f2.kidney_tx_dataset.kidneytx


In [64]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

pipeline = aiplatform.PipelineJob(
    display_name="automl_spectrain_kfp_pipeline",
    template_path=PIPELINE_JSON,
    enable_caching=True,
)

pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/469700469475/locations/us-central1/pipelineJobs/covertype-vertex-automl-pipeline-20230613063551
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/469700469475/locations/us-central1/pipelineJobs/covertype-vertex-automl-pipeline-20230613063551')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/covertype-vertex-automl-pipeline-20230613063551?project=469700469475
PipelineJob projects/469700469475/locations/us-central1/pipelineJobs/covertype-vertex-automl-pipeline-20230613063551 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/469700469475/locations/us-central1/pipelineJobs/covertype-vertex-automl-pipeline-20230613063551 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/469700469475/locations/us-central1/pipelineJobs/covertype-vertex-automl-pipeline-20230613063551 current state:
Pipeli

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [tabular-dataset-create].; Job (project_id = qwiklabs-asl-00-c812c3b423f2, job_id = 3834232891299069952) is failed due to the above error.; Failed to handle the job: {project_number = 469700469475, job_id = 3834232891299069952}"
