In [None]:
import os
import time
import logging
import kfp
from google.cloud import bigquery, storage
from google.cloud import aiplatform as vertex_ai
from google_cloud_pipeline_components.experimental.custom_job import utils
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import component
from typing import NamedTuple
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, Metrics,
                        OutputPath, component)

from google_cloud_pipeline_components.experimental.custom_job import utils

In [None]:
logging.basicConfig(level=logging.INFO)

## Load Params and Resource Config

In [None]:
from config.gcp_resource import *

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    
if SERVICE_ACCOUNT == "" or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == "[your-service-account]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.account)' 2>/dev/null
    SERVICE_ACCOUNT = shell_output[0]
    
if GCS_BUCKET == "" or GCS_BUCKET is None or GCS_BUCKET == "[your-bucket-name]":
    # Get your bucket name to GCP projet id
    GCS_BUCKET = PROJECT_ID
    # Try to create the bucket if it doesn'exists
    ! gsutil mb -l $REGION gs://$BUCKET
    print("")
    
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [None]:
print("Train machine type", TRAIN_COMPUTE)
print("Deploy machine type", DEPLOY_COMPUTE)
print("Deployment:", DEPLOY_IMAGE)
print('PIPELINE_ROOT: {}'.format(PIPELINE_ROOT))
print('MODULE_ROOT: {}'.format(MODULE_ROOT))
print('DATA_ROOT: {}'.format(DATA_ROOT))
print('SERVING_MODEL_DIR: {}'.format(SERVING_MODEL_DIR))

Train machine type n1-standard-4
Deploy machine type n1-standard-4
Deployment: us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest
PIPELINE_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/pipeline_root
MODULE_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/pipeline_module
DATA_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/data
SERVING_MODEL_DIR: gs://mle_airbus_dataset/airbusmlepipeline/serving_model


## Data Ingest Component

In [14]:
%%writefile ./src/dataset/ingest_component.py

import requests
import os
import logging
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
from sklearn.utils import resample
from google.cloud import bigquery, storage
from sklearn.model_selection import train_test_split
from google.oauth2 import service_account
from skimage.segmentation import mark_boundaries
from skimage.util import montage as montage2d
from skimage.io import imread
from skimage.segmentation import mark_boundaries
from skimage.util import montage
from skimage.morphology import label
from src.utils.dataset import *
from src.utils.common import *
from pathlib import Path
    

logging.basicConfig(level=logging.INFO)

parser = argparse.ArgumentParser()
parser.add_argument('--project_id', dest='project_id',
                    default='mle-airbus-detection-smu', type=str,
                    help='Project id.')
parser.add_argument('--gcs_bucket', dest='gcs_bucket',
                    default='mle_airbus_dataset', type=str,
                    help='GCS bucket url.')
parser.add_argument('--region', dest='region',
                    default='asia-east1', type=str,
                    help='Project location/region.')
parser.add_argument('--table-bq', dest='table_bq',
                    default='mle-airbus-detection-smu.airbus_data.label_data', type=str,
                    help='BigQuery big table for source data.')
parser.add_argument('--train-output', dest='train_output',
                    default='train.txt', type=str,
                    help='Filename of training parquet file')
parser.add_argument('--test-output', dest='test_output',
                    default='test.txt', type=str,
                    help='Filename of test parquet file')
parser.add_argument('--n-truncate', dest='n_truncate',
                    default=20000, type=int,
                    help='Filename of test parquet file')
args = parser.parse_args()

PROJECT_ID = args.project_id
GCS_BUCKET = args.gcs_bucket
REGION = args.region
TABLE_BQ = args.table_bq

bucket = storage.Client().bucket(GCS_BUCKET)
bqclient = bigquery.Client(project=PROJECT_ID, location=REGION)

try: 
    bucket = storage.Client().bucket(GCS_BUCKET)
    bqclient = bigquery.Client(project=PROJECT_ID, location=REGION)
    logging.info(f"Connection to BigQuery table {TABLE_BQ} and GCS Bucket {GCS_BUCKET} successfully.")
except:
    logging.error("")

# Download a table.
table = bigquery.TableReference.from_string(
    TABLE_BQ
)
rows = bqclient.list_rows(
    table
)
masks = rows.to_dataframe(
    create_bqstorage_client=True,
)

#reprocessing RLE data
masks = masks[:args.n_truncate]
masks.replace(to_replace=[None], value='', inplace=True)
masks = masks.groupby(['ImageId'])['EncodedPixels'].apply(lambda x: ';'.join(x) if x is not None else ';'.join('')).reset_index()

masks['ships'] = masks['EncodedPixels'].map(lambda c_row: c_row.count(";"))
unique_img_ids = masks.groupby('ImageId').agg({'ships': 'sum'}).reset_index()
unique_img_ids['has_ship'] = unique_img_ids['ships'].map(lambda x: 1.0 if x>0 else 0.0)
unique_img_ids['has_ship_vec'] = unique_img_ids['has_ship'].map(lambda x: [x])
masks.drop(['ships'], axis=1, inplace=True)
masks.EncodedPixels = masks.EncodedPixels.apply(lambda x: merge_rle_encode(x))

train_ids, valid_ids = train_test_split(unique_img_ids, 
                 test_size = 0.3, 
                 stratify = unique_img_ids['ships'])
train_df = pd.merge(masks, train_ids)
valid_df = pd.merge(masks, valid_ids)
logging.info(train_df.shape[0], 'training masks')
logging.info(valid_df.shape[0], 'validation masks')

train_df_balanced = pd.DataFrame()
for ship_num in train_df['ships'].unique():
    train_df_balanced = train_df_balanced.append(resample(train_df.query("ships == {}".format(ship_num)), n_samples=N_SAMPLE))
train_df_balanced.reset_index(drop=True, inplace=True)

valid_df_balanced = pd.DataFrame()
for ship_num in valid_df['ships'].unique():
    valid_df_balanced = valid_df_balanced.append(resample(valid_df.query("ships == {}".format(ship_num)), n_samples=N_SAMPLE//10))
    
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
output_path = Path(f"{timestamp}")
train_df_balanced.to_parquet(f"train.parquet")
valid_df_balanced.to_parquet(f"test.parquet")

try:
    blob = bucket.blob(f"output_path.join('train.parquet')")
    blob.upload_from_filename('train.parquet')
    blob = bucket.blob(f"output_path.join('test.parquet')")
    blob.upload_from_filename('test.parquet')
    logging.info("File uploaded to GCS bucket successfully.")
except:
    logging.error("File upload to GCS Bucket failed!")

#return f"gs://{GCS_BUCKET}/train.parquet", f"gs://{GCS_BUCKET}/test.parquet"
return f"gs://{GCS_BUCKET}/train.parquet", f"gs://{GCS_BUCKET}/test.parquet"

Writing ./src/dataset/ingest_component.py


FileNotFoundError: [Errno 2] No such file or directory: './src/dataset/ingest_component.py'

In [13]:
from pathlib import Path

str(Path("./build/test").relative_to("./"))



'build/test'