# Model Training

In [3]:
import os
import time
import logging
import kfp
from google.cloud import bigquery, storage
from google.cloud import aiplatform as vertex_ai
from google_cloud_pipeline_components.experimental.custom_job import utils
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import component
from typing import NamedTuple
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, Metrics,
                        OutputPath, component)

from google_cloud_pipeline_components.experimental.custom_job import utils

## Setup

In [4]:
PROJECT = 'mle-airbus-detection-smu' # Change to your project id.
PROJECT_NUMBER = '484894607141'
REGION = 'asia-east1' # Change to your region.
BUCKET = 'mle_airbus_dataset' # Change to your bucket name.
SERVICE_ACCOUNT = "service-account-for-mle@mle-airbus-detection-smu.iam.gserviceaccount.com"

if PROJECT == "" or PROJECT is None or PROJECT == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT = shell_output[0]
    
if SERVICE_ACCOUNT == "" or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == "[your-service-account]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.account)' 2>/dev/null
    SERVICE_ACCOUNT = shell_output[0]
    
if BUCKET == "" or BUCKET is None or BUCKET == "[your-bucket-name]":
    # Get your bucket name to GCP projet id
    BUCKET = PROJECT
    # Try to create the bucket if it doesn'exists
    ! gsutil mb -l $REGION gs://$BUCKET
    print("")
    
PARENT = f"projects/{PROJECT}/locations/{REGION}"

PIPELINE_NAME = 'airbusmlepipeline'

# Path to various pipeline artifact.
PIPELINE_ROOT = 'gs://{}/{}/pipeline_root'.format(
    BUCKET, PIPELINE_NAME)

# Paths for users' Python module.
MODULE_ROOT = 'gs://{}/{}/pipeline_module'.format(
    BUCKET, PIPELINE_NAME)

# Paths for users' data.
DATA_ROOT = 'gs://{}/{}/data'.format(BUCKET, PIPELINE_NAME)

# This is the path where your model will be pushed for serving.
SERVING_MODEL_DIR = 'gs://{}/{}/serving_model'.format(
    BUCKET, PIPELINE_NAME)

print('PIPELINE_ROOT: {}'.format(PIPELINE_ROOT))
print('MODULE_ROOT: {}'.format(MODULE_ROOT))
print('DATA_ROOT: {}'.format(DATA_ROOT))
print('SERVING_MODEL_DIR: {}'.format(SERVING_MODEL_DIR))

BQ_DATASET_NAME = 'mle-airbus-detection-smu.airbus_label_dataset' # Change to your BQ dataset name.
BQ_TABLE_NAME = 'airbus_label'
BQ_LOCATION = ' asia-east1'
    
print("Project ID:", PROJECT)
print("Region:", REGION)
print("Bucket name:", BUCKET)
print("Service Account:", SERVICE_ACCOUNT)
print("Vertex API Parent URI:", PARENT)

! gsutil ls -al "gs://"$BUCKET


storage_client = storage.Client(project=PROJECT)
bucket = storage_client.get_bucket(BUCKET)
storage_path = f"gs://{BUCKET}/train_v2/"
print (storage_path)

VERSION = 'v01'
DATASET_DISPLAY_NAME = 'airbus-ship-dataset-display'
MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'

WORKSPACE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}'
EXPERIMENT_ARTIFACTS_DIR = os.path.join(WORKSPACE, 'experiments')

TENSORBOARD_DISPLAY_NAME = f'tb-{DATASET_DISPLAY_NAME}'
EXPERIMENT_NAME = f'{MODEL_DISPLAY_NAME}'

BATCH_SIZE = 16
EDGE_CROP = 16
NB_EPOCHS = 5
GAUSSIAN_NOISE = 0.1
UPSAMPLE_MODE = 'SIMPLE'
# downsampling inside the network
NET_SCALING = None
# downsampling in preprocessing
IMG_SCALING = (1, 1)
# number of validation images to use
VALID_IMG_COUNT = 400
# maximum number of steps_per_epoch in training
MAX_TRAIN_STEPS = 200
AUGMENT_BRIGHTNESS = False
N_SAMPLE = 10
IMG_SHAPE = (768, 768)

PIPELINE_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/pipeline_root
MODULE_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/pipeline_module
DATA_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/data
SERVING_MODEL_DIR: gs://mle_airbus_dataset/airbusmlepipeline/serving_model
Project ID: mle-airbus-detection-smu
Region: asia-east1
Bucket name: mle_airbus_dataset
Service Account: service-account-for-mle@mle-airbus-detection-smu.iam.gserviceaccount.com
Vertex API Parent URI: projects/mle-airbus-detection-smu/locations/asia-east1
      2373  2022-06-26T04:02:36Z  gs://mle_airbus_dataset/mle-airbus-detection-smu-b1f8ee58e814.json#1656216156596465  metageneration=1
    194297  2022-07-03T10:12:15Z  gs://mle_airbus_dataset/test.parquet#1656843135799925  metageneration=1
   1541531  2022-07-03T10:12:15Z  gs://mle_airbus_dataset/train.parquet#1656843135725932  metageneration=1
 419430400  2022-07-03T10:13:34Z  gs://mle_airbus_dataset/train_hist.csv#1656843214414169  metageneration=1
        18  

In [None]:
hyperparams_gen = components.hyperparameters_gen(
    num_epochs=5,
    learning_rate=0.001,
    batch_size=512,
    hidden_units='64,64',
)

context.run(hyperparams_gen, enable_cache=False)

In [None]:
json.load(
    tf.io.gfile.GFile(
        os.path.join(
            hyperparams_gen.outputs['hyperparameters'].get()[0].uri, 'hyperparameters.json')
    )
)

## Model training

### Model Training Files

#### trainer.py

In [None]:
%%writefile ./src/model_training/trainer.py
import logging
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.losses import binary_crossentropy

from tensorflow import keras
import numpy as np
import pandas as pd

# from src.model_training 
import data, model
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau

def train(
    train_data_dir,
    eval_data_dir,
    hyperparams,
    base_model_dir=None,
):
    logging.basicConfig(level=logging.INFO)
    logging.info(f"Loading dataset from {train_data_dir}")

    train_dataset = data.get_dataset(
        train_data_dir,
        hyperparams,
    )
    
    eval_dataset = data.get_dataset(
        eval_data_dir,
        hyperparams,
    )
    
    def dice_coef(y_true, y_pred, smooth=1):
        intersection = K.sum(y_true * y_pred, axis=[1,2,3])
        union = K.sum(y_true, axis=[1,2,3]) + K.sum(y_pred, axis=[1,2,3])
        return K.mean( (2. * intersection + smooth) / (union + smooth), axis=0)

    def dice_p_bce(in_gt, in_pred):
        return 1e-3*binary_crossentropy(in_gt, in_pred) - dice_coef(in_gt, in_pred)

    def true_positive_rate(y_true, y_pred):
        return K.sum(K.flatten(y_true)*K.flatten(K.round(y_pred)))/K.sum(y_true)
    
    
    weight_path="{}_weights.best.ctph".format('seg_model')

    checkpoint = ModelCheckpoint(
        weight_path, 
        monitor='val_dice_coef', 
        verbose=1, 
        save_best_only=True, 
        mode='max', 
        save_weights_only = True)

    optimizer = keras.optimizers.Adam(
        learning_rate=hyperparams["learning_rate"], 
        decay=hyperparams["decay_rate"])
    
    metrics = [dice_coef, 'binary_accuracy', true_positive_rate]
    
    reduceLROnPlat = ReduceLROnPlateau(
        monitor='val_dice_coef', 
        factor=0.5, 
        patience=10,                       
        verbose=1, 
        mode='max', 
        epsilon=0.0001, 
        cooldown=2, 
        min_lr=1e-6
    )
    
    early_stopping = EarlyStopping(
        monitor="val_dice_coef", 
        mode="max", 
        patience=30)
    
    callbacks_list = [checkpoint, reduceLROnPlat, early_stopping]

    seg_model = model.create_model(hyperparams)
    if base_model_dir:
        try:
            seg_model = keras.load_model(base_model_dir)
        except:
            pass

    seg_model.compile(
        optimizer = optimizer, 
        loss = dice_p_bce, 
        metrics = metrics
    )

    logging.info("Model training started...")
    loss_history = [
        seg_model.fit(
            train_dataset,
            epochs=hyperparams["num_epochs"], 
            validation_data=eval_dataset.take(1),
            callbacks=callbacks_list,
            verbose=1,
            workers=1 # the generator is not very thread safe
        )
    ]
    
    logging.info("Model training completed.")

    return seg_model

def evaluate(model, data_dir, raw_schema_location, hyperparams):
    pass

Overwriting ./src/model_training/trainer.py


#### model.py

In [None]:
%%writefile ./src/model_training/model.py
"""A DNN keras classification model."""

from keras import models, layers

# Build U-Net model
def upsample_conv(filters, kernel_size, strides, padding):
    return layers.Conv2DTranspose(filters, kernel_size, strides=strides, padding=padding)

def upsample_simple(filters, kernel_size, strides, padding):
    return layers.UpSampling2D(strides)

def UNet_keras(gaussian_noise=0.1, upsample_mode='SIMPLE', net_scaling=None, img_shape=(768, 768), edge_crop=16):
    
    input_img = layers.Input((img_shape[0], img_shape[1], 3), name = 'RGB_Input')
    
    pp_in_layer = input_img
    
    if upsample_mode=='DECONV':
        upsample=upsample_conv
    else:
        upsample=upsample_simple
    
    if net_scaling is not None:
        pp_in_layer = layers.AvgPool2D(net_scaling)(pp_in_layer)

    pp_in_layer = layers.GaussianNoise(gaussian_noise)(pp_in_layer)
    pp_in_layer = layers.BatchNormalization()(pp_in_layer)

    c1 = layers.Conv2D(8, (3, 3), activation='relu', padding='same') (pp_in_layer)
    c1 = layers.Conv2D(8, (3, 3), activation='relu', padding='same') (c1)
    p1 = layers.MaxPooling2D((2, 2)) (c1)

    c2 = layers.Conv2D(16, (3, 3), activation='relu', padding='same') (p1)
    c2 = layers.Conv2D(16, (3, 3), activation='relu', padding='same') (c2)
    p2 = layers.MaxPooling2D((2, 2)) (c2)

    c3 = layers.Conv2D(32, (3, 3), activation='relu', padding='same') (p2)
    c3 = layers.Conv2D(32, (3, 3), activation='relu', padding='same') (c3)
    p3 = layers.MaxPooling2D((2, 2)) (c3)

    c4 = layers.Conv2D(64, (3, 3), activation='relu', padding='same') (p3)
    c4 = layers.Conv2D(64, (3, 3), activation='relu', padding='same') (c4)
    p4 = layers.MaxPooling2D(pool_size=(2, 2)) (c4)

    c5 = layers.Conv2D(128, (3, 3), activation='relu', padding='same') (p4)
    c5 = layers.Conv2D(128, (3, 3), activation='relu', padding='same') (c5)

    u6 = upsample(64, (2, 2), strides=(2, 2), padding='same') (c5)
    u6 = layers.concatenate([u6, c4])
    c6 = layers.Conv2D(64, (3, 3), activation='relu', padding='same') (u6)
    c6 = layers.Conv2D(64, (3, 3), activation='relu', padding='same') (c6)

    u7 = upsample(32, (2, 2), strides=(2, 2), padding='same') (c6)
    u7 = layers.concatenate([u7, c3])
    c7 = layers.Conv2D(32, (3, 3), activation='relu', padding='same') (u7)
    c7 = layers.Conv2D(32, (3, 3), activation='relu', padding='same') (c7)

    u8 = upsample(16, (2, 2), strides=(2, 2), padding='same') (c7)
    u8 = layers.concatenate([u8, c2])
    c8 = layers.Conv2D(16, (3, 3), activation='relu', padding='same') (u8)
    c8 = layers.Conv2D(16, (3, 3), activation='relu', padding='same') (c8)

    u9 = upsample(8, (2, 2), strides=(2, 2), padding='same') (c8)
    u9 = layers.concatenate([u9, c1], axis=3)
    c9 = layers.Conv2D(8, (3, 3), activation='relu', padding='same') (u9)
    c9 = layers.Conv2D(8, (3, 3), activation='relu', padding='same') (c9)

    d = layers.Conv2D(1, (1, 1), activation='sigmoid') (c9)
    d = layers.Cropping2D((edge_crop, edge_crop))(d)
    d = layers.ZeroPadding2D((edge_crop, edge_crop))(d)
    
    if net_scaling is not None:
        d = layers.UpSampling2D(net_scaling)(d)

    seg_model = models.Model(inputs=[input_img], outputs=[d])
    
    return seg_model

def create_model(hyperparams):
    return UNet_keras(
        gaussian_noise=hyperparams["guassian_noise"], 
        upsample_mode=hyperparams["upsample_mode"], 
        net_scaling=hyperparams["net_scaling"], 
        img_shape=hyperparams["img_shape"], 
        edge_crop=hyperparams["edge_crop"])

Overwriting ./src/model_training/model.py


#### data.py

In [None]:
%%writefile ./src/model_training/data.py
"""Functions for reading data as tf.data.Dataset."""

import tensorflow as tf
import numpy as np
import pandas as pd

from tensorflow.keras import layers

from google.cloud import storage

class Augment(tf.keras.layers.Layer):
    def __init__(self,  resize_shape=(768, 768), train=True, seed=42):
        super().__init__()
    # both use the same seed, so they'll make the same random changes.
        seed = np.random.randint(1000)
        if train:
            self.augment_inputs = tf.keras.Sequential(
                                    [
                                        layers.experimental.preprocessing.RandomFlip(seed=seed),
                                        layers.experimental.preprocessing.RandomRotation(0.1, seed=seed),
                                        layers.experimental.preprocessing.RandomHeight(0.1, seed=seed),
                                        layers.experimental.preprocessing.RandomWidth(0.1, seed=seed),
                                        layers.experimental.preprocessing.RandomZoom(0.9, seed=seed),
                                        layers.experimental.preprocessing.Rescaling(1.0 / 255),
                                        layers.experimental.preprocessing.Resizing(resize_shape[0], resize_shape[0])
                                    ]
                                )

            self.augment_labels = tf.keras.Sequential(
                                    [
                                        layers.experimental.preprocessing.RandomFlip(seed=seed),
                                        layers.experimental.preprocessing.RandomRotation(0.1, seed=seed),
                                        layers.experimental.preprocessing.RandomHeight(0.1, seed=seed),
                                        layers.experimental.preprocessing.RandomWidth(0.1, seed=seed),
                                        layers.experimental.preprocessing.RandomZoom(0.9, seed=seed),
                                        layers.experimental.preprocessing.Resizing(resize_shape[0], resize_shape[0])
                                    ]
                                )
        else:
            self.augment_inputs = tf.keras.Sequential(
                                    [
                                        layers.experimental.preprocessing.Rescaling(1.0 / 255),
                                        layers.experimental.preprocessing.Resizing(resize_shape[0], resize_shape[0])
                                    ]
                                )

            self.augment_labels = tf.keras.Sequential(
                                    [
                                        layers.experimental.preprocessing.Resizing(resize_shape[0], resize_shape[0])
                                    ]
                                )       

    def call(self, inputs, labels):
        inputs = self.augment_inputs(inputs)
        labels = self.augment_labels(labels)
        return inputs, labels


def rle_decode_tf(mask_rle, shape=(768, 768)):

    shape = tf.convert_to_tensor(shape, tf.int64)
    size = tf.math.reduce_prod(shape)
    # Split string
    s = tf.strings.split(mask_rle)
    s = tf.strings.to_number(s, tf.int64)
    # Get starts and lengths
    starts = s[::2] - 1
    lens = s[1::2]
    # Make ones to be scattered
    total_ones = tf.reduce_sum(lens)
    ones = tf.ones([total_ones], tf.uint8)
    # Make scattering indices
    r = tf.range(total_ones)
    lens_cum = tf.math.cumsum(lens)
    s = tf.searchsorted(lens_cum, r, 'right')
    idx = r + tf.gather(starts - tf.pad(lens_cum[:-1], [(1, 0)]), s)
    # Scatter ones into flattened mask
    mask_flat = tf.scatter_nd(tf.expand_dims(idx, 1), ones, [size])
    return tf.expand_dims(tf.transpose(tf.reshape(mask_flat, shape)), axis=2)

def multi_rle_encode(img):
    labels = label(img[:, :, 0])
    return [rle_encode(labels==k) for k in np.unique(labels[labels>0])]

# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def rle_decode(mask_rle, shape=(768, 768)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background
    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T   # Needed to align to RLE direction

def masks_as_image(in_mask_list):
    #in_mask_list = tf.compat.as_str_any(in_mask_list)
    # Take the individual ship masks and create a single mask array for all ships
    all_masks = np.zeros((768, 768), dtype = np.int16)
    #if isinstance(in_mask_list, list):
    for mask in in_mask_list:
        if isinstance(mask, str):
            all_masks += rle_decode(mask)
    return np.expand_dims(all_masks, -1)

def merge_rle_encode(mask_rle, shape=(768, 768)):
    img = np.zeros(shape=shape, dtype=np.uint8)

    for rle in mask_rle.split(";"):
        img += rle_decode(rle)

    return rle_encode(img)

def parse_db_to_img(filename, label):
    file_path = filename
    img = tf.io.read_file(file_path)
    image = tf.image.decode_jpeg(img, channels=3)
    label_img = rle_decode_tf(label)
    return image, label_img

def get_dataset(file_name, feature_spec):
    """Generates features and label for tuning/training.
    Args:
      file_pattern: input file path
      feature_spec: a dictionary of feature specifications.
      batch_size: representing the number of consecutive elements of returned
        dataset to combine in a single batch
    Returns:
      A dataset that contains (features, indices) tuple where features is a
        dictionary of Tensors, and indices is a single Tensor of label indices.
    """
    
    GCS_IMAGES = feature_spec['gcs_image']
    IMG_SHAPE = feature_spec['img_shape']
    GCS_BUCKET = feature_spec['gcs_bucket']
    BATCH_SIZE = feature_spec['batch_size']
    
    bucket = storage.Client().bucket(GCS_BUCKET)
    blob = bucket.blob(file_name)
    blob.download_to_filename(file_name)
    
    train_df = pd.read_parquet(file_name)
    dataset = tf.data.Dataset.from_tensor_slices((train_df['ImageId'].values, train_df['EncodedPixels'].values))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.map(lambda x, y: parse_db_to_img(GCS_IMAGES + x, y))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(Augment(resize_shape=IMG_SHAPE, train=True))
    dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

Overwriting ./src/model_training/data.py


#### __ init __.py

In [None]:
%%writefile ./src/model_training/__init__.py


UsageError: %%writefile is a cell magic, but the cell body is empty.


#### defaults.py

In [None]:
%%writefile ./src/model_training/defaults.py
"""Defaults for the model.
These values can be tweaked to affect model training performance.
"""


NET_SCALING = None
IMG_SHAPE = (128, 128)
GUASSIAN_NOISE = 0.1
BATCH_SIZE = 16
NUM_EPOCHS = 1
NUM_EVAL_STEPS = 10
EDGE_CROP = 16
UPSAMPLE_MODE = "SIMPLE"
LEARNING_RATE = 1e-4 
DECAY_RATE = 1e-6

GCS_IMAGES = "gs://mle_airbus_dataset/train_v2/"
GCS_BUCKET = "mle_airbus_dataset/"

def update_hyperparams(hyperparams: dict) -> dict:
    if "net_scaling" not in hyperparams:
        hyperparams["net_scaling"] = NET_SCALING
    if "img_shape" not in hyperparams:
        hyperparams["img_shape"] = IMG_SHAPE
    if "guassian_noise" not in hyperparams:
        hyperparams["num_epochs"] = GUASSIAN_NOISE
    if "batch_size" not in hyperparams:
        hyperparams["batch_size"] = BATCH_SIZE
    if "num_epochs" not in hyperparams:
        hyperparams["num_epochs"] = NUM_EPOCHS
    if "num_eval_steps" not in hyperparams:
        hyperparams["num_eval_steps"] = NUM_EVAL_STEPS
    if "edge_crop" not in hyperparams:
        hyperparams["edge_crop"] = EDGE_CROP
    if "upsample_mode" not in hyperparams:
        hyperparams["upsample_mode"] = UPSAMPLE_MODE
    if "learning_rate" not in hyperparams:
        hyperparams["learning_rate"] = LEARNING_RATE
    if "decay_rate" not in hyperparams:
        hyperparams["decay_rate"] = DECAY
    
    if "gcs_image" not in hyperparams:
        hyperparams["gcs_image"] = GCS_IMAGES
    if "gcs_bucket" not in hyperparams:
        hyperparams["gcs_bucket"] = GCS_BUCKET
    return hyperparams

Overwriting ./src/model_training/defaults.py


#### task.py


In [None]:
%%writefile ./src/model_training/task.py
"""The entrypoint for the Vertex training job."""

import os
import sys
from datetime import datetime
import logging
import tensorflow as tf
from tensorflow.python.client import device_lib
import argparse
from typing import Optional

from google.cloud import aiplatform as vertex_ai

# from src.model_training 
import defaults, trainer


dirname = os.path.dirname(__file__)
dirname = dirname.replace("/model_training", "")


def get_args():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model-dir",
        default="",
        type=str,
    )

    parser.add_argument(
        "--train-data-dir",
        type=str,
    )

    parser.add_argument(
        "--eval-data-dir",
        type=str,
    )
    
    parser.add_argument(
        "--out-model",
        type=str,
    )
    
    parser.add_argument("--net-scaling", default=None, type=Optional[bool])
    parser.add_argument("--image-shape", default=(128,128), type=tuple)
    parser.add_argument("--guassian-noise", default=0.1, type=float)
    parser.add_argument("--batch-size", default=16, type=float)
    parser.add_argument("--num-epochs", default=1, type=int)
    parser.add_argument("--num-eval-steps", default=10, type=int)
    parser.add_argument("--edge-crop", default=16, type=int)
    parser.add_argument("--upsample-mode", default="SIMPLE", type=str)
    parser.add_argument("--learning-rate", default=1e-4, type=float)
    parser.add_argument("--decay-rate", default=1e-6, type=float)

    parser.add_argument("--gcs-bucket", default="mle_airbus_dataset", type=str)
    parser.add_argument("--gcs-image", default="gs://mle_airbus_dataset/train_v2/", type=str)
    
    return parser.parse_args()


def main() -> str:
    args = get_args()
    time = datetime.now()
    hyperparams = vars(args)
    hyperparams = defaults.update_hyperparams(hyperparams)
    logging.info(f"Hyperparameter: {hyperparams}")

#     if args.experiment_name:
#         vertex_ai.init(
#             project=args.project,
#             staging_bucket=args.staging_bucket,
#             experiment=args.experiment_name,
#         )

#         logging.info(f"Using Vertex AI experiment: {args.experiment_name}")

#         run_id = args.run_name
#         if not run_id:
#             run_id = f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}"

#         vertex_ai.start_run(run_id)
#         logging.info(f"Run {run_id} started.")

#         vertex_ai.log_params(hyperparams)

    seg_model = trainer.train(
        train_data_dir=args.train_data_dir,
        eval_data_dir=args.eval_data_dir,
        hyperparams=hyperparams,
    )

    # val_loss, val_accuracy = trainer.evaluate(
    #     model=classifier,
    #     data_dir=args.eval_data_dir,
    #     raw_schema_location=RAW_SCHEMA_LOCATION,
    #     tft_output_dir=args.tft_output_dir,
    #     hyperparams=hyperparams,
    # )
    
    
    # Report val_accuracy to Vertex hypertuner.
    # logging.info(f'Reporting metric {HYPERTUNE_METRIC_NAME}={val_accuracy} to Vertex hypertuner...')
    # hpt = hypertune.HyperTune()
    # hpt.report_hyperparameter_tuning_metric(
    #     hyperparameter_metric_tag=HYPERTUNE_METRIC_NAME,
    #     metric_value=val_accuracy,
    #     global_step=args.num_epochs * args.batch_size
    # )

    # Log metrics in Vertex Experiments.
    # logging.info(f'Logging metrics to Vertex Experiments...')
    # if args.experiment_name:
    #     vertex_ai.log_metrics({"val_loss": val_loss, "val_accuracy": val_accuracy})
    
    
    import pickle

    logging.info(f"exporting model")
    timestamp = time.strftime("%Y%m%d-%H%M%S")

    export_dir = "gs://mle_airbus_dataset/trained_model/segm_full_{}".format(timestamp)
    print('Exporting to {}'.format(export_dir))
    tf.saved_model.save(seg_model, export_dir)

    # in two lines of code
    # with open(hyperparams['model_dir'] + f"segm_full_{timestamp}/loss.pickle", "wb") as f:
    #     print('Exporting to {}/loss.pickle'.format(export_dir))
    #     pickle.dump(loss_history[0].history, f)

    logging.info(f"exported model: {export_dir}")
    # model_filename = args.model_dir + "model_" + datetime.datetime.now().strftime('%Y%m%d_%H%M')
    # model.save_model(model_filename)

    with open(hyperparams['out_model'], 'w') as f:
        f.write(export_dir)
    # os.stdout(export_dir)
    return (print(export_dir))

if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    logging.info(f"Python Version = {sys.version}")
    logging.info(f"TensorFlow Version = {tf.__version__}")
    logging.info(f'TF_CONFIG = {os.environ.get("TF_CONFIG", "Not found")}')
    logging.info(f"DEVICES = {device_lib.list_local_devices()}")
    logging.info(f"Task started...")
    main()
    logging.info(f"Task completed.")

Overwriting ./src/model_training/task.py


#### Create DockerFile

In [None]:
%%writefile Dockerfile
FROM asia-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest
WORKDIR /app
COPY src/model_training src/model_training
ENTRYPOINT ["python","./src/model_training/task.py"]

Overwriting Dockerfile


#### Create Component

In [8]:
# Required Parameters, change according to your setups
PROJECT_ID='mle-airbus-detection-smu'
GCS_BUCKET='mle_airbus_dataset'
REGION = 'asia-east1'
ARTIFACT_REGISTRY_REPO="airbus-mle"
CONTAINER_REGISTRY=f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{ARTIFACT_REGISTRY_REPO}/trainer-airbus-model-ey:latest"

In [9]:
%%bash -s "{CONTAINER_REGISTRY}"

CONTAINER_REGISTRY=$1
echo ${CONTAINER_REGISTRY}
# Create the component definition .yaml file
cat > ./build/tensorflow_airbus.yaml <<HERE
name: Tensorflow Airbus Model Training
description: Train a tf model and save to GCS
inputs:
  - name: model_dir
    description: 'Path to save model.'
    type: String
  - name: train_data_dir
    description: 'Training dataset directory.'
    type: String
  - name: eval_data_dir
    description: 'Evaluation dataset directory.'
    type: String
    
outputs:
  - name: gcs_model_path
    description: 'Trained model path.'
    type: String
implementation:
    container:
        image: ${CONTAINER_REGISTRY}
        command: [
          python, ./src/model_training/task.py,
          --model-dir, {inputValue: model_dir},
          --train-data-dir, {inputValue: train_data_dir},
          --eval-data-dir, {inputValue: eval_data_dir},
          --out-model: {outputPath: gcs_model_path}
        ]
HERE

asia-east1-docker.pkg.dev/mle-airbus-detection-smu/airbus-mle/trainer-airbus-model-ey:latest


In [None]:
import kfp
trainer_component = kfp.components.load_component_from_file("./build/tensorflow_airbus.yaml")

In [None]:
from tfx.dsl.components.common.resolver import Resolver
from tfx.dsl.experimental import latest_artifacts_resolver
from tfx.dsl.experimental import latest_blessed_model_resolver

### Train the model

In [2]:
import subprocess
cmd = ['docker', 'build', '-f', 'Dockerfile_trainer', '.', '--tag', CONTAINER_REGISTRY]

build_log = (subprocess.run(cmd, stdout=subprocess.PIPE).stdout[:-1].decode('utf-8'))
print(build_log)

In [None]:
import subprocess
cmd = ['docker', 'run', CONTAINER_REGISTRY, 
       '--model-dir', f"gs://{GCS_BUCKET}/airbusmlepipeline/{MODEL_OUTPUT_NAME}", 
       '--train-data-dir', f"train.parquet", 
       '--eval-data-dir', f"test.parquet"]

build_log = (subprocess.run(cmd, stdout=subprocess.PIPE).stdout[:-1].decode('utf-8'))
print(build_log)

### Evaluate and validate the model against the baseline model.

In [None]:
evaluation_results = evaluator.outputs['evaluation'].get()[0].uri
print("validation_ok:", tfma.load_validation_result(evaluation_results).validation_ok, '\n')

for entry in list(tfma.load_metrics(evaluation_results))[0].metric_keys_and_values:
    value = entry.value.double_value.value
    if value:
        print(entry.key.name, ":", round(entry.value.double_value.value, 3))