# 03 - Data Statistics Generation

This notebook creates a component that calculates the image data statistics, which will be used for input data validation at later stage.

In [None]:
import os
import time
import logging
import kfp
from google.cloud import bigquery, storage
from google.cloud import aiplatform as vertex_ai
from google_cloud_pipeline_components.experimental.custom_job import utils
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import component
from typing import NamedTuple
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, Metrics,
                        OutputPath, component)

from google_cloud_pipeline_components.experimental.custom_job import utils

In [2]:
logging.basicConfig(level=logging.INFO)

## Load Params and Resource Config

In [3]:
from config.gcp_resource import *

In [4]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    
if SERVICE_ACCOUNT == "" or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == "[your-service-account]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.account)' 2>/dev/null
    SERVICE_ACCOUNT = shell_output[0]
    
if GCS_BUCKET == "" or GCS_BUCKET is None or GCS_BUCKET == "[your-bucket-name]":
    # Get your bucket name to GCP projet id
    GCS_BUCKET = PROJECT_ID
    # Try to create the bucket if it doesn'exists
    ! gsutil mb -l $REGION gs://$BUCKET
    print("")
    
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [5]:
print("Train machine type", TRAIN_COMPUTE)
print("Deploy machine type", DEPLOY_COMPUTE)
print("Deployment:", DEPLOY_IMAGE)
print('PIPELINE_ROOT: {}'.format(PIPELINE_ROOT))
print('MODULE_ROOT: {}'.format(MODULE_ROOT))
print('DATA_ROOT: {}'.format(DATA_ROOT))
print('SERVING_MODEL_DIR: {}'.format(SERVING_MODEL_DIR))

Train machine type n1-standard-4
Deploy machine type n1-standard-4
Deployment: us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest
PIPELINE_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/pipeline_root
MODULE_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/pipeline_module
DATA_ROOT: gs://mle_airbus_dataset/airbusmlepipeline/data
SERVING_MODEL_DIR: gs://mle_airbus_dataset/airbusmlepipeline/serving_model


## Data Statistics Schema Generation

In [6]:
@component(packages_to_install=["google-cloud-storage","opencv-python-headless","pandas","pyarrow","fsspec","gcsfs"],
           base_image='python:3.9',
           output_component_file="./build/gen_train_hist_component.yaml")
def gen_train_hist_component(
    project_dict: dict
    ) -> NamedTuple(
    "Outputs",
    [
        ("train_hist_fpath", str),  # Return path to histogram of training data.
        ("train_threshold_fpath", str),  # Return path to threshold value.
    ],
    ):
    
    import cv2
    import urllib
    import numpy as np
    import pandas as pd
    from google.cloud import storage
    
        
    PROJECT_ID = project_dict['PROJECT_ID']
    GCS_BUCKET = project_dict['GCS_BUCKET']
    GCS_TRAIN_IMAGES=f"gs://{GCS_BUCKET}/train_v2/"
    
    # read the parquet files
    train_data = pd.read_parquet(f"gs://{GCS_BUCKET}/train.parquet")
    test_data = pd.read_parquet(f"gs://{GCS_BUCKET}/test.parquet")
    
    # load training images to calculate histogram
    gcs_url = f"https://storage.googleapis.com/{GCS_TRAIN_IMAGES.replace('gs://','')}"

    train_images = []
    test_images = []

    for image_id in train_data['ImageId']:
        resp = urllib.request.urlopen(f'{gcs_url}{image_id}')
        image = np.asarray(bytearray(resp.read()), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        train_images.append(image)
        
    for image_id in test_data['ImageId']:
        resp = urllib.request.urlopen(f'{gcs_url}{image_id}')
        image = np.asarray(bytearray(resp.read()), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        test_images.append(image)
    
    channels = [0, 1, 2]
    hist_size = [256] * 3
    hist_ranges = [0, 256] * 3

    # compute the image histograms for training data
    train_hist = cv2.calcHist(train_images,
                              channels,
                              None,
                              hist_size,
                              hist_ranges,
                              accumulate = True)
    cv2.normalize(train_hist, train_hist, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
        
    # compute the image histograms for training data
    test_hist = cv2.calcHist(test_images,
                             channels,
                             None,
                             hist_size,
                             hist_ranges,
                             accumulate = True)
    cv2.normalize(test_hist, test_hist, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
    
    # use correlation method for comparison
    threshold = cv2.compareHist(train_hist, test_hist, 0)
    
    # reshaping the array from 3D matrice to 2D matrice.
    arrReshaped = train_hist.reshape(train_hist.shape[0], -1)
    # saving reshaped array to file.
    np.savetxt('train_hist.csv', arrReshaped)
    
    with open('train_threshold.txt','w') as f:
        f.write(f'{threshold}')
        
    # move the files to GCS
    bucket = storage.Client().bucket(GCS_BUCKET)
    
    blob = bucket.blob('train_hist.csv')
    blob.upload_from_filename('train_hist.csv')
    blob = bucket.blob('train_threshold.txt')
    blob.upload_from_filename('train_threshold.txt')
    
    return f"gs://{GCS_BUCKET}/train_hist.csv", f"gs://{GCS_BUCKET}/train_threshold.txt"