# AgriAutoML Pipeline Execution in Vertex AI

This notebook demonstrates how to execute the AgriAutoML pipeline directly in Vertex AI Studio.

In [None]:
# First, uninstall all related packages
%pip uninstall -y google-cloud-aiplatform google-cloud-storage google-cloud-datastore protobuf google-cloud-bigquery google-genai kfp pydantic

# Install packages with exact versions
%pip install "protobuf>=3.20.2,<4.0.0dev"
%pip install "google-cloud-storage>=1.32.0,<3.0.0"
%pip install "google-cloud-datastore==1.15.5"
%pip install "google-cloud-aiplatform==1.104.0"
%pip install "google-cloud-bigquery<3.0.0"
%pip install "pydantic>=2.0.0,<3.0.0"
%pip install "kfp>=2.0.0"
%pip install "google-genai>=1.25.0"

# Install remaining dependencies
%pip install pandas numpy pillow scikit-learn tensorflow google-auth google-auth-httplib2 google-api-python-client

In [None]:
import os
from google.cloud import aiplatform
from google.cloud import storage
from google.auth import default
from datetime import datetime
from kfp import dsl, components, compiler
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from PIL import Image
import numpy as np 
import io
import pandas as pd



# Get default credentials and project
credentials, project_id = default()


# Configuration
REGION = "us-central1"
BUCKET_NAME = "qwiklabs-gcp-01-d4f6611afd55-bucket"
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipeline_root"

In [None]:
# Create sample datasets
def create_sample_data():
    """Create sample datasets for vision and tabular models."""
    # Create sample vision data (dummy image)
    vision_uri = f"gs://{BUCKET_NAME}/sample_vision_data"
    
    # Create sample tabular data
    tabular_uri = f"gs://{BUCKET_NAME}/sample_tabular_data"
    
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(BUCKET_NAME)
    
    # Create and upload sample vision data
    img = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
    img_byte_arr = io.BytesIO()
    img.save(img_byte_arr, format='PNG')
    img_byte_arr = img_byte_arr.getvalue()
    
    vision_blob = bucket.blob('sample_vision_data/image1.png')
    vision_blob.upload_from_string(img_byte_arr, content_type='image/png')
    
    # Create and upload sample tabular data
    df = pd.DataFrame({
        'planting_date': pd.date_range(start='2025-01-01', periods=100),
        'temperature': np.random.normal(25, 5, 100),
        'rainfall': np.random.normal(50, 10, 100),
        'soil_quality': np.random.choice(['good', 'medium', 'poor'], 100),
        'yield': np.random.normal(75, 15, 100)
    })
    
    tabular_blob = bucket.blob('sample_tabular_data/farming_data.csv')
    tabular_blob.upload_from_string(df.to_csv(index=False))
    
    return vision_uri, tabular_uri

# Create the sample data and get the URIs
vision_uri, tabular_uri = create_sample_data()
print(f"Created vision dataset at: {vision_uri}")
print(f"Created tabular dataset at: {tabular_uri}")

In [None]:
def preprocess_data(vision_data: str, tabular_data: str, bucket_name: str) -> tuple[str, str]:
    """
    Preprocess vision and tabular data for training
    
    Args:
        vision_data: GCS URI for vision dataset
        tabular_data: GCS URI for tabular dataset
        bucket_name: GCS bucket for processed data
        
    Returns:
        tuple: (vision_dataset_uri, tabular_dataset_uri)
    """
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Process vision data
    def process_image(image_bytes):
        img = Image.open(io.BytesIO(image_bytes))
        img = img.resize((224, 224))  # Standard size for many vision models
        return np.array(img)

    # Process tabular data
    def process_tabular(df):
        # Handle missing values
        df = df.fillna(df.mean())
        
        # Feature engineering
        if "planting_date" in df.columns:
            df["planting_date"] = pd.to_datetime(df["planting_date"])
            df["planting_month"] = df["planting_date"].dt.month
            df["planting_day"] = df["planting_date"].dt.day
        
        return df

    # Process and save datasets
    vision_blob = bucket.blob('processed_vision_data.txt')
    vision_blob.upload_from_string(vision_data)
    vision_output_uri = f"gs://{bucket_name}/{vision_blob.name}"

    tabular_blob = bucket.blob('processed_tabular_data.csv')
    tabular_blob.upload_from_string(tabular_data)
    tabular_output_uri = f"gs://{bucket_name}/{tabular_blob.name}"
    
    return vision_output_uri, tabular_output_uri

In [None]:
def train_vision_model(project_id: str, region: str, dataset: str, min_accuracy: float) -> dict:
    """
    Train AutoML Vision model for crop analysis
    
    Args:
        project_id: GCP project ID
        region: GCP region
        dataset: URI of the processed vision dataset
        min_accuracy: Minimum required accuracy
        
    Returns:
        dict: Model information including resource name and metrics
    """
    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=region)

    # Create dataset
    ai_dataset = aiplatform.ImageDataset.create(
        display_name="crop_vision_dataset",
        gcs_source=dataset
    )

    # Train model
    job = aiplatform.AutoMLImageTrainingJob(
        display_name="crop_vision_model",
        prediction_type="classification",
        budget_milli_node_hours=83,  # Approximately 5 minutes
        model_type="CLOUD",
        base_model=None
    )

    # Run the training job
    ai_model = job.run(
        dataset=ai_dataset,
        budget_milli_node_hours=83,  # 5 minutes for testing
        training_filter_split="",  # No filter
        model_display_name="crop_vision_model",
        training_fraction_split=0.8,
        validation_fraction_split=0.1,
        test_fraction_split=0.1
    )

    # Get model evaluation
    eval_metrics = ai_model.list_model_evaluations()[0]

    # Check if model meets accuracy threshold
    if eval_metrics.metrics['auRoc'] < min_accuracy:
        raise ValueError(f"Model accuracy {eval_metrics.metrics['auRoc']} below threshold {min_accuracy}")

    # Return model info
    model_info = {
        'model': ai_model.resource_name,
        'accuracy': float(eval_metrics.metrics['auRoc'])
    }
    return model_info


In [None]:
def train_tabular_model(project_id: str, region: str, dataset: str, min_accuracy: float) -> dict:
    """
    Train AutoML Tabular model for crop yield prediction
    
    Args:
        project_id: GCP project ID
        region: GCP region
        dataset: URI of the processed tabular dataset
        min_accuracy: Minimum required accuracy (RMSE threshold)
        
    Returns:
        dict: Model information including resource name and metrics
    """
    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=region)

    # Create dataset
    ai_dataset = aiplatform.TabularDataset.create(
        display_name="crop_tabular_dataset",
        gcs_source=dataset
    )

    # Train model
    job = aiplatform.AutoMLTabularTrainingJob(
        display_name="crop_tabular_model",
        optimization_objective="minimize-rmse",
        column_transformations=[
            {"numeric": {"column_name": "field_size"}},
            {"numeric": {"column_name": "rainfall"}},
            {"numeric": {"column_name": "temperature"}},
            {"categorical": {"column_name": "location"}},
            {"categorical": {"column_name": "crop_type"}},
            {"timestamp": {"column_name": "date"}}
        ],
        target_column="yield",
        budget_milli_node_hours=83,  # Approximately 5 minutes
        optimization_prediction_type="regression",
        additional_experiments=["enable_model_compression"]
    )

    # Run the training job
    ai_model = job.run(
        dataset=ai_dataset,
        model_display_name="crop_yield_model",
        training_fraction_split=0.8,
        validation_fraction_split=0.1,
        test_fraction_split=0.1
    )

    # Get model evaluation
    eval_metrics = ai_model.list_model_evaluations()[0]

    # Check if model meets accuracy threshold
    if eval_metrics.metrics['rmse'] > min_accuracy:
        raise ValueError(f"Model RMSE {eval_metrics.metrics['rmse']} above threshold {min_accuracy}")

    # Return model info
    model_info = {
        'model': ai_model.resource_name,
        'rmse': float(eval_metrics.metrics['rmse'])
    }
    return model_info


In [None]:
def deploy_models(project_id: str, region: str, vision_model: dict, tabular_model: dict) -> tuple[str, str]:
    """
    Deploy trained models to endpoints
    
    Args:
        project_id: GCP project ID
        region: GCP region
        vision_model: Vision model information
        tabular_model: Tabular model information
        
    Returns:
        tuple: (vision_endpoint_name, tabular_endpoint_name)
    """
    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=region)

    # Deploy vision model
    vision_model_resource = aiplatform.Model(vision_model['model'])
    vision_endpoint = vision_model_resource.deploy(
        machine_type='n1-standard-4',
        min_replica_count=1,
        max_replica_count=1
    )

    # Deploy tabular model
    tabular_model_resource = aiplatform.Model(tabular_model['model'])
    tabular_endpoint = tabular_model_resource.deploy(
        machine_type='n1-standard-4',
        min_replica_count=1,
        max_replica_count=1
    )

    return vision_endpoint.resource_name, tabular_endpoint.resource_name


In [None]:

# Get the absolute path to the components directory
COMPONENTS_DIR = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), 'components'))

# Function to get absolute component path
def get_component_path(component_name):
    return os.path.join(os.path.dirname(os.getcwd()), 'components', component_name)
    
# Load components
#preprocess_op = components.load_component_from_file(get_component_path('preprocess.yaml'))
#train_vision_op = components.load_component_from_file(get_component_path('train_vision.yaml'))
#train_tabular_op = components.load_component_from_file(get_component_path('train_tabular.yaml'))
#deploy_op = components.load_component_from_file(get_component_path('deploy.yaml'))

# Define pipeline
@dsl.pipeline(
    name='AgriAutoML Pipeline',
    description='End-to-end pipeline for agricultural yield prediction'
)
def agri_automl_pipeline(
    project_id: str,
    region: str,
    bucket_name: str,
    vision_dataset_uri: str,
    tabular_dataset_uri: str,
    min_accuracy: float = 0.8
):
    # Preprocess data
    preprocess_task = preprocess_data(
        vision_data=vision_dataset_uri,
        tabular_data=tabular_dataset_uri,
        bucket_name=bucket_name
    )

    # Train vision model
    train_vision_task = train_vision_model(
        project_id=project_id,
        region=region,
        dataset=preprocess_task.outputs['vision_dataset'],
        min_accuracy=min_accuracy
    )
    train_vision_task.after(preprocess_task)

    # Train tabular model
    train_tabular_task = train_tabular_model(
        project_id=project_id,
        region=region,
        dataset=preprocess_task.outputs['tabular_dataset'],
        min_accuracy=min_accuracy
    )
    train_tabular_task.after(preprocess_task)

    # Deploy models
    deploy_task = deploy_models(
        project_id=project_id,
        region=region,
        vision_model=train_vision_task.outputs['model_info'],
        tabular_model=train_tabular_task.outputs['model_info']
    )
    deploy_task.after(train_vision_task, train_tabular_task)

In [None]:
# Initialize Vertex AI
aiplatform.init(
    project=project_id,
    location=REGION,
    credentials=credentials
)


# Compile pipeline
compiler.Compiler().compile(
    pipeline_func=agri_automl_pipeline,
    package_path='pipeline.yaml'
)


# Create and run pipeline job
job = pipeline_jobs.PipelineJob(
    display_name='agri-automl-pipeline',
    template_path='pipeline.yaml',
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        'project_id': project_id,  # Changed from PROJECT_ID
        'region': REGION,
        'bucket_name': BUCKET_NAME,
        'vision_dataset_uri': vision_uri,  # Changed from VISION_DATASET_URI
        'tabular_dataset_uri': tabular_uri,  # Changed from TABULAR_DATASET_URI
        'min_accuracy': 0.8
    }
)

job.submit()