# AgriAutoML Pipeline Execution in Vertex AI

This notebook demonstrates how to execute the AgriAutoML pipeline directly in Vertex AI Studio.

In [None]:
%pip cache purge

# First, uninstall all related packages
%pip uninstall -y google-cloud-aiplatform google-cloud-storage google-cloud-datastore protobuf google-cloud-bigquery google-genai kfp pydantic


%pip install --no-cache-dir "protobuf>=3.20.2,<4.0.0dev"
%pip install --no-cache-dir "pydantic"

%pip install --no-cache-dir "google-cloud-bigquery<3.0.0"
#%pip install --upgrade pip setuptools wheel

# Install core dependencies first

%pip install --no-cache-dir "setuptools>=65.5.1"
%pip install --no-cache-dir "wheel>=0.40.0"

# Try installing KFP 2.0.1 specifically (a stable version)
%pip install --no-cache-dir kfp>=2.0.0 --use-pep517


# Install remaining dependencies after KFP is installed
%pip install --no-cache-dir "google-cloud-storage>=1.32.0,<3.0.0"
%pip install --no-cache-dir "google-cloud-datastore==1.15.5"
%pip install --no-cache-dir "google-cloud-aiplatform==1.104.0"
%pip install --no-cache-dir "google-cloud-bigquery<3.0.0"
%pip install --no-cache-dir pandas numpy pillow scikit-learn tensorflow
%pip install --no-cache-dir google-auth google-auth-httplib2 google-api-python-client


# Install remaining dependencies
%pip install pandas numpy pillow scikit-learn tensorflow google-auth google-auth-httplib2 google-api-python-client

Files removed: 2112 (787.1 MB)
Note: you may need to restart the kernel to use updated packages.
Found existing installation: google-cloud-aiplatform 1.104.0
Uninstalling google-cloud-aiplatform-1.104.0:
  Successfully uninstalled google-cloud-aiplatform-1.104.0
Found existing installation: google-cloud-storage 2.14.0
Uninstalling google-cloud-storage-2.14.0:
  Successfully uninstalled google-cloud-storage-2.14.0
Found existing installation: google-cloud-datastore 1.15.5
Uninstalling google-cloud-datastore-1.15.5:
  Successfully uninstalled google-cloud-datastore-1.15.5
Found existing installation: protobuf 4.25.8
Uninstalling protobuf-4.25.8:
  Successfully uninstalled protobuf-4.25.8
Found existing installation: google-cloud-bigquery 2.6.1
Uninstalling google-cloud-bigquery-2.6.1:
  Successfully uninstalled google-cloud-bigquery-2.6.1
Found existing installation: google-genai 1.26.0
Uninstalling google-genai-1.26.0:
  Successfully uninstalled google-genai-1.26.0
Found existing instal

You can safely remove it manually.


Collecting protobuf<4.0.0dev,>=3.20.2
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Installing collected packages: protobuf
Successfully installed protobuf-3.20.3
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp-pipeline-spec 0.6.0 requires protobuf<5,>=4.21.1, but you have protobuf 3.20.3 which is incompatible.


Collecting pydantic
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Downloading pydantic-2.11.7-py3-none-any.whl (444 kB)
Installing collected packages: pydantic
Successfully installed pydantic-2.11.7
Note: you may need to restart the kernel to use updated packages.
Collecting google-cloud-bigquery<3.0.0
  Downloading google_cloud_bigquery-2.6.1-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting google-api-core<2.0.0dev,>=1.23.0 (from google-api-core[grpc]<2.0.0dev,>=1.23.0->google-cloud-bigquery<3.0.0)
  Downloading google_api_core-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting google-cloud-core<2.0dev,>=1.4.1 (from google-cloud-bigquery<3.0.0)
  Downloading google_cloud_core-1.7.3-py2.py3-none-any.whl.metadata (2.4 kB)
Collecting google-resumable-media<2.0dev,>=0.6.0 (from google-cloud-bigquery<3.0.0)
  Downloading google_resumable_media-1.3.3-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-auth<3.0dev,>=1.25.0 (from google-api-core<2.0.0dev,>=1.23.0->go

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-bigquery 2.6.1 requires google-api-core[grpc]<2.0.0dev,>=1.23.0, but you have google-api-core 2.25.1 which is incompatible.
google-cloud-bigquery 2.6.1 requires google-cloud-core<2.0dev,>=1.4.1, but you have google-cloud-core 2.4.3 which is incompatible.
google-cloud-bigquery 2.6.1 requires google-resumable-media<2.0dev,>=0.6.0, but you have google-resumable-media 2.7.2 which is incompatible.


Collecting google-cloud-storage<3.0.0,>=1.32.0
  Downloading google_cloud_storage-2.19.0-py2.py3-none-any.whl.metadata (9.1 kB)
Downloading google_cloud_storage-2.19.0-py2.py3-none-any.whl (131 kB)
Installing collected packages: google-cloud-storage
  Attempting uninstall: google-cloud-storage
    Found existing installation: google-cloud-storage 3.2.0
    Uninstalling google-cloud-storage-3.2.0:
      Successfully uninstalled google-cloud-storage-3.2.0
Successfully installed google-cloud-storage-2.19.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
from google.cloud import aiplatform
from google.cloud import storage
from google.auth import default
from datetime import datetime
from kfp import dsl
from kfp.dsl import Output, Dataset, Input, Artifact
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from PIL import Image
import numpy as np 
import io

import pandas as pd 



# Get default credentials and project
credentials, project_id = default()


# Configuration
REGION = "us-central1"
bucket_name = "agrifingcpflow-465809-bucket"
PIPELINE_ROOT = f"gs://{bucket_name}/pipeline_root"

In [None]:
# Create sample datasets
def create_sample_data():
    """Create sample datasets for vision and tabular models."""
    # Create sample vision data (dummy image)
    vision_uri = f"gs://{bucket_name}/sample_vision_data"
    
    # Create sample tabular data
    tabular_uri = f"gs://{bucket_name}/sample_tabular_data"
    
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    # Create and upload sample vision data
    img = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
    img_byte_arr = io.BytesIO()
    img.save(img_byte_arr, format='PNG')
    img_byte_arr = img_byte_arr.getvalue()
    
    vision_blob = bucket.blob('sample_vision_data/image1.png')
    vision_blob.upload_from_string(img_byte_arr, content_type='image/png')
    
    # Create and upload sample tabular data
    df = pd.DataFrame({
        'planting_date': pd.date_range(start='2025-01-01', periods=100),
        'temperature': np.random.normal(25, 5, 100),
        'rainfall': np.random.normal(50, 10, 100),
        'soil_quality': np.random.choice(['good', 'medium', 'poor'], 100),
        'yield': np.random.normal(75, 15, 100)
    })
    
    tabular_blob = bucket.blob('sample_tabular_data/farming_data.csv')
    tabular_blob.upload_from_string(df.to_csv(index=False))
    
    return vision_uri, tabular_uri

# Create the sample data and get the URIs
vision_uri, tabular_uri = create_sample_data()
print(f"Created vision dataset at: {vision_uri}")
print(f"Created tabular dataset at: {tabular_uri}")

In [None]:
@dsl.component
def preprocess_data(
    vision_data: str,
    tabular_data: str,
    bucket_name: str,
    vision_dataset: Output[Artifact],
    tabular_dataset: Output[Artifact]
):
    """
    Preprocess vision and tabular data for training
    
    Args:
        vision_data: GCS URI for vision dataset
        tabular_data: GCS URI for tabular dataset
        bucket_name: GCS bucket for processed data
        
    Returns:
        tuple: (vision_dataset, tabular_dataset)
    """
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Process vision data
    def process_image(image_bytes):
        img = Image.open(io.BytesIO(image_bytes))
        img = img.resize((224, 224))  # Standard size for many vision models
        return np.array(img)

    # Process tabular data
    def process_tabular(df):
        # Handle missing values
        df = df.fillna(df.mean())
        
        # Feature engineering
        if "planting_date" in df.columns:
            df["planting_date"] = pd.to_datetime(df["planting_date"])
            df["planting_month"] = df["planting_date"].dt.month
            df["planting_day"] = df["planting_date"].dt.day
        
        return df

    # Process and save datasets
    vision_blob = bucket.blob('processed_vision_data.txt')
    vision_blob.upload_from_string(vision_data)
    vision_output_uri = f"gs://{bucket_name}/{vision_blob.name}"

     # Save to the KFP output location
    with open(vision_dataset.path, 'w') as f:
        f.write(vision_output_uri)

    tabular_blob = bucket.blob('processed_tabular_data.csv')
    tabular_blob.upload_from_string(tabular_data)
    tabular_output_uri = f"gs://{bucket_name}/{tabular_blob.name}"
    
    # Save to the KFP output location
    with open(tabular_dataset.path, 'w') as f:
        f.write(tabular_output_uri)

In [None]:
@dsl.component
def train_vision_model(
    project_id: str,
    region: str,
    dataset: Input[Artifact],
    min_accuracy: float,
    model_info: Output[Artifact]
):
    """
    Train AutoML Vision model for crop analysis
    
    Args:
        project_id: GCP project ID
        region: GCP region
        dataset: Input artifact containing the processed vision dataset URI
        min_accuracy: Minimum required accuracy
        model_info: Output artifact for model information
    """
     # Read dataset URI from input artifact
    with open(dataset.path, 'r') as f:
        dataset_uri = f.read().strip()

    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=region)

    # Create dataset
    ai_dataset = aiplatform.ImageDataset.create(
        display_name="crop_vision_dataset",
        gcs_source=dataset_uri
    )

    # Train model
    job = aiplatform.AutoMLImageTrainingJob(
        display_name="crop_vision_model",
        prediction_type="classification",
        budget_milli_node_hours=83,  # Approximately 5 minutes
        model_type="CLOUD",
        base_model=None
    )

    # Run the training job
    ai_model = job.run(
        dataset=ai_dataset,
        budget_milli_node_hours=83,  # 5 minutes for testing
        training_filter_split="",  # No filter
        model_display_name="crop_vision_model",
        training_fraction_split=0.8,
        validation_fraction_split=0.1,
        test_fraction_split=0.1
    )

    # Get model evaluation
    eval_metrics = ai_model.list_model_evaluations()[0]

    # Check if model meets accuracy threshold
    if eval_metrics.metrics['auRoc'] < min_accuracy:
        raise ValueError(f"Model accuracy {eval_metrics.metrics['auRoc']} below threshold {min_accuracy}")

    # Return model info
    model_info = {
        'model': ai_model.resource_name,
        'accuracy': float(eval_metrics.metrics['auRoc'])
    }
    with open(model_info.path, 'w') as f:
        json.dump(info, f)


In [None]:
@dsl.component
def train_tabular_model(
    project_id: str,
    region: str,
    dataset: Input[Artifact],
    min_accuracy: float,
    model_info: Output[Artifact]
):
    """
    Train AutoML Tabular model for crop yield prediction
    
    Args:
        project_id: GCP project ID
        region: GCP region
        dataset: Input artifact containing the processed tabular dataset URI
        min_accuracy: Minimum required accuracy (RMSE threshold)
        model_info: Output artifact for model information
    """
     # Read dataset URI from input artifact
    with open(dataset.path, 'r') as f:
        dataset_uri = f.read().strip()

    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=region)

    # Create dataset
    ai_dataset = aiplatform.TabularDataset.create(
        display_name="crop_tabular_dataset",
        gcs_source=dataset_uri
    )

    # Train model
    job = aiplatform.AutoMLTabularTrainingJob(
        display_name="crop_tabular_model",
        optimization_objective="minimize-rmse",
        column_transformations=[
            {"numeric": {"column_name": "field_size"}},
            {"numeric": {"column_name": "rainfall"}},
            {"numeric": {"column_name": "temperature"}},
            {"categorical": {"column_name": "location"}},
            {"categorical": {"column_name": "crop_type"}},
            {"timestamp": {"column_name": "date"}}
        ],
        target_column="yield",
        budget_milli_node_hours=83,  # Approximately 5 minutes
        optimization_prediction_type="regression",
        additional_experiments=["enable_model_compression"]
    )

    # Run the training job
    ai_model = job.run(
        dataset=ai_dataset,
        model_display_name="crop_yield_model",
        training_fraction_split=0.8,
        validation_fraction_split=0.1,
        test_fraction_split=0.1
    )

    # Get model evaluation
    eval_metrics = ai_model.list_model_evaluations()[0]

    # Check if model meets accuracy threshold
    if eval_metrics.metrics['rmse'] > min_accuracy:
        raise ValueError(f"Model RMSE {eval_metrics.metrics['rmse']} above threshold {min_accuracy}")

    # Return model info
    model_info = {
        'model': ai_model.resource_name,
        'rmse': float(eval_metrics.metrics['rmse'])
    }
    with open(model_info.path, 'w') as f:
        json.dump(info, f)


In [None]:
@dsl.component
def deploy_models(
    project_id: str,
    region: str,
    vision_model: Input[Artifact],
    tabular_model: Input[Artifact],
    endpoints: Output[Artifact]
):
    """
    Deploy trained models to endpoints
    
     Args:
        project_id: GCP project ID
        region: GCP region
        vision_model: Input artifact containing vision model information
        tabular_model: Input artifact containing tabular model information
        endpoints: Output artifact for endpoint information
    """
    import json

     # Read model info from input artifacts
    with open(vision_model.path, 'r') as f:
        vision_model_info = json.load(f)
    
    with open(tabular_model.path, 'r') as f:
        tabular_model_info = json.load(f)
    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=region)

    # Deploy vision model
    vision_model_resource = aiplatform.Model(vision_model_info['model'])
    vision_endpoint = vision_model_resource.deploy(
        machine_type='n1-standard-4',
        min_replica_count=1,
        max_replica_count=1
    )

    # Deploy tabular model
    tabular_model_resource = aiplatform.Model(tabular_model_info['model'])
    tabular_endpoint = tabular_model_resource.deploy(
        machine_type='n1-standard-4',
        min_replica_count=1,
        max_replica_count=1
    )

        # Write endpoint information to output artifact
    endpoint_info = {
        'vision_endpoint': vision_endpoint.resource_name,
        'tabular_endpoint': tabular_endpoint.resource_name
    }
    
    with open(endpoints.path, 'w') as f:
        json.dump(endpoint_info, f)


In [None]:
# Define pipeline
@dsl.pipeline(
    name='AgriAutoML Pipeline',
    description='End-to-end pipeline for agricultural yield prediction'
)
def agri_automl_pipeline(
    project_id: str,
    region: str,
    bucket_name: str,
    vision_dataset_uri: str,
    tabular_dataset_uri: str,
    min_accuracy: float = 0.8
):
    # Preprocess data
    preprocess_task = preprocess_data(
        vision_data=vision_dataset_uri,
        tabular_data=tabular_dataset_uri,
        bucket_name=bucket_name
    )

    # Train vision model
    train_vision_task = train_vision_model(
        project_id=project_id,
        region=region,
        dataset=preprocess_task.outputs['vision_dataset'],
        min_accuracy=min_accuracy
    )
    train_vision_task.after(preprocess_task)

    # Train tabular model
    train_tabular_task = train_tabular_model(
        project_id=project_id,
        region=region,
        dataset=preprocess_task.outputs['tabular_dataset'],
        min_accuracy=min_accuracy
    )
    train_tabular_task.after(preprocess_task)

    # Deploy models
    deploy_task = deploy_models(
        project_id=project_id,
        region=region,
        vision_model=train_vision_task.outputs['model_info'],
        tabular_model=train_tabular_task.outputs['model_info']
    )
    deploy_task.after(train_vision_task, train_tabular_task)

In [None]:
# Initialize Vertex AI
aiplatform.init(
    project=project_id,
    location=REGION,
    credentials=credentials
)


# Compile pipeline
compiler.Compiler().compile(
    pipeline_func=agri_automl_pipeline,
    package_path='pipeline.yaml'
)


# Create and run pipeline job
job = pipeline_jobs.PipelineJob(
    display_name='agri-automl-pipeline',
    template_path='pipeline.yaml',
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        'project_id': project_id,  # Changed from PROJECT_ID
        'region': REGION,
        'bucket_name': bucket_name,
        'vision_dataset_uri': vision_uri,  # Changed from VISION_DATASET_URI
        'tabular_dataset_uri': tabular_uri,  # Changed from TABULAR_DATASET_URI
        'min_accuracy': 0.8
    }
)

job.submit()