# Crop Yield Prediction Pipeline in Vertex AI

This notebook demonstrates the deployment of a tabular model for crop yield prediction using Vertex AI.

In [None]:
%pip cache purge

# Install core dependencies
%pip install --no-cache-dir "setuptools>=65.5.1"
%pip install --no-cache-dir "wheel>=0.40.0"
%pip install --no-cache-dir "protobuf>=4.21.1,<5.0.0"
%pip install --no-cache-dir "pydantic>=2.0.0,<3.0.0"

# Install GCP dependencies
%pip install --no-cache-dir "google-cloud-aiplatform==1.104.0"
%pip install --no-cache-dir "google-cloud-storage>=2.0.0"
%pip install --no-cache-dir 'google-cloud-bigquery[bqstorage,pandas]>=3.31.0,<4.0.0'

# Install KFP and related packages
%pip install --no-cache-dir kfp>=2.0.0 --use-pep517
%pip install --no-cache-dir 'kfp-pipeline-spec==0.6.0'

# Install data processing packages
%pip install --no-cache-dir pandas numpy scikit-learn

In [None]:
import os
from google.cloud import aiplatform
from google.cloud import storage
from google.auth import default
from datetime import datetime
from kfp import dsl, compiler
from kfp.dsl import Output, Dataset, Input, Artifact
from google.cloud.aiplatform import pipeline_jobs
import pandas as pd
import numpy as np

# Get default credentials and project
credentials, project_id = default()

# Configuration
REGION = "us-central1"
bucket_name = "agrifingcpflow-465809-bucket"
PIPELINE_ROOT = f"gs://{bucket_name}/pipeline_root"

In [None]:
def create_sample_data():
    """Create sample tabular dataset for crop yield prediction."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    # Create sample tabular data
    df = pd.DataFrame({
        'field_size': np.random.uniform(1, 100, 100),
        'temperature': np.random.normal(25, 5, 100),
        'rainfall': np.random.normal(50, 10, 100),
        'soil_quality': np.random.choice(['good', 'medium', 'poor'], 100),
        'yield': np.random.normal(75, 15, 100)
    })
    
    # Upload to GCS
    blob = bucket.blob('sample_tabular_data/farming_data.csv')
    blob.upload_from_string(df.to_csv(index=False))
    
    return f"gs://{bucket_name}/sample_tabular_data/farming_data.csv"

# Create the sample data and get the URI
tabular_uri = create_sample_data()
print(f"Created tabular dataset at: {tabular_uri}")

In [None]:
@dsl.component(
    packages_to_install=[
        'google-cloud-storage>=2.0.0',
        'google-cloud-aiplatform==1.104.0',
        'pandas',
        'scikit-learn'
    ]
)
def preprocess_data(
    tabular_data: str,
    bucket_name: str,
    project_id: str,
    region: str,
    tabular_dataset: Output[Dataset]
):
    """Preprocess tabular data for crop yield prediction."""
    from google.cloud import storage
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    
    # Read data
    df = pd.read_csv(tabular_data)
    
    # Encode categorical variables
    le = LabelEncoder()
    df['soil_quality'] = le.fit_transform(df['soil_quality'])
    
    # Save processed data
    output_uri = f"gs://{bucket_name}/processed_data/farming_data_processed.csv"
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob('processed_data/farming_data_processed.csv')
    blob.upload_from_string(df.to_csv(index=False))
    
    # Save to the KFP output location
    with open(tabular_dataset.path, 'w') as f:
        f.write(output_uri)

In [None]:
@dsl.pipeline(
    name='Crop Yield Prediction Pipeline',
    description='Pipeline for agricultural yield prediction using tabular data'
)
def crop_prediction_pipeline(
    project_id: str,
    region: str,
    bucket_name: str,
    tabular_dataset_uri: str,
    min_accuracy: float = 0.8
):
    # Preprocess data
    preprocess_task = preprocess_data(
        tabular_data=tabular_dataset_uri,
        bucket_name=bucket_name,
        project_id=project_id,
        region=region
    )

    # Train tabular model
    train_tabular_task = train_tabular_model(
        project_id=project_id,
        region=region,
        dataset=preprocess_task.outputs['tabular_dataset'],
        min_accuracy=min_accuracy
    )
    train_tabular_task.after(preprocess_task)

    # Deploy model
    deploy_task = deploy_model(
        project_id=project_id,
        region=region,
        model=train_tabular_task.outputs['model_info']
    )
    deploy_task.after(train_tabular_task)

In [None]:
# Check GCP setup
print(f"Current Project ID: {project_id}")
print(f"Current Region: {REGION}")
print("Authenticated as:", credentials.service_account_email if hasattr(credentials, 'service_account_email') else "User Account")

# Test GCP API access
storage_client = storage.Client()
try:
    buckets = list(storage_client.list_buckets(max_results=1))
    print("✅ Storage API access successful")
except Exception as e:
    print("❌ Storage API access failed:", str(e))

# Initialize Vertex AI
aiplatform.init(
    project=project_id,
    location=REGION,
    credentials=credentials
)

# Compile pipeline
compiler.Compiler().compile(
    pipeline_func=crop_prediction_pipeline,
    package_path='pipeline.yaml'
)

# Create and run pipeline job
job = pipeline_jobs.PipelineJob(
    display_name='crop-yield-prediction-pipeline',
    template_path='pipeline.yaml',
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        'project_id': project_id,
        'region': REGION,
        'bucket_name': bucket_name,
        'tabular_dataset_uri': tabular_uri,
        'min_accuracy': 0.8
    }
)

job.submit()