# Customer.IO Data Pipelines API - Advanced Pipeline Integration

## Purpose

This notebook demonstrates advanced data pipeline integration and orchestration with Customer.IO's Data Pipelines API.
It covers pipeline design patterns, data flow orchestration, transformation pipelines, scheduling, dependency management, and integration with external data sources.

## Prerequisites

- Complete setup from `00_setup_and_configuration.ipynb`
- Complete authentication setup from `01_authentication_and_utilities.ipynb`
- Understanding of batch operations from `09_batch_operations.ipynb`
- Customer.IO API key configured in Databricks secrets
- Understanding of data pipeline concepts and ETL processes

## Key Concepts

- **Pipeline Orchestration**: Coordinating complex data workflows
- **Data Transformation**: ETL processes and data quality management
- **Dependency Management**: Pipeline dependencies and execution order
- **Scheduling**: Time-based and event-driven pipeline execution
- **Data Quality**: Validation, cleansing, and enrichment
- **Integration Patterns**: Connecting multiple data sources and systems

## Pipeline Operations Covered

1. **Pipeline Design**: Multi-stage pipelines with dependencies
2. **Data Ingestion**: Batch and streaming data ingestion patterns
3. **Transformation**: Data cleansing, enrichment, and standardization
4. **Orchestration**: Workflow management and execution coordination
5. **Monitoring**: Pipeline health, performance, and data quality metrics
6. **Integration**: External systems, APIs, and data sources

## Setup and Imports

In [ ]:
# Standard library imports
import sys
import os
import json
import time
from datetime import datetime, timezone, timedelta
from typing import Dict, List, Optional, Any, Union, Tuple
import uuid
from collections import defaultdict
import statistics

# Import warnings handler
import warnings
warnings.filterwarnings('ignore')

print("SUCCESS: Standard libraries imported")

In [None]:
# Add utils directory to Python path
sys.path.append('/Workspace/Repos/customer_io_notebooks/utils')
print("SUCCESS: Utils directory added to Python path")

In [ ]:
# Import Customer.IO utilities including PipelineManager
from utils.api_client import CustomerIOClient
from utils.authentication_manager import AuthenticationManager, AuthenticationConfig
from utils.pipeline_manager import (
    PipelineManager,
    PipelineStageType,
    PipelineStatus,
    ExecutionStrategy,
    DataSourceType,
    DataQualityRule,
    TriggerType,
    DataQualityMetrics,
    PipelineExecution,
    PipelineStage
)
from utils.event_manager import EventManager
from utils.people_manager import PeopleManager

print("SUCCESS: Customer.IO utilities imported with existing PipelineManager")

In [ ]:
# Import validation and transformation utilities
from utils.validators import validate_request_size, create_context
from utils.transformers import BatchTransformer, ContextTransformer
from utils.error_handlers import (
    CustomerIOError,
    RateLimitError,
    ValidationError,
    NetworkError,
    retry_on_error,
    ErrorContext
)

print("SUCCESS: Validation and transformation utilities imported")

In [ ]:
# Import Databricks and Spark utilities
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta.tables import DeltaTable

# Import validation and logging
import structlog
from pydantic import ValidationError as PydanticValidationError

# Initialize logger
logger = structlog.get_logger("pipeline_integration")

print("SUCCESS: Databricks, Spark, and logging utilities imported")

In [ ]:
# Load configuration from setup notebook
try:
    CUSTOMERIO_REGION = dbutils.widgets.get("customerio_region") or "us"
    DATABASE_NAME = dbutils.widgets.get("database_name") or "customerio_demo"
    CATALOG_NAME = dbutils.widgets.get("catalog_name") or "main"
    ENVIRONMENT = dbutils.widgets.get("environment") or "test"
    
    print(f"Configuration loaded from setup notebook:")
    print(f"  Region: {CUSTOMERIO_REGION}")
    print(f"  Database: {CATALOG_NAME}.{DATABASE_NAME}")
    print(f"  Environment: {ENVIRONMENT}")
    
except Exception as e:
    print(f"WARNING: Could not load configuration from setup notebook: {str(e)}")
    print("INFO: Using fallback configuration")
    CUSTOMERIO_REGION = "us"
    DATABASE_NAME = "customerio_demo"
    CATALOG_NAME = "main"
    ENVIRONMENT = "test"

# Configure Spark to use the specified database
spark.sql(f"USE {CATALOG_NAME}.{DATABASE_NAME}")
print("SUCCESS: Database configured")

## Test-Driven Development: Pipeline Validation Functions

In [ ]:
# Initialize authentication and PipelineManager
try:
    # Get API key from secure storage
    CUSTOMERIO_API_KEY = dbutils.secrets.get("customerio", "api_key")
    
    # Create authentication configuration
    auth_config = AuthenticationConfig(
        api_key=CUSTOMERIO_API_KEY,
        region=CUSTOMERIO_REGION,
        environment=ENVIRONMENT
    )
    
    # Initialize authentication manager
    auth_manager = AuthenticationManager(auth_config, spark)
    client = auth_manager.connect()
    
    # Initialize PipelineManager using existing implementation
    pipeline_manager = PipelineManager(client, spark)
    
    # Initialize other managers
    event_manager = EventManager(client)
    people_manager = PeopleManager(client)
    
    print("SUCCESS: PipelineManager and other managers initialized")
    print(f"  PipelineManager ready for orchestration")
    print(f"  Event and People managers available for data operations")
    
except Exception as e:
    print(f"ERROR: Failed to initialize managers: {str(e)}")
    raise

In [ ]:
# Create a customer data pipeline using existing PipelineManager
print("=== Creating Customer Data Pipeline ===")

# Define data sources for the pipeline
data_sources = [
    {
        "source_id": "customer_database",
        "source_type": DataSourceType.DATABASE,
        "config": {
            "table_name": "customers",
            "batch_size": 1000,
            "quality_checks": True
        }
    }
]

# Create pipeline configuration
pipeline_config = {
    "pipeline_id": "customer_data_pipeline_v2",
    "pipeline_name": "Customer Data Integration Pipeline",
    "description": "Comprehensive customer data ingestion and processing",
    "version": "2.0",
    "environment": ENVIRONMENT,
    "data_sources": data_sources,
    "quality_threshold": 0.85,
    "enable_monitoring": True,
    "retry_failed_stages": True,
    "max_parallel_stages": 3
}

# Create the pipeline using PipelineManager
pipeline_definition = pipeline_manager.create_pipeline(pipeline_config)

print(f"Pipeline created successfully:")
print(f"  Pipeline ID: {pipeline_definition['pipeline_id']}")
print(f"  Version: {pipeline_definition['version']}")
print(f"  Total Stages: {len(pipeline_definition['stages'])}")
print(f"  Data Sources: {len(pipeline_definition['data_sources'])}")

# Display pipeline stages
print(f"\nPipeline Stages:")
for i, stage in enumerate(pipeline_definition['stages']):
    deps = ", ".join(stage.get('dependencies', [])) if stage.get('dependencies') else "None"
    print(f"  {i+1}. {stage['stage_name']} ({stage['stage_type']}) - Dependencies: {deps}")

In [ ]:
# Execute the pipeline using PipelineManager
print("=== Pipeline Execution ===")

# Configure execution parameters
execution_config = {
    "execution_strategy": ExecutionStrategy.HYBRID,
    "trigger_type": TriggerType.MANUAL,
    "enable_monitoring": True,
    "enable_alerts": True,
    "timeout_minutes": 60,
    "max_retries": 3
}

# Execute the pipeline
print("Starting pipeline execution...")
start_time = time.time()

execution_result = pipeline_manager.execute_pipeline(
    pipeline_definition['pipeline_id'],
    execution_config
)

end_time = time.time()
execution_time = end_time - start_time

print(f"\nPipeline execution completed:")
print(f"  Execution ID: {execution_result['execution_id']}")
print(f"  Status: {execution_result['status']}")
print(f"  Total Time: {execution_time:.2f} seconds")
print(f"  Stages Completed: {execution_result['completed_stages']}")
print(f"  Records Processed: {execution_result.get('records_processed', 0):,}")

if execution_result.get('quality_metrics'):
    qm = execution_result['quality_metrics']
    print(f"  Overall Quality Score: {qm.get('overall_score', 0):.3f}")
    print(f"  Valid Records: {qm.get('valid_records', 0):,}")
    print(f"  Invalid Records: {qm.get('invalid_records', 0):,}")

print("SUCCESS: Pipeline executed using PipelineManager")

# Validate pipeline configuration using PipelineManager
print("=== Pipeline Validation ===")

# Validate the pipeline definition
validation_result = pipeline_manager.validate_pipeline(pipeline_definition)

print(f"Pipeline validation results:")
print(f"  Valid: {validation_result['is_valid']}")
print(f"  Stage count: {validation_result['stage_count']}")
print(f"  Dependency validation: {validation_result['dependencies_valid']}")

if validation_result.get('warnings'):
    print(f"  Warnings: {len(validation_result['warnings'])}")
    for warning in validation_result['warnings']:
        print(f"    - {warning}")

if validation_result.get('errors'):
    print(f"  Errors: {len(validation_result['errors'])}")
    for error in validation_result['errors']:
        print(f"    - {error}")

print("SUCCESS: Pipeline validation completed")

## Configuration and Authentication

In [ ]:
# Get pipeline monitoring and analytics using PipelineManager
print("=== Pipeline Monitoring ===")

# Get execution history
execution_history = pipeline_manager.get_execution_history(
    pipeline_id=pipeline_definition['pipeline_id'],
    limit=10
)

print(f"Recent executions for {pipeline_definition['pipeline_id']}:")
print(f"  Total executions found: {len(execution_history)}")

for i, execution in enumerate(execution_history[:5]):  # Show first 5
    status_icon = "✓" if execution['status'] == 'completed' else "✗" if execution['status'] == 'failed' else "⏳"
    print(f"  {i+1}. {status_icon} {execution['execution_id'][:8]}... - {execution['status']} ({execution.get('execution_time', 'N/A')})")

# Get pipeline metrics
metrics = pipeline_manager.get_pipeline_metrics(pipeline_definition['pipeline_id'])

print(f"\nPipeline Performance Metrics:")
print(f"  Success Rate: {metrics.get('success_rate_percent', 0):.1f}%")
print(f"  Average Execution Time: {metrics.get('avg_execution_time_minutes', 0):.2f} minutes")
print(f"  Average Quality Score: {metrics.get('avg_quality_score', 0):.3f}")
print(f"  Total Records Processed: {metrics.get('total_records_processed', 0):,}")

# Check for performance anomalies
anomalies = pipeline_manager.detect_performance_anomalies(pipeline_definition['pipeline_id'])

print(f"\nAnomaly Detection:")
if anomalies:
    print(f"  Found {len(anomalies)} potential issues:")
    for anomaly in anomalies:
        print(f"    [{anomaly['severity']}] {anomaly['description']}")
else:
    print("  No performance anomalies detected")

print("SUCCESS: Pipeline monitoring data retrieved")

## Data Quality Management

In [ ]:
# Validate pipeline configuration using PipelineManager
print("=== Pipeline Validation ===")

# Validate the pipeline definition
validation_result = pipeline_manager.validate_pipeline(pipeline_definition)

print(f"Pipeline validation results:")
print(f"  Valid: {validation_result['is_valid']}")
print(f"  Stage count: {validation_result['stage_count']}")
print(f"  Dependency validation: {validation_result['dependencies_valid']}")

if validation_result.get('warnings'):
    print(f"  Warnings: {len(validation_result['warnings'])}")
    for warning in validation_result['warnings']:
        print(f"    - {warning}")

if validation_result.get('errors'):
    print(f"  Errors: {len(validation_result['errors'])}")
    for error in validation_result['errors']:
        print(f"    - {error}")

print("SUCCESS: Pipeline validation completed")

## Advanced Pipeline Features

In [ ]:
# Demonstrate advanced pipeline features
print("=== Advanced Pipeline Features ===")

# 1. Pipeline scheduling
schedule_config = {
    "trigger_type": TriggerType.SCHEDULE,
    "cron_expression": "0 2 * * *",  # Daily at 2 AM
    "timezone": "UTC",
    "enabled": True
}

pipeline_manager.schedule_pipeline(
    pipeline_definition['pipeline_id'],
    schedule_config
)

print(f"Pipeline scheduled:")
print(f"  Schedule: Daily at 2:00 AM UTC")
print(f"  Status: Enabled")

# 2. Pipeline dependency management
dependent_pipeline_config = {
    "pipeline_id": "event_processing_pipeline",
    "pipeline_name": "Event Processing Pipeline",
    "description": "Process events after customer data is loaded",
    "dependencies": [pipeline_definition['pipeline_id']],
    "data_sources": [],
    "quality_threshold": 0.9
}

dependent_pipeline = pipeline_manager.create_pipeline(dependent_pipeline_config)

print(f"\nDependent pipeline created:")
print(f"  Pipeline: {dependent_pipeline['pipeline_name']}")
print(f"  Depends on: {', '.join(dependent_pipeline['dependencies'])}")

# 3. Pipeline versioning
new_version = pipeline_manager.create_pipeline_version(
    pipeline_definition['pipeline_id'],
    {
        "version": "2.1",
        "changes": ["Improved data quality rules", "Added monitoring"],
        "backward_compatible": True
    }
)

print(f"\nPipeline versioning:")
print(f"  New version: {new_version['version']}")
print(f"  Changes: {len(new_version['changes'])} improvements")
print(f"  Backward compatible: {new_version['backward_compatible']}")

print("SUCCESS: Advanced pipeline features demonstrated")

In [None]:
# Set up and execute the pipeline
orchestrator = PipelineOrchestrator(client)

# Register stage executors
orchestrator.register_stage_executor(PipelineStageType.EXTRACT, DataExtractionStage)
orchestrator.register_stage_executor(PipelineStageType.TRANSFORM, DataTransformationStage)
orchestrator.register_stage_executor(PipelineStageType.VALIDATE, DataTransformationStage)  # Reuse transform logic
orchestrator.register_stage_executor(PipelineStageType.LOAD, CustomerIOLoadStage)

print("=== Executing Customer Data Pipeline ===")

# Execute the pipeline
start_time = time.time()
completed_pipeline = orchestrator.execute_pipeline(customer_pipeline)
end_time = time.time()

actual_execution_time = end_time - start_time

print(f"\n=== Pipeline Execution Results ===")
print(f"Execution ID: {completed_pipeline.execution_id}")
print(f"Final Status: {completed_pipeline.status}")
print(f"Progress: {completed_pipeline.get_progress_percent():.1f}%")
print(f"Total Execution Time: {completed_pipeline.total_execution_time_minutes:.2f} minutes")
print(f"Records Processed: {completed_pipeline.total_records_processed:,}")
print(f"Overall Quality Score: {completed_pipeline.overall_quality_score:.3f}" if completed_pipeline.overall_quality_score else "N/A")

print(f"\n=== Stage Results ===")
for stage in completed_pipeline.stages:
    status_icon = "✓" if stage.status == PipelineStatus.COMPLETED else "✗" if stage.status == PipelineStatus.FAILED else "⏳"
    runtime = f"{stage.execution_time_minutes:.2f}m" if stage.execution_time_minutes else "N/A"
    quality = f"{stage.quality_metrics.quality_score:.3f}" if stage.quality_metrics else "N/A"
    
    print(f"  {status_icon} {stage.stage_name}: {stage.status} ({runtime}, quality: {quality})")
    
    if stage.error_message:
        print(f"    Error: {stage.error_message}")
    
    if stage.quality_metrics:
        qm = stage.quality_metrics
        print(f"    Records: {qm.total_records} total, {qm.valid_records} valid, {qm.invalid_records} invalid")

print(f"\n=== Execution Summary ===")
print(f"Completed Stages: {len(completed_pipeline.completed_stages)}")
print(f"Failed Stages: {len(completed_pipeline.failed_stages)}")
print(f"Skipped Stages: {len(completed_pipeline.skipped_stages)}")

if completed_pipeline.failed_stages:
    print(f"Failed Stage IDs: {', '.join(completed_pipeline.failed_stages)}")

In [None]:
# Implementation: Pipeline monitoring and analytics
class PipelineMonitor:
    """Advanced pipeline monitoring and analytics."""
    
    def __init__(self):
        self.logger = structlog.get_logger("pipeline_monitor")
        self.execution_history = deque(maxlen=1000)
        self.performance_metrics = defaultdict(list)
        
    def record_execution(self, execution: PipelineExecution) -> None:
        """Record pipeline execution for monitoring."""
        
        execution_record = {
            "execution_id": execution.execution_id,
            "pipeline_id": execution.pipeline_id,
            "status": execution.status,
            "started_at": execution.started_at,
            "completed_at": execution.completed_at,
            "total_execution_time_minutes": execution.total_execution_time_minutes,
            "records_processed": execution.total_records_processed,
            "overall_quality_score": execution.overall_quality_score,
            "stage_count": len(execution.stages),
            "completed_stages": len(execution.completed_stages),
            "failed_stages": len(execution.failed_stages)
        }
        
        self.execution_history.append(execution_record)
        
        # Record performance metrics
        pipeline_id = execution.pipeline_id
        if execution.total_execution_time_minutes:
            self.performance_metrics[f"{pipeline_id}_execution_time"].append(
                execution.total_execution_time_minutes
            )
        
        if execution.overall_quality_score:
            self.performance_metrics[f"{pipeline_id}_quality_score"].append(
                execution.overall_quality_score
            )
        
        if execution.total_records_processed:
            self.performance_metrics[f"{pipeline_id}_throughput"].append(
                execution.total_records_processed / (execution.total_execution_time_minutes or 1)
            )
    
    def get_pipeline_analytics(self, pipeline_id: str) -> Dict[str, Any]:
        """Get comprehensive analytics for a pipeline."""
        
        # Filter executions for this pipeline
        pipeline_executions = [
            exec_record for exec_record in self.execution_history
            if exec_record["pipeline_id"] == pipeline_id
        ]
        
        if not pipeline_executions:
            return {"error": "No execution data found for pipeline"}
        
        # Calculate success/failure rates
        total_executions = len(pipeline_executions)
        successful_executions = len([
            e for e in pipeline_executions 
            if e["status"] == PipelineStatus.COMPLETED
        ])
        failed_executions = len([
            e for e in pipeline_executions 
            if e["status"] == PipelineStatus.FAILED
        ])
        
        success_rate = (successful_executions / total_executions * 100) if total_executions > 0 else 0
        
        # Calculate performance statistics
        execution_times = [
            e["total_execution_time_minutes"] for e in pipeline_executions
            if e["total_execution_time_minutes"]
        ]
        
        quality_scores = [
            e["overall_quality_score"] for e in pipeline_executions
            if e["overall_quality_score"]
        ]
        
        records_processed = [
            e["records_processed"] for e in pipeline_executions
            if e["records_processed"]
        ]
        
        # Recent performance (last 10 executions)
        recent_executions = sorted(
            pipeline_executions, 
            key=lambda x: x["started_at"] or datetime.min.replace(tzinfo=timezone.utc),
            reverse=True
        )[:10]
        
        analytics = {
            "pipeline_id": pipeline_id,
            "total_executions": total_executions,
            "successful_executions": successful_executions,
            "failed_executions": failed_executions,
            "success_rate_percent": round(success_rate, 2),
            "performance_stats": {
                "avg_execution_time_minutes": round(statistics.mean(execution_times), 2) if execution_times else 0,
                "min_execution_time_minutes": round(min(execution_times), 2) if execution_times else 0,
                "max_execution_time_minutes": round(max(execution_times), 2) if execution_times else 0,
                "avg_quality_score": round(statistics.mean(quality_scores), 3) if quality_scores else 0,
                "avg_records_per_execution": round(statistics.mean(records_processed), 0) if records_processed else 0,
                "total_records_processed": sum(records_processed) if records_processed else 0
            },
            "recent_trend": {
                "last_10_executions": len(recent_executions),
                "recent_success_rate": (
                    len([e for e in recent_executions if e["status"] == PipelineStatus.COMPLETED]) /
                    len(recent_executions) * 100
                ) if recent_executions else 0,
                "recent_avg_time": round(
                    statistics.mean([
                        e["total_execution_time_minutes"] for e in recent_executions
                        if e["total_execution_time_minutes"]
                    ]), 2
                ) if recent_executions else 0
            },
            "last_execution": recent_executions[0] if recent_executions else None,
            "analyzed_at": datetime.now(timezone.utc).isoformat()
        }
        
        return analytics
    
    def detect_anomalies(self, pipeline_id: str) -> List[Dict[str, Any]]:
        """Detect performance anomalies in pipeline executions."""
        
        anomalies = []
        
        # Get performance metrics
        execution_times = self.performance_metrics.get(f"{pipeline_id}_execution_time", [])
        quality_scores = self.performance_metrics.get(f"{pipeline_id}_quality_score", [])
        throughputs = self.performance_metrics.get(f"{pipeline_id}_throughput", [])
        
        # Detect execution time anomalies
        if len(execution_times) >= 5:
            avg_time = statistics.mean(execution_times)
            std_time = statistics.stdev(execution_times)
            recent_time = execution_times[-1]
            
            if recent_time > avg_time + (2 * std_time):  # 2 standard deviations
                anomalies.append({
                    "type": "slow_execution",
                    "severity": "warning",
                    "description": f"Execution time {recent_time:.2f}m is significantly above average {avg_time:.2f}m",
                    "metric": "execution_time",
                    "value": recent_time,
                    "threshold": avg_time + (2 * std_time)
                })
        
        # Detect quality score anomalies
        if len(quality_scores) >= 5:
            avg_quality = statistics.mean(quality_scores)
            recent_quality = quality_scores[-1]
            
            if recent_quality < avg_quality - 0.1:  # Quality drop > 10%
                anomalies.append({
                    "type": "quality_degradation",
                    "severity": "critical",
                    "description": f"Quality score {recent_quality:.3f} is significantly below average {avg_quality:.3f}",
                    "metric": "quality_score",
                    "value": recent_quality,
                    "threshold": avg_quality - 0.1
                })
        
        # Detect throughput anomalies
        if len(throughputs) >= 5:
            avg_throughput = statistics.mean(throughputs)
            recent_throughput = throughputs[-1]
            
            if recent_throughput < avg_throughput * 0.7:  # 30% drop in throughput
                anomalies.append({
                    "type": "low_throughput",
                    "severity": "warning",
                    "description": f"Throughput {recent_throughput:.1f} records/min is significantly below average {avg_throughput:.1f}",
                    "metric": "throughput",
                    "value": recent_throughput,
                    "threshold": avg_throughput * 0.7
                })
        
        return anomalies
    
    def get_system_health(self) -> Dict[str, Any]:
        """Get overall system health metrics."""
        
        if not self.execution_history:
            return {"status": "no_data"}
        
        # Recent executions (last hour)
        cutoff_time = datetime.now(timezone.utc) - timedelta(hours=1)
        recent_executions = [
            e for e in self.execution_history
            if e["started_at"] and e["started_at"] > cutoff_time
        ]
        
        # Calculate health metrics
        total_recent = len(recent_executions)
        successful_recent = len([
            e for e in recent_executions 
            if e["status"] == PipelineStatus.COMPLETED
        ])
        
        health_score = (successful_recent / total_recent * 100) if total_recent > 0 else 100
        
        # Determine overall status
        if health_score >= 95:
            status = "healthy"
        elif health_score >= 80:
            status = "degraded"
        else:
            status = "unhealthy"
        
        return {
            "status": status,
            "health_score_percent": round(health_score, 2),
            "recent_executions_1h": total_recent,
            "successful_executions_1h": successful_recent,
            "failed_executions_1h": total_recent - successful_recent,
            "total_pipelines_monitored": len(set(e["pipeline_id"] for e in self.execution_history)),
            "last_updated": datetime.now(timezone.utc).isoformat()
        }

print("SUCCESS: PipelineMonitor class defined")

## Pipeline Data from Spark Integration

## Spark Integration and Analytics

## Clean Up and Summary