# Customer.IO Data Pipelines API - Setup and Configuration

## Purpose

This notebook establishes the foundation for working with Customer.IO's Data Pipelines API in Databricks.
It covers environment setup, authentication configuration, Delta Lake table creation, and synthetic data generation for demonstrations.

## Prerequisites

- Databricks Runtime 11.3 LTS or higher
- Customer.IO API key (for test/sandbox environment)
- Databricks secrets configured for API credentials
- Cluster with Delta Lake enabled

## Key Concepts

- **Customer.IO Data Pipelines API**: REST API for sending customer data and events
- **Regional Endpoints**: Separate US and EU endpoints for data residency
- **Rate Limits**: 3000 requests per 3 seconds
- **Request Limits**: 32KB per request, 500KB per batch
- **Delta Lake Integration**: Structured data storage for analytics and processing

## Environment Setup and Package Installation

In [None]:
# Package Management - Production Ready with Version Constraints

REQUIRED_PACKAGES = {
    "httpx": ">=0.25.0",
    "pydantic": ">=2.0.0", 
    "structlog": ">=24.0.0",
    "faker": ">=20.0.0",
    "python-dateutil": ">=2.8.0"
}

def install_packages():
    """Install required packages with version constraints."""
    packages = [f"{pkg}{version}" for pkg, version in REQUIRED_PACKAGES.items()]
    package_string = " ".join(packages)
    
    print("Installing packages with version constraints:")
    for pkg, version in REQUIRED_PACKAGES.items():
        print(f"  {pkg} {version}")
    
    return package_string

# Prepare package installation
package_string = install_packages()
print(f"SUCCESS: Package installation prepared - {len(REQUIRED_PACKAGES)} packages")

In [None]:
# Install packages and restart Python kernel

%pip install {package_string}

print("SUCCESS: Packages installed with version constraints")
print("INFO: Restarting Python kernel to use newly installed packages...")

# Restart Python kernel to use newly installed packages
dbutils.library.restartPython()

## Import Required Libraries

In [None]:
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any
import uuid

# Databricks and Spark imports
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, BooleanType, DoubleType, ArrayType, MapType
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# HTTP and validation libraries
import httpx
from pydantic import BaseModel, Field, validator
import structlog

# Data generation
from faker import Faker
from dateutil import tz

# Initialize Faker for generating realistic test data
fake = Faker()
fake.seed_instance(42)  # For reproducible data

# Initialize structured logging
logger = structlog.get_logger()

print("SUCCESS: All libraries imported successfully")

In [None]:
# Configuration widgets (non-sensitive data only)
# SECURITY: Never expose API keys in widgets!

dbutils.widgets.dropdown("customerio_region", "us", ["us", "eu"], "Customer.IO Region")
dbutils.widgets.text("database_name", "customerio_demo", "Database Name") 
dbutils.widgets.text("catalog_name", "main", "Unity Catalog Name")
dbutils.widgets.dropdown("environment", "test", ["test", "sandbox", "production"], "Environment")

# Get configuration values
CUSTOMERIO_REGION = dbutils.widgets.get("customerio_region")
DATABASE_NAME = dbutils.widgets.get("database_name")
CATALOG_NAME = dbutils.widgets.get("catalog_name")
ENVIRONMENT = dbutils.widgets.get("environment")

print(f"Widget configuration:")
print(f"  Region: {CUSTOMERIO_REGION}")
print(f"  Database: {CATALOG_NAME}.{DATABASE_NAME}")
print(f"  Environment: {ENVIRONMENT}")
print("SUCCESS: Configuration widgets set up")

In [None]:
# SECURE: Get API key from Databricks secrets
# This cell demonstrates proper secret management

try:
    if ENVIRONMENT == "production":
        CUSTOMERIO_API_KEY = dbutils.secrets.get(scope="customerio", key="production_api_key")
        secret_source = "production secrets"
    elif ENVIRONMENT == "sandbox":
        CUSTOMERIO_API_KEY = dbutils.secrets.get(scope="customerio", key="sandbox_api_key")
        secret_source = "sandbox secrets"
    else:
        # Test environment - use mock key
        CUSTOMERIO_API_KEY = "test_key_demo_12345"
        secret_source = "test mode"
        print("WARNING: Using test mode with mock API key")
        
except Exception as e:
    print(f"ERROR: Failed to retrieve API key from secrets: {str(e)}")
    print("INFO: To configure secrets, run:")
    print("   databricks secrets create-scope customerio")
    print("   databricks secrets put customerio production_api_key")
    print("   databricks secrets put customerio sandbox_api_key")
    
    # Fallback to test mode
    CUSTOMERIO_API_KEY = "test_key_demo_12345"
    ENVIRONMENT = "test"
    secret_source = "fallback test mode"
    print("INFO: Falling back to test mode")

print(f"SUCCESS: API key retrieved from {secret_source}")
print(f"Environment: {ENVIRONMENT}")
print(f"API Key: {'SECURED' if ENVIRONMENT != 'test' else 'TEST_MODE'}")

In [None]:
# Import CustomerIOConfig from utils module for type safety
# CustomerIOConfig will be extracted to utils for reusability

from utils.validators import CustomerIOConfig

# Test API key validation using the extracted utility
def test_api_key_validation():
    """Test API key validation logic."""
    try:
        # Test valid configuration
        valid_config = CustomerIOConfig(api_key=CUSTOMERIO_API_KEY, region=CUSTOMERIO_REGION)
        print("SUCCESS: Configuration validation passed")
        return valid_config
    except Exception as e:
        print(f"ERROR: Configuration validation failed: {str(e)}")
        # Create fallback test configuration
        test_config = CustomerIOConfig(api_key="test_key_demo_12345", region="us")
        print("INFO: Using fallback test configuration")
        return test_config

# Initialize and validate configuration
config = test_api_key_validation()

print(f"SUCCESS: Customer.IO API configured")
print(f"   Base URL: {config.base_url}")
print(f"   Rate Limit: {config.RATE_LIMIT_REQUESTS} requests per {config.RATE_LIMIT_WINDOW} seconds")
print(f"   Max Request Size: {config.MAX_REQUEST_SIZE / 1024:.0f}KB")
print(f"   Max Batch Size: {config.MAX_BATCH_SIZE / 1024:.0f}KB")
print(f"   Headers configured: {len(config.get_headers())} headers")

## Customer.IO API Configuration

In [None]:
# Customer.IO API Configuration with Type Safety and Validation
from pydantic import BaseModel, Field, validator
from typing import Dict, Literal

class CustomerIOConfig(BaseModel):
    """Type-safe configuration class for Customer.IO API settings."""
    
    api_key: str = Field(..., description="Customer.IO API key")
    region: Literal["us", "eu"] = Field(default="us", description="API region")
    
    # Rate limiting configuration (class variables)
    RATE_LIMIT_REQUESTS: int = 3000
    RATE_LIMIT_WINDOW: int = 3  # seconds
    
    # Request size limits
    MAX_REQUEST_SIZE: int = 32 * 1024  # 32KB
    MAX_BATCH_SIZE: int = 500 * 1024   # 500KB
    
    # Retry configuration
    MAX_RETRIES: int = 3
    RETRY_BACKOFF_FACTOR: float = 2.0
    
    @validator('api_key')
    def validate_api_key(cls, v: str) -> str:
        """Validate API key format."""
        if not v or len(v.strip()) == 0:
            raise ValueError("API key cannot be empty")
        if len(v) < 10:  # Reasonable minimum length
            raise ValueError("API key appears to be too short")
        return v.strip()
    
    @validator('region')
    def validate_region(cls, v: str) -> str:
        """Validate and normalize region."""
        return v.lower()
    
    @property
    def base_url(self) -> str:
        """Get base URL based on region."""
        if self.region == "eu":
            return "https://cdp-eu.customer.io/v1"
        else:
            return "https://cdp.customer.io/v1"
    
    def get_headers(self) -> Dict[str, str]:
        """Get HTTP headers for API requests."""
        import base64
        
        # Customer.IO uses Basic Auth with API key as username, empty password
        auth_string = base64.b64encode(f"{self.api_key}:".encode()).decode()
        
        return {
            "Authorization": f"Basic {auth_string}",
            "Content-Type": "application/json",
            "User-Agent": "CustomerIO-Databricks-Notebooks/1.0.0",
            "Accept": "application/json"
        }
    
    class Config:
        """Pydantic model configuration."""
        validate_assignment = True
        extra = "forbid"

# Test API key validation
def test_api_key_validation():
    """Test API key validation logic."""
    try:
        # Test valid configuration
        valid_config = CustomerIOConfig(api_key=CUSTOMERIO_API_KEY, region=CUSTOMERIO_REGION)
        print("SUCCESS: Configuration validation passed")
        return valid_config
    except Exception as e:
        print(f"ERROR: Configuration validation failed: {str(e)}")
        # Create fallback test configuration
        test_config = CustomerIOConfig(api_key="test_key_demo_12345", region="us")
        print("INFO: Using fallback test configuration")
        return test_config

# Initialize and validate configuration
config = test_api_key_validation()

print(f"SUCCESS: Customer.IO API configured")
print(f"   Base URL: {config.base_url}")
print(f"   Rate Limit: {config.RATE_LIMIT_REQUESTS} requests per {config.RATE_LIMIT_WINDOW} seconds")
print(f"   Max Request Size: {config.MAX_REQUEST_SIZE / 1024:.0f}KB")
print(f"   Max Batch Size: {config.MAX_BATCH_SIZE / 1024:.0f}KB")
print(f"   Headers configured: {len(config.get_headers())} headers")

## Database and Table Setup

In [None]:
# Create database if it doesn't exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {CATALOG_NAME}.{DATABASE_NAME}")
spark.sql(f"USE {CATALOG_NAME}.{DATABASE_NAME}")

print(f"SUCCESS: Using database: {CATALOG_NAME}.{DATABASE_NAME}")

## Delta Lake Schema Definitions

Define schemas for Delta Lake tables that align with Customer.IO API data structures.

In [None]:
# Schema for customers table (aligns with /identify endpoint)
customers_schema = StructType([
    StructField("customer_id", StringType(), False),
    StructField("user_id", StringType(), True),
    StructField("anonymous_id", StringType(), True),
    StructField("email", StringType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("updated_at", TimestampType(), True),
    StructField("traits", MapType(StringType(), StringType()), True),
    StructField("custom_attributes", MapType(StringType(), StringType()), True),
    StructField("is_active", BooleanType(), True),
    StructField("last_seen", TimestampType(), True),
    StructField("source", StringType(), True),
    StructField("region", StringType(), True)
])

# Schema for events table (aligns with /track endpoint)
events_schema = StructType([
    StructField("event_id", StringType(), False),
    StructField("customer_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("anonymous_id", StringType(), True),
    StructField("event_name", StringType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("properties", MapType(StringType(), StringType()), True),
    StructField("context", MapType(StringType(), StringType()), True),
    StructField("is_semantic_event", BooleanType(), True),
    StructField("event_category", StringType(), True),
    StructField("source", StringType(), True),
    StructField("processed_at", TimestampType(), True)
])

# Schema for groups table (aligns with /group endpoint)
groups_schema = StructType([
    StructField("group_id", StringType(), False),
    StructField("group_type", StringType(), True),
    StructField("name", StringType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("updated_at", TimestampType(), True),
    StructField("traits", MapType(StringType(), StringType()), True),
    StructField("parent_group_id", StringType(), True),
    StructField("is_active", BooleanType(), True)
])

# Schema for devices table (device management)
devices_schema = StructType([
    StructField("device_id", StringType(), False),
    StructField("customer_id", StringType(), False),
    StructField("device_token", StringType(), False),
    StructField("device_type", StringType(), False),  # ios, android, web
    StructField("platform", StringType(), True),
    StructField("app_version", StringType(), True),
    StructField("os_version", StringType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("last_used", TimestampType(), True),
    StructField("is_active", BooleanType(), True)
])

# Schema for API responses (logging and monitoring)
api_responses_schema = StructType([
    StructField("request_id", StringType(), False),
    StructField("endpoint", StringType(), False),
    StructField("method", StringType(), False),
    StructField("status_code", IntegerType(), False),
    StructField("response_time_ms", IntegerType(), True),
    StructField("request_size_bytes", IntegerType(), True),
    StructField("response_size_bytes", IntegerType(), True),
    StructField("timestamp", TimestampType(), False),
    StructField("error_message", StringType(), True),
    StructField("retry_count", IntegerType(), True),
    StructField("customer_id", StringType(), True)
])

# Schema for batch operations tracking
batch_operations_schema = StructType([
    StructField("batch_id", StringType(), False),
    StructField("operation_type", StringType(), False),
    StructField("total_records", IntegerType(), False),
    StructField("successful_records", IntegerType(), True),
    StructField("failed_records", IntegerType(), True),
    StructField("started_at", TimestampType(), False),
    StructField("completed_at", TimestampType(), True),
    StructField("status", StringType(), False),  # pending, processing, completed, failed
    StructField("error_summary", ArrayType(StringType()), True)
])

print("SUCCESS: Delta Lake schemas defined")
print(f"   Customers schema: {len(customers_schema.fields)} fields")
print(f"   Events schema: {len(events_schema.fields)} fields")
print(f"   Groups schema: {len(groups_schema.fields)} fields")
print(f"   Devices schema: {len(devices_schema.fields)} fields")
print(f"   API responses schema: {len(api_responses_schema.fields)} fields")
print(f"   Batch operations schema: {len(batch_operations_schema.fields)} fields")

## Create Delta Lake Tables

In [None]:
# Import synthetic data generation utilities
from utils.transformers import add_timestamp
from typing import Dict, List, Any, Optional
from datetime import datetime
import uuid

def generate_synthetic_customers(num_customers: int = 1000) -> List[Dict[str, Any]]:
    """
    Generate synthetic customer data for testing.
    
    Args:
        num_customers: Number of customers to generate
        
    Returns:
        List of customer dictionaries with required fields
        
    Raises:
        ValueError: If num_customers is not positive
        Exception: If data generation fails
    """
    if num_customers <= 0:
        raise ValueError("num_customers must be positive")
    
    customers: List[Dict[str, Any]] = []
    
    try:
        for i in range(num_customers):
            customer_id = str(uuid.uuid4())
            created_at = fake.date_time_between(start_date='-2y', end_date='now', tzinfo=tz.UTC)
            
            customer = {
                "customer_id": customer_id,
                "user_id": f"user_{i+1:06d}",
                "anonymous_id": str(uuid.uuid4()) if fake.boolean(chance_of_getting_true=30) else None,
                "email": fake.email(),
                "created_at": created_at,
                "updated_at": fake.date_time_between(start_date=created_at, end_date='now', tzinfo=tz.UTC),
                "traits": {
                    "first_name": fake.first_name(),
                    "last_name": fake.last_name(),
                    "age": str(fake.random_int(min=18, max=80)),
                    "city": fake.city(),
                    "country": fake.country(),
                    "plan": fake.random_element(["free", "basic", "premium", "enterprise"]),
                    "signup_source": fake.random_element(["website", "mobile_app", "referral", "social"])
                },
                "custom_attributes": {
                    "lifetime_value": str(round(fake.random.uniform(0, 5000), 2)),
                    "last_purchase_amount": str(round(fake.random.uniform(10, 500), 2)) if fake.boolean(chance_of_getting_true=60) else None,
                    "subscription_status": fake.random_element(["active", "canceled", "trial", "expired"])
                },
                "is_active": fake.boolean(chance_of_getting_true=85),
                "last_seen": fake.date_time_between(start_date='-30d', end_date='now', tzinfo=tz.UTC),
                "source": "synthetic_data",
                "region": CUSTOMERIO_REGION
            }
            customers.append(customer)
    except Exception as e:
        raise Exception(f"Failed to generate customer data: {str(e)}")
    
    return customers

print("SUCCESS: Customer data generation function defined")
print("   Generates realistic customer profiles with traits and attributes")
print("   Includes validation and error handling")

In [None]:
# Generate full synthetic dataset for demonstrations

print("Generating full synthetic dataset...")
try:
    synthetic_customers = generate_synthetic_customers(1000)
    synthetic_events = generate_synthetic_events(synthetic_customers, 5000)
    
    print(f"SUCCESS: Generated synthetic data")
    print(f"   Customers: {len(synthetic_customers):,}")
    print(f"   Events: {len(synthetic_events):,}")
    print(f"   Event categories: ecommerce, engagement, lifecycle, mobile")
    print(f"   Data ready for Delta Lake loading")
    
except Exception as e:
    print(f"ERROR: Data generation failed: {str(e)}")
    raise

In [None]:
# Test-driven approach: Test data generation functions

def test_customer_generation():
    """Test customer data generation."""
    print("TEST: Testing customer data generation...")
    
    # Test with valid parameters
    test_customers = generate_synthetic_customers(5)
    
    # Validate structure
    assert len(test_customers) == 5, "Should generate exactly 5 customers"
    
    for customer in test_customers:
        # Test required fields
        assert "customer_id" in customer, "customer_id is required"
        assert "user_id" in customer, "user_id is required"
        assert "email" in customer, "email is required"
        assert "traits" in customer, "traits is required"
        assert "region" in customer, "region is required"
        
        # Test data types
        assert isinstance(customer["customer_id"], str), "customer_id should be string"
        assert isinstance(customer["traits"], dict), "traits should be dict"
        assert isinstance(customer["is_active"], bool), "is_active should be bool"
    
    # Test error cases
    try:
        generate_synthetic_customers(0)
        assert False, "Should raise ValueError for zero customers"
    except ValueError:
        pass  # Expected
    
    print("SUCCESS: Customer generation test passed")
    return test_customers

def test_event_generation():
    """Test event data generation."""
    print("TEST: Testing event data generation...")
    
    # Generate test customers first
    test_customers = generate_synthetic_customers(3)
    
    # Test event generation
    test_events = generate_synthetic_events(test_customers, 10)
    
    # Validate structure
    assert len(test_events) == 10, "Should generate exactly 10 events"
    
    for event in test_events:
        # Test required fields
        assert "event_id" in event, "event_id is required"
        assert "event_name" in event, "event_name is required"
        assert "customer_id" in event, "customer_id is required"
        assert "timestamp" in event, "timestamp is required"
        assert "event_category" in event, "event_category is required"
        
        # Test data types
        assert isinstance(event["event_id"], str), "event_id should be string"
        assert isinstance(event["properties"], dict), "properties should be dict"
        assert isinstance(event["context"], dict), "context should be dict"
        assert isinstance(event["is_semantic_event"], bool), "is_semantic_event should be bool"
    
    # Test error cases
    try:
        generate_synthetic_events([], 5)
        assert False, "Should raise ValueError for empty customers"
    except ValueError:
        pass  # Expected
    
    print("SUCCESS: Event generation test passed")
    return test_events

# Run tests first to validate functionality
print("Running data generation tests...")
test_customers = test_customer_generation()
test_events = test_event_generation()
print("SUCCESS: All data generation tests passed")

In [None]:
# Define event generation function for comprehensive testing

def generate_synthetic_events(customers: List[Dict[str, Any]], num_events: int = 5000) -> List[Dict[str, Any]]:
    """
    Generate synthetic event data for testing.
    
    Args:
        customers: List of customer dictionaries
        num_events: Number of events to generate
        
    Returns:
        List of event dictionaries with required fields
        
    Raises:
        ValueError: If customers list is empty or num_events is not positive
        Exception: If event generation fails
    """
    if not customers:
        raise ValueError("customers list cannot be empty")
    if num_events <= 0:
        raise ValueError("num_events must be positive")
    
    events: List[Dict[str, Any]] = []
    
    # Define event types and their categories
    event_types = {
        "ecommerce": [
            "Product Viewed", "Product Added", "Cart Viewed", "Checkout Started", 
            "Order Completed", "Product Removed", "Coupon Applied"
        ],
        "engagement": [
            "Page Viewed", "Button Clicked", "Form Submitted", "Video Played", 
            "Document Downloaded", "Search Performed"
        ],
        "lifecycle": [
            "User Registered", "Profile Updated", "Settings Changed", "Account Upgraded", 
            "Subscription Canceled", "Password Reset"
        ],
        "mobile": [
            "Application Opened", "Application Backgrounded", "Push Notification Clicked",
            "Screen Viewed", "Feature Used"
        ]
    }
    
    try:
        for i in range(num_events):
            customer = fake.random_element(customers)
            category = fake.random_element(list(event_types.keys()))
            event_name = fake.random_element(event_types[category])
            
            # Generate event properties based on category
            properties: Dict[str, str] = {}
            if category == "ecommerce":
                properties.update({
                    "product_id": f"prod_{fake.random_int(min=1, max=1000)}",
                    "product_name": fake.catch_phrase(),
                    "price": str(round(fake.random.uniform(9.99, 299.99), 2)),
                    "currency": "USD",
                    "category": fake.random_element(["electronics", "clothing", "books", "home", "sports"])
                })
            elif category == "engagement":
                properties.update({
                    "page_url": fake.url(),
                    "referrer": fake.url() if fake.boolean(chance_of_getting_true=30) else "",
                    "session_id": str(uuid.uuid4())
                })
            
            event = {
                "event_id": str(uuid.uuid4()),
                "customer_id": customer["customer_id"],
                "user_id": customer["user_id"],
                "anonymous_id": customer.get("anonymous_id"),
                "event_name": event_name,
                "timestamp": fake.date_time_between(start_date='-90d', end_date='now', tzinfo=tz.UTC),
                "properties": properties,
                "context": {
                    "ip": fake.ipv4(),
                    "user_agent": fake.user_agent(),
                    "locale": fake.locale(),
                    "timezone": str(fake.timezone())
                },
                "is_semantic_event": event_name in [item for sublist in event_types.values() for item in sublist[:3]],
                "event_category": category,
                "source": "synthetic_data",
                "processed_at": datetime.now(tz.UTC)
            }
            events.append(event)
    except Exception as e:
        raise Exception(f"Failed to generate event data: {str(e)}")
    
    return events

print("SUCCESS: Event data generation function defined")
print("   Generates events across 4 categories with realistic properties")
print("   Includes semantic event identification")

In [None]:
# Import validation utilities from utils module for reusability
from utils.error_handlers import CircuitBreaker
from typing import List, Tuple, Dict, Any
from dataclasses import dataclass

@dataclass
class ValidationResult:
    """Type-safe validation result."""
    status: str  # "SUCCESS", "ERROR", "WARNING"
    component: str
    result: str
    error: Optional[Exception] = None

# Initialize circuit breaker for validation operations
validation_breaker = CircuitBreaker(failure_threshold=2, timeout=30)

print("SUCCESS: Validation utilities imported and configured")
print("   Circuit breaker configured for fault tolerance")
print("   ValidationResult dataclass defined for type safety")

In [None]:
# Execute validation and display results

def display_validation_results(validations: List[ValidationResult]) -> Dict[str, Any]:
    """Display validation results and calculate summary."""
    
    # Print detailed results
    print("Setup Validation Results:")
    print("=" * 60)
    
    for validation in validations:
        print(f"{validation.status} {validation.component:<25} {validation.result}")
        if validation.error and validation.status == "ERROR":
            print(f"    Error: {type(validation.error).__name__}: {str(validation.error)}")
    
    # Calculate summary
    passed = sum(1 for v in validations if v.status == "SUCCESS")
    warnings = sum(1 for v in validations if v.status == "WARNING")
    failed = sum(1 for v in validations if v.status == "ERROR")
    total = len(validations)
    
    print(f"\nValidation Summary:")
    print(f"  SUCCESS: Passed: {passed}")
    print(f"  WARNING: Warnings: {warnings}")
    print(f"  ERROR: Failed: {failed}")
    print(f"  DATA: Total: {total}")
    
    # Determine overall status
    if failed == 0:
        if warnings == 0:
            print("COMPLETED: All validation checks passed! Ready to proceed.")
            overall_status = "success"
        else:
            print("WARNING: Validation passed with warnings. Review before proceeding.")
            overall_status = "warning"
    else:
        print("ERROR: Some validation checks failed. Please fix issues before proceeding.")
        overall_status = "failed"
    
    return {
        "overall_status": overall_status,
        "passed": passed,
        "warnings": warnings,
        "failed": failed,
        "total": total,
        "validations": validations,
        "circuit_breaker_state": validation_breaker.state
    }

# Run validation and display results
validations = validate_setup()
validation_results = display_validation_results(validations)

# Raise exception if critical validations failed
if validation_results["overall_status"] == "failed":
    critical_failures = [v for v in validations 
                       if v.status == "ERROR" and "Database" in v.component]
    
    if critical_failures:
        raise Exception("Critical validation failures detected - cannot proceed")

print("\nSUCCESS: Setup validation completed successfully")

In [None]:
# Comprehensive setup validation with circuit breaker protection

def validate_setup() -> Dict[str, Any]:
    """
    Comprehensive setup validation with error handling and circuit breaker.
    
    Returns:
        Dict containing validation results and summary
    """
    print("ANALYSIS: Running comprehensive setup validation...")
    
    validations: List[ValidationResult] = []
    
    # Database validation
    try:
        result = validation_breaker(validate_database_access)
        validations.append(result)
    except Exception as e:
        validations.append(ValidationResult("ERROR", "Database access", f"Circuit breaker: {str(e)}", e))
    
    # Table validation
    required_tables = ["customers", "events", "groups", "devices", "api_responses", "batch_operations"]
    for table in required_tables:
        try:
            result = validation_breaker(validate_table_exists, table)
            validations.append(result)
        except Exception as e:
            validations.append(ValidationResult("ERROR", f"Table {table}", f"Circuit breaker: {str(e)}", e))
    
    # Data quality validation
    data_tables = ["customers", "events"]
    for table in data_tables:
        try:
            result = validation_breaker(validate_data_quality, table)
            validations.append(result)
        except Exception as e:
            validations.append(ValidationResult("ERROR", f"{table} data", f"Circuit breaker: {str(e)}", e))
    
    # API configuration validation
    try:
        result = validation_breaker(validate_api_configuration)
        validations.append(result)
    except Exception as e:
        validations.append(ValidationResult("ERROR", "API configuration", f"Circuit breaker: {str(e)}", e))
    
    return validations

print("SUCCESS: Comprehensive validation function defined")
print("   Includes circuit breaker protection for fault tolerance")
print("   Validates database, tables, data quality, and API configuration")

In [None]:
# Define individual validation functions with error handling

def validate_database_access() -> ValidationResult:
    """Validate database access with error handling."""
    try:
        spark.sql(f"USE {CATALOG_NAME}.{DATABASE_NAME}")
        return ValidationResult("SUCCESS", "Database access", "OK")
    except Exception as e:
        return ValidationResult("ERROR", "Database access", f"Failed: {str(e)}", e)

def validate_table_exists(table_name: str) -> ValidationResult:
    """Validate that a table exists."""
    try:
        spark.table(f"{CATALOG_NAME}.{DATABASE_NAME}.{table_name}")
        return ValidationResult("SUCCESS", f"Table {table_name}", "Exists")
    except Exception as e:
        return ValidationResult("ERROR", f"Table {table_name}", "Missing", e)

def validate_data_quality(table_name: str) -> ValidationResult:
    """Validate data quality in a table."""
    try:
        df = spark.table(f"{CATALOG_NAME}.{DATABASE_NAME}.{table_name}")
        count = df.count()
        
        if count > 0:
            return ValidationResult("SUCCESS", f"{table_name} data", f"{count} records")
        else:
            return ValidationResult("WARNING", f"{table_name} data", "No records")
    except Exception as e:
        return ValidationResult("ERROR", f"{table_name} data", f"Failed: {str(e)}", e)

def validate_api_configuration() -> ValidationResult:
    """Validate API configuration."""
    try:
        if not config.api_key:
            return ValidationResult("ERROR", "API configuration", "Missing API key")
        
        if not config.base_url:
            return ValidationResult("ERROR", "API configuration", "Missing base URL")
        
        # Test header generation
        headers = config.get_headers()
        if not headers.get("Authorization"):
            return ValidationResult("ERROR", "API configuration", "Invalid authorization header")
        
        if ENVIRONMENT == "test":
            return ValidationResult("WARNING", "API configuration", "Test mode - mock key")
        else:
            return ValidationResult("SUCCESS", "API configuration", "Valid")
            
    except Exception as e:
        return ValidationResult("ERROR", "API configuration", f"Failed: {str(e)}", e)

print("SUCCESS: Individual validation functions defined")
print("   Database access validation")
print("   Table existence validation") 
print("   Data quality validation")
print("   API configuration validation")

In [None]:
# Type-safe synthetic data generation with comprehensive testing
from typing import Dict, List, Any, Optional
from datetime import datetime
import uuid

def generate_synthetic_customers(num_customers: int = 1000) -> List[Dict[str, Any]]:
    """
    Generate synthetic customer data for testing.
    
    Args:
        num_customers: Number of customers to generate
        
    Returns:
        List of customer dictionaries with required fields
        
    Raises:
        ValueError: If num_customers is not positive
        Exception: If data generation fails
    """
    if num_customers <= 0:
        raise ValueError("num_customers must be positive")
    
    customers: List[Dict[str, Any]] = []
    
    try:
        for i in range(num_customers):
            customer_id = str(uuid.uuid4())
            created_at = fake.date_time_between(start_date='-2y', end_date='now', tzinfo=tz.UTC)
            
            customer = {
                "customer_id": customer_id,
                "user_id": f"user_{i+1:06d}",
                "anonymous_id": str(uuid.uuid4()) if fake.boolean(chance_of_getting_true=30) else None,
                "email": fake.email(),
                "created_at": created_at,
                "updated_at": fake.date_time_between(start_date=created_at, end_date='now', tzinfo=tz.UTC),
                "traits": {
                    "first_name": fake.first_name(),
                    "last_name": fake.last_name(),
                    "age": str(fake.random_int(min=18, max=80)),
                    "city": fake.city(),
                    "country": fake.country(),
                    "plan": fake.random_element(["free", "basic", "premium", "enterprise"]),
                    "signup_source": fake.random_element(["website", "mobile_app", "referral", "social"])
                },
                "custom_attributes": {
                    "lifetime_value": str(round(fake.random.uniform(0, 5000), 2)),
                    "last_purchase_amount": str(round(fake.random.uniform(10, 500), 2)) if fake.boolean(chance_of_getting_true=60) else None,
                    "subscription_status": fake.random_element(["active", "canceled", "trial", "expired"])
                },
                "is_active": fake.boolean(chance_of_getting_true=85),
                "last_seen": fake.date_time_between(start_date='-30d', end_date='now', tzinfo=tz.UTC),
                "source": "synthetic_data",
                "region": CUSTOMERIO_REGION
            }
            customers.append(customer)
    except Exception as e:
        raise Exception(f"Failed to generate customer data: {str(e)}")
    
    return customers

def generate_synthetic_events(customers: List[Dict[str, Any]], num_events: int = 5000) -> List[Dict[str, Any]]:
    """
    Generate synthetic event data for testing.
    
    Args:
        customers: List of customer dictionaries
        num_events: Number of events to generate
        
    Returns:
        List of event dictionaries with required fields
        
    Raises:
        ValueError: If customers list is empty or num_events is not positive
        Exception: If event generation fails
    """
    if not customers:
        raise ValueError("customers list cannot be empty")
    if num_events <= 0:
        raise ValueError("num_events must be positive")
    
    events: List[Dict[str, Any]] = []
    
    # Define event types and their categories
    event_types = {
        "ecommerce": [
            "Product Viewed", "Product Added", "Cart Viewed", "Checkout Started", 
            "Order Completed", "Product Removed", "Coupon Applied"
        ],
        "engagement": [
            "Page Viewed", "Button Clicked", "Form Submitted", "Video Played", 
            "Document Downloaded", "Search Performed"
        ],
        "lifecycle": [
            "User Registered", "Profile Updated", "Settings Changed", "Account Upgraded", 
            "Subscription Canceled", "Password Reset"
        ],
        "mobile": [
            "Application Opened", "Application Backgrounded", "Push Notification Clicked",
            "Screen Viewed", "Feature Used"
        ]
    }
    
    try:
        for i in range(num_events):
            customer = fake.random_element(customers)
            category = fake.random_element(list(event_types.keys()))
            event_name = fake.random_element(event_types[category])
            
            # Generate event properties based on category
            properties: Dict[str, str] = {}
            if category == "ecommerce":
                properties.update({
                    "product_id": f"prod_{fake.random_int(min=1, max=1000)}",
                    "product_name": fake.catch_phrase(),
                    "price": str(round(fake.random.uniform(9.99, 299.99), 2)),
                    "currency": "USD",
                    "category": fake.random_element(["electronics", "clothing", "books", "home", "sports"])
                })
            elif category == "engagement":
                properties.update({
                    "page_url": fake.url(),
                    "referrer": fake.url() if fake.boolean(chance_of_getting_true=30) else "",
                    "session_id": str(uuid.uuid4())
                })
            
            event = {
                "event_id": str(uuid.uuid4()),
                "customer_id": customer["customer_id"],
                "user_id": customer["user_id"],
                "anonymous_id": customer.get("anonymous_id"),
                "event_name": event_name,
                "timestamp": fake.date_time_between(start_date='-90d', end_date='now', tzinfo=tz.UTC),
                "properties": properties,
                "context": {
                    "ip": fake.ipv4(),
                    "user_agent": fake.user_agent(),
                    "locale": fake.locale(),
                    "timezone": str(fake.timezone())
                },
                "is_semantic_event": event_name in [item for sublist in event_types.values() for item in sublist[:3]],
                "event_category": category,
                "source": "synthetic_data",
                "processed_at": datetime.now(tz.UTC)
            }
            events.append(event)
    except Exception as e:
        raise Exception(f"Failed to generate event data: {str(e)}")
    
    return events

# Test-driven approach: Test data generation functions
def test_customer_generation():
    """Test customer data generation."""
    print("TEST: Testing customer data generation...")
    
    # Test with valid parameters
    test_customers = generate_synthetic_customers(5)
    
    # Validate structure
    assert len(test_customers) == 5, "Should generate exactly 5 customers"
    
    for customer in test_customers:
        # Test required fields
        assert "customer_id" in customer, "customer_id is required"
        assert "user_id" in customer, "user_id is required"
        assert "email" in customer, "email is required"
        assert "traits" in customer, "traits is required"
        assert "region" in customer, "region is required"
        
        # Test data types
        assert isinstance(customer["customer_id"], str), "customer_id should be string"
        assert isinstance(customer["traits"], dict), "traits should be dict"
        assert isinstance(customer["is_active"], bool), "is_active should be bool"
    
    # Test error cases
    try:
        generate_synthetic_customers(0)
        assert False, "Should raise ValueError for zero customers"
    except ValueError:
        pass  # Expected
    
    print("SUCCESS: Customer generation test passed")
    return test_customers

def test_event_generation():
    """Test event data generation."""
    print("TEST: Testing event data generation...")
    
    # Generate test customers first
    test_customers = generate_synthetic_customers(3)
    
    # Test event generation
    test_events = generate_synthetic_events(test_customers, 10)
    
    # Validate structure
    assert len(test_events) == 10, "Should generate exactly 10 events"
    
    for event in test_events:
        # Test required fields
        assert "event_id" in event, "event_id is required"
        assert "event_name" in event, "event_name is required"
        assert "customer_id" in event, "customer_id is required"
        assert "timestamp" in event, "timestamp is required"
        assert "event_category" in event, "event_category is required"
        
        # Test data types
        assert isinstance(event["event_id"], str), "event_id should be string"
        assert isinstance(event["properties"], dict), "properties should be dict"
        assert isinstance(event["context"], dict), "context should be dict"
        assert isinstance(event["is_semantic_event"], bool), "is_semantic_event should be bool"
    
    # Test error cases
    try:
        generate_synthetic_events([], 5)
        assert False, "Should raise ValueError for empty customers"
    except ValueError:
        pass  # Expected
    
    print("SUCCESS: Event generation test passed")
    return test_events

# Run tests and generate data
print("Running data generation tests...")
test_customers = test_customer_generation()
test_events = test_event_generation()

print("\nGenerating full synthetic dataset...")
try:
    synthetic_customers = generate_synthetic_customers(1000)
    synthetic_events = generate_synthetic_events(synthetic_customers, 5000)
    print(f"SUCCESS: Generated {len(synthetic_customers)} customers and {len(synthetic_events)} events")
except Exception as e:
    print(f"ERROR: Data generation failed: {str(e)}")
    raise

## Load Synthetic Data into Delta Tables

In [None]:
# Convert synthetic data to Spark DataFrames and load into Delta tables

# Load customers data
customers_df = spark.createDataFrame(synthetic_customers, customers_schema)
customers_df.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG_NAME}.{DATABASE_NAME}.customers")

# Load events data
events_df = spark.createDataFrame(synthetic_events, events_schema)
events_df.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG_NAME}.{DATABASE_NAME}.events")

print("SUCCESS: Synthetic data loaded into Delta tables")

# Show sample data
print("\nSample customer data:")
spark.table(f"{CATALOG_NAME}.{DATABASE_NAME}.customers").select("customer_id", "email", "traits", "is_active").show(3, truncate=False)

print("\nSample event data:")
spark.table(f"{CATALOG_NAME}.{DATABASE_NAME}.events").select("event_name", "customer_id", "timestamp", "event_category").show(5, truncate=False)

## Validate Setup and Configuration

In [None]:
# Comprehensive validation with error handling and circuit breaker patterns
from typing import List, Tuple, Dict, Any
import time
from dataclasses import dataclass

@dataclass
class ValidationResult:
    """Type-safe validation result."""
    status: str  # "SUCCESS", "ERROR", "WARNING"
    component: str
    result: str
    error: Optional[Exception] = None

class CircuitBreaker:
    """Circuit breaker for validation operations."""
    
    def __init__(self, failure_threshold: int = 3, timeout: int = 60):
        self.failure_threshold = failure_threshold
        self.timeout = timeout
        self.failure_count = 0
        self.last_failure_time = 0
        self.state = "closed"  # closed, open, half-open
    
    def call(self, func, *args, **kwargs):
        """Execute function with circuit breaker protection."""
        if self.state == "open":
            if time.time() - self.last_failure_time > self.timeout:
                self.state = "half-open"
            else:
                raise Exception("Circuit breaker is open")
        
        try:
            result = func(*args, **kwargs)
            if self.state == "half-open":
                self.state = "closed"
                self.failure_count = 0
            return result
        except Exception as e:
            self.failure_count += 1
            self.last_failure_time = time.time()
            
            if self.failure_count >= self.failure_threshold:
                self.state = "open"
            
            raise e

# Initialize circuit breaker for validation
validation_breaker = CircuitBreaker(failure_threshold=2, timeout=30)

def validate_database_access() -> ValidationResult:
    """Validate database access with error handling."""
    try:
        spark.sql(f"USE {CATALOG_NAME}.{DATABASE_NAME}")
        return ValidationResult("SUCCESS", "Database access", "OK")
    except Exception as e:
        return ValidationResult("ERROR", "Database access", f"Failed: {str(e)}", e)

def validate_table_exists(table_name: str) -> ValidationResult:
    """Validate that a table exists."""
    try:
        spark.table(f"{CATALOG_NAME}.{DATABASE_NAME}.{table_name}")
        return ValidationResult("SUCCESS", f"Table {table_name}", "Exists")
    except Exception as e:
        return ValidationResult("ERROR", f"Table {table_name}", "Missing", e)

def validate_data_quality(table_name: str) -> ValidationResult:
    """Validate data quality in a table."""
    try:
        df = spark.table(f"{CATALOG_NAME}.{DATABASE_NAME}.{table_name}")
        count = df.count()
        
        if count > 0:
            return ValidationResult("SUCCESS", f"{table_name} data", f"{count} records")
        else:
            return ValidationResult("WARNING", f"{table_name} data", "No records")
    except Exception as e:
        return ValidationResult("ERROR", f"{table_name} data", f"Failed: {str(e)}", e)

def validate_api_configuration() -> ValidationResult:
    """Validate API configuration."""
    try:
        if not config.api_key:
            return ValidationResult("ERROR", "API configuration", "Missing API key")
        
        if not config.base_url:
            return ValidationResult("ERROR", "API configuration", "Missing base URL")
        
        # Test header generation
        headers = config.get_headers()
        if not headers.get("Authorization"):
            return ValidationResult("ERROR", "API configuration", "Invalid authorization header")
        
        if ENVIRONMENT == "test":
            return ValidationResult("WARNING", "API configuration", "Test mode - mock key")
        else:
            return ValidationResult("SUCCESS", "API configuration", "Valid")
            
    except Exception as e:
        return ValidationResult("ERROR", "API configuration", f"Failed: {str(e)}", e)

def validate_setup() -> Dict[str, Any]:
    """
    Comprehensive setup validation with error handling and circuit breaker.
    
    Returns:
        Dict containing validation results and summary
    """
    print("ANALYSIS: Running comprehensive setup validation...")
    
    validations: List[ValidationResult] = []
    
    # Database validation
    try:
        result = validation_breaker.call(validate_database_access)
        validations.append(result)
    except Exception as e:
        validations.append(ValidationResult("ERROR", "Database access", f"Circuit breaker: {str(e)}", e))
    
    # Table validation
    required_tables = ["customers", "events", "groups", "devices", "api_responses", "batch_operations"]
    for table in required_tables:
        try:
            result = validation_breaker.call(validate_table_exists, table)
            validations.append(result)
        except Exception as e:
            validations.append(ValidationResult("ERROR", f"Table {table}", f"Circuit breaker: {str(e)}", e))
    
    # Data quality validation
    data_tables = ["customers", "events"]
    for table in data_tables:
        try:
            result = validation_breaker.call(validate_data_quality, table)
            validations.append(result)
        except Exception as e:
            validations.append(ValidationResult("ERROR", f"{table} data", f"Circuit breaker: {str(e)}", e))
    
    # API configuration validation
    try:
        result = validation_breaker.call(validate_api_configuration)
        validations.append(result)
    except Exception as e:
        validations.append(ValidationResult("ERROR", "API configuration", f"Circuit breaker: {str(e)}", e))
    
    # Print detailed results
    print("Setup Validation Results:")
    print("=" * 60)
    
    for validation in validations:
        print(f"{validation.status} {validation.component:<25} {validation.result}")
        if validation.error and validation.status == "ERROR":
            print(f"    Error: {type(validation.error).__name__}: {str(validation.error)}")
    
    # Calculate summary
    passed = sum(1 for v in validations if v.status == "SUCCESS")
    warnings = sum(1 for v in validations if v.status == "WARNING")
    failed = sum(1 for v in validations if v.status == "ERROR")
    total = len(validations)
    
    print(f"\nValidation Summary:")
    print(f"  SUCCESS: Passed: {passed}")
    print(f"  WARNING: Warnings: {warnings}")
    print(f"  ERROR: Failed: {failed}")
    print(f"  DATA: Total: {total}")
    
    # Determine overall status
    if failed == 0:
        if warnings == 0:
            print("COMPLETED: All validation checks passed! Ready to proceed.")
            overall_status = "success"
        else:
            print("WARNING: Validation passed with warnings. Review before proceeding.")
            overall_status = "warning"
    else:
        print("ERROR: Some validation checks failed. Please fix issues before proceeding.")
        overall_status = "failed"
    
    # Return detailed results for programmatic use
    return {
        "overall_status": overall_status,
        "passed": passed,
        "warnings": warnings,
        "failed": failed,
        "total": total,
        "validations": validations,
        "circuit_breaker_state": validation_breaker.state
    }

def test_validation_function():
    """Test the validation function itself."""
    print("TEST: Testing validation function...")
    
    try:
        # Test validation function
        results = validate_setup()
        
        # Validate structure
        assert "overall_status" in results, "overall_status is required"
        assert "validations" in results, "validations is required"
        assert isinstance(results["validations"], list), "validations should be list"
        
        # Test that we have some validations
        assert len(results["validations"]) > 0, "Should have validation results"
        
        # Test ValidationResult structure
        for validation in results["validations"]:
            assert hasattr(validation, "status"), "ValidationResult should have status"
            assert hasattr(validation, "component"), "ValidationResult should have component"
            assert hasattr(validation, "result"), "ValidationResult should have result"
        
        print("SUCCESS: Validation function test passed")
        return True
    except Exception as e:
        print(f"ERROR: Validation function test failed: {str(e)}")
        return False

# Test and run validation
if test_validation_function():
    validation_results = validate_setup()
    
    # Raise exception if critical validations failed
    if validation_results["overall_status"] == "failed":
        critical_failures = [v for v in validation_results["validations"] 
                           if v.status == "ERROR" and "Database" in v.component]
        
        if critical_failures:
            raise Exception("Critical validation failures detected - cannot proceed")
else:
    raise Exception("Validation function test failed")

## Cluster Configuration Recommendations

In [None]:
# Display recommended cluster configuration
print("Recommended Databricks Cluster Configuration:")
print("=" * 50)

recommended_config = {
    "Databricks Runtime": "11.3.x-scala2.12 or higher",
    "Node Type (Driver)": "Standard_DS3_v2 (14 GB Memory, 4 Cores)",
    "Node Type (Workers)": "Standard_DS3_v2 (14 GB Memory, 4 Cores)",
    "Workers": "2-4 (autoscaling enabled)",
    "Auto Termination": "120 minutes",
    "Spark Config": {
        "spark.sql.adaptive.enabled": "true",
        "spark.sql.adaptive.coalescePartitions.enabled": "true",
        "spark.sql.adaptive.coalescePartitions.minPartitionNum": "1",
        "spark.sql.adaptive.coalescePartitions.initialPartitionNum": "200",
        "spark.sql.adaptive.skewJoin.enabled": "true",
        "spark.databricks.delta.preview.enabled": "true",
        "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite": "true",
        "spark.databricks.delta.properties.defaults.autoOptimize.autoCompact": "true"
    },
    "Environment Variables": {
        "CUSTOMERIO_REGION": CUSTOMERIO_REGION,
        "DATABRICKS_ENV": ENVIRONMENT
    }
}

for key, value in recommended_config.items():
    if isinstance(value, dict):
        print(f"{key}:")
        for sub_key, sub_value in value.items():
            print(f"  {sub_key}: {sub_value}")
    else:
        print(f"{key}: {value}")

print("\nNOTE: These configurations are optimized for Customer.IO API workloads")
print("   with Delta Lake and structured streaming capabilities.")