In [None]:
# Data Mesh Implementation
# 
# This notebook provides a concise implementation of a data mesh on AWS.
%%capture
!pip install boto3 pandas aws-cdk-lib

import boto3
import pandas as pd
import json
import uuid
from IPython.display import Markdown, display

# Helper function to display information
def display_md(text):
    display(Markdown(text))

In [None]:
# Cell 2: Setup unique identifier for the workshop
workshop_id = str(uuid.uuid4())[:6]
display_md(f"Workshop ID: **{workshop_id}** (Ensures unique resource names)")

In [None]:
# Cell 3: Create domain data products
def create_domain_data_products():
    """Create sample domain data products for customer and sales domains"""
    display_md("## 1. Creating Domain Data Products")
    
    # Initialize clients
    s3 = boto3.client('s3')
    glue = boto3.client('glue')
    
    # Create buckets for each domain
    customer_bucket = f"customer-domain-{workshop_id}"
    sales_bucket = f"sales-domain-{workshop_id}"
    
    try:
        # Create the buckets
        s3.create_bucket(Bucket=customer_bucket)
        s3.create_bucket(Bucket=sales_bucket)
        
        # Generate sample data
        customers_df = pd.DataFrame({
            'customer_id': [f'C{i:03d}' for i in range(1, 11)],
            'name': [f'Customer {i}' for i in range(1, 11)],
            'segment': ['HIGH_VALUE', 'STANDARD', 'GROWTH', 'AT_RISK'] * 3,
            'lifetime_value': [i * 500 for i in range(1, 11)]
        })
        
        sales_df = pd.DataFrame({
            'transaction_id': [f'T{i:03d}' for i in range(1, 21)],
            'customer_id': [f'C{(i % 10) + 1:03d}' for i in range(1, 21)],
            'amount': [i * 50 + 25 for i in range(1, 21)],
            'date': pd.date_range(start='2023-01-01', periods=20).astype(str)
        })
        
        # Save to CSV and upload to S3
        customers_df.to_csv('customers.csv', index=False)
        sales_df.to_csv('sales.csv', index=False)
        
        s3.upload_file('customers.csv', customer_bucket, 'customers.csv')
        s3.upload_file('sales.csv', sales_bucket, 'sales.csv')
        
        # Create Glue databases and tables for each domain
        customer_db = f"customer_domain_{workshop_id}"
        sales_db = f"sales_domain_{workshop_id}"
        
        glue.create_database(
            DatabaseInput={'Name': customer_db, 'Description': 'Customer domain data'}
        )
        glue.create_database(
            DatabaseInput={'Name': sales_db, 'Description': 'Sales domain data'}
        )
        
        # Create tables with domain-specific metadata
        glue.create_table(
            DatabaseName=customer_db,
            TableInput={
                'Name': 'customer_profiles',
                'StorageDescriptor': {
                    'Columns': [
                        {'Name': 'customer_id', 'Type': 'string'},
                        {'Name': 'name', 'Type': 'string'},
                        {'Name': 'segment', 'Type': 'string'},
                        {'Name': 'lifetime_value', 'Type': 'double'}
                    ],
                    'Location': f"s3://{customer_bucket}/",
                    'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
                    'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
                    'SerdeInfo': {
                        'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
                        'Parameters': {'field.delim': ',', 'skip.header.line.count': '1'}
                    }
                },
                'TableType': 'EXTERNAL_TABLE',
                'Parameters': {
                    'domain': 'customer',
                    'owner': 'customer_team',
                    'data_quality.completeness': '99.5%',
                    'data_update_frequency': 'daily',
                    'documentation_url': 'https://wiki.example.com/customer_domain',
                    'EXTERNAL': 'TRUE'
                }
            }
        )
        
        glue.create_table(
            DatabaseName=sales_db,
            TableInput={
                'Name': 'transactions',
                'StorageDescriptor': {
                    'Columns': [
                        {'Name': 'transaction_id', 'Type': 'string'},
                        {'Name': 'customer_id', 'Type': 'string'},
                        {'Name': 'amount', 'Type': 'double'},
                        {'Name': 'date', 'Type': 'string'}
                    ],
                    'Location': f"s3://{sales_bucket}/",
                    'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
                    'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
                    'SerdeInfo': {
                        'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
                        'Parameters': {'field.delim': ',', 'skip.header.line.count': '1'}
                    }
                },
                'TableType': 'EXTERNAL_TABLE',
                'Parameters': {
                    'domain': 'sales',
                    'owner': 'sales_team',
                    'data_quality.completeness': '99.8%',
                    'data_update_frequency': 'hourly',
                    'documentation_url': 'https://wiki.example.com/sales_domain',
                    'EXTERNAL': 'TRUE'
                }
            }
        )
        
        display_md(f"""
        ✅ Created domain data products:
        
        **Customer Domain**
        - Bucket: `{customer_bucket}`
        - Database: `{customer_db}`
        - Table: `customer_profiles`
        - Owner: `customer_team`
        
        **Sales Domain**
        - Bucket: `{sales_bucket}`
        - Database: `{sales_db}`
        - Table: `transactions`
        - Owner: `sales_team`
        """)
        
        return {
            'customer_bucket': customer_bucket,
            'sales_bucket': sales_bucket,
            'customer_db': customer_db,
            'sales_db': sales_db
        }
        
    except Exception as e:
        display_md(f"❌ Error creating domain data products: {str(e)}")
        return None

# Execute the function
domain_resources = create_domain_data_products()

In [None]:
# Cell 4: Create self-serve infrastructure template
def create_self_serve_template():
    """Create a template for self-serve data product creation"""
    display_md("## 2. Implementing Self-serve Infrastructure")
    
    # Create a simple CDK-like template as a Python function
    template = """
    # data_product_template.py
    
    def create_data_product(domain_name, product_name, description, update_frequency='daily'):
        \"\"\"Template for creating a new domain data product\"\"\"
        import boto3
        import uuid
        
        # Generate a unique ID for the data product
        product_id = str(uuid.uuid4())[:6]
        
        # Initialize clients
        s3 = boto3.client('s3')
        glue = boto3.client('glue')
        
        # Create resources
        bucket_name = f"{domain_name}-{product_name}-{product_id}"
        database_name = f"{domain_name}_{product_name}_{product_id}"
        
        # Create S3 bucket with appropriate lifecycle policies
        s3.create_bucket(Bucket=bucket_name)
        
        # Configure lifecycle rules (simplified)
        s3.put_bucket_lifecycle_configuration(
            Bucket=bucket_name,
            LifecycleConfiguration={
                'Rules': [
                    {
                        'Status': 'Enabled',
                        'Prefix': '',
                        'Transitions': [
                            {
                                'Days': 30,
                                'StorageClass': 'INTELLIGENT_TIERING'
                            }
                        ],
                        'ID': 'Standard-to-Intelligent'
                    }
                ]
            }
        )
        
        # Create Glue database with domain metadata
        glue.create_database(
            DatabaseInput={
                'Name': database_name,
                'Description': description,
                'Parameters': {
                    'domain': domain_name,
                    'product': product_name,
                    'update_frequency': update_frequency
                }
            }
        )
        
        print(f"Created data product resources:")
        print(f"- S3 Bucket: {bucket_name}")
        print(f"- Glue Database: {database_name}")
        
        return {
            'bucket_name': bucket_name,
            'database_name': database_name
        }
    """
    
    # Write the template to a file
    with open('data_product_template.py', 'w') as f:
        f.write(template)
    
    display_md("""
    ✅ Created self-serve infrastructure template
    
    This template allows domain teams to create new data products with standardized:
    - S3 bucket with proper lifecycle policies
    - Glue database with domain metadata
    - Consistent naming and tagging conventions
    
    Example usage:
    ```python
    # Example of how domain teams would use the template
    from data_product_template import create_data_product
    
    # Create a new marketing campaign data product
    resources = create_data_product(
        domain_name='marketing',
        product_name='campaign_analytics',
        description='Marketing campaign performance data',
        update_frequency='hourly'
    )
    ```
    """)
    
    # For demonstration, actually create a marketing data product
    display_md("Creating a marketing data product using the template...")
    
    # Create marketing domain data product manually for demonstration
    try:
        marketing_bucket = f"marketing-campaigns-{workshop_id}"
        marketing_db = f"marketing_campaigns_{workshop_id}"
        
        s3 = boto3.client('s3')
        glue = boto3.client('glue')
        
        s3.create_bucket(Bucket=marketing_bucket)
        glue.create_database(
            DatabaseInput={
                'Name': marketing_db,
                'Description': 'Marketing campaign performance data',
                'Parameters': {
                    'domain': 'marketing',
                    'product': 'campaign_analytics',
                    'update_frequency': 'hourly'
                }
            }
        )
        
        # Create sample campaign data
        campaigns_df = pd.DataFrame({
            'campaign_id': [f'C{i:03d}' for i in range(1, 6)],
            'name': [f'Campaign {i}' for i in range(1, 6)],
            'channel': ['EMAIL', 'SOCIAL', 'SEARCH', 'DISPLAY', 'EMAIL'],
            'target_segment': ['HIGH_VALUE', 'GROWTH', 'STANDARD', 'ALL', 'HIGH_VALUE'],
            'budget': [5000, 3000, 2000, 10000, 4000],
            'impressions': [50000, 30000, 20000, 100000, 40000],
            'conversions': [500, 300, 200, 800, 450]
        })
        
        campaigns_df.to_csv('campaigns.csv', index=False)
        s3.upload_file('campaigns.csv', marketing_bucket, 'campaigns.csv')
        
        glue.create_table(
            DatabaseName=marketing_db,
            TableInput={
                'Name': 'campaigns',
                'StorageDescriptor': {
                    'Columns': [
                        {'Name': 'campaign_id', 'Type': 'string'},
                        {'Name': 'name', 'Type': 'string'},
                        {'Name': 'channel', 'Type': 'string'},
                        {'Name': 'target_segment', 'Type': 'string'},
                        {'Name': 'budget', 'Type': 'double'},
                        {'Name': 'impressions', 'Type': 'int'},
                        {'Name': 'conversions', 'Type': 'int'}
                    ],
                    'Location': f"s3://{marketing_bucket}/",
                    'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
                    'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
                    'SerdeInfo': {
                        'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
                        'Parameters': {'field.delim': ',', 'skip.header.line.count': '1'}
                    }
                },
                'TableType': 'EXTERNAL_TABLE',
                'Parameters': {
                    'domain': 'marketing',
                    'owner': 'marketing_team',
                    'data_quality.completeness': '99.0%',
                    'data_update_frequency': 'hourly',
                    'documentation_url': 'https://wiki.example.com/marketing_domain',
                    'EXTERNAL': 'TRUE'
                }
            }
        )
        
        display_md(f"""
        ✅ Created marketing data product:
        - Bucket: `{marketing_bucket}`
        - Database: `{marketing_db}`
        - Table: `campaigns`
        """)
        
        domain_resources['marketing_bucket'] = marketing_bucket
        domain_resources['marketing_db'] = marketing_db
        
    except Exception as e:
        display_md(f"⚠️ Note: Could not create marketing data product: {str(e)}")

# Execute the function
create_self_serve_template()

In [None]:
# Cell 5: Implement federated governance
def implement_federated_governance():
    """Implement federated governance using Lake Formation tags"""
    display_md("## 3. Implementing Federated Governance")
    
    try:
        # Create Lake Formation tags and policies
        # Note: For real implementation, you would need proper Lake Formation permissions
        # This cell demonstrates the approach conceptually
        
        display_md("""
        For federated governance, we would:
        
        1. Create Lake Formation tags for data classification and domains:
        ```python
        lakeformation_client = boto3.client('lakeformation')
        
        # Create data classification tags
        lakeformation_client.create_lf_tag(
            TagKey='data_classification',
            TagValues=['public', 'internal', 'confidential', 'restricted']
        )
        
        # Create domain tags
        lakeformation_client.create_lf_tag(
            TagKey='domain',
            TagValues=['customer', 'sales', 'marketing', 'product', 'finance']
        )
        ```
        
        2. Assign tags to resources:
        ```python
        # Tag the customer domain resources
        lakeformation_client.add_lf_tags_to_resource(
            Resource={
                'Database': {'Name': customer_db}
            },
            LFTags=[
                {'TagKey': 'domain', 'TagValues': ['customer']},
                {'TagKey': 'data_classification', 'TagValues': ['internal']}
            ]
        )
        ```
        
        3. Create tag-based access policies:
        ```python
        # Grant marketing analysts access to customer and sales data with internal classification
        lakeformation_client.grant_permissions(
            Principal={'DataLakePrincipalIdentifier': 'arn:aws:iam::account-id:role/marketing-analysts'},
            Resource={
                'LFTagPolicy': {
                    'ResourceType': 'TABLE',
                    'Expression': [
                        {'TagKey': 'data_classification', 'TagValues': ['public', 'internal']},
                        {'TagKey': 'domain', 'TagValues': ['customer', 'sales', 'marketing']}
                    ]
                }
            },
            Permissions=['SELECT']
        )
        ```
        
        This approach enables centrally-defined but distributed governance:
        - Consistent data classification across domains
        - Role-based access that spans multiple domains
        - Policy management by capability rather than by table
        """)
        
        # Create a diagram of the governance model
        governance_diagram = """
        # Federated Governance Model
        
                      ┌─────────────────────────┐
                      │ Centralized Governance  │
                      │                         │
                      │ • Data classification   │
                      │ • Access policies       │
                      │ • Compliance standards  │
                      └───────────┬─────────────┘
                                  │
                                  ▼
        ┌─────────────────────────────────────────────────────────┐
        │                  Lake Formation Tags                    │
        │                                                         │
        │  domain:[customer,sales,marketing,product,finance]      │
        │  data_classification:[public,internal,confidential]     │
        │  data_quality:[gold,silver,bronze]                      │
        └──────────────┬─────────────────┬────────────────────────┘
                       │                 │
          ┌────────────▼──────┐   ┌──────▼─────────┐    ┌─────────▼─────────┐
          │ Customer Domain   │   │ Sales Domain   │    │ Marketing Domain  │
          │                   │   │                │    │                   │
          │ • Local ownership │   │ • Local       │    │ • Local ownership │
          │ • Domain-specific │   │   ownership   │    │ • Domain-specific │
          │   quality rules   │   │ • Domain-     │    │   quality rules   │
          │ • Schema evolution│   │   specific    │    │ • Schema evolution│
          └───────────────────┘   │   quality     │    └───────────────────┘
                                  │   rules       │
                                  └───────────────┘
        """
        
        display_md(governance_diagram)
        
    except Exception as e:
        display_md(f"⚠️ Note: Skipping Lake Formation operations in this example")

# Execute the function
implement_federated_governance()


In [None]:
# Cell 6: Cross-domain analytics
def implement_cross_domain_analytics():
    """Implement cross-domain analytics views"""
    display_md("## 4. Implementing Cross-Domain Analytics")
    
    if not domain_resources:
        display_md("❌ Domain resources must be created first")
        return
    
    try:
        # Create cross-domain analytics database
        glue = boto3.client('glue')
        analytics_db = f"cross_domain_analytics_{workshop_id}"
        
        glue.create_database(
            DatabaseInput={
                'Name': analytics_db,
                'Description': 'Cross-domain analytics views',
                'Parameters': {
                    'purpose': 'integrated_analytics'
                }
            }
        )
        
        # Demonstrate cross-domain views with SQL
        customer_360_view = f"""
        CREATE OR REPLACE VIEW {analytics_db}.customer_360 AS
        SELECT
          c.customer_id,
          c.name,
          c.segment,
          c.lifetime_value,
          COUNT(DISTINCT s.transaction_id) AS transaction_count,
          SUM(s.amount) AS total_spend,
          MAX(s.date) AS last_purchase_date
        FROM
          {domain_resources['customer_db']}.customer_profiles c
        LEFT JOIN
          {domain_resources['sales_db']}.transactions s ON c.customer_id = s.customer_id
        GROUP BY
          c.customer_id,
          c.name,
          c.segment,
          c.lifetime_value
        """
        
        campaign_performance_view = f"""
        CREATE OR REPLACE VIEW {analytics_db}.campaign_performance AS
        SELECT
          m.campaign_id,
          m.name AS campaign_name,
          m.channel,
          m.target_segment,
          m.budget,
          m.impressions,
          m.conversions,
          COUNT(DISTINCT s.customer_id) AS unique_customers,
          COUNT(s.transaction_id) AS transactions,
          SUM(s.amount) AS revenue,
          SUM(s.amount) / m.budget AS roi
        FROM
          {domain_resources.get('marketing_db', 'marketing_db')}.campaigns m
        LEFT JOIN
          {domain_resources['customer_db']}.customer_profiles c ON m.target_segment = c.segment
        LEFT JOIN
          {domain_resources['sales_db']}.transactions s ON c.customer_id = s.customer_id
        GROUP BY
          m.campaign_id,
          m.name,
          m.channel,
          m.target_segment,
          m.budget,
          m.impressions,
          m.conversions
        """
        
        display_md(f"""
        ✅ Created cross-domain analytics database: `{analytics_db}`
        
        With the following cross-domain views:
        
        **Customer 360 View**
        ```sql
        {customer_360_view}
        ```
        
        **Campaign Performance View**
        ```sql
        {campaign_performance_view}
        ```
        
        These views enable cross-domain analytics while maintaining domain ownership:
        - Each domain team still owns and maintains their core data products
        - Analysts can query integrated views without understanding the underlying structure
        - Changes in domain data products automatically reflect in cross-domain views
        """)
        
        # Create sample results for demonstration
        customer_360_results = pd.DataFrame({
            'customer_id': ['C001', 'C002', 'C003', 'C004', 'C005'],
            'name': ['Customer 1', 'Customer 2', 'Customer 3', 'Customer 4', 'Customer 5'],
            'segment': ['HIGH_VALUE', 'STANDARD', 'GROWTH', 'AT_RISK', 'HIGH_VALUE'],
            'lifetime_value': [500, 1000, 1500, 2000, 2500],
            'transaction_count': [3, 2, 4, 1, 5],
            'total_spend': [325, 175, 400, 75, 625],
            'last_purchase_date': ['2023-01-15', '2023-01-10', '2023-01-18', '2023-01-05', '2023-01-20']
        })
        
        display_md("### Sample Customer 360 Query Results")
        display(customer_360_results)
        
    except Exception as e:
        display_md(f"❌ Error implementing cross-domain analytics: {str(e)}")

# Execute the function
implement_cross_domain_analytics()

In [None]:
# Cell 7: Summary
display_md("""
## Data Mesh Implementation Summary

This notebook has demonstrated the key components of a data mesh architecture on AWS:

1. **Domain-oriented data products**
   - Each domain team owns their data and its quality
   - Clear metadata and documentation
   - Domain-specific optimizations

2. **Self-serve infrastructure**
   - Standardized templates for data product creation
   - Consistent infrastructure with appropriate policies
   - Domain teams empowered to create their own products

3. **Federated governance**
   - Tag-based access policies that scale across domains
   - Centrally defined, locally implemented rules
   - Balance between consistency and domain autonomy

4. **Cross-domain analytics**
   - Integrated views that span multiple domains
   - Maintained domain ownership with unified analysis
   - Business insights that transcend organizational boundaries

This implementation provides a simplified but practical foundation for building a data mesh architecture on AWS. In a production environment, you would expand on these patterns with more sophisticated monitoring, quality controls, and further integration between domains.
""")

In [None]:
# Cell 8: Cleanup (Not auto-executed to avoid accidental resource deletion)
def cleanup_resources():
    """Clean up all resources created in this notebook"""
    if not domain_resources:
        display_md("❌ No resources to clean up")
        return
    
    display_md("## Cleaning up resources")
    display_md("Removing all resources created in this workshop...")
    
    # Initialize clients
    s3 = boto3.client('s3')
    glue = boto3.client('glue')
    
    try:
        # Delete Glue resources
        for db in [
            domain_resources.get('customer_db'),
            domain_resources.get('sales_db'),
            domain_resources.get('marketing_db'),
            f"cross_domain_analytics_{workshop_id}"
        ]:
            if db:
                try:
                    glue.delete_database(Name=db)
                    display_md(f"- Deleted database: {db}")
                except Exception:
                    pass
        
        # Delete S3 resources
        for bucket in [
            domain_resources.get('customer_bucket'),
            domain_resources.get('sales_bucket'),
            domain_resources.get('marketing_bucket')
        ]:
            if bucket:
                try:
                    # Empty the bucket first
                    objects = s3.list_objects_v2(Bucket=bucket)
                    if 'Contents' in objects:
                        delete_keys = {'Objects': [{'Key': obj['Key']} for obj in objects['Contents']]}
                        s3.delete_objects(Bucket=bucket, Delete=delete_keys)
                    
                    # Delete the bucket
                    s3.delete_bucket(Bucket=bucket)
                    display_md(f"- Deleted bucket: {bucket}")
                except Exception:
                    pass
        
        display_md("✅ Cleanup complete")
    
    except Exception as e:
        display_md(f"❌ Error cleaning up resources: {str(e)}")

# Run this cell to clean up all resources when you're done
# cleanup_resources()

display_md("To clean up all resources, uncomment and run the cleanup_resources() line in the cell above.")