# Capítulo 10: Micro-Lakehouse — Arquitetura Completa

Implementação completa de um Micro-Lakehouse com DuckDB.

## Setup

In [None]:
!pip install duckdb pandas numpy matplotlib seaborn pyarrow faker httpx -q

In [None]:
import duckdb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import matplotlib.pyplot as plt
import seaborn as sns
from faker import Faker
import json

fake = Faker()
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print(f"DuckDB: {duckdb.__version__}")

## 10.1 Criar Lakehouse Database

In [None]:
# Conectar ao DuckDB
con = duckdb.connect('micro_lakehouse.db')

print("✅ Micro-Lakehouse database criado")

## 10.2 Schema Design: Star Schema

In [None]:
# Criar dimensões e fato (Star Schema)

# Dimensão: Customers
con.execute("""
    CREATE TABLE IF NOT EXISTS dim_customers (
        customer_id INTEGER PRIMARY KEY,
        name VARCHAR,
        email VARCHAR,
        segment VARCHAR,
        country VARCHAR,
        created_at TIMESTAMP
    )
""")

# Dimensão: Products
con.execute("""
    CREATE TABLE IF NOT EXISTS dim_products (
        product_id INTEGER PRIMARY KEY,
        product_name VARCHAR,
        category VARCHAR,
        subcategory VARCHAR,
        unit_price DECIMAL(10,2)
    )
""")

# Dimensão: Time
con.execute("""
    CREATE TABLE IF NOT EXISTS dim_time (
        date_id INTEGER PRIMARY KEY,
        date DATE,
        year INTEGER,
        quarter INTEGER,
        month INTEGER,
        week INTEGER,
        day INTEGER,
        day_of_week INTEGER,
        is_weekend BOOLEAN
    )
""")

# Fato: Orders
con.execute("""
    CREATE TABLE IF NOT EXISTS fact_orders (
        order_id VARCHAR PRIMARY KEY,
        customer_id INTEGER,
        product_id INTEGER,
        date_id INTEGER,
        quantity INTEGER,
        unit_price DECIMAL(10,2),
        discount DECIMAL(10,2),
        total_amount DECIMAL(10,2),
        timestamp BIGINT,
        FOREIGN KEY (customer_id) REFERENCES dim_customers(customer_id),
        FOREIGN KEY (product_id) REFERENCES dim_products(product_id),
        FOREIGN KEY (date_id) REFERENCES dim_time(date_id)
    )
""")

print("✅ Star Schema criado:")
print("  - dim_customers")
print("  - dim_products")
print("  - dim_time")
print("  - fact_orders")

## 10.3 Carregar Dados

In [None]:
# Gerar dimensão de clientes
customers = []
for i in range(1, 10001):
    customers.append({
        'customer_id': i,
        'name': fake.name(),
        'email': fake.email(),
        'segment': np.random.choice(['Enterprise', 'SMB', 'Consumer']),
        'country': np.random.choice(['USA', 'UK', 'Brazil', 'Germany', 'Japan']),
        'created_at': fake.date_time_between(start_date='-2y', end_date='now')
    })

customers_df = pd.DataFrame(customers)
con.execute("INSERT INTO dim_customers SELECT * FROM customers_df")
print(f"✅ {len(customers):,} clientes carregados")

In [None]:
# Gerar dimensão de produtos
categories = {
    'Electronics': ['Laptop', 'Phone', 'Tablet', 'Headphones'],
    'Clothing': ['Shirt', 'Pants', 'Shoes', 'Jacket'],
    'Home': ['Furniture', 'Appliances', 'Decor', 'Kitchen']
}

products = []
product_id = 1
for category, subcats in categories.items():
    for subcat in subcats:
        for i in range(50):  # 50 produtos por subcategoria
            products.append({
                'product_id': product_id,
                'product_name': f"{subcat} {i+1}",
                'category': category,
                'subcategory': subcat,
                'unit_price': round(np.random.uniform(10, 2000), 2)
            })
            product_id += 1

products_df = pd.DataFrame(products)
con.execute("INSERT INTO dim_products SELECT * FROM products_df")
print(f"✅ {len(products):,} produtos carregados")

In [None]:
# Gerar dimensão de tempo (últimos 365 dias)
start_date = datetime.now() - timedelta(days=365)
time_dims = []

for i in range(365):
    date = start_date + timedelta(days=i)
    time_dims.append({
        'date_id': int(date.strftime('%Y%m%d')),
        'date': date.date(),
        'year': date.year,
        'quarter': (date.month - 1) // 3 + 1,
        'month': date.month,
        'week': date.isocalendar()[1],
        'day': date.day,
        'day_of_week': date.weekday(),
        'is_weekend': date.weekday() >= 5
    })

time_df = pd.DataFrame(time_dims)
con.execute("INSERT INTO dim_time SELECT * FROM time_df")
print(f"✅ {len(time_dims)} dias carregados")

In [None]:
# Gerar fato de orders (1 milhão de transações)
print("Gerando 1 milhão de transações...")

n_orders = 1_000_000
orders = []

for i in range(n_orders):
    date = start_date + timedelta(days=np.random.randint(0, 365))
    product = products[np.random.randint(0, len(products))]
    quantity = np.random.randint(1, 10)
    discount = np.random.choice([0, 0.05, 0.10, 0.15, 0.20], p=[0.6, 0.2, 0.1, 0.05, 0.05])
    
    unit_price = product['unit_price']
    total = round(quantity * unit_price * (1 - discount), 2)
    
    orders.append({
        'order_id': f'ORD-{i:08d}',
        'customer_id': np.random.randint(1, 10001),
        'product_id': product['product_id'],
        'date_id': int(date.strftime('%Y%m%d')),
        'quantity': quantity,
        'unit_price': unit_price,
        'discount': discount,
        'total_amount': total,
        'timestamp': int(date.timestamp())
    })
    
    if (i + 1) % 100000 == 0:
        print(f"  {i+1:,} transações geradas...")

orders_df = pd.DataFrame(orders)
con.execute("INSERT INTO fact_orders SELECT * FROM orders_df")
print(f"\n✅ {len(orders):,} transações carregadas")

## 10.4 Analytics Queries

In [None]:
# Query 1: Revenue por mês
print("=== Query 1: Revenue Mensal ===")
start = time.perf_counter()

monthly_revenue = con.execute("""
    SELECT 
        t.year,
        t.month,
        COUNT(DISTINCT o.order_id) as num_orders,
        COUNT(DISTINCT o.customer_id) as unique_customers,
        ROUND(SUM(o.total_amount), 2) as total_revenue,
        ROUND(AVG(o.total_amount), 2) as avg_order_value
    FROM fact_orders o
    JOIN dim_time t ON o.date_id = t.date_id
    GROUP BY t.year, t.month
    ORDER BY t.year DESC, t.month DESC
    LIMIT 12
""").fetchdf()

elapsed = time.perf_counter() - start
print(f"Tempo: {elapsed*1000:.1f}ms\n")
print(monthly_revenue.to_string(index=False))

In [None]:
# Visualizar revenue mensal
monthly_revenue['period'] = monthly_revenue['year'].astype(str) + '-' + monthly_revenue['month'].astype(str).str.zfill(2)

fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(monthly_revenue['period'], monthly_revenue['total_revenue'], 
        marker='o', linewidth=2, markersize=8, color='#2196F3')
ax.set_xlabel('Período')
ax.set_ylabel('Revenue ($)')
ax.set_title('Revenue Mensal', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Query 2: Top produtos por categoria
print("\n=== Query 2: Top 5 Produtos por Categoria ===")
start = time.perf_counter()

top_products = con.execute("""
    WITH product_sales AS (
        SELECT 
            p.category,
            p.product_name,
            SUM(o.quantity) as units_sold,
            SUM(o.total_amount) as revenue,
            ROW_NUMBER() OVER (PARTITION BY p.category ORDER BY SUM(o.total_amount) DESC) as rank
        FROM fact_orders o
        JOIN dim_products p ON o.product_id = p.product_id
        GROUP BY p.category, p.product_name
    )
    SELECT 
        category,
        product_name,
        units_sold,
        ROUND(revenue, 2) as revenue
    FROM product_sales
    WHERE rank <= 5
    ORDER BY category, rank
""").fetchdf()

elapsed = time.perf_counter() - start
print(f"Tempo: {elapsed*1000:.1f}ms\n")
print(top_products.to_string(index=False))

In [None]:
# Query 3: Segmentação de clientes (RFM)
print("\n=== Query 3: Análise RFM (Recency, Frequency, Monetary) ===")
start = time.perf_counter()

rfm = con.execute("""
    WITH customer_metrics AS (
        SELECT 
            c.customer_id,
            c.name,
            c.segment,
            MAX(o.timestamp) as last_order_ts,
            COUNT(DISTINCT o.order_id) as frequency,
            SUM(o.total_amount) as monetary
        FROM dim_customers c
        JOIN fact_orders o ON c.customer_id = o.customer_id
        GROUP BY c.customer_id, c.name, c.segment
    ),
    rfm_scores AS (
        SELECT 
            *,
            NTILE(5) OVER (ORDER BY last_order_ts DESC) as recency_score,
            NTILE(5) OVER (ORDER BY frequency) as frequency_score,
            NTILE(5) OVER (ORDER BY monetary) as monetary_score
        FROM customer_metrics
    )
    SELECT 
        CASE 
            WHEN recency_score >= 4 AND frequency_score >= 4 THEN 'Champions'
            WHEN recency_score >= 3 AND frequency_score >= 3 THEN 'Loyal'
            WHEN recency_score >= 4 AND frequency_score <= 2 THEN 'Recent'
            WHEN recency_score <= 2 AND frequency_score >= 3 THEN 'At Risk'
            ELSE 'Others'
        END as segment,
        COUNT(*) as customers,
        ROUND(AVG(monetary), 2) as avg_ltv,
        ROUND(SUM(monetary), 2) as total_revenue
    FROM rfm_scores
    GROUP BY segment
    ORDER BY total_revenue DESC
""").fetchdf()

elapsed = time.perf_counter() - start
print(f"Tempo: {elapsed*1000:.1f}ms\n")
print(rfm.to_string(index=False))

In [None]:
# Visualizar segmentação RFM
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart de clientes
ax1.pie(rfm['customers'], labels=rfm['segment'], autopct='%1.1f%%', startangle=90)
ax1.set_title('Distribuição de Clientes por Segmento')

# Bar chart de revenue
colors = ['#4CAF50', '#2196F3', '#FF9800', '#F44336', '#9E9E9E']
bars = ax2.bar(rfm['segment'], rfm['total_revenue'], color=colors, alpha=0.7)
ax2.set_ylabel('Revenue ($)')
ax2.set_title('Revenue por Segmento RFM')
ax2.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45)

for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
             f'${height/1000:.0f}K', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 10.5 Performance Benchmark

In [None]:
# Benchmark de queries complexas
queries = {
    'Simple Aggregation': """
        SELECT COUNT(*), SUM(total_amount) 
        FROM fact_orders
    """,
    'Join with 1 Dimension': """
        SELECT p.category, SUM(o.total_amount) 
        FROM fact_orders o
        JOIN dim_products p ON o.product_id = p.product_id
        GROUP BY p.category
    """,
    'Join with 3 Dimensions': """
        SELECT 
            c.segment, 
            p.category, 
            t.quarter,
            SUM(o.total_amount) as revenue
        FROM fact_orders o
        JOIN dim_customers c ON o.customer_id = c.customer_id
        JOIN dim_products p ON o.product_id = p.product_id
        JOIN dim_time t ON o.date_id = t.date_id
        GROUP BY c.segment, p.category, t.quarter
    """,
    'Window Function': """
        SELECT 
            customer_id,
            SUM(total_amount) OVER (PARTITION BY customer_id) as total_spent,
            ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY timestamp DESC) as recency
        FROM fact_orders
        LIMIT 100000
    """
}

benchmark_results = []

print("=== Performance Benchmark ===")
for name, query in queries.items():
    times = []
    for _ in range(3):  # 3 execuções
        start = time.perf_counter()
        con.execute(query).fetchdf()
        elapsed = time.perf_counter() - start
        times.append(elapsed * 1000)
    
    avg_time = np.mean(times)
    benchmark_results.append({'Query': name, 'Avg Time (ms)': f"{avg_time:.1f}"})
    print(f"  {name}: {avg_time:.1f}ms")

benchmark_df = pd.DataFrame(benchmark_results)
print(f"\n{benchmark_df.to_string(index=False)}")

## 10.6 Database Statistics

In [None]:
# Estatísticas do database
stats = con.execute("""
    SELECT 
        table_name,
        estimated_size as rows,
        column_count as columns
    FROM duckdb_tables()
    WHERE table_name LIKE 'dim_%' OR table_name LIKE 'fact_%'
    ORDER BY table_name
""").fetchdf()

print("\n=== Database Statistics ===")
print(stats.to_string(index=False))

# Tamanho do arquivo
import os
db_size = os.path.getsize('micro_lakehouse.db') / 1024 / 1024
print(f"\nDatabase Size: {db_size:.1f} MB")

## 10.7 Export Data

In [None]:
# Exportar dados para Parquet
print("\n=== Exportando para Parquet ===")

con.execute("""
    COPY fact_orders 
    TO 'export/fact_orders.parquet' 
    (FORMAT PARQUET, COMPRESSION SNAPPY)
""")

export_size = os.path.getsize('export/fact_orders.parquet') / 1024 / 1024
print(f"✅ fact_orders exportado: {export_size:.1f} MB")

compression_ratio = db_size / export_size
print(f"Compression ratio: {compression_ratio:.2f}x")

## 10.8 Resumo Final

In [None]:
summary = pd.DataFrame({
    'Componente': [
        'Schema',
        'Dados',
        'Queries',
        'Performance',
        'Storage'
    ],
    'Implementação': [
        'Star Schema (3 dims + 1 fact)',
        '1M transações + 10K clientes',
        'Analytics SQL (aggregations, joins, window functions)',
        'Sub-segundo para queries complexas',
        f'{db_size:.1f} MB database, {export_size:.1f} MB Parquet'
    ]
})

print("\n=== RESUMO DO CAPÍTULO 10 ===")
print(summary.to_string(index=False))

print("\n✅ Principais Conclusões:")
print("  1. Micro-Lakehouse completo com Star Schema")
print("  2. Queries analytics em sub-segundo")
print("  3. Compressão eficiente (Parquet)")
print("  4. Fácil integração com ferramentas BI")

In [None]:
# Limpeza
con.close()
print("\n✅ Notebook concluído!")