## 3. Melhores Práticas Gerais

**Arquitetura:**
- Use Spark para ETL pesado (TBs)
- Use DuckDB para analytics (GBs-TBs)
- Use Pandas/Polars para transformações

**Performance:**
- Aplique partition pruning
- Use projection pushdown
- Cache queries frequentes
- Configure row groups adequadamente

**Segurança:**
- Use secrets para credenciais
- Implemente rotação de credenciais
- Valide qualidade de dados

**Organização:**
- Estruture código modular
- Implemente logging
- Adicione testes automatizados

In [None]:
import duckdb
from typing import List
from dataclasses import dataclass

@dataclass
class QualityIssue:
    table: str
    column: str
    issue_type: str
    severity: str
    count: int

class DataQualityMonitor:
    """Monitor de qualidade de dados"""
    
    def __init__(self):
        self.con = duckdb.connect()
        self.issues: List[QualityIssue] = []
    
    def check_nulls(self, table_path: str, critical_columns: List[str]) -> List[QualityIssue]:
        """Verificar valores nulos"""
        issues = []
        
        for column in critical_columns:
            query = f"""
            SELECT COUNT(*) as total, SUM(CASE WHEN "{column}" IS NULL THEN 1 ELSE 0 END) as nulls
            FROM delta_scan('{table_path}')
            """
            result = self.con.execute(query).fetchone()
            total, nulls = result
            
            if nulls > 0:
                null_pct = (nulls / total) * 100
                issues.append(QualityIssue(
                    table=table_path,
                    column=column,
                    issue_type='null_values',
                    severity='high' if null_pct > 5 else 'medium',
                    count=nulls
                ))
        
        return issues
    
    def check_duplicates(self, table_path: str, key_columns: List[str]) -> List[QualityIssue]:
        """Verificar duplicatas"""
        key_cols = ', '.join(f'"{col}"' for col in key_columns)
        
        query = f"""
        WITH dups AS (
            SELECT {key_cols}, COUNT(*) as cnt
            FROM delta_scan('{table_path}')
            GROUP BY {key_cols}
            HAVING COUNT(*) > 1
        )
        SELECT COUNT(*) as dup_groups, SUM(cnt) as total_dups
        FROM dups
        """
        
        result = self.con.execute(query).fetchone()
        dup_groups, total_dups = result
        
        if dup_groups and dup_groups > 0:
            return [QualityIssue(
                table=table_path,
                column=','.join(key_columns),
                issue_type='duplicates',
                severity='high',
                count=total_dups
            )]
        
        return []
    
    def run_all_checks(self, table_path: str, critical_cols: List[str], key_cols: List[str]):
        """Executar todas as verificações"""
        print(f"Running quality checks on {table_path}...")
        
        self.issues.extend(self.check_nulls(table_path, critical_cols))
        self.issues.extend(self.check_duplicates(table_path, key_cols))
        
        if self.issues:
            print(f"\n⚠ {len(self.issues)} issues encontrados:")
            for issue in self.issues:
                print(f"  - {issue.severity.upper()}: {issue.issue_type} em {issue.column} ({issue.count})")
        else:
            print("\n✓ Sem issues de qualidade!")

# Uso
monitor = DataQualityMonitor()
monitor.run_all_checks(
    table_path='./sales_partitioned',
    critical_cols=['order_id', 'amount'],
    key_cols=['order_id']
)

## 2. Data Quality Monitor

Monitor de qualidade com checagens automatizadas.

In [None]:
import duckdb
import pandas as pd
from datetime import datetime, timedelta

class EcommerceAnalytics:
    """Pipeline de analytics para e-commerce"""
    
    def __init__(self, base_path: str):
        self.base_path = base_path
        self.con = duckdb.connect()
    
    def get_sales_metrics(self, start_date: str, end_date: str) -> pd.DataFrame:
        """Métricas de vendas"""
        query = f"""
        SELECT
            DATE_TRUNC('day', order_date) as date,
            COUNT(DISTINCT order_id) as orders,
            COUNT(DISTINCT customer_id) as unique_customers,
            SUM(amount) as revenue,
            AVG(amount) as avg_order_value,
            -- Moving average 7 dias
            AVG(SUM(amount)) OVER (
                ORDER BY DATE_TRUNC('day', order_date)
                ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
            ) as revenue_7d_ma
        FROM delta_scan('{self.base_path}')
        WHERE order_date BETWEEN '{start_date}' AND '{end_date}'
        GROUP BY date
        ORDER BY date DESC
        """
        return self.con.execute(query).df()
    
    def get_customer_segments(self) -> pd.DataFrame:
        """Segmentação RFM"""
        query = f"""
        WITH rfm AS (
            SELECT
                customer_id,
                DATEDIFF('day', MAX(order_date), CURRENT_DATE) as recency,
                COUNT(*) as frequency,
                SUM(amount) as monetary
            FROM delta_scan('{self.base_path}')
            GROUP BY customer_id
        )
        SELECT
            CASE
                WHEN recency <= 30 AND frequency >= 10 THEN 'Champions'
                WHEN recency <= 60 AND frequency >= 5 THEN 'Loyal'
                WHEN recency <= 90 THEN 'Promising'
                WHEN recency > 180 AND frequency >= 5 THEN 'At Risk'
                ELSE 'Hibernating'
            END as segment,
            COUNT(*) as customer_count,
            AVG(recency) as avg_recency,
            AVG(frequency) as avg_frequency,
            AVG(monetary) as avg_monetary
        FROM rfm
        GROUP BY segment
        """
        return self.con.execute(query).df()

# Uso
pipeline = EcommerceAnalytics('./sales_partitioned')

# Métricas últimos 30 dias
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')

metrics = pipeline.get_sales_metrics(start_date, end_date)
print("=== SALES METRICS ===")
print(metrics.head())

segments = pipeline.get_customer_segments()
print("\n=== CUSTOMER SEGMENTS ===")
print(segments)

## 1. E-commerce Analytics Pipeline

Pipeline completo para análise de vendas.

In [None]:
# Instalação
%pip install duckdb deltalake pyarrow pandas -q