# Capitulo 10 Otimizacao Boas Praticas

Notebook gerado automaticamente a partir do código fonte python.


In [None]:
# -*- coding: utf-8 -*-
"""
capitulo_10_otimizacao_boas_praticas
"""

# capitulo_10_otimizacao_boas_praticas
import duckdb
import os

# Exemplo/Bloco 1
import duckdb
con = duckdb.connect(database=':memory:')

con.execute("""
-- Baixa TODAS as colunas (desperdício de banda e tempo)
SELECT *
FROM 's3://my-bucket/wide_table.parquet'
WHERE id = 12345;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Baixa apenas colunas necessárias
SELECT id, name, email, created_at
FROM 's3://my-bucket/wide_table.parquet'
WHERE id = 12345;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Escrever dados ordenados
COPY (
    SELECT *
    FROM transactions
    ORDER BY timestamp, customer_id
) TO 's3://my-bucket/transactions.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 100_000,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Muito rápido - lê apenas metadados
SELECT count(*) FROM 's3://my-bucket/large.parquet';

-- Também rápido - lê apenas uma coluna
SELECT count(id) FROM 's3://my-bucket/large.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Lista TODO o bucket (pode ser muito lento)
SELECT * FROM 's3://huge-bucket/**/*.parquet'
WHERE date = '2024-01-15';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Glob específico (lista menos objetos)
SELECT * FROM 's3://huge-bucket/2024/01/15/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Query eficiente com partições
SELECT *
FROM 's3://bucket/data/**/*.parquet'
WHERE year = 2024 AND month = 1 AND day = 15;

-- DuckDB lê apenas a partição específica
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Snappy ou LZ4: rápido, boa compressão
COPY hot_data TO 's3://bucket/hot.parquet' (
    FORMAT parquet,
    COMPRESSION snappy  -- ou lz4
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Zstd: melhor compressão, boa velocidade
COPY warm_data TO 's3://bucket/warm.parquet' (
    FORMAT parquet,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Brotli: máxima compressão
COPY cold_data TO 's3://bucket/archive.parquet' (
    FORMAT parquet,
    COMPRESSION brotli
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Row groups menores = melhor pruning
COPY filtered_data TO 's3://bucket/data.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 50_000,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Row groups maiores = melhor compressão
COPY bulk_data TO 's3://bucket/data.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 500_000,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Se queries frequentemente filtram por data e região
COPY sales TO 's3://bucket/sales' (
    FORMAT parquet,
    PARTITION_BY (year, month, region),
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
COPY product_events TO 's3://bucket/events.parquet' (
    FORMAT parquet,
    STRING_DICTIONARY_PAGE_SIZE_LIMIT 100_000,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Usa AWS credential chain (ENV, EC2 role, etc)
CREATE PERSISTENT SECRET production (
    TYPE s3,
    PROVIDER credential_chain
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- NÃO faça isso em produção
CREATE SECRET bad_practice (
    TYPE s3,
    PROVIDER config,
    KEY_ID 'AKIAIOSFODNN7EXAMPLE',  -- Exposto em código
    SECRET 'wJalrXUtnFEMI/K7MDENG'  -- Risco de segurança
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
CREATE PERSISTENT SECRET finance_data (
    TYPE s3,
    PROVIDER credential_chain,
    SCOPE 's3://finance-bucket/sensitive'
);

CREATE PERSISTENT SECRET public_data (
    TYPE s3,
    PROVIDER credential_chain,
    SCOPE 's3://public-bucket'
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
CREATE PERSISTENT SECRET personal (
    TYPE s3,
    PROVIDER credential_chain
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Arquivo fica sem criptografia em ~/.duckdb/stored_secrets
-- Risco em ambientes compartilhados
""")
print(con.fetchall()) # Inspect result

con.execute("""
CREATE SECRET temp_creds (
    TYPE s3,
    PROVIDER credential_chain,
    REFRESH auto
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ajustar número de threads (padrão: número de CPUs)
SET threads = 8;

-- Para downloads de S3
SET s3_uploader_thread_limit = 10;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Aumentar timeout para arquivos grandes
SET http_timeout = 300000;  -- 5 minutos

-- Para redes lentas
SET http_timeout = 600000;  -- 10 minutos
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Limitar memória (útil em ambientes com recursos limitados)
SET memory_limit = '4GB';

-- Para máquinas com mais RAM
SET memory_limit = '32GB';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Extract, Transform, Load em um comando
COPY (
    SELECT
        id,
        upper(name) as name,
        amount * 1.1 as amount_with_tax,
        current_timestamp() as processed_at
    FROM 's3://raw-bucket/input/**/*.parquet'
    WHERE date >= '2024-01-01'
      AND status = 'active'
) TO 's3://processed-bucket/output/' || current_date() || '.parquet' (
    FORMAT parquet,
    PARTITION_BY (region),
    COMPRESSION zstd,
    KV_METADATA {
        etl_version: '2.0',
        processed_at: current_timestamp()::VARCHAR
    }
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Tabela de controle (local ou em S3)
CREATE TABLE IF NOT EXISTS watermark (
    last_processed_date DATE
);

-- Processar dados incrementais
COPY (
    SELECT *
    FROM 's3://source-bucket/**/*.parquet'
    WHERE date > (SELECT max(last_processed_date) FROM watermark)
) TO 's3://target-bucket/incremental/' || current_date() || '.parquet';

-- Atualizar watermark
UPDATE watermark SET last_processed_date = current_date();
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Bronze → Silver
COPY (
    SELECT *
    FROM 's3://bronze-bucket/raw/**/*.parquet'
    WHERE is_valid(data)  -- Validação
) TO 's3://silver-bucket/cleaned/' (
    FORMAT parquet,
    PARTITION_BY (date)
);

-- Silver → Gold
COPY (
    SELECT
        date,
        category,
        sum(amount) as total,
        count(*) as transactions
    FROM 's3://silver-bucket/cleaned/**/*.parquet'
    GROUP BY date, category
) TO 's3://gold-bucket/aggregated/' (
    FORMAT parquet,
    PARTITION_BY (date)
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Baixa apenas 3 colunas ao invés de 100
SELECT id, name, amount
FROM 's3://bucket/wide_table.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Dados acessados frequentemente
CREATE SECRET r2_hot (
    TYPE r2,
    KEY_ID 'key',
    SECRET 'secret',
    ACCOUNT_ID 'account'
);

-- Ler do R2 sem custos de saída
SELECT * FROM 'r2://hot-data/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
COPY archive_data TO 's3://archive-bucket/data.parquet' (
    FORMAT parquet,
    COMPRESSION brotli  -- Máxima compressão
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver plano de execução
EXPLAIN SELECT *
FROM 's3://bucket/data.parquet'
WHERE date = '2024-01-15';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Comparar compressed vs uncompressed
SELECT
    file_name,
    sum(total_compressed_size) / 1024 / 1024 as compressed_mb,
    sum(total_uncompressed_size) / 1024 / 1024 as uncompressed_mb,
    sum(total_uncompressed_size) / sum(total_compressed_size) as ratio
FROM parquet_metadata('s3://bucket/**/*.parquet')
GROUP BY file_name;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Habilitar profiling
PRAGMA enable_profiling = 'json';
PRAGMA profiling_output = '/tmp/profile.json';

-- Executar query
SELECT * FROM 's3://bucket/data.parquet' WHERE ...;

-- Analisar profile.json
""")
print(con.fetchall()) # Inspect result

con.execute("""
CREATE SECRET encrypted (
    TYPE s3,
    PROVIDER credential_chain,
    KMS_KEY_ID 'arn:aws:kms:region:account:key/id',
    SCOPE 's3://sensitive-bucket'
);

-- Dados escritos são criptografados automaticamente
COPY sensitive_data TO 's3://sensitive-bucket/data.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Configurar secrets por ambiente
CREATE PERSISTENT SECRET prod_raw (
    TYPE s3,
    PROVIDER credential_chain,
    CHAIN 'env;config',
    SCOPE 's3://prod-raw-data',
    REFRESH auto
);

CREATE PERSISTENT SECRET prod_processed (
    TYPE s3,
    PROVIDER credential_chain,
    SCOPE 's3://prod-processed-data',
    KMS_KEY_ID 'arn:aws:kms:us-east-1:123456789:key/abcd',
    REFRESH auto
);

-- 2. Configurar performance
SET threads = 16;
SET memory_limit = '64GB';
SET s3_uploader_thread_limit = 20;

-- 3. Pipeline ETL otimizado
COPY (
    SELECT
        -- Column pruning: apenas colunas necessárias
        id,
        customer_id,
        product_id,
        amount,
        timestamp,
        -- Enriquecimento
        EXTRACT(year FROM timestamp) as year,
        EXTRACT(month FROM timestamp) as month,
        EXTRACT(day FROM timestamp) as day,
        -- Validação
        CASE
            WHEN amount > 0 AND amount < 1000000 THEN 'valid'
            ELSE 'invalid'
        END as validation_status
    FROM 's3://prod-raw-data/transactions/**/*.parquet'
    WHERE
        -- Filtro temporal
        timestamp >= current_date() - INTERVAL '1 day'
        AND timestamp < current_date()
        -- Filtro de qualidade
        AND customer_id IS NOT NULL
        AND amount > 0
    -- Ordenação para melhor compression e pruning
    ORDER BY timestamp, customer_id
) TO 's3://prod-processed-data/transactions/' || current_date() || '/' (
    FORMAT parquet,
    COMPRESSION zstd,
    ROW_GROUP_SIZE 100_000,
    PARTITION_BY (year, month, day),
    OVERWRITE_OR_IGNORE true,
    KV_METADATA {
        pipeline_version: '3.1.0',
        processed_at: current_timestamp()::VARCHAR,
        source_bucket: 'prod-raw-data',
        row_count: (
            SELECT count(*)::VARCHAR
            FROM 's3://prod-raw-data/transactions/**/*.parquet'
            WHERE timestamp >= current_date() - INTERVAL '1 day'
        )
    }
);

-- 4. Validação
SELECT
    'Processed Records: ' || count(*) as summary
FROM 's3://prod-processed-data/transactions/' || current_date() || '/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Setup
CREATE PERSISTENT SECRET my_project (TYPE s3, PROVIDER credential_chain);
SET threads = 8;
SET memory_limit = '16GB';

-- 2. Ingestão (Bronze)
COPY raw_events TO 's3://my-project/bronze/events/' || current_date() || '.parquet';

-- 3. Limpeza (Silver)
COPY (
    SELECT *
    FROM 's3://my-project/bronze/events/**/*.parquet'
    WHERE is_valid(event_data)
) TO 's3://my-project/silver/events/' (
    FORMAT parquet,
    PARTITION_BY (date),
    COMPRESSION zstd
);

-- 4. Agregação (Gold)
COPY (
    SELECT
        date,
        event_type,
        count(*) as events,
        count(DISTINCT user_id) as unique_users
    FROM 's3://my-project/silver/events/**/*.parquet'
    GROUP BY date, event_type
) TO 's3://my-project/gold/daily_summary.parquet';

-- 5. Análise
SELECT *
FROM 's3://my-project/gold/daily_summary.parquet'
ORDER BY date DESC, events DESC;
""")
print(con.fetchall()) # Inspect result


