# Capitulo 07 Trabalhando Parquet S3

Notebook gerado automaticamente a partir do código fonte python.


In [None]:
# -*- coding: utf-8 -*-
"""
capitulo_07_trabalhando_parquet_s3
"""

# capitulo_07_trabalhando_parquet_s3
import duckdb
import os

# Exemplo/Bloco 1
import duckdb
con = duckdb.connect(database=':memory:')

con.execute("""
-- Leitura básica
SELECT * FROM 'test.parquet';

-- Extensão alternativa (.parq)
SELECT * FROM read_parquet('test.parq');

-- Lista de arquivos
SELECT * FROM read_parquet(['file1.parquet', 'file2.parquet', 'file3.parquet']);

-- Glob pattern
SELECT * FROM 'test/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver origem dos dados
SELECT *, filename
FROM read_parquet('test/*.parquet', filename = true);
""")
print(con.fetchall()) # Inspect result

con.execute("""
SELECT * FROM parquet_metadata('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
SELECT * FROM parquet_schema('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
SELECT * FROM parquet_file_metadata('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
SELECT * FROM parquet_kv_metadata('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Snappy (padrão, bom balanço)
COPY (SELECT * FROM tbl) TO 's3://my-bucket/result-snappy.parquet' (
    FORMAT parquet
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Zstd: melhor taxa de compressão com boa velocidade
COPY (FROM generate_series(100_000)) TO 's3://my-bucket/test.parquet' (
    FORMAT parquet,
    COMPRESSION zstd,
    ROW_GROUP_SIZE 100_000
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- LZ4: compressão muito rápida
COPY (FROM generate_series(100_000)) TO 's3://my-bucket/result-lz4.parquet' (
    FORMAT parquet,
    COMPRESSION lz4
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Brotli: máxima compressão
COPY (FROM generate_series(100_000)) TO 's3://my-bucket/result-brotli.parquet' (
    FORMAT parquet,
    COMPRESSION brotli
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Uncompressed: sem compressão
COPY 'test.csv' TO 's3://my-bucket/result-uncompressed.parquet' (
    FORMAT parquet,
    COMPRESSION uncompressed
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Definir tamanho do row group
COPY large_table TO 's3://my-bucket/data.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 100_000
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Dados que serão frequentemente filtrados por timestamp
COPY events TO 's3://my-bucket/events.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 50_000,  -- Menor para melhor pruning
    COMPRESSION zstd
);

-- Query otimizada
SELECT *
FROM 's3://my-bucket/events.parquet'
WHERE timestamp BETWEEN '2024-01-01' AND '2024-01-02';
-- DuckDB pode pular row groups inteiros baseado em min/max stats
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Adicionar metadados descritivos
COPY (
    SELECT 42 AS number, true AS is_even
) TO 's3://my-bucket/kv_metadata.parquet' (
    FORMAT parquet,
    KV_METADATA {
        number: 'Answer to life, universe, and everything',
        is_even: 'not ''odd'''
    }
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
SELECT * FROM parquet_kv_metadata('s3://my-bucket/kv_metadata.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Adicionar metadados de processamento
COPY (
    SELECT *
    FROM raw_data
    WHERE date = current_date()
) TO 's3://processed-bucket/daily_' || current_date() || '.parquet' (
    FORMAT parquet,
    COMPRESSION zstd,
    KV_METADATA {
        etl_version: '2.1.0',
        processed_at: current_timestamp()::VARCHAR,
        source_table: 'raw_data',
        record_count: (SELECT count(*) FROM raw_data WHERE date = current_date())::VARCHAR
    }
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Aumentar tamanho do dictionary
COPY lineitem TO 's3://my-bucket/lineitem-custom-dict.parquet' (
    FORMAT parquet,
    STRING_DICTIONARY_PAGE_SIZE_LIMIT 100_000
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Baixa todas as 50 colunas
SELECT * FROM 's3://my-bucket/wide_table.parquet';

-- ✅ Baixa apenas 3 colunas
SELECT id, name, amount
FROM 's3://my-bucket/wide_table.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Tabela com 100 colunas, 1M linhas, 500MB

-- Query 1: SELECT * (baixa 500MB)
SELECT *
FROM 's3://my-bucket/large.parquet'
WHERE id = 12345;
-- Tempo: ~8 segundos

-- Query 2: SELECT específico (baixa ~5MB)
SELECT id, name, email
FROM 's3://my-bucket/large.parquet'
WHERE id = 12345;
-- Tempo: ~0.3 segundos
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Parquet tem estatísticas por row group:
-- Row Group 1: timestamp min=2024-01-01, max=2024-01-15
-- Row Group 2: timestamp min=2024-01-16, max=2024-01-31
-- Row Group 3: timestamp min=2024-02-01, max=2024-02-15

-- Query com filtro
SELECT *
FROM 's3://my-bucket/events.parquet'
WHERE timestamp = '2024-02-05';

-- DuckDB pula Row Groups 1 e 2 (fora do intervalo)
-- Lê apenas Row Group 3
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Dados ordenados por timestamp
COPY (
    SELECT *
    FROM events
    ORDER BY timestamp
) TO 's3://my-bucket/events_sorted.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 100_000
);

-- Queries com filtro temporal são muito eficientes
SELECT count(*)
FROM 's3://my-bucket/events_sorted.parquet'
WHERE timestamp >= '2024-01-15' AND timestamp < '2024-01-16';
-- Muito rápido graças ao row group pruning
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- file1.parquet: id, name, age
-- file2.parquet: id, name, salary, department

SELECT *
FROM read_parquet(
    's3://my-bucket/*.parquet',
    union_by_name = true
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- DuckDB detecta automaticamente partições Hive
SELECT *
FROM 's3://my-bucket/data/**/*.parquet'
WHERE year = 2024 AND month = 1;
""")
print(con.fetchall()) # Inspect result

con.execute("""
SELECT *
FROM read_parquet(
    's3://my-bucket/data/**/*.parquet',
    hive_partitioning = true
)
WHERE year = 2024;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Escrever com particionamento Hive
COPY sales TO 's3://my-bucket/sales' (
    FORMAT parquet,
    PARTITION_BY (year, month, day)
);

-- Estrutura resultante:
-- s3://my-bucket/sales/
--   year=2024/
--     month=1/
--       day=1/
--         data_0.parquet
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver estatísticas de todos os arquivos em um diretório
SELECT
    file_name,
    sum(num_values) as total_values,
    sum(total_compressed_size) as compressed_bytes,
    sum(total_uncompressed_size) as uncompressed_bytes,
    sum(total_uncompressed_size) / sum(total_compressed_size) as compression_ratio
FROM parquet_metadata('s3://my-bucket/data/*.parquet')
GROUP BY file_name
ORDER BY compressed_bytes DESC;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Identificar arquivos com baixa compressão
WITH compression_stats AS (
    SELECT
        file_name,
        sum(total_compressed_size)::DOUBLE / sum(total_uncompressed_size) as ratio
    FROM parquet_metadata('s3://my-bucket/**/*.parquet')
    GROUP BY file_name
)
SELECT *
FROM compression_stats
WHERE ratio > 0.5  -- Menos de 50% de compressão
ORDER BY ratio DESC;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Arquivo antigo: id, name
-- Arquivo novo: id, name, email, phone

-- Ler ambos com union_by_name
SELECT *
FROM read_parquet(
    's3://my-bucket/*.parquet',
    union_by_name = true
);

-- Colunas ausentes aparecem como NULL
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Se houver mudanças de tipo incompatíveis,
-- carregar como VARCHAR e converter
SELECT
    id,
    name,
    TRY_CAST(problematic_column AS INTEGER) as problematic_column
FROM 's3://my-bucket/data.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Dados ordenados melhoram compressão e pruning
COPY (
    SELECT *
    FROM large_table
    ORDER BY timestamp, category
) TO 's3://my-bucket/optimized.parquet' (
    FORMAT parquet,
    COMPRESSION zstd,
    ROW_GROUP_SIZE 100_000
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Se queries frequentemente filtram por data e região
COPY sales TO 's3://my-bucket/sales' (
    FORMAT parquet,
    PARTITION_BY (year, month, region)
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Hot data (acesso frequente): snappy ou lz4
COPY hot_data TO 's3://hot-bucket/data.parquet' (
    FORMAT parquet,
    COMPRESSION lz4
);

-- Warm data (acesso ocasional): zstd
COPY warm_data TO 's3://warm-bucket/data.parquet' (
    FORMAT parquet,
    COMPRESSION zstd
);

-- Cold data (arquivamento): brotli
COPY cold_data TO 's3://archive-bucket/data.parquet' (
    FORMAT parquet,
    COMPRESSION brotli
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Criar e escrever arquivo
CREATE TABLE sample AS SELECT range as id, 'Value_' || range as name FROM range(10000);
COPY sample TO 's3://your-bucket/sample.parquet';

-- 2. Ver schema
SELECT * FROM parquet_schema('s3://your-bucket/sample.parquet');

-- 3. Ver metadados
SELECT * FROM parquet_metadata('s3://your-bucket/sample.parquet');

-- 4. Ver info do arquivo
SELECT * FROM parquet_file_metadata('s3://your-bucket/sample.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Criar tabela de teste
CREATE TABLE test AS SELECT range as id, random() as value FROM range(100000);

-- Escrever com diferentes compressões
COPY test TO 's3://your-bucket/snappy.parquet' (FORMAT parquet, COMPRESSION snappy);
COPY test TO 's3://your-bucket/zstd.parquet' (FORMAT parquet, COMPRESSION zstd);
COPY test TO 's3://your-bucket/lz4.parquet' (FORMAT parquet, COMPRESSION lz4);
COPY test TO 's3://your-bucket/brotli.parquet' (FORMAT parquet, COMPRESSION brotli);

-- Comparar tamanhos
SELECT
    file_name,
    sum(total_compressed_size) / 1024 / 1024 as size_mb,
    sum(total_uncompressed_size) / sum(total_compressed_size) as compression_ratio
FROM parquet_metadata('s3://your-bucket/*.parquet')
GROUP BY file_name;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Criar dados com datas
CREATE TABLE events AS
SELECT
    range as id,
    DATE '2024-01-01' + INTERVAL (range % 90) DAY as date,
    'Event_' || range as description
FROM range(10000);

-- 2. Adicionar colunas de partição
ALTER TABLE events ADD COLUMN year INTEGER;
ALTER TABLE events ADD COLUMN month INTEGER;
UPDATE events SET year = EXTRACT(year FROM date), month = EXTRACT(month FROM date);

-- 3. Escrever particionado
COPY events TO 's3://your-bucket/events' (
    FORMAT parquet,
    PARTITION_BY (year, month)
);

-- 4. Ler partição específica
SELECT count(*) FROM 's3://your-bucket/events/year=2024/month=1/*.parquet';
""")
print(con.fetchall()) # Inspect result


