# Capitulo 05 Escrita Dados S3

Notebook gerado automaticamente a partir do código fonte python.


In [None]:
# -*- coding: utf-8 -*-
"""
capitulo_05_escrita_dados_s3
"""

# capitulo_05_escrita_dados_s3
import duckdb
import os

# Exemplo/Bloco 1
import duckdb
con = duckdb.connect(database=':memory:')

con.execute("""
COPY table_name TO 's3://bucket-name/path/filename.extension';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Criar uma tabela de exemplo
CREATE TABLE sales AS
SELECT
    range as id,
    'Product_' || (range % 100) as product,
    (random() * 1000)::INTEGER as amount
FROM range(10000);

-- Escrever para S3
COPY sales TO 's3://my-bucket/sales.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Escrever resultado de uma query diretamente
COPY (
    SELECT
        product,
        sum(amount) as total_sales,
        count(*) as transactions
    FROM sales
    GROUP BY product
) TO 's3://my-bucket/sales_summary.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Formato Parquet com compressão Snappy (padrão)
COPY sales TO 's3://my-bucket/data.parquet' (FORMAT parquet);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Escrever como CSV
COPY sales TO 's3://my-bucket/data.csv' (
    FORMAT csv,
    HEADER true,
    DELIMITER ','
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Escrever como JSON
COPY sales TO 's3://my-bucket/data.json' (FORMAT json);

-- JSON Lines (NDJSON)
COPY sales TO 's3://my-bucket/data.jsonl' (
    FORMAT json,
    ARRAY false
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Compressão Snappy (padrão, não precisa especificar)
COPY (SELECT * FROM tbl) TO 's3://my-bucket/result-snappy.parquet' (
    FORMAT parquet
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Compressão Zstd (melhor taxa de compressão)
COPY (FROM generate_series(100_000)) TO 's3://my-bucket/test.parquet' (
    FORMAT parquet,
    COMPRESSION zstd,
    ROW_GROUP_SIZE 100_000
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Compressão LZ4 (mais rápida)
COPY (FROM generate_series(100_000)) TO 's3://my-bucket/result-lz4.parquet' (
    FORMAT parquet,
    COMPRESSION lz4
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Compressão Brotli (máxima compressão)
COPY (FROM generate_series(100_000)) TO 's3://my-bucket/result-brotli.parquet' (
    FORMAT parquet,
    COMPRESSION brotli
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Sem compressão
COPY 'test.csv' TO 's3://my-bucket/result-uncompressed.parquet' (
    FORMAT parquet,
    COMPRESSION uncompressed
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Particionar por uma coluna
COPY sales TO 's3://my-bucket/partitioned' (
    FORMAT parquet,
    PARTITION_BY (year)
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Particionar por múltiplas colunas
COPY sales TO 's3://my-bucket/partitioned' (
    FORMAT parquet,
    PARTITION_BY (year, month)
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Criar tabela com dados temporais
CREATE TABLE transactions AS
SELECT
    range as id,
    DATE '2024-01-01' + INTERVAL (range % 365) DAY as date,
    (random() * 1000)::INTEGER as amount
FROM range(100000);

-- Adicionar colunas de particionamento
ALTER TABLE transactions ADD COLUMN year INTEGER;
ALTER TABLE transactions ADD COLUMN month INTEGER;

UPDATE transactions
SET
    year = EXTRACT(year FROM date),
    month = EXTRACT(month FROM date);

-- Escrever particionado
COPY transactions TO 's3://my-bucket/transactions' (
    FORMAT parquet,
    PARTITION_BY (year, month),
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Sobrescrever arquivos existentes
COPY table TO 's3://my-bucket/partitioned' (
    FORMAT parquet,
    PARTITION_BY (year, month),
    OVERWRITE_OR_IGNORE true
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Definir tamanho do row group (padrão: 122880 linhas)
COPY large_table TO 's3://my-bucket/data.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 100_000
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Row groups menores para melhor pruning
COPY events TO 's3://my-bucket/events.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 50_000,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Adicionar metadados ao arquivo Parquet
COPY (
    SELECT 42 AS number, true AS is_even
) TO 's3://my-bucket/kv_metadata.parquet' (
    FORMAT parquet,
    KV_METADATA {
        number: 'Answer to life, universe, and everything',
        is_even: 'not ''odd'''
    }
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Verificar metadados escritos
SELECT * FROM parquet_kv_metadata('s3://my-bucket/kv_metadata.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Configurar tamanho do dictionary page para strings
COPY lineitem TO 's3://my-bucket/lineitem-custom-dict.parquet' (
    FORMAT parquet,
    STRING_DICTIONARY_PAGE_SIZE_LIMIT 100_000
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Configurar secret com KMS key
CREATE OR REPLACE SECRET encrypted_secret (
    TYPE s3,
    PROVIDER credential_chain,
    CHAIN 'config',
    REGION 'eu-west-1',
    KMS_KEY_ID 'arn:aws:kms:eu-west-1:123456789:key/abcd-1234-5678',
    SCOPE 's3://encrypted-bucket'
);

-- Escrever dados (serão criptografados automaticamente)
COPY sensitive_data TO 's3://encrypted-bucket/data.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Configurar parâmetros de upload (via SET)
SET s3_uploader_max_parts_per_file = 10000;
SET s3_uploader_max_filesize = '5GB';
SET s3_uploader_thread_limit = 50;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Escrever tabela muito grande (multipart upload automático)
COPY huge_table TO 's3://my-bucket/huge_data.parquet' (
    FORMAT parquet,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Primeira escrita
COPY (
    SELECT * FROM transactions
    WHERE date >= '2024-01-01' AND date < '2024-02-01'
) TO 's3://my-bucket/transactions' (
    FORMAT parquet,
    PARTITION_BY (year, month)
);

-- Segunda escrita (diferentes partições, não sobrescreve)
COPY (
    SELECT * FROM transactions
    WHERE date >= '2024-02-01' AND date < '2024-03-01'
) TO 's3://my-bucket/transactions' (
    FORMAT parquet,
    PARTITION_BY (year, month)
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Escrever com timestamp no nome do arquivo
COPY batch_data TO 's3://my-bucket/data/batch_' || current_timestamp() || '.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
COPY sales TO 's3://my-bucket/sales.csv' (
    FORMAT csv,
    HEADER true,
    DELIMITER ',',
    QUOTE '"'
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
COPY sales TO 's3://my-bucket/sales.tsv' (
    FORMAT csv,
    HEADER true,
    DELIMITER '\t'
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
COPY sales TO 's3://my-bucket/sales.csv.gz' (
    FORMAT csv,
    COMPRESSION gzip
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ✅ Parquet para dados analíticos
COPY analytics_data TO 's3://my-bucket/data.parquet' (FORMAT parquet);

-- ✅ CSV para interoperabilidade
COPY export_data TO 's3://my-bucket/export.csv' (FORMAT csv);

-- ✅ JSON para dados semi-estruturados
COPY logs TO 's3://my-bucket/logs.jsonl' (FORMAT json);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ✅ zstd para dados que serão lidos frequentemente
COPY hot_data TO 's3://my-bucket/hot.parquet' (
    FORMAT parquet,
    COMPRESSION zstd
);

-- ✅ brotli para dados arquivados
COPY archive_data TO 's3://my-bucket/archive.parquet' (
    FORMAT parquet,
    COMPRESSION brotli
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ✅ Particionar por data para queries temporais
COPY large_dataset TO 's3://my-bucket/data' (
    FORMAT parquet,
    PARTITION_BY (year, month, day)
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ✅ Ajustar row group para padrão de acesso
COPY data TO 's3://my-bucket/data.parquet' (
    FORMAT parquet,
    ROW_GROUP_SIZE 100_000,
    COMPRESSION zstd
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Verifique se o bucket existe e você tem acesso
SELECT which_secret('s3://my-bucket/test.parquet', 's3');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Aumentar timeout e threads
SET s3_uploader_thread_limit = 10;
SET http_timeout = 120000;  -- 2 minutos
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Criar tabela de teste
CREATE TABLE test_data AS
SELECT
    range as id,
    'Value_' || range as value
FROM range(1000);

-- 2. Escrever para S3
COPY test_data TO 's3://your-bucket/test.parquet';

-- 3. Verificar escrita
SELECT count(*) FROM 's3://your-bucket/test.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Comparar tamanhos com diferentes compressões
COPY test_data TO 's3://your-bucket/snappy.parquet' (
    FORMAT parquet,
    COMPRESSION snappy
);

COPY test_data TO 's3://your-bucket/zstd.parquet' (
    FORMAT parquet,
    COMPRESSION zstd
);

-- Verificar tamanhos via AWS CLI ou console
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Criar dados com múltiplas categorias
CREATE TABLE sales_data AS
SELECT
    range as id,
    DATE '2024-01-01' + INTERVAL (range % 90) DAY as date,
    'Region_' || (range % 3) as region,
    (random() * 1000)::INTEGER as amount
FROM range(10000);

-- Adicionar colunas de partição
ALTER TABLE sales_data ADD COLUMN year INTEGER;
ALTER TABLE sales_data ADD COLUMN month INTEGER;

UPDATE sales_data
SET
    year = EXTRACT(year FROM date),
    month = EXTRACT(month FROM date);

-- Escrever particionado
COPY sales_data TO 's3://your-bucket/partitioned' (
    FORMAT parquet,
    PARTITION_BY (year, month)
);

-- Ler apenas uma partição
SELECT count(*)
FROM 's3://your-bucket/partitioned/year=2024/month=1/*.parquet';
""")
print(con.fetchall()) # Inspect result


