# Capitulo 04 Leitura Dados S3

Notebook gerado automaticamente a partir do código fonte python.


In [None]:
# -*- coding: utf-8 -*-
"""
capitulo_04_leitura_dados_s3
"""

# capitulo_04_leitura_dados_s3
import duckdb
import os

# Exemplo/Bloco 1
import duckdb
con = duckdb.connect(database=':memory:')

con.execute("""
SELECT * FROM 's3://your-bucket/filename.extension';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ler arquivo Parquet completo
SELECT * FROM 's3://my-bucket/data/sales.parquet';

-- Ler apenas algumas colunas
SELECT product_id, quantity, price
FROM 's3://my-bucket/data/sales.parquet';

-- Ler com filtro
SELECT *
FROM 's3://my-bucket/data/sales.parquet'
WHERE date >= '2024-01-01';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Sintaxe básica
SELECT * FROM read_parquet('s3://your-bucket/file.parquet');

-- Com múltiplas colunas
SELECT column_a, column_b
FROM read_parquet('s3://your-bucket/file.parquet');

-- Com agregação
SELECT category, sum(amount) as total
FROM read_parquet('s3://my-bucket/transactions.parquet')
GROUP BY category;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Leitura automática de CSV (detecta delimitador, tipos, etc)
SELECT * FROM read_csv_auto('s3://my-bucket/data.csv');

-- Com parâmetros específicos
SELECT * FROM read_csv_auto(
    's3://my-bucket/data.csv',
    header = true,
    delim = ',',
    quote = '"'
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Leitura de JSON
SELECT * FROM read_json_auto('s3://my-bucket/data.json');

-- JSON Lines (NDJSON)
SELECT * FROM read_json_auto('s3://my-bucket/logs.jsonl');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ler múltiplos arquivos Parquet
SELECT * FROM read_parquet([
    's3://my-bucket/data/2024-01.parquet',
    's3://my-bucket/data/2024-02.parquet',
    's3://my-bucket/data/2024-03.parquet'
]);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Consolidar dados de vários meses
SELECT
    date_trunc('month', date) as month,
    count(*) as total_transactions,
    sum(amount) as total_amount
FROM read_parquet([
    's3://sales-bucket/2024/january.parquet',
    's3://sales-bucket/2024/february.parquet',
    's3://sales-bucket/2024/march.parquet'
])
GROUP BY month
ORDER BY month;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ler todos os arquivos Parquet em um diretório
SELECT * FROM 's3://my-bucket/data/*.parquet';

-- Ler todos os CSV
SELECT * FROM read_csv_auto('s3://my-bucket/csv-files/*.csv');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ler todos os Parquet em todos os subdiretórios
SELECT * FROM 's3://my-bucket/data/**/*.parquet';

-- Exemplo com estrutura hierárquica
SELECT * FROM 's3://logs-bucket/year=2024/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Caractere único (?)
SELECT count(*) FROM 's3://my-bucket/data/file_?.parquet';
-- Corresponde: file_1.parquet, file_2.parquet, file_a.parquet

-- Conjunto de caracteres ([])
SELECT count(*) FROM 's3://my-bucket/data/file_[0-9].parquet';
-- Corresponde: file_0.parquet, file_1.parquet, ..., file_9.parquet

-- Padrão complexo
SELECT count(*)
FROM read_parquet('s3://my-bucket/folder*/100?/t[0-9].parquet');
-- Corresponde: folder1/1001/t5.parquet, folder2/1009/t3.parquet, etc.
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura: s3://logs/2024/01/01/app.parquet
--           s3://logs/2024/01/02/app.parquet
--           ...

-- Ler todos os logs de janeiro de 2024
SELECT *
FROM 's3://logs-bucket/2024/01/**/*.parquet'
WHERE timestamp >= '2024-01-01'
  AND timestamp < '2024-02-01';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Adicionar nome do arquivo como coluna
SELECT *
FROM read_parquet('s3://my-bucket/data/*.parquet', filename = true);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver de qual arquivo cada registro veio
SELECT
    filename,
    count(*) as records,
    sum(amount) as total
FROM read_parquet('s3://sales-bucket/2024/*.parquet', filename = true)
GROUP BY filename
ORDER BY filename;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver o schema do arquivo sem baixar os dados
SELECT * FROM parquet_schema('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Metadados completos do arquivo
SELECT * FROM parquet_metadata('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Metadados de nível de arquivo (informações sobre row groups)
SELECT * FROM parquet_file_metadata('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Metadados key-value customizados
SELECT * FROM parquet_kv_metadata('s3://my-bucket/data.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Baixa apenas a coluna 'product_id' (não o arquivo inteiro)
SELECT product_id
FROM 's3://my-bucket/large_sales.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Lê apenas metadados (não baixa dados das colunas)
SELECT count(*) FROM 's3://my-bucket/data.parquet';

-- Muito mais rápido que baixar o arquivo inteiro!
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Ineficiente: baixa todas as 50 colunas
SELECT *
FROM 's3://my-bucket/wide_table.parquet'
WHERE id = 123;

-- ✅ Eficiente: baixa apenas coluna 'id' e as colunas selecionadas
SELECT id, name, amount
FROM 's3://my-bucket/wide_table.parquet'
WHERE id = 123;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- DuckDB detecta automaticamente as partições Hive
SELECT *
FROM 's3://my-bucket/sales/**/*.parquet'
WHERE year = 2024 AND month = 1;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Forçar uso de Hive partitioning
SELECT *
FROM read_parquet('s3://my-bucket/sales/**/*.parquet', hive_partitioning = true)
WHERE year = 2024;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Arquivos podem ter colunas diferentes
-- file1.parquet: id, name, age
-- file2.parquet: id, name, salary

SELECT *
FROM read_parquet(
    's3://my-bucket/*.parquet',
    union_by_name = true
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ler apenas primeiras 1000 linhas
SELECT *
FROM 's3://my-bucket/huge_file.parquet'
LIMIT 1000;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Amostragem de 10% dos dados
SELECT *
FROM 's3://my-bucket/large_file.parquet'
USING SAMPLE 10%;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Pushdown de filtros: DuckDB aplica filtro antes de baixar dados
SELECT *
FROM 's3://my-bucket/data.parquet'
WHERE date = '2024-01-15'  -- Filtro aplicado no Parquet row group level
  AND status = 'active';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- CSV com delimitador customizado
SELECT * FROM read_csv_auto(
    's3://my-bucket/data.csv',
    delim = ';',
    header = true,
    nullstr = 'NULL'
);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- JSON com estrutura aninhada
SELECT
    id,
    data->>'name' as name,
    data->>'email' as email
FROM read_json_auto('s3://my-bucket/users.json');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Cada linha é um objeto JSON
SELECT *
FROM read_json_auto('s3://my-bucket/logs.jsonl');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Criar tabela local
CREATE TABLE customer_segments (
    customer_id INTEGER,
    segment VARCHAR
);

-- Join com dados no S3
SELECT
    t.transaction_id,
    t.customer_id,
    t.amount,
    c.segment
FROM 's3://transactions-bucket/2024/*.parquet' t
JOIN customer_segments c ON t.customer_id = c.customer_id
WHERE c.segment = 'premium';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Análise de vendas por região
SELECT
    region,
    date_trunc('month', date) as month,
    count(*) as transactions,
    sum(amount) as revenue,
    avg(amount) as avg_order_value
FROM 's3://sales-bucket/2024/**/*.parquet'
GROUP BY region, month
ORDER BY region, month;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ranking de produtos por vendas mensais
SELECT
    product_id,
    month,
    revenue,
    rank() OVER (PARTITION BY month ORDER BY revenue DESC) as rank
FROM (
    SELECT
        product_id,
        date_trunc('month', date) as month,
        sum(amount) as revenue
    FROM 's3://sales-bucket/2024/*.parquet'
    GROUP BY product_id, month
)
ORDER BY month, rank;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Erro se arquivo não existir
SELECT * FROM 's3://my-bucket/nonexistent.parquet';
-- Error: HTTP Error 404: Not Found
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Erro se não tiver permissão de leitura
SELECT * FROM 's3://restricted-bucket/data.parquet';
-- Error: Access Denied
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Erro se tentar ler CSV como Parquet
SELECT * FROM read_parquet('s3://my-bucket/data.csv');
-- Error: Invalid Parquet file
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Ler um arquivo Parquet do S3
SELECT * FROM 's3://your-bucket/test.parquet' LIMIT 10;

-- 2. Contar total de registros
SELECT count(*) FROM 's3://your-bucket/test.parquet';

-- 3. Ver schema
SELECT * FROM parquet_schema('s3://your-bucket/test.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Ler todos os arquivos de um diretório
SELECT count(*) FROM 's3://your-bucket/data/*.parquet';

-- 2. Adicionar coluna filename
SELECT filename, count(*) as records
FROM read_parquet('s3://your-bucket/data/*.parquet', filename = true)
GROUP BY filename;

-- 3. Filtrar por padrão específico
SELECT * FROM 's3://your-bucket/data/2024-0[1-3]-*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Análise consolidada de múltiplos arquivos
SELECT
    date_trunc('day', timestamp) as day,
    count(*) as events,
    count(DISTINCT user_id) as unique_users
FROM 's3://logs-bucket/2024/**/*.parquet'
WHERE timestamp >= '2024-01-01'
GROUP BY day
ORDER BY day;
""")
print(con.fetchall()) # Inspect result


