# Capitulo 08 Padroes Avancados Globbing

Notebook gerado automaticamente a partir do código fonte python.


In [None]:
# -*- coding: utf-8 -*-
"""
capitulo_08_padroes_avancados_globbing
"""

# capitulo_08_padroes_avancados_globbing
import duckdb
import os

# Exemplo/Bloco 1
import duckdb
con = duckdb.connect(database=':memory:')

con.execute("""
-- Todos os arquivos .parquet em um diretório
SELECT * FROM 's3://my-bucket/data/*.parquet';

-- Qualquer nome de arquivo
SELECT * FROM 's3://my-bucket/data/*';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Todos os .parquet em todos os subdiretórios
SELECT * FROM 's3://my-bucket/data/**/*.parquet';

-- Estrutura:
-- s3://my-bucket/data/
--   2024/
--     01/
--       file1.parquet
--     02/
--       file2.parquet
--   2023/
--     12/
--       file3.parquet
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Corresponde a qualquer caractere único
SELECT * FROM 's3://my-bucket/data/file_?.parquet';

-- Corresponde:
-- file_1.parquet ✅
-- file_2.parquet ✅
-- file_a.parquet ✅
-- file_10.parquet ❌ (dois caracteres)
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Dígitos de 0-9
SELECT * FROM 's3://my-bucket/data/file_[0-9].parquet';

-- Letras específicas
SELECT * FROM 's3://my-bucket/data/file_[abc].parquet';

-- Intervalo de letras
SELECT * FROM 's3://my-bucket/data/file_[a-z].parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Padrão complexo do exemplo da documentação
SELECT count(*)
FROM read_parquet('s3://my-bucket/folder*/100?/t[0-9].parquet');

-- Corresponde:
-- s3://my-bucket/folder1/1001/t5.parquet ✅
-- s3://my-bucket/folder2/1009/t3.parquet ✅
-- s3://my-bucket/folderA/1000/t7.parquet ✅
-- s3://my-bucket/folder/1001/t5.parquet ❌ (folder não tem sufixo)
-- s3://my-bucket/folder1/100/t5.parquet ❌ (100 tem 3 dígitos, não 4)
-- s3://my-bucket/folder1/1001/t15.parquet ❌ (t15 tem dois dígitos)
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura de diretórios por data
-- s3://logs/2024/01/01/app.parquet
-- s3://logs/2024/01/02/app.parquet

-- Todos os logs de janeiro 2024
SELECT *
FROM 's3://logs-bucket/2024/01/**/*.parquet';

-- Dia específico
SELECT *
FROM 's3://logs-bucket/2024/01/15/*.parquet';

-- Primeiro dia de cada mês em 2024
SELECT *
FROM 's3://logs-bucket/2024/*/01/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Arquivos nomeados: data-2024-01-15.parquet

-- Todos de janeiro
SELECT *
FROM 's3://bucket/data/data-2024-01-*.parquet';

-- Primeiros 10 dias de qualquer mês
SELECT *
FROM 's3://bucket/data/data-2024-*-0[1-9].parquet'
UNION ALL
SELECT *
FROM 's3://bucket/data/data-2024-*-10.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura Hive:
-- s3://data/year=2024/month=01/day=15/data.parquet

-- Mês específico
SELECT *
FROM 's3://data/year=2024/month=01/**/*.parquet'
WHERE year = 2024 AND month = 1;

-- Primeiro dia de cada mês
SELECT *
FROM 's3://data/year=2024/**/day=01/*.parquet';

-- Q1 2024
SELECT *
FROM 's3://data/year=2024/month=0[1-3]/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura:
-- s3://logs/2024-01-15T10/app.parquet
-- s3://logs/2024-01-15T11/app.parquet
-- s3://logs/2024-01-15T12/app.parquet

-- Logs de um dia específico
SELECT *
FROM 's3://logs-bucket/2024-01-15*/*.parquet'
WHERE timestamp >= '2024-01-15 00:00:00'
  AND timestamp < '2024-01-16 00:00:00';

-- Logs de horário específico (10h-12h)
SELECT *
FROM 's3://logs-bucket/2024-01-15T1[0-2]/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura:
-- s3://sales/region=us-east/2024/01/data.parquet
-- s3://sales/region=us-west/2024/01/data.parquet
-- s3://sales/region=eu-west/2024/01/data.parquet

-- Apenas regiões US
SELECT *
FROM 's3://sales-bucket/region=us-*/**/*.parquet';

-- Específicas regiões
SELECT *
FROM read_parquet('s3://sales-bucket/region={us-east,eu-west}/**/*.parquet');
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura:
-- s3://data/v1/file.parquet
-- s3://data/v2/file.parquet
-- s3://data/v10/file.parquet

-- Apenas versões de um dígito (v1-v9)
SELECT * FROM 's3://data-bucket/v[0-9]/*.parquet';

-- Versões específicas
SELECT * FROM 's3://data-bucket/v{1,5,10}/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura:
-- s3://processed/batch_001.parquet
-- s3://processed/batch_002.parquet
-- ...
-- s3://processed/batch_100.parquet

-- Apenas batches 1-99
SELECT *
FROM 's3://bucket/batch_0[0-9][0-9].parquet';

-- Batches específicos (50-59)
SELECT *
FROM 's3://bucket/batch_05[0-9].parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver origem dos dados
SELECT *, filename
FROM read_parquet('s3://my-bucket/**/*.parquet', filename = true);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Filenames: s3://bucket/region-us/2024-01-15.parquet

SELECT
    filename,
    split_part(filename, '/', -2) as region,  -- Extrai região do path
    split_part(split_part(filename, '/', -1), '.', 1) as date,  -- Extrai data do nome
    count(*) as records
FROM read_parquet('s3://my-bucket/**/*.parquet', filename = true)
GROUP BY filename;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Apenas arquivos de uma região específica
SELECT *
FROM read_parquet('s3://my-bucket/**/*.parquet', filename = true)
WHERE filename LIKE '%region=us-east%';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Menos eficiente: busca em toda a estrutura
SELECT * FROM 's3://huge-bucket/**/*.parquet';

-- ✅ Mais eficiente: limita scope
SELECT * FROM 's3://huge-bucket/2024/01/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Glob amplo + filtro na query
SELECT *
FROM 's3://data/**/*.parquet'
WHERE year = 2024 AND month = 1;

-- ✅ Glob específico (menos arquivos listados)
SELECT *
FROM 's3://data/year=2024/month=01/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura Hive otimizada
SELECT *
FROM read_parquet(
    's3://data/year=2024/month=0[1-3]/**/*.parquet',
    hive_partitioning = true
)
WHERE day <= 15;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Este comando faz:
-- 1. ListObjectsV2('s3://bucket/data/', prefix='2024/')
-- 2. Filtra resultados pelo padrão *.parquet
-- 3. Lê apenas arquivos correspondentes
SELECT * FROM 's3://bucket/data/2024/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- ❌ Lista TODO o bucket (lento)
SELECT * FROM 's3://huge-bucket/**/*.parquet' LIMIT 10;

-- ✅ Lista apenas subdiretório (rápido)
SELECT * FROM 's3://huge-bucket/recent/**/*.parquet' LIMIT 10;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura:
-- s3://cdc/table_name/YYYY-MM-DD/HH/batch_NNN.parquet

-- Todas as mudanças de um dia
SELECT *
FROM 's3://cdc-bucket/users/2024-01-15/**/*.parquet';

-- Mudanças de uma hora específica
SELECT *
FROM 's3://cdc-bucket/users/2024-01-15/14/*.parquet';

-- Últimos batches de cada hora
SELECT
    split_part(filename, '/', -2) as hour,
    max(filename) as latest_batch
FROM read_parquet('s3://cdc-bucket/users/2024-01-15/**/*.parquet', filename = true)
GROUP BY hour;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura:
-- s3://data/tenant_001/2024/01/data.parquet
-- s3://data/tenant_002/2024/01/data.parquet

-- Todos os tenants, mês específico
SELECT *
FROM 's3://data-bucket/tenant_*/2024/01/**/*.parquet';

-- Tenants específicos
SELECT *
FROM 's3://data-bucket/tenant_{001,002,005}/2024/**/*.parquet';

-- Análise por tenant
SELECT
    split_part(filename, '/', -4) as tenant,
    count(*) as records,
    sum(amount) as total
FROM read_parquet('s3://data-bucket/tenant_*/2024/01/**/*.parquet', filename = true)
GROUP BY tenant;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura:
-- s3://experiments/exp_001/variant_A/2024-01-15.parquet
-- s3://experiments/exp_001/variant_B/2024-01-15.parquet

-- Experimento específico, todas as variantes
SELECT
    split_part(filename, '/', -2) as variant,
    count(*) as impressions,
    sum(converted) as conversions,
    sum(converted)::FLOAT / count(*) as conversion_rate
FROM read_parquet('s3://experiments/exp_001/**/*.parquet', filename = true)
GROUP BY variant;

-- Múltiplos experimentos, comparação
SELECT
    split_part(filename, '/', -3) as experiment,
    split_part(filename, '/', -2) as variant,
    sum(revenue) as total_revenue
FROM read_parquet('s3://experiments/exp_*/variant_{A,B}/**/*.parquet', filename = true)
GROUP BY experiment, variant;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Estrutura por minuto:
-- s3://metrics/2024/01/15/10/30/metrics.parquet

-- Hora específica (10:00-10:59)
SELECT *
FROM 's3://metrics-bucket/2024/01/15/10/**/*.parquet';

-- Minutos específicos (10:00, 10:15, 10:30, 10:45)
SELECT *
FROM 's3://metrics-bucket/2024/01/15/10/{00,15,30,45}/*.parquet';

-- Agregação por hora
SELECT
    split_part(filename, '/', -3) as hour,
    avg(cpu_usage) as avg_cpu,
    max(memory_usage) as max_memory
FROM read_parquet('s3://metrics-bucket/2024/01/15/**/*.parquet', filename = true)
GROUP BY hour
ORDER BY hour;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Combinar dados de múltiplas fontes
SELECT * FROM 's3://bucket/2024/01/**/*.parquet'
UNION ALL
SELECT * FROM 's3://bucket/2024/02/**/*.parquet'
UNION ALL
SELECT * FROM 's3://bucket/2024/03/**/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Não suportado diretamente, mas pode usar:
SELECT *
FROM read_parquet([
    's3://bucket/2024/01/**/*.parquet',
    's3://bucket/2024/02/**/*.parquet',
    's3://bucket/2024/03/**/*.parquet'
]);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver quais arquivos correspondem ao padrão
SELECT DISTINCT filename
FROM read_parquet('s3://bucket/complex/pattern/**/*.parquet', filename = true)
LIMIT 20;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Ver distribuição de arquivos
SELECT
    split_part(filename, '/', -2) as directory,
    count(*) as file_count
FROM read_parquet('s3://bucket/**/*.parquet', filename = true)
GROUP BY directory
ORDER BY file_count DESC;
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Verificar se padrão retorna arquivos esperados
SELECT
    count(DISTINCT filename) as total_files,
    min(filename) as first_file,
    max(filename) as last_file
FROM read_parquet('s3://bucket/pattern/**/*.parquet', filename = true);
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Criar estrutura de teste
COPY (SELECT range as id FROM range(100))
  TO 's3://your-bucket/test/file_1.parquet';
COPY (SELECT range as id FROM range(100))
  TO 's3://your-bucket/test/file_2.parquet';
COPY (SELECT range as id FROM range(100))
  TO 's3://your-bucket/test/file_a.parquet';

-- 2. Testar padrões
SELECT count(*) FROM 's3://your-bucket/test/file_*.parquet';
SELECT count(*) FROM 's3://your-bucket/test/file_[0-9].parquet';
SELECT count(*) FROM 's3://your-bucket/test/file_?.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- 1. Criar estrutura de diretórios
COPY (SELECT range as id, '2024-01-15' as date FROM range(100))
  TO 's3://your-bucket/data/2024/01/15/data.parquet';
COPY (SELECT range as id, '2024-01-16' as date FROM range(100))
  TO 's3://your-bucket/data/2024/01/16/data.parquet';
COPY (SELECT range as id, '2024-02-01' as date FROM range(100))
  TO 's3://your-bucket/data/2024/02/01/data.parquet';

-- 2. Testar padrões recursivos
SELECT count(*) FROM 's3://your-bucket/data/**/*.parquet';
SELECT count(*) FROM 's3://your-bucket/data/2024/01/**/*.parquet';
SELECT count(*) FROM 's3://your-bucket/data/2024/*/15/*.parquet';
""")
print(con.fetchall()) # Inspect result

con.execute("""
-- Análise de distribuição de arquivos
SELECT
    split_part(filename, '/', -3) as year,
    split_part(filename, '/', -2) as month,
    count(*) as total_records,
    count(DISTINCT filename) as file_count
FROM read_parquet('s3://your-bucket/data/**/*.parquet', filename = true)
GROUP BY year, month
ORDER BY year, month;
""")
print(con.fetchall()) # Inspect result


