# Capitulo 03 Arrow Tables Datasets

Notebook gerado automaticamente a partir do c√≥digo fonte python.


In [None]:
# Instala√ß√£o de pacotes necess√°rios
!pip install pyarrow duckdb pandas numpy

## üìö Introdu√ß√£o

Este notebook aborda os conceitos de Arrow Tables e Datasets:
- Cria√ß√£o de Tables
- Schemas e tipos
- Datasets particionados
- Leitura de Parquet
- Filtros e projections

In [None]:
# -*- coding: utf-8 -*-
"""
Cap√≠tulo 03: Arrow Tables e Datasets
Curso: Apache Arrow + DuckDB
Nota: UTF-8 √© configurado automaticamente em notebooks Jupyter
"""

import pyarrow as pa
import duckdb
import pandas as pd
import numpy as np

print("="*60)
print(f"CAP√çTULO 03: ARROW TABLES E DATASETS")
print("="*60)

## üîß Prepara√ß√£o dos Dados

Cria√ß√£o de dados de exemplo e conex√£o com DuckDB

In [None]:
# Dados de exemplo globais
try:
    print("\nGerando dados de exemplo...")
    data = pa.table({
        'id': range(1000),
        'valor': np.random.randn(1000),
        'categoria': np.random.choice(['A', 'B', 'C'], 1000)
    })
    print(f"Tabela PyArrow criada: {data.num_rows} linhas")
except Exception as e:
    print(f"Erro ao criar dados: {e}")

# Conex√£o DuckDB
con = duckdb.connect()

## üìã T√≥pico 1: Cria√ß√£o de Tables

Exemplos pr√°ticos de como criar tabelas Arrow

In [None]:
print(f"\n--- {'Cria√ß√£o de Tables'.upper()} ---")

# 3.1.1 Anatomia de uma Arrow Table
print("\n1. Anatomia de uma Arrow Table:")
print("-" * 40)

# Criar Arrow table
table = pa.table({
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Carol', 'David', 'Eve'],
    'age': [30, 25, 35, 28, 32],
    'salary': [75000.00, 65000.00, 85000.00, 70000.00, 80000.00]
})

print("Arrow Table:")
print(table)
print(f"\nN√∫mero de linhas: {table.num_rows}")
print(f"N√∫mero de colunas: {table.num_columns}")
print(f"Nome das colunas: {table.column_names}")
print(f"Schema: {table.schema}")

# Tamanho em mem√≥ria
print(f"\nTamanho total: {table.nbytes:,} bytes")

# 3.1.2 Acessar Dados da Table
print("\n2. Acessar Dados da Table:")
print("-" * 40)

table_sales = pa.table({
    'product': ['A', 'B', 'C', 'D', 'E'],
    'sales': [100, 200, 150, 300, 250],
    'region': ['North', 'South', 'North', 'East', 'South']
})

# Acessar coluna por nome
sales_column = table_sales['sales']
print(f"Coluna 'sales': {sales_column}")
print(f"Tipo: {type(sales_column)}")  # pyarrow.lib.ChunkedArray

# Acessar coluna por √≠ndice
first_column = table_sales.column(0)
print(f"\nPrimeira coluna: {first_column}")

# Converter para Python list
sales_list = table_sales['sales'].to_pylist()
print(f"\nSales como lista: {sales_list}")

# Slice (fatiar) table
subset = table_sales.slice(1, 3)  # Linhas 1-3
print(f"\nSubset (linhas 1-3):")
print(subset)

# Filtrar com DuckDB
filtered = con.execute("""
    SELECT * FROM table_sales
    WHERE sales > 150
    ORDER BY sales DESC
""").arrow()

print(f"\nFiltrado (sales > 150):")
print(filtered)

# 3.1.3 Opera√ß√µes com Colunas
print("\n3. Opera√ß√µes com Colunas:")
print("-" * 40)

import pyarrow.compute as pc

table_ops = pa.table({
    'id': [1, 2, 3, 4, 5],
    'value': [10, 20, 30, 40, 50]
})

# Adicionar coluna
doubled = pc.multiply(table_ops['value'], 2)
table_with_doubled = table_ops.append_column('value_doubled', doubled)

print("Table com nova coluna:")
print(table_with_doubled)

# Remover coluna
table_without_id = table_ops.remove_column(0)
print("\nTable sem coluna 'id':")
print(table_without_id)

# Renomear colunas
renamed = table_ops.rename_columns(['product_id', 'quantity'])
print("\nTable com colunas renomeadas:")
print(renamed)

# Selecionar colunas
selected = table_ops.select(['value'])
print("\nApenas coluna 'value':")
print(selected)

# 3.1.4 Concatenar Tables
print("\n4. Concatenar Tables:")
print("-" * 40)

# Criar m√∫ltiplas tables
table1 = pa.table({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Carol']
})

table2 = pa.table({
    'id': [4, 5, 6],
    'name': ['David', 'Eve', 'Frank']
})

table3 = pa.table({
    'id': [7, 8, 9],
    'name': ['Grace', 'Henry', 'Ivy']
})

# Concatenar verticalmente (empilhar linhas)
combined = pa.concat_tables([table1, table2, table3])
print("Tables concatenadas:")
print(combined)

# Verificar com DuckDB
result = con.execute("SELECT count(*) FROM combined").fetchone()
print(f"\nTotal de linhas: {result[0]}")

## üî§ T√≥pico 2: Schemas e tipos

Trabalhando com schemas e diferentes tipos de dados

In [None]:
print(f"\n--- {'Schemas e tipos'.upper()} ---")

# 3.3.1 Nested Types (Struct)
print("\n1. Tipos Aninhados (Struct):")
print("-" * 40)

# Criar schema com struct (tipo aninhado)
schema = pa.schema([
    ('id', pa.int64()),
    ('name', pa.string()),
    ('address', pa.struct([
        ('street', pa.string()),
        ('city', pa.string()),
        ('zip', pa.string())
    ])),
    ('phone_numbers', pa.list_(pa.string()))
])

# Criar dados
data_arrays = [
    pa.array([1, 2, 3], type=pa.int64()),
    pa.array(['Alice', 'Bob', 'Carol'], type=pa.string()),
    pa.StructArray.from_arrays(
        [
            pa.array(['123 Main St', '456 Oak Ave', '789 Pine Rd']),
            pa.array(['New York', 'Los Angeles', 'Chicago']),
            pa.array(['10001', '90001', '60601'])
        ],
        names=['street', 'city', 'zip']
    ),
    pa.array([
        ['555-1234', '555-5678'],
        ['555-9012'],
        ['555-3456', '555-7890', '555-1111']
    ])
]

table_nested = pa.Table.from_arrays(data_arrays, schema=schema)

print("Table com tipos aninhados:")
print(table_nested)
print(f"\nSchema:\n{table_nested.schema}")

# Query com DuckDB (acessa campos nested)
result_nested = con.execute("""
    SELECT
        name,
        address.city as city,
        address.zip as zip,
        len(phone_numbers) as phone_count
    FROM table_nested
""").arrow()

print("\nResultado (campos extra√≠dos):")
print(result_nested)

# 3.3.2 List Types
print("\n2. Tipos de Lista:")
print("-" * 40)

# Criar table com arrays
table_lists = pa.table({
    'customer_id': [1, 2, 3],
    'customer_name': ['Alice', 'Bob', 'Carol'],
    'order_ids': [
        [101, 102, 103],
        [201],
        [301, 302]
    ],
    'order_amounts': [
        [50.00, 75.50, 100.00],
        [200.00],
        [30.00, 45.00]
    ]
})

print("Table com listas:")
print(table_lists)

# Query com DuckDB (unnest arrays)
# Explodir arrays
result_unnest = con.execute("""
    SELECT
        customer_id,
        customer_name,
        unnest(order_ids) as order_id,
        unnest(order_amounts) as amount
    FROM table_lists
""").arrow()

print("\nDados expandidos:")
print(result_unnest)

# Agrega√ß√µes
summary = con.execute("""
    SELECT
        customer_name,
        len(order_ids) as order_count,
        list_sum(order_amounts) as total_spent
    FROM table_lists
    ORDER BY total_spent DESC
""").df()

print("\nResumo por cliente:")
print(summary)

# 3.3.3 Tipos Primitivos
print("\n3. Tipos Primitivos Diversos:")
print("-" * 40)

from datetime import datetime, date, time
from decimal import Decimal

# Criar table com v√°rios tipos
table_types = pa.table({
    'int8_col': pa.array([1, 2, 3], type=pa.int8()),
    'int16_col': pa.array([100, 200, 300], type=pa.int16()),
    'int32_col': pa.array([10000, 20000, 30000], type=pa.int32()),
    'int64_col': pa.array([1000000, 2000000, 3000000], type=pa.int64()),
    'float32_col': pa.array([1.1, 2.2, 3.3], type=pa.float32()),
    'float64_col': pa.array([1.111, 2.222, 3.333], type=pa.float64()),
    'bool_col': pa.array([True, False, True], type=pa.bool_()),
    'string_col': pa.array(['a', 'b', 'c'], type=pa.string()),
    'binary_col': pa.array([b'x', b'y', b'z'], type=pa.binary()),
    'date_col': pa.array([date(2024, 1, 1), date(2024, 1, 2), date(2024, 1, 3)], type=pa.date32()),
    'timestamp_col': pa.array([
        datetime(2024, 1, 1, 10, 30),
        datetime(2024, 1, 2, 11, 30),
        datetime(2024, 1, 3, 12, 30)
    ], type=pa.timestamp('s')),
    'decimal_col': pa.array([
        Decimal('123.45'),
        Decimal('678.90'),
        Decimal('111.22')
    ], type=pa.decimal128(10, 2))
})

print("Table com tipos diversos:")
print(table_types)

print("\nSchema detalhado:")
for field in table_types.schema:
    print(f"  {field.name}: {field.type}")

# Query com DuckDB
result_types = con.execute("""
    SELECT 
        int32_col,
        float64_col,
        bool_col,
        string_col,
        date_col,
        decimal_col
    FROM table_types
""").arrow()

print("\nResultado da query:")
print(result_types)

# 3.3.4 Tipos Nullable vs Non-Nullable
print("\n4. Tipos Nullable vs Non-Nullable:")
print("-" * 40)

# Criar schema com nullable e non-nullable
schema_nullable = pa.schema([
    pa.field('id', pa.int32(), nullable=False),
    pa.field('name', pa.string(), nullable=False),
    pa.field('email', pa.string(), nullable=True),
    pa.field('age', pa.int32(), nullable=True)
])

# Criar table com valores nulos
table_nullable = pa.table({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Carol'],
    'email': ['alice@example.com', None, 'carol@example.com'],
    'age': [30, None, 35]
}, schema=schema_nullable)

print("Schema com nullable:")
for field in schema_nullable:
    print(f"  {field.name}: {field.type} (nullable={field.nullable})")

print("\nTable com valores nulos:")
print(table_nullable)

# Query filtrando nulos
result_nulls = con.execute("""
    SELECT *
    FROM table_nullable
    WHERE email IS NOT NULL AND age IS NOT NULL
""").arrow()

print("\nAp√≥s filtrar nulos:")
print(result_nulls)

## üìÇ T√≥pico 3: Datasets particionados

Trabalhando com datasets particionados para melhor performance

In [None]:
print(f"\n--- {'Datasets particionados'.upper()} ---")

import pyarrow.dataset as ds
import pyarrow.parquet as pq
import os
import shutil

# 3.2.1 Criar Dataset Simples (Multi-arquivo)
print("\n1. Dataset Simples (Multi-arquivo Parquet):")
print("-" * 40)

# Criar diret√≥rio para dados
data_dir = 'data_sales'
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)
os.makedirs(f'{data_dir}', exist_ok=True)

# Criar m√∫ltiplos arquivos Parquet (simulando dados mensais)
for month in range(1, 4):
    table = pa.table({
        'date': pa.array([f'2024-{month:02d}-{day:02d}' for day in range(1, 11)]),
        'sales': pa.array([100 * month + day for day in range(1, 11)], type=pa.int32()),
        'region': pa.array(['North' if day % 2 == 0 else 'South' for day in range(1, 11)])
    })
    
    # Escrever arquivo
    pq.write_table(table, f'{data_dir}/month_{month}.parquet')

print("Arquivos criados:")
for f in os.listdir(data_dir):
    print(f"  - {f}")

# Criar Dataset apontando para o diret√≥rio
dataset = ds.dataset(data_dir, format='parquet')

print(f"\nDataset criado")
print(f"Schema: {dataset.schema}")
print(f"Arquivos: {len(list(dataset.get_fragments()))}")

# Query com DuckDB
result = con.execute("""
    SELECT
        region,
        count(*) as count,
        sum(sales) as total_sales
    FROM dataset
    GROUP BY region
""").df()

print("\nResultado da query:")
print(result)

# 3.2.2 Dataset Particionado (Hive Partitioning)
print("\n2. Dataset Particionado (Hive Partitioning):")
print("-" * 40)

# Criar diret√≥rio particionado
partition_dir = 'data_partitioned'
if os.path.exists(partition_dir):
    shutil.rmtree(partition_dir)

# Criar dados particionados por regi√£o e ano
regions = ['North', 'South', 'East']
years = [2023, 2024]

for region in regions:
    for year in years:
        # Criar dados com tipos expl√≠citos para evitar inconsist√™ncia
        table_part = pa.table({
            'date': pa.array([f'{year}-01-{day:02d}' for day in range(1, 21)]),
            'product': pa.array([f'Product_{i%3}' for i in range(20)]),
            'sales': pa.array([100 + i * 10 for i in range(20)], type=pa.int32()),
            'region': pa.array([region] * 20),
            'year': pa.array([year] * 20, type=pa.int32())
        })
        
        # Criar diret√≥rio particionado (Hive style: region=X/year=Y)
        partition_path = f'{partition_dir}/region={region}/year={year}'
        os.makedirs(partition_path, exist_ok=True)
        
        # Escrever arquivo
        pq.write_table(table_part, f'{partition_path}/data.parquet')

print("Dataset particionado criado com estrutura Hive:")
for root, dirs, files in os.walk(partition_dir):
    level = root.replace(partition_dir, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f'{subindent}{file}')

# Ler dataset com parti√ß√µes autom√°ticas
dataset_hive = ds.dataset(partition_dir, partitioning='hive')

print(f"\nSchema (com colunas de parti√ß√£o):")
print(dataset_hive.schema)

# Query filtrando por parti√ß√£o (muito eficiente!)
result_partition = con.execute("""
    SELECT
        product,
        sum(sales) as total_sales
    FROM dataset_hive
    WHERE region = 'North' AND year = 2024
    GROUP BY product
    ORDER BY total_sales DESC
""").arrow()

print("\nVendas em North (2024):")
print(result_partition)

# 3.2.3 Scanear Dataset Incrementalmente (Com Filtros)
print("\n3. Scanear Dataset com Filtros e Proje√ß√µes:")
print("-" * 40)

# Criar scanner com filtros
scanner = dataset_hive.scanner(
    columns=['product', 'sales', 'region'],
    filter=ds.field('year') == 2024
)

print("Scanner criado com filtro year=2024")
print(f"Colunas projetadas: {scanner.projected_schema.names}")

# Converter para Arrow table
filtered_table = scanner.to_table()
print(f"Linhas filtradas: {filtered_table.num_rows}")

# Query com DuckDB
result_scan = con.execute("""
    SELECT region, count(*) as count
    FROM filtered_table
    GROUP BY region
""").df()

print("\nContagem por regi√£o (2024):")
print(result_scan)

# 3.2.4 Compara√ß√£o de Performance
print("\n4. Compara√ß√£o: Arquivo vs Dataset Particionado:")
print("-" * 40)

import time

# Teste 1: Ler arquivo √∫nico
print("\nTeste 1: Arquivo √∫nico (month_1.parquet)")
start = time.time()
single_file = pq.read_table(f'{data_dir}/month_1.parquet')
query_single = con.execute("""
    SELECT sum(sales) FROM single_file
""").fetchone()
time_single = time.time() - start
print(f"  Tempo: {time_single:.4f}s")

# Teste 2: Ler dataset multi-arquivo (sem parti√ß√£o)
print("\nTeste 2: Dataset multi-arquivo")
start = time.time()
result_multi = con.execute("""
    SELECT sum(sales) FROM dataset
""").fetchone()
time_multi = time.time() - start
print(f"  Tempo: {time_multi:.4f}s")

# Teste 3: Ler dataset particionado com filtro
print("\nTeste 3: Dataset particionado com filtro (region='North')")
start = time.time()
result_filtered = con.execute("""
    SELECT sum(sales) FROM dataset_hive
    WHERE region = 'North' AND year = 2024
""").fetchone()
time_filtered = time.time() - start
print(f"  Tempo: {time_filtered:.4f}s")

print(f"\nOs datasets particionados permitem pruning de arquivos,")
print(f"evitando ler dados desnecess√°rios!")

# Limpeza
shutil.rmtree(data_dir, ignore_errors=True)
shutil.rmtree(partition_dir, ignore_errors=True)

## üìÑ T√≥pico 4: Leitura de Parquet

Carregando e manipulando arquivos Parquet

In [None]:
print(f"\n--- {'Leitura de Parquet'.upper()} ---")

import pyarrow.parquet as pq
import os

# 3.4.1 Preparar arquivo Parquet de exemplo
parquet_dir = 'parquet_examples'
os.makedirs(parquet_dir, exist_ok=True)
parquet_file = os.path.join(parquet_dir, 'sample_data.parquet')

table_to_save = pa.table({
    'id': range(100),
    'name': [f'User_{i}' for i in range(100)],
    'score': np.random.randint(0, 100, 100),
    'active': [i % 2 == 0 for i in range(100)]
})

pq.write_table(table_to_save, parquet_file)
print(f"Arquivo '{parquet_file}' criado.")

# 3.4.2 Leitura b√°sica
print("\n1. Leitura Completa:")
full_table = pq.read_table(parquet_file)
print(f"Linhas lidas: {full_table.num_rows}")

# 3.4.3 Leitura seletiva (Projections)
print("\n2. Leitura Seletiva (Apenas 'id' e 'name'):")
partial_table = pq.read_table(parquet_file, columns=['id', 'name'])

print(f"Table lida (colunas selecionadas):")
print(partial_table.slice(0, 5))
print(f"\nN√∫mero de colunas: {partial_table.num_columns}")
print(f"Tamanho em mem√≥ria: {partial_table.nbytes:,} bytes")

# 3.4.4 Inspe√ß√£o de Metadados
print("\n3. Inspe√ß√£o de Metadados do Arquivo:")
metadata = pq.read_metadata(parquet_file)
print(f"N√∫mero de row groups: {metadata.num_row_groups}")
print(f"Esquema no arquivo: {metadata.schema.to_arrow_schema()}")

## üîç T√≥pico 5: Filtros e projections

Otimizando consultas com filtros e proje√ß√µes

In [None]:
print(f"\n--- {'Filtros e projections'.upper()} ---")

import pyarrow.compute as pc
import time
import os
import shutil

# Preparar dados para otimiza√ß√£o
optimize_dir = 'optimize_data'
if os.path.exists(optimize_dir):
    shutil.rmtree(optimize_dir)
os.makedirs(optimize_dir, exist_ok=True)

# Criar tabela grande para demonstra√ß√£o
large_table = pa.table({
    'id': list(range(1, 10001)),
    'customer_name': [f'Customer_{i}' for i in range(1, 10001)],
    'email': [f'customer{i}@example.com' for i in range(1, 10001)],
    'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany', 'France'], 10000),
    'age': np.random.randint(18, 85, 10000),
    'salary': np.random.uniform(30000, 200000, 10000),
    'department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing', 'Finance'], 10000),
    'hire_date': pa.array([f'2020-{(i%12)+1:02d}-{(i%28)+1:02d}' for i in range(10000)]),
    'is_active': np.random.choice([True, False], 10000),
    'last_login': pa.array([f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}' for i in range(10000)])
})

# Escrever para Parquet
parquet_optimize = f'{optimize_dir}/large_dataset.parquet'
pq.write_table(large_table, parquet_optimize)

print(f"Dataset criado: {large_table.num_rows} linhas, {large_table.num_columns} colunas")
print(f"Tamanho em mem√≥ria: {large_table.nbytes:,} bytes")

# 3.5.1 Push-down Filters (Filtrar ao ler)
print("\n1. Push-down Filters (Filtros na leitura):")
print("-" * 40)

# Sem filtro - ler tudo
print("Op√ß√£o A: Ler tudo e depois filtrar em mem√≥ria")
start = time.time()
full_table = pq.read_table(parquet_optimize)
filtered_memory = full_table.filter(pc.field('salary') > 100000)
time_filter_memory = time.time() - start
print(f"  Tempo: {time_filter_memory:.4f}s")
print(f"  Resultados: {filtered_memory.num_rows} linhas")

# Com push-down filter - PyArrow filtra ao ler (mais eficiente)
print("\nOp√ß√£o B: Usar push-down filter (ler com filtro)")
start = time.time()
filters = [('salary', '>', 100000)]
filtered_pushdown = pq.read_table(parquet_optimize, filters=filters)
time_filter_pushdown = time.time() - start
print(f"  Tempo: {time_filter_pushdown:.4f}s")
print(f"  Resultados: {filtered_pushdown.num_rows} linhas")

print(f"\nMelhoria: {((time_filter_memory - time_filter_pushdown) / time_filter_memory * 100):.1f}% mais r√°pido com push-down")

# Exemplo com filtro complexo
print("\nOp√ß√£o C: Filtros complexos (m√∫ltiplas condi√ß√µes)")
complex_filters = [
    [('department', '==', 'Engineering'), ('salary', '>', 80000)],
    [('department', '==', 'Finance'), ('age', '>', 40)]
]
filtered_complex = pq.read_table(parquet_optimize, filters=complex_filters)
print(f"  Resultados: {filtered_complex.num_rows} linhas")
print(f"  Colunas: {filtered_complex.column_names}")

# 3.5.2 Column Projection (Selecionar colunas √∫teis)
print("\n2. Column Projection (Proje√ß√£o de colunas):")
print("-" * 40)

# Sem proje√ß√£o - ler todas as colunas
print("Op√ß√£o A: Ler todas as colunas (10 colunas)")
start = time.time()
all_cols = pq.read_table(parquet_optimize)
time_all_cols = time.time() - start
memory_all_cols = all_cols.nbytes
print(f"  Tempo: {time_all_cols:.4f}s")
print(f"  Mem√≥ria: {memory_all_cols:,} bytes")

# Com proje√ß√£o - ler apenas colunas necess√°rias
print("\nOp√ß√£o B: Projetar apenas 3 colunas (id, salary, department)")
needed_cols = ['id', 'salary', 'department']
start = time.time()
projected = pq.read_table(parquet_optimize, columns=needed_cols)
time_projected = time.time() - start
memory_projected = projected.nbytes
print(f"  Tempo: {time_projected:.4f}s")
print(f"  Mem√≥ria: {memory_projected:,} bytes")

economy = (1 - memory_projected / memory_all_cols) * 100
print(f"\nEconomia de mem√≥ria: {economy:.1f}%")
print(f"Redu√ß√£o de tempo: {((time_all_cols - time_projected) / time_all_cols * 100):.1f}%")

# 3.5.3 Filtros + Projections (Otimiza√ß√£o combinada)
print("\n3. Filtros + Projections (Otimiza√ß√£o combinada):")
print("-" * 40)

print("Cen√°rio: Filtrar por pa√≠s e selecionar apenas dados relevantes")

# Sem otimiza√ß√£o
print("\nAbordagem 1: Ler tudo, depois filtrar e projetar")
start = time.time()
step1 = pq.read_table(parquet_optimize)
step2 = step1.filter(pc.field('country') == 'USA')
step3 = step2.select(['id', 'customer_name', 'salary', 'department'])
time_naive = time.time() - start
print(f"  Tempo: {time_naive:.4f}s")
print(f"  Resultado: {step3.num_rows} linhas, {step3.num_columns} colunas")

# Com otimiza√ß√£o
print("\nAbordagem 2: Projetar + Filtrar na leitura (otimizado)")
start = time.time()
filters_opt = [('country', '==', 'USA')]
optimized = pq.read_table(
    parquet_optimize,
    columns=['id', 'customer_name', 'salary', 'department', 'country'],
    filters=filters_opt
)
time_optimized = time.time() - start
print(f"  Tempo: {time_optimized:.4f}s")
print(f"  Resultado: {optimized.num_rows} linhas, {optimized.num_columns} colunas")

improvement = ((time_naive - time_optimized) / time_naive * 100)
print(f"\nMelhoria de performance: {improvement:.1f}% mais r√°pido")

# 3.5.4 Benchmark Completo com Dataset Scanner
print("\n4. Benchmark: Diferentes estrat√©gias de acesso")
print("-" * 40)

dataset = ds.dataset(parquet_optimize, format='parquet')

# Estrat√©gia 1: Leitura padr√£o sem otimiza√ß√£o
print("\nEstrat√©gia 1: Scanner b√°sico (sem otimiza√ß√µes)")
start = time.time()
scanner_basic = dataset.scanner()
result_basic = scanner_basic.to_table()
time_basic = time.time() - start
print(f"  Tempo: {time_basic:.4f}s")
print(f"  Linhas: {result_basic.num_rows}")

# Estrat√©gia 2: Scanner com proje√ß√£o
print("\nEstrat√©gia 2: Scanner com proje√ß√£o de 3 colunas")
start = time.time()
scanner_proj = dataset.scanner(columns=['id', 'salary', 'department'])
result_proj = scanner_proj.to_table()
time_proj = time.time() - start
print(f"  Tempo: {time_proj:.4f}s")
print(f"  Linhas: {result_proj.num_rows}")

# Estrat√©gia 3: Scanner com filtro
print("\nEstrat√©gia 3: Scanner com filtro (salary > 120000)")
start = time.time()
scanner_filter = dataset.scanner(
    filter=ds.field('salary') > 120000
)
result_filter = scanner_filter.to_table()
time_filter = time.time() - start
print(f"  Tempo: {time_filter:.4f}s")
print(f"  Linhas: {result_filter.num_rows}")

# Estrat√©gia 4: Scanner otimizado (filtro + proje√ß√£o)
print("\nEstrat√©gia 4: Scanner otimizado (filtro + proje√ß√£o)")
start = time.time()
scanner_optimized = dataset.scanner(
    columns=['id', 'customer_name', 'salary', 'department'],
    filter=(ds.field('salary') > 120000) & (ds.field('country') == 'USA')
)
result_optimized = scanner_optimized.to_table()
time_optimized_scan = time.time() - start
print(f"  Tempo: {time_optimized_scan:.4f}s")
print(f"  Linhas: {result_optimized.num_rows}")

# Resumo de performance
print("\nüìä Resumo de Performance:")
print("-" * 40)
results = [
    ("Scanner b√°sico", time_basic),
    ("Scanner com proje√ß√£o", time_proj),
    ("Scanner com filtro", time_filter),
    ("Scanner otimizado", time_optimized_scan)
]
results.sort(key=lambda x: x[1])

for i, (strategy, elapsed) in enumerate(results, 1):
    pct = (elapsed / results[-1][1] - 1) * 100 if i > 1 else 0
    faster = f"({pct:.1f}% mais r√°pido)" if pct != 0 else "(refer√™ncia)"
    print(f"  {i}. {strategy:.<30} {elapsed:.4f}s {faster}")

# An√°lise com DuckDB
print("\nAn√°lise com DuckDB (push-down autom√°tico):")
result_duckdb = con.execute(f"""
    SELECT 
        COUNT(*) as total,
        COUNT(CASE WHEN salary > 120000 THEN 1 END) as high_salary,
        AVG(salary) as avg_salary,
        COUNT(DISTINCT country) as countries
    FROM read_parquet('{parquet_optimize}')
""").df()

print(result_duckdb)

# Limpeza
shutil.rmtree(optimize_dir, ignore_errors=True)

print("\n‚úÖ Filtros e Projections demonstram o poder das otimiza√ß√µes!")
print("   - Push-down filters reduzem dados lidos do disco")
print("   - Column projection reduz mem√≥ria utilizada")
print("   - Combinadas = m√°xima performance!")