# Capitulo 04 Zero Copy Performance

Notebook gerado automaticamente a partir do c√≥digo fonte python.


In [None]:
# Instala√ß√£o de pacotes necess√°rios
!pip install pyarrow duckdb pandas numpy

## üìö Introdu√ß√£o

Este notebook aborda Zero-Copy e Performance:
- Fundamentos zero-copy
- Processamento vetorizado
- Memory mapping
- Buffer management
- Benchmarks

In [None]:
# -*- coding: utf-8 -*-
"""
Cap√≠tulo 04: Zero-Copy e Performance
Curso: Apache Arrow + DuckDB
"""

import sys
import os
import shutil
import psutil
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.compute as pc
import duckdb
import pandas as pd
import numpy as np
import time

print("="*60)
print(f"CAP√çTULO 04: ZERO-COPY E PERFORMANCE")
print("="*60)


## üîß Prepara√ß√£o dos Dados

Cria√ß√£o de dados de exemplo e conex√£o com DuckDB

In [None]:
# Dados de exemplo globais
try:
    print("\nGerando dados de exemplo...")
    data = pa.table({
        'id': range(1000),
        'valor': np.random.randn(1000),
        'categoria': np.random.choice(['A', 'B', 'C'], 1000)
    })
    print(f"Tabela PyArrow criada: {data.num_rows} linhas")
except Exception as e:
    print(f"Erro ao criar dados: {e}")

# Conex√£o DuckDB
con = duckdb.connect()

## üöÄ T√≥pico 1: Fundamentos zero-copy

Entendendo os conceitos de zero-copy e suas vantagens

In [None]:
print(f"\n--- {'Fundamentos zero-copy'.upper()} ---")

import sys
import tracemalloc

# 4.1.1 Entender o problema: C√≥pia vs Refer√™ncia
print("\n1. Zero-Copy vs Copy: Compara√ß√£o de Mem√≥ria:")
print("-" * 40)

# Criar dados em NumPy (dados tradicional Python)
numpy_array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 1000)

print(f"Array NumPy:")
print(f"  Shape: {numpy_array.shape}")
print(f"  Dtype: {numpy_array.dtype}")
print(f"  Memory: {numpy_array.nbytes:,} bytes")

# Abordagem 1: C√≥pias (ineficiente)
print("\nAbordagem 1: COPY (Ineficiente)")
tracemalloc.start()

# C√≥pia 1
array_copy1 = numpy_array.copy()
current, peak1 = tracemalloc.get_traced_memory()

# C√≥pia 2
array_copy2 = numpy_array.copy()
current, peak2 = tracemalloc.get_traced_memory()

# C√≥pia 3
array_copy3 = numpy_array.copy()
current, peak3 = tracemalloc.get_traced_memory()

tracemalloc.stop()

total_memory_copy = numpy_array.nbytes * 3
print(f"  3 c√≥pias = {total_memory_copy:,} bytes extras em mem√≥ria")

# Abordagem 2: Refer√™ncias (eficiente)
print("\nAbordagem 2: ZERO-COPY (Eficiente)")
tracemalloc.start()

# Refer√™ncias (sem c√≥pia)
array_ref1 = numpy_array  # Mesma mem√≥ria!
array_ref2 = numpy_array  # Mesma mem√≥ria!
array_ref3 = numpy_array  # Mesma mem√≥ria!

tracemalloc.stop()

print(f"  3 refer√™ncias = 0 bytes extras em mem√≥ria")
print(f"  Mesmos dados? {array_ref1 is numpy_array}")

# 4.1.2 Arrow: Zero-Copy Between Languages
print("\n2. Arrow: Zero-Copy entre linguagens:")
print("-" * 40)

# Criar tabela Arrow
arrow_table = pa.table({
    'id': list(range(100)),
    'values': np.random.randn(100),
    'category': np.random.choice(['A', 'B', 'C'], 100)
})

print(f"Arrow Table:")
print(f"  Shape: {arrow_table.num_rows} linhas, {arrow_table.num_columns} colunas")
print(f"  Memory: {arrow_table.nbytes:,} bytes")

# Converter para Pandas (zero-copy quando poss√≠vel)
print("\nConvers√£o Arrow ‚Üí Pandas:")
pandas_df = arrow_table.to_pandas()
print(f"  Pandas DataFrame created (not copied)")

# Converter back para Arrow (zero-copy)
print("\nConvers√£o Pandas ‚Üí Arrow:")
arrow_table2 = pa.Table.from_pandas(pandas_df)
print(f"  Arrow Table restored (zero-copy)")

# Verificar igualdade
print(f"\nTables s√£o iguais? {arrow_table.equals(arrow_table2)}")

# 4.1.3 Buffer Compartilhado
print("\n3. Buffer Compartilhado (Shared Memory):")
print("-" * 40)

# Criar um buffer Arrow
buffer = pa.allocate_buffer(1000)
print(f"Buffer alocado: {buffer.size} bytes")

# Slice do mesmo buffer (zero-copy)
slice1 = buffer.slice(0, 500)
slice2 = buffer.slice(500, 500)

print(f"  Slice 1: offset 0, size 500")
print(f"  Slice 2: offset 500, size 500")

# Converter para array compartilhado
array_from_buffer = pa.array([1, 2, 3, 4, 5], type=pa.int64())
print(f"\nArray from buffer: {array_from_buffer}")

# 4.1.4 Demonstra√ß√£o com DuckDB
print("\n4. Zero-Copy com DuckDB:")
print("-" * 40)

# Criar dados em Arrow
large_table = pa.table({
    'customer_id': list(range(10000)),
    'amount': np.random.uniform(10, 1000, 10000),
    'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany'], 10000)
})

print(f"Large Table: {large_table.num_rows} linhas")

# Query em DuckDB (zero-copy - acessa buffer Arrow diretamente)
result = con.execute("""
    SELECT 
        country,
        COUNT(*) as count,
        AVG(amount) as avg_amount,
        SUM(amount) as total_amount
    FROM large_table
    GROUP BY country
    ORDER BY total_amount DESC
""").arrow()

print("\nResultado (processado sem copiar dados):")
print(result)

# 4.1.5 Memory Ownership
print("\n5. Ownership de Mem√≥ria:")
print("-" * 40)

# Arrow gerencia mem√≥ria automaticamente
arrow_data = pa.array([1, 2, 3, 4, 5])
print(f"Arrow Array: {arrow_data}")
print(f"  Type: {arrow_data.type}")
print(f"  Length: {len(arrow_data)}")

# Convers√£o mant√©m refer√™ncia
numpy_from_arrow = arrow_data.to_numpy()
print(f"\nNumPy array from Arrow:")
print(f"  Data: {numpy_from_arrow}")
print(f"  Dtype: {numpy_from_arrow.dtype}")

# Liberar mem√≥ria (Arrow gerencia)
del arrow_data
print(f"\nArrow array deletado (mem√≥ria liberada automaticamente)")

# NumPy array ainda existe
print(f"NumPy array ainda existe: {numpy_from_arrow}")

print("\n‚úÖ Zero-Copy permite compartilhar buffers de mem√≥ria")
print("   entre diferentes estruturas de dados sem c√≥pias!")


## ‚ö° T√≥pico 2: Processamento vetorizado

Otimizando opera√ß√µes com processamento vetorizado

In [None]:
print(f"\n--- {'Processamento vetorizado'.upper()} ---")

# 4.2.1 Opera√ß√µes Escalares vs Vetorizadas
print("\n1. Opera√ß√µes Escalares vs Vetorizadas:")
print("-" * 40)

# Criar dataset grande
n = 100000
data = pa.table({
    'a': np.random.randint(1, 100, n),
    'b': np.random.randint(1, 100, n),
    'c': np.random.uniform(10, 1000, n)
})

print(f"Dataset: {data.num_rows:,} linhas")

# Estrat√©gia 1: Loop Escalar (LENTO)
print("\nEstrat√©gia 1: Loop Escalar (Python puro)")
a_list = data['a'].to_pylist()
b_list = data['b'].to_pylist()

start = time.perf_counter()
result_scalar = []
for i in range(len(a_list)):
    result_scalar.append(a_list[i] * 2 + b_list[i])
time_scalar = time.perf_counter() - start

print(f"  Tempo: {time_scalar:.4f}s")
print(f"  Primeiros resultados: {result_scalar[:5]}")

# Estrat√©gia 2: NumPy Vetorizado (R√ÅPIDO)
print("\nEstrat√©gia 2: NumPy Vetorizado")
a_numpy = data['a'].to_numpy()
b_numpy = data['b'].to_numpy()

start = time.perf_counter()
result_numpy = a_numpy * 2 + b_numpy
time_numpy = time.perf_counter() - start

print(f"  Tempo: {time_numpy:.6f}s")
# Preven√ß√£o de divis√£o por zero
speedup_np = time_scalar / max(time_numpy, 1e-9)
print(f"  Speedup: {speedup_np:.1f}x mais r√°pido")

# Estrat√©gia 3: Arrow Compute (MAIS R√ÅPIDO)
print("\nEstrat√©gia 3: Arrow Compute (Vetorizado)")

start = time.perf_counter()
a_col = data['a']
b_col = data['b']
result_arrow = pc.add(pc.multiply(a_col, 2), b_col)
time_arrow = time.perf_counter() - start

print(f"  Tempo: {time_arrow:.6f}s")
speedup_arrow_scalar = time_scalar / max(time_arrow, 1e-9)
speedup_arrow_numpy = time_numpy / max(time_arrow, 1e-9)
print(f"  Speedup vs Escalar: {speedup_arrow_scalar:.1f}x")
print(f"  Speedup vs NumPy: {speedup_arrow_numpy:.1f}x")

# 4.2.2 Opera√ß√µes Complexas Vetorizadas
print("\n2. Opera√ß√µes Complexas Vetorizadas:")
print("-" * 40)

# Dataset para an√°lise
sales_data = pa.table({
    'product': np.random.choice(['A', 'B', 'C', 'D'], 50000),
    'price': np.random.uniform(10, 1000, 50000),
    'quantity': np.random.randint(1, 100, 50000),
    'discount': np.random.uniform(0, 0.5, 50000)
})

print(f"Sales Data: {sales_data.num_rows:,} registros")

# C√°lculos Vetorizados com Arrow Compute
start = time.perf_counter()

# Calcular valores com desconto
total_price = pc.multiply(sales_data['price'], sales_data['quantity'])
discounted_price = pc.multiply(total_price, pc.subtract(1, sales_data['discount']))

# Filtrar valores maiores que 500
mask = pc.greater(discounted_price, 500)
filtered = sales_data.filter(mask)

# Agrega√ß√µes
avg_discounted = pc.mean(discounted_price)
max_discounted = pc.max(discounted_price)
count = pc.count(discounted_price)

time_vectorized = time.perf_counter() - start

print(f"\nResultados (tempo: {time_vectorized:.4f}s):")
print(f"  Quantidade de registros: {count.as_py()}")
print(f"  Pre√ßo m√©dio (com desconto): {avg_discounted.as_py():.2f}")
print(f"  Pre√ßo m√°ximo (com desconto): {max_discounted.as_py():.2f}")

# 4.2.3 Opera√ß√µes com DuckDB (Vetorizado)
print("\n3. DuckDB: Processamento Vetorizado Autom√°tico:")
print("-" * 40)

# Query SQL executada vetorizadamente
start = time.perf_counter()

result_duckdb = con.execute(f"""
    SELECT 
        product,
        COUNT(*) as count,
        AVG(price * quantity * (1 - discount)) as avg_sale,
        SUM(price * quantity * (1 - discount)) as total_sale,
        MAX(price * quantity * (1 - discount)) as max_sale
    FROM sales_data
    WHERE price * quantity * (1 - discount) > 500
    GROUP BY product
    ORDER BY total_sale DESC
""").df()

time_duckdb = time.perf_counter() - start

print(f"Tempo DuckDB: {time_duckdb:.4f}s")
print("\nResultados por produto:")
print(result_duckdb)

# 4.2.4 Benchmark Completo
print("\n4. Benchmark: Compara√ß√£o de Estrat√©gias")
print("-" * 40)

# Opera√ß√£o: Calcular m√©dia de vendas filtradas
operation_data = pa.table({
    'sales': np.random.uniform(100, 10000, 1000000),
    'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], 1000000)
})

print(f"Dataset: {operation_data.num_rows:,} vendas")

# Estrat√©gia 1: Python Puro
print("\nEstrat√©gia 1: Python Puro (loop + condi√ß√µes)")
start = time.perf_counter()
sales_list = operation_data['sales'].to_pylist()
total = 0
count = 0
for sale in sales_list:
    if sale > 5000:
        total += sale
        count += 1
avg_python = total / count if count > 0 else 0
time_python = time.perf_counter() - start

print(f"  Tempo: {time_python:.4f}s")
print(f"  Resultado: {avg_python:.2f}")

# Estrat√©gia 2: NumPy Vetorizado
print("\nEstrat√©gia 2: NumPy Vetorizado")
start = time.perf_counter()
sales_np = operation_data['sales'].to_numpy()
mask_np = sales_np > 5000
avg_numpy = sales_np[mask_np].mean()
time_numpy_calc = time.perf_counter() - start

print(f"  Tempo: {time_numpy_calc:.6f}s")
speedup_np_calc = time_python / max(time_numpy_calc, 1e-9)
print(f"  Speedup: {speedup_np_calc:.1f}x")

# Estrat√©gia 3: Arrow Compute
print("\nEstrat√©gia 3: Arrow Compute")
start = time.perf_counter()
sales_col = operation_data['sales']
mask_arrow = pc.greater(sales_col, 5000)
filtered_sales = pc.filter(sales_col, mask_arrow)
avg_arrow = pc.mean(filtered_sales).as_py()
time_arrow_calc = time.perf_counter() - start

print(f"  Tempo: {time_arrow_calc:.6f}s")
speedup_arrow_python = time_python / max(time_arrow_calc, 1e-9)
speedup_arrow_numpy_calc = time_numpy_calc / max(time_arrow_calc, 1e-9)
print(f"  Speedup vs Python: {speedup_arrow_python:.1f}x")
print(f"  Speedup vs NumPy: {speedup_arrow_numpy_calc:.1f}x")

# Estrat√©gia 4: DuckDB SQL
print("\nEstrat√©gia 4: DuckDB SQL (vetorizado autom√°tico)")
start = time.perf_counter()
result_sql = con.execute("""
    SELECT AVG(sales) as avg_sale
    FROM operation_data
    WHERE sales > 5000
""").fetchone()
time_sql = time.perf_counter() - start

print(f"  Tempo: {time_sql:.4f}s")
speedup_sql = time_python / max(time_sql, 1e-9)
print(f"  Speedup vs Python: {speedup_sql:.1f}x")

print("\nüìä Resumo de Velocidades:")
print("-" * 40)
timings = [
    ("Python Puro", time_python),
    ("NumPy", time_numpy_calc),
    ("Arrow Compute", time_arrow_calc),
    ("DuckDB SQL", time_sql)
]
timings.sort(key=lambda x: x[1])

for i, (strategy, elapsed) in enumerate(timings, 1):
    speedup = time_python / max(elapsed, 1e-9)
    print(f"  {i}. {strategy:.<20} {elapsed:.6f}s ({speedup:.1f}x)")

print("\n‚úÖ Processamento vetorizado √© fundamental para performance!")
print("   Loop scalares devem ser evitados a todo custo!")


## üíæ T√≥pico 3: Memory mapping

T√©cnicas de mapeamento de mem√≥ria para efici√™ncia

In [None]:
print(f"\n--- {'Memory mapping'.upper()} ---")

import os
import shutil
import psutil

# 4.3.1 Conceitos de Memory Mapping
print("\n1. Memory Mapping: Conceito e Benef√≠cios:")
print("-" * 40)

# Criar arquivo Parquet grande
mmap_dir = 'mmap_data'
if os.path.exists(mmap_dir):
    shutil.rmtree(mmap_dir)
os.makedirs(mmap_dir, exist_ok=True)

# Criar dataset de 1 milh√£o de registros
n_records = 1000000
mmap_table = pa.table({
    'id': list(range(1, n_records + 1)),
    'value': np.random.randn(n_records),
    'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_records),
    'timestamp': pa.array([f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}' for i in range(n_records)]),
    'amount': np.random.uniform(10, 10000, n_records)
})

parquet_file = f'{mmap_dir}/large_dataset.parquet'
pq.write_table(mmap_table, parquet_file)

file_size = os.path.getsize(parquet_file)
print(f"Arquivo Parquet criado:")
print(f"  Caminho: {parquet_file}")
print(f"  Tamanho: {file_size / 1024 / 1024:.2f} MB")
print(f"  Registros: {n_records:,}")

# 4.3.2 Leitura Tradicional vs Memory Mapping
print("\n2. Leitura Tradicional vs Memory Mapping:")
print("-" * 40)

# Estrat√©gia 1: Leitura Tradicional (Carrega tudo em RAM)
print("Estrat√©gia 1: Leitura Tradicional (load all into RAM)")
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss / 1024 / 1024

start = time.time()
full_table = pq.read_table(parquet_file)
time_traditional = time.time() - start

mem_after = process.memory_info().rss / 1024 / 1024
mem_used = mem_after - mem_before

print(f"  Tempo de leitura: {time_traditional:.4f}s")
print(f"  Mem√≥ria usada: {mem_used:.2f} MB")
print(f"  Dados na mem√≥ria: {full_table.nbytes / 1024 / 1024:.2f} MB")

# Estrat√©gia 2: Leitura com Filter (Carrega apenas dados filtrados)
print("\nEstrat√©gia 2: Leitura com Filtro (load only filtered data)")
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss / 1024 / 1024

start = time.time()
filtered_table = pq.read_table(
    parquet_file,
    filters=[('amount', '>', 5000)]
)
time_filtered = time.time() - start

mem_after = process.memory_info().rss / 1024 / 1024
mem_used_filtered = mem_after - mem_before

print(f"  Tempo de leitura: {time_filtered:.4f}s")
print(f"  Mem√≥ria usada: {mem_used_filtered:.2f} MB")
print(f"  Dados na mem√≥ria: {filtered_table.nbytes / 1024 / 1024:.2f} MB")
print(f"  Registros recuperados: {filtered_table.num_rows:,}")

# 4.3.3 Processamento Streaming (Memory Mapping)
print("\n3. Leitura Streaming (Memory Mapping):")
print("-" * 40)

# Usar Scanner para ler em batches (memory mapped)
print("Usando PyArrow Scanner com batch_size:")

dataset = ds.dataset(parquet_file, format='parquet')
scanner = dataset.scanner(batch_size=50000)

total_rows = 0
batch_count = 0
start = time.time()

for batch in scanner.to_batches():
    batch_count += 1
    total_rows += batch.num_rows
    
    # Processar batch (sem carregar tudo em mem√≥ria)
    if batch_count == 1:
        print(f"  Batch 1: {batch.num_rows:,} registros")

time_streaming = time.time() - start

print(f"  Total de batches: {batch_count}")
print(f"  Total de registros: {total_rows:,}")
print(f"  Tempo: {time_streaming:.4f}s")

# 4.3.4 Mapeamento de Colunas (Column Mapping)
print("\n4. Column Mapping (Ler colunas seletivamente):")
print("-" * 40)

# Op√ß√£o A: Ler todas as colunas
print("Op√ß√£o A: Ler todas as 5 colunas")
start = time.time()
all_cols = pq.read_table(parquet_file)
time_all = time.time() - start
mem_all = all_cols.nbytes / 1024 / 1024

print(f"  Tempo: {time_all:.4f}s")
print(f"  Mem√≥ria: {mem_all:.2f} MB")

# Op√ß√£o B: Mapear apenas colunas necess√°rias
print("\nOp√ß√£o B: Mapear apenas 2 colunas (id, amount)")
start = time.time()
mapped_cols = pq.read_table(parquet_file, columns=['id', 'amount'])
time_mapped = time.time() - start
mem_mapped = mapped_cols.nbytes / 1024 / 1024

print(f"  Tempo: {time_mapped:.4f}s")
print(f"  Mem√≥ria: {mem_mapped:.2f} MB")
print(f"  Economia: {(1 - mem_mapped / mem_all) * 100:.1f}%")

# 4.3.5 DuckDB com Memory Mapping Autom√°tico
print("\n5. DuckDB: Memory Mapping Autom√°tico:")
print("-" * 40)

# DuckDB otimiza automaticamente com memory mapping
print("Query 1: Agrega√ß√£o com filtro")
start = time.time()

result1 = con.execute(f"""
    SELECT 
        category,
        COUNT(*) as count,
        AVG(amount) as avg_amount,
        MAX(amount) as max_amount
    FROM read_parquet('{parquet_file}')
    WHERE amount > 5000
    GROUP BY category
    ORDER BY avg_amount DESC
""").df()

time_query1 = time.time() - start

print(f"  Tempo: {time_query1:.4f}s")
print(f"  Resultados: {len(result1)} categorias")
print(result1)

# Demonstrar que dados n√£o est√£o todos carregados em RAM
print("\nQuery 2: Stat√≠sticas globais (sem carregar tudo)")
start = time.time()

result2 = con.execute(f"""
    SELECT 
        COUNT(*) as total_records,
        AVG(amount) as global_avg,
        MIN(amount) as min_amount,
        MAX(amount) as max_amount,
        STDDEV(amount) as std_amount
    FROM read_parquet('{parquet_file}')
""").df()

time_query2 = time.time() - start

print(f"  Tempo: {time_query2:.4f}s")
print(result2)

# 4.3.6 Compara√ß√£o de Estrat√©gias
print("\n6. Compara√ß√£o: Strategies de Acesso a Dados")
print("-" * 40)

strategies = [
    ("Full Load + Process", time_traditional + time_query1),
    ("Filtered Load", time_filtered),
    ("Streaming (Memory Mapped)", time_streaming),
    ("DuckDB Query", time_query2)
]

strategies.sort(key=lambda x: x[1])

print("\nOrdenado por velocidade:")
for i, (strategy, elapsed) in enumerate(strategies, 1):
    speedup = time_traditional / elapsed
    print(f"  {i}. {strategy:.<35} {elapsed:.4f}s ({speedup:.1f}x)")

print("\n‚úÖ Memory Mapping benef√≠cios:")
print("   - Carrega apenas dados necess√°rios")
print("   - Economiza RAM quando processando em streaming")
print("   - Deixa o SO gerenciar cache de p√°ginas")
print("   - Ideal para arquivos maiores que a RAM")

# Limpeza
shutil.rmtree(mmap_dir, ignore_errors=True)

## üîÑ T√≥pico 4: Buffer management

Gerenciamento eficiente de buffers de mem√≥ria

In [None]:
print(f"\n--- {'Buffer management'.upper()} ---")

# 4.4.1 Gerenciamento de Batches
print("\n1. Gerenciamento de Batch Size:")
print("-" * 40)

# Criar dataset
n = 5000000
large_data = pa.table({
    'id': list(range(1, n + 1)),
    'value': np.random.randn(n),
    'category': np.random.choice(['A', 'B', 'C', 'D'], n),
    'amount': np.random.uniform(10, 10000, n)
})

# Salvar para teste
buffer_dir = 'buffer_data'
if os.path.exists(buffer_dir):
    shutil.rmtree(buffer_dir)
os.makedirs(buffer_dir, exist_ok=True)

parquet_buffer = f'{buffer_dir}/buffer_test.parquet'
pq.write_table(large_data, parquet_buffer)

print(f"Dataset: {n:,} registros, {large_data.nbytes / 1024 / 1024:.2f} MB")

# Teste 1: Batch pequeno
print("\nTeste 1: Batch Size = 10.000 registros")
dataset = ds.dataset(parquet_buffer, format='parquet')
scanner_small = dataset.scanner(batch_size=10000)

batch_count = 0
peak_memory = 0
start = time.perf_counter()

for batch in scanner_small.to_batches():
    batch_count += 1
    peak_memory = max(peak_memory, batch.nbytes)

time_small_batch = time.perf_counter() - start

print(f"  Batches processados: {batch_count}")
print(f"  Mem√≥ria de pico por batch: {peak_memory / 1024 / 1024:.2f} MB")
print(f"  Tempo total: {time_small_batch:.4f}s")

# Teste 2: Batch m√©dio
print("\nTeste 2: Batch Size = 100.000 registros")
scanner_medium = dataset.scanner(batch_size=100000)

batch_count = 0
peak_memory = 0
start = time.perf_counter()

for batch in scanner_medium.to_batches():
    batch_count += 1
    peak_memory = max(peak_memory, batch.nbytes)

time_medium_batch = time.perf_counter() - start

print(f"  Batches processados: {batch_count}")
print(f"  Mem√≥ria de pico por batch: {peak_memory / 1024 / 1024:.2f} MB")
print(f"  Tempo total: {time_medium_batch:.4f}s")

# Teste 3: Batch grande
print("\nTeste 3: Batch Size = 500.000 registros")
scanner_large = dataset.scanner(batch_size=500000)

batch_count = 0
peak_memory = 0
start = time.perf_counter()

for batch in scanner_large.to_batches():
    batch_count += 1
    peak_memory = max(peak_memory, batch.nbytes)

time_large_batch = time.perf_counter() - start

print(f"  Batches processados: {batch_count}")
print(f"  Mem√≥ria de pico por batch: {peak_memory / 1024 / 1024:.2f} MB")
print(f"  Tempo total: {time_large_batch:.4f}s")

# 4.4.2 Buffer Pool Management
print("\n2. Buffer Pool: Reutilizar buffers:")
print("-" * 40)

# Implementar pool de buffers manual
class BufferPool:
    def __init__(self, buffer_size=65536, pool_size=10):
        self.buffer_size = buffer_size
        self.buffers = [bytearray(buffer_size) for _ in range(pool_size)]
        self.available = list(range(pool_size))
        
    def acquire(self):
        if self.available:
            return self.buffers[self.available.pop()]
        return bytearray(self.buffer_size)
    
    def release(self, buffer):
        if len(self.available) < len(self.buffers):
            self.available.append(self.buffers.index(buffer))

# Pool pequeno (1 MB)
print("Buffer Pool pequeno (10 buffers x 64KB):")
pool_small = BufferPool(buffer_size=65536, pool_size=10)

start = time.perf_counter()
for i in range(1000):
    buf = pool_small.acquire()
    buf[0] = i % 256
    pool_small.release(buf)
time_pool_small = time.perf_counter() - start

print(f"  Tempo (1000 opera√ß√µes): {time_pool_small:.4f}s")

# Pool grande (10 MB)
print("\nBuffer Pool grande (100 buffers x 64KB):")
pool_large = BufferPool(buffer_size=65536, pool_size=100)

start = time.perf_counter()
for i in range(1000):
    buf = pool_large.acquire()
    buf[0] = i % 256
    pool_large.release(buf)
time_pool_large = time.perf_counter() - start

print(f"  Tempo (1000 opera√ß√µes): {time_pool_large:.4f}s")

# 4.4.3 Aloca√ß√£o de Mem√≥ria Eficiente
print("\n3. Aloca√ß√£o de Mem√≥ria Eficiente:")
print("-" * 40)

# Arrow alocador
print("Arrow Memory Allocator:")

# Aloca√ß√£o manual usando MemoryPool
allocator = pa.default_memory_pool()
print(f"  Default pool: {allocator.backend_name}")

# Alocar buffer (Usando allocate_buffer com o pool especificado)
buffer1 = pa.allocate_buffer(1024 * 1024, memory_pool=allocator)  # 1 MB
print(f"  Alocado buffer 1: {buffer1.size / 1024 / 1024:.1f} MB")

buffer2 = pa.allocate_buffer(1024 * 1024, memory_pool=allocator)  # 1 MB
print(f"  Alocado buffer 2: {buffer2.size / 1024 / 1024:.1f} MB")

# Verificar estado (estat√≠sticas do pool)
print(f"  Mem√≥ria alocada no pool: {allocator.bytes_allocated() / 1024 / 1024:.2f} MB")
print(f"  Pico de mem√≥ria no pool: {allocator.max_memory() / 1024 / 1024:.2f} MB")

# 4.4.4 Processamento com controle de mem√≥ria
print("\n4. Processamento com Limite de Mem√≥ria:")
print("-" * 40)

# Simular processamento com limite
def process_data_with_limit(parquet_file, batch_size=50000, memory_limit_mb=100):
    """Processa dados com limite de mem√≥ria"""
    
    dataset = ds.dataset(parquet_file, format='parquet')
    scanner = dataset.scanner(batch_size=batch_size)
    
    total_processed = 0
    batches = 0
    
    for batch in scanner.to_batches():
        batch_mem = batch.nbytes / 1024 / 1024
        
        if batch_mem > memory_limit_mb:
            print(f"  ‚ö†Ô∏è Batch de {batch_mem:.2f} MB excede limite de {memory_limit_mb} MB")
            break
        
        # Processar batch
        total_processed += batch.num_rows
        batches += 1
    
    return total_processed, batches

print(f"Processando com limite de 50 MB por batch:")
total, batch_count = process_data_with_limit(parquet_buffer, batch_size=50000, memory_limit_mb=50)
print(f"  Registros processados: {total:,}")
print(f"  Batches: {batch_count}")

# 4.4.5 Configura√ß√£o Otimizada
print("\n5. Configura√ß√£o Otimizada de Buffer:")
print("-" * 40)

# Diferentes configura√ß√µes
configs = [
    ("Conservador", 10000, 100),      # batch_size, max_io_concurrency
    ("Balanceado", 50000, 50),
    ("Agressivo", 200000, 10)
]

print("Benchmark de configura√ß√µes:")
print(f"{'Modo':<15} {'Batch Size':<12} {'IO Conc':<10} {'Tempo (s)':<10} {'Batches':<10}")
print("-" * 65)

for name, batch_size, io_conc in configs:
    dataset = ds.dataset(parquet_buffer, format='parquet')
    scanner = dataset.scanner(batch_size=batch_size)
    
    start = time.perf_counter()
    batch_count = 0
    
    for batch in scanner.to_batches():
        batch_count += 1
    
    elapsed = time.perf_counter() - start
    print(f"{name:<15} {batch_size:<12,} {io_conc:<10} {elapsed:<10.4f} {batch_count:<10}")

# 4.4.6 DuckDB Buffer Management
print("\n6. DuckDB Buffer Management Autom√°tico:")
print("-" * 40)

# DuckDB gerencia buffers automaticamente
print("DuckDB otimiza buffer management automaticamente")

# Query com processamento em streaming
start = time.perf_counter()

result = con.execute(f"""
    SELECT 
        category,
        COUNT(*) as count,
        AVG(amount) as avg_amount,
        SUM(amount) as total_amount
    FROM (
        SELECT * FROM read_parquet('{parquet_buffer}')
        WHERE amount > 1000
    )
    GROUP BY category
    ORDER BY total_amount DESC
""").df()

elapsed = time.perf_counter() - start

print(f"\nQuery com streaming:")
print(f"  Tempo: {elapsed:.4f}s")
print(f"  Resultados:")
print(result)

print("\n‚úÖ Buffer Management otimizado:")
print("   - Batch size apropriado = melhor performance")
print("   - Reutilizar buffers reduz aloca√ß√µes")
print("   - DuckDB gerencia automaticamente")
print("   - Limite de mem√≥ria = processamento previs√≠vel")

# Limpeza
shutil.rmtree(buffer_dir, ignore_errors=True)


## üìä T√≥pico 5: Benchmarks

Medindo e comparando performance

In [None]:
print(f"\n--- {'Benchmarks'.upper()} ---")

# 4.5.1 Benchmark: Zero-Copy vs Copy
print("\n1. Benchmark: Zero-Copy vs Copy:")
print("-" * 40)

# Preparar dados grandes
n = 10000000
bench_data = pa.table({
    'id': list(range(1, n + 1)),
    'value1': np.random.randn(n),
    'value2': np.random.randn(n),
    'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], n)
})

print(f"Dataset: {n:,} registros, {bench_data.nbytes / 1024 / 1024:.2f} MB")

# Teste 1: C√≥pia de dados (COM c√≥pia)
print("\nTeste 1: COM C√≥pia (create new objects)")
start = time.perf_counter()
copy1 = bench_data.to_pandas()
copy2 = pa.Table.from_pandas(copy1)
copy3 = copy2.to_pandas()
time_with_copy = time.perf_counter() - start

print(f"  Tempo: {time_with_copy:.4f}s")
print(f"  Opera√ß√µes: to_pandas() -> from_pandas() -> to_pandas()")

# Teste 2: Zero-Copy (SEM c√≥pia desnecess√°ria)
print("\nTeste 2: ZERO-Copy (reference same buffers)")
start = time.perf_counter()
# Proje√ß√µes e concatena√ß√µes horizontais em Arrow s√£o zero-copy
view1 = bench_data.select(['id', 'value1'])
view2 = bench_data.select(['value2', 'category'])

# Combinar colunas (zero-copy)
# Criamos uma nova tabela referenciando as colunas existentes
view3 = pa.table({
    'id': view1['id'],
    'value1': view1['value1'],
    'value2': view2['value2'],
    'category': view2['category']
})
time_zero_copy = time.perf_counter() - start

print(f"  Tempo: {time_zero_copy:.6f}s")
speedup_zero = time_with_copy / max(time_zero_copy, 1e-9)
print(f"  Speedup: {speedup_zero:.1f}x")

# 4.5.2 Benchmark: Processamento Vetorizado
print("\n2. Benchmark: Vetorizado vs Escalar:")
print("-" * 40)

# Dataset para computa√ß√£o
n_comp = 5000000
comp_data = pa.table({
    'a': np.random.randint(1, 1000, n_comp),
    'b': np.random.randint(1, 1000, n_comp),
    'c': np.random.uniform(10, 1000, n_comp)
})

print(f"Dataset: {n_comp:,} registros")

# Opera√ß√£o: (a * 2 + b) / c

# Escalar
print("\nOp√ß√£o 1: Loop Escalar (Python loop)")
a_list = comp_data['a'].to_pylist()
b_list = comp_data['b'].to_pylist()
c_list = comp_data['c'].to_pylist()

start = time.perf_counter()
result_scalar = []
for i in range(min(10000, len(a_list))):  # Apenas 10k para n√£o demorar
    result_scalar.append((a_list[i] * 2 + b_list[i]) / c_list[i])
# Extrapolamos para o tamanho real para compara√ß√£o justa
time_scalar = (time.perf_counter() - start) * (n_comp / 10000)

print(f"  Tempo (extrapolado para {n_comp:,} linhas): {time_scalar:.4f}s")

# Vetorizado com Arrow Compute
print("\nOp√ß√£o 2: Arrow Compute (Vetorizado)")
start = time.perf_counter()

result_arrow = pc.divide(
    pc.add(pc.multiply(comp_data['a'], 2), comp_data['b']),
    comp_data['c']
)

time_vector = time.perf_counter() - start

print(f"  Tempo: {time_vector:.6f}s")
speedup_vec = time_scalar / max(time_vector, 1e-9)
print(f"  Speedup: {speedup_vec:.1f}x")

# 4.5.3 Benchmark: Diferentes Estrat√©gias de I/O
print("\n3. Benchmark: Estrat√©gias de I/O:")
print("-" * 40)

# Preparar arquivo Parquet
bench_dir = 'bench_data'
if os.path.exists(bench_dir):
    shutil.rmtree(bench_dir)
os.makedirs(bench_dir, exist_ok=True)

io_table = pa.table({
    'id': list(range(100000)),
    'value': np.random.randn(100000),
    'category': np.random.choice(['A', 'B', 'C'], 100000),
    'amount': np.random.uniform(10, 1000, 100000)
})

parquet_bench = f'{bench_dir}/benchmark.parquet'
pq.write_table(io_table, parquet_bench)

print(f"Arquivo: {os.path.getsize(parquet_bench) / 1024 / 1024:.2f} MB")

# Estrat√©gia 1: Full Load
print("\nEstrat√©gia 1: Full Load (ler arquivo completo)")
start = time.perf_counter()
full_tbl = pq.read_table(parquet_bench)
result_full = con.execute("SELECT COUNT(*) FROM full_tbl").fetchone()
time_full = time.perf_counter() - start

print(f"  Tempo: {time_full:.4f}s")

# Estrat√©gia 2: With Filter (push-down)
print("\nEstrat√©gia 2: With Filter (push-down no Parquet)")
start = time.perf_counter()
filtered_tbl = pq.read_table(parquet_bench, filters=[('amount', '>', 500)])
result_filtered = con.execute("SELECT COUNT(*) FROM filtered_tbl").fetchone()
time_filtered = time.perf_counter() - start

print(f"  Tempo: {time_filtered:.4f}s")
speedup_filt = time_full / max(time_filtered, 1e-9)
print(f"  Speedup: {speedup_filt:.1f}x")

# Estrat√©gia 3: With Projection
print("\nEstrat√©gia 3: With Projection (selecionar colunas)")
start = time.perf_counter()
projected_tbl = pq.read_table(parquet_bench, columns=['id', 'amount'])
result_proj = con.execute("SELECT COUNT(*) FROM projected_tbl").fetchone()
time_proj = time.perf_counter() - start

print(f"  Tempo: {time_proj:.4f}s")
speedup_proj = time_full / max(time_proj, 1e-9)
print(f"  Speedup: {speedup_proj:.1f}x")

# Estrat√©gia 4: With Filter + Projection
print("\nEstrat√©gia 4: With Filter + Projection (otimizado)")
start = time.perf_counter()
optimized_tbl = pq.read_table(
    parquet_bench,
    columns=['id', 'amount'],
    filters=[('amount', '>', 500)]
)
result_opt = con.execute("SELECT COUNT(*) FROM optimized_tbl").fetchone()
time_opt = time.perf_counter() - start

print(f"  Tempo: {time_opt:.4f}s")
speedup_opt = time_full / max(time_opt, 1e-9)
print(f"  Speedup: {speedup_opt:.1f}x")

# 4.5.4 Benchmark: Memory Usage
print("\n4. Benchmark: Uso de Mem√≥ria:")
print("-" * 40)

print("Compara√ß√£o de t√©cnicas:")
print(f"{'T√©cnica':<30} {'Mem√≥ria (MB)':<15} {'Tempo (s)':<10}")
print("-" * 55)

# T√©cnica 1: Full pandas
mem_before = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
start = time.perf_counter()
df_pd = io_table.to_pandas()
mem_pandas = (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024) - mem_before
time_pandas = time.perf_counter() - start

print(f"{'Pandas Full':<30} {mem_pandas:<15.2f} {time_pandas:<10.4f}")

# T√©cnica 2: Arrow (zero-copy)
mem_before = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
start = time.perf_counter()
arrow_view = io_table.select(['id', 'amount'])
mem_arrow = (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024) - mem_before
time_arrow = time.perf_counter() - start

print(f"{'Arrow Zero-Copy':<30} {mem_arrow:<15.2f} {time_arrow:<10.4f}")

# T√©cnica 3: Streaming
mem_before = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
start = time.perf_counter()
dataset_bench = ds.dataset(parquet_bench, format='parquet')
scanner_bench = dataset_bench.scanner(batch_size=10000)
batch_count_idx = 0
for _ in scanner_bench.to_batches():
    batch_count_idx += 1
mem_stream = (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024) - mem_before
time_stream = time.perf_counter() - start

print(f"{'Streaming (10K batch)':<30} {mem_stream:<15.2f} {time_stream:<10.4f}")

# 4.5.5 Benchmark Final: End-to-End
print("\n5. Benchmark End-to-End: Diferentes Fluxos:")
print("-" * 40)

# Carga de trabalho real√≠stica
print("Carga: Ler, filtrar, agregar, salvar resultado")

# Fluxo 1: Pandas tradicional
print("\nFluxo 1: Pandas (copy-based)")
start = time.perf_counter()
pdf = pq.read_table(parquet_bench).to_pandas()
filtered_pdf = pdf[pdf['amount'] > 500]
result_pandas = filtered_pdf.groupby('category')['amount'].agg(['count', 'mean', 'sum'])
result_pandas.to_csv(f'{bench_dir}/result_pandas.csv')
time_workflow_pandas = time.perf_counter() - start

print(f"  Tempo: {time_workflow_pandas:.4f}s")

# Fluxo 2: Arrow + DuckDB
print("\nFluxo 2: Arrow + DuckDB (zero-copy)")
start = time.perf_counter()
tbl_bench = pq.read_table(parquet_bench)
result_duckdb = con.execute(f"""
    SELECT 
        category,
        COUNT(*) as count,
        AVG(amount) as mean,
        SUM(amount) as sum
    FROM tbl_bench
    WHERE amount > 500
    GROUP BY category
""").df()
result_duckdb.to_csv(f'{bench_dir}/result_duckdb.csv', index=False)
time_workflow_arrow = time.perf_counter() - start

print(f"  Tempo: {time_workflow_arrow:.4f}s")
speedup_workflow = time_workflow_pandas / max(time_workflow_arrow, 1e-9)
print(f"  Speedup: {speedup_workflow:.1f}x")

# 4.5.6 Resumo de Performance
print("\n6. Resumo de Performance:")
print("-" * 40)

benchmarks = [
    ("Zero-Copy vs Copy", time_zero_copy, time_with_copy),
    ("Vetorizado vs Escalar", time_vector, time_scalar),
    ("Full vs Optimized", time_full, time_opt),
    ("Pandas vs Arrow", time_workflow_pandas, time_workflow_arrow)
]

print(f"{'Benchmark':<30} {'Tempo':<12} {'Speedup':<10}")
print("-" * 55)

for name, time_new, time_old in benchmarks:
    speedup = time_old / max(time_new, 1e-9)
    print(f"{name:<30} {time_new:<12.6f}s {speedup:<10.1f}x")

print("\n‚úÖ Performance Insights:")
print("   ‚Ä¢ Zero-Copy: evite c√≥pias desnecess√°rias")
print("   ‚Ä¢ Vetoriza√ß√£o: 10-100x mais r√°pido que loops")
print("   ‚Ä¢ Push-down Filters: reduz I/O")
print("   ‚Ä¢ Projections: economiza mem√≥ria")
print("   ‚Ä¢ Arrow + DuckDB: otimiza√ß√£o autom√°tica")

# Limpeza
shutil.rmtree(bench_dir, ignore_errors=True)
