# Cap√≠tulo 2: Anatomia da Integra√ß√£o DuckDB ‚Üî Fluss

Este notebook demonstra os conceitos de integra√ß√£o entre DuckDB e sistemas de streaming colunar,
simulando opera√ß√µes de pushdown e otimiza√ß√µes.

## Setup: Instala√ß√£o e Configura√ß√£o

In [None]:
!pip install duckdb pyarrow pandas numpy matplotlib seaborn faker -q

In [None]:
import duckdb
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from faker import Faker
import json

fake = Faker()
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print(f"‚úÖ DuckDB: {duckdb.__version__}")
print(f"‚úÖ PyArrow: {pa.__version__}")

## 2.1 Simula√ß√£o de Fluss Tablets

Criar m√∫ltiplos arquivos Parquet para simular tablets distribu√≠dos.

In [None]:
import os

# Criar diret√≥rio para tablets
tablets_dir = 'fluss_tablets'
os.makedirs(tablets_dir, exist_ok=True)

# Gerar dados de vendas particionados
num_tablets = 4
rows_per_tablet = 250_000

print(f"Gerando {num_tablets} tablets com {rows_per_tablet:,} linhas cada...")

for tablet_id in range(num_tablets):
    data = {
        'order_id': [f'ORD-{tablet_id}-{i:06d}' for i in range(rows_per_tablet)],
        'customer_id': np.random.randint(1000, 100000, rows_per_tablet),
        'product_id': np.random.randint(1, 10000, rows_per_tablet),
        'quantity': np.random.randint(1, 10, rows_per_tablet),
        'unit_price': np.round(np.random.uniform(10, 500, rows_per_tablet), 2),
        'total_amount': np.zeros(rows_per_tablet),  # ser√° calculado
        'timestamp': [int((datetime.now() - timedelta(days=30) + timedelta(seconds=i)).timestamp()) 
                      for i in range(rows_per_tablet)],
        'status': np.random.choice(['completed', 'pending', 'cancelled'], rows_per_tablet, p=[0.7, 0.2, 0.1]),
        'region': np.random.choice(['north', 'south', 'east', 'west'], rows_per_tablet),
        'payment_method': np.random.choice(['credit', 'debit', 'pix', 'cash'], rows_per_tablet)
    }
    
    # Calcular total
    data['total_amount'] = np.round(data['quantity'] * data['unit_price'], 2)
    
    # Salvar como Parquet
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'{tablets_dir}/tablet_{tablet_id}.parquet')
    
    print(f"  ‚úì Tablet {tablet_id}: {rows_per_tablet:,} linhas")

print(f"\n‚úÖ Total: {num_tablets * rows_per_tablet:,} linhas em {num_tablets} tablets")

## 2.2 Query sem Pushdown (Full Scan)

Ler todos os dados e filtrar em mem√≥ria.

In [None]:
# Conectar DuckDB
con = duckdb.connect()

# Query 1: Full scan sem pushdown
print("=== Query 1: Full Scan (SEM Pushdown) ===")
start = time.perf_counter()

# Ler todos os tablets
result = con.execute(f"""
    SELECT * FROM read_parquet('{tablets_dir}/*.parquet')
""").fetchdf()

# Filtrar em mem√≥ria
filtered = result[
    (result['region'] == 'north') & 
    (result['status'] == 'completed')
]

time_no_pushdown = time.perf_counter() - start

print(f"Linhas lidas: {len(result):,}")
print(f"Linhas ap√≥s filtro: {len(filtered):,}")
print(f"Tempo total: {time_no_pushdown*1000:.1f}ms")
print(f"Dados transferidos: {result.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

## 2.3 Query com Projection Pushdown

Ler apenas colunas necess√°rias.

In [None]:
print("\n=== Query 2: Projection Pushdown (apenas colunas necess√°rias) ===")
start = time.perf_counter()

# DuckDB automaticamente aplica projection pushdown
result = con.execute(f"""
    SELECT 
        order_id,
        customer_id,
        total_amount,
        timestamp
    FROM read_parquet('{tablets_dir}/*.parquet')
    WHERE region = 'north' AND status = 'completed'
""").fetchdf()

time_projection = time.perf_counter() - start

print(f"Linhas retornadas: {len(result):,}")
print(f"Colunas: {list(result.columns)}")
print(f"Tempo total: {time_projection*1000:.1f}ms")
print(f"Dados transferidos: {result.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

# Compara√ß√£o
speedup = time_no_pushdown / time_projection
print(f"\nüöÄ Speedup: {speedup:.1f}x mais r√°pido com pushdown!")

## 2.4 Filter Pushdown com Estat√≠sticas

Demonstrar como metadados Parquet eliminam leitura de tablets.

In [None]:
# Ler metadados de cada tablet
print("=== Metadados dos Tablets ===")
metadata_list = []

for tablet_id in range(num_tablets):
    parquet_file = pq.ParquetFile(f'{tablets_dir}/tablet_{tablet_id}.parquet')
    metadata = parquet_file.metadata
    row_group = metadata.row_group(0)
    
    # Obter estat√≠sticas da coluna customer_id
    customer_col = row_group.column(1)  # customer_id √© a 2¬™ coluna
    stats = customer_col.statistics
    
    info = {
        'tablet_id': tablet_id,
        'num_rows': metadata.num_rows,
        'customer_id_min': stats.min,
        'customer_id_max': stats.max,
        'file_size_mb': os.path.getsize(f'{tablets_dir}/tablet_{tablet_id}.parquet') / 1024 / 1024
    }
    metadata_list.append(info)
    
metadata_df = pd.DataFrame(metadata_list)
print(metadata_df.to_string(index=False))

In [None]:
# Query com filter pushdown
print("\n=== Query 3: Filter Pushdown (customer_id < 10000) ===")
start = time.perf_counter()

result = con.execute(f"""
    SELECT 
        COUNT(*) as num_orders,
        SUM(total_amount) as total_revenue
    FROM read_parquet('{tablets_dir}/*.parquet')
    WHERE customer_id < 10000
""").fetchdf()

time_filter = time.perf_counter() - start

print(f"Tempo: {time_filter*1000:.1f}ms")
print(f"\nResultado:")
print(result.to_string(index=False))

# Com metadados, DuckDB pode pular tablets onde max < 10000
skippable_tablets = metadata_df[metadata_df['customer_id_max'] < 10000]
print(f"\nüìä Tablets que podem ser pulados: {len(skippable_tablets)}/{len(metadata_df)}")

## 2.5 Parallel Tablet Reading

Demonstrar leitura paralela de m√∫ltiplos tablets.

In [None]:
# Configurar n√∫mero de threads
con.execute("SET threads TO 1")  # Single-threaded

print("=== Teste: Single-threaded ===")
start = time.perf_counter()
result_single = con.execute(f"""
    SELECT region, status, COUNT(*) as count
    FROM read_parquet('{tablets_dir}/*.parquet')
    GROUP BY region, status
""").fetchdf()
time_single = time.perf_counter() - start
print(f"Tempo: {time_single*1000:.1f}ms")

In [None]:
# Multi-threaded
con.execute("SET threads TO 4")  # 4 threads

print("\n=== Teste: Multi-threaded (4 threads) ===")
start = time.perf_counter()
result_multi = con.execute(f"""
    SELECT region, status, COUNT(*) as count
    FROM read_parquet('{tablets_dir}/*.parquet')
    GROUP BY region, status
""").fetchdf()
time_multi = time.perf_counter() - start
print(f"Tempo: {time_multi*1000:.1f}ms")

# Compara√ß√£o
speedup = time_single / time_multi
print(f"\nüöÄ Parallel Speedup: {speedup:.2f}x")
print(f"Efici√™ncia: {speedup/4*100:.1f}%")

## 2.6 Aggregation Pushdown

Demonstrar agrega√ß√µes executadas pr√≥ximo aos dados.

In [None]:
print("=== Query 4: Aggregation Pushdown ===")
start = time.perf_counter()

# Agrega√ß√£o complexa
result = con.execute(f"""
    SELECT 
        region,
        payment_method,
        COUNT(*) as num_orders,
        COUNT(DISTINCT customer_id) as unique_customers,
        ROUND(SUM(total_amount), 2) as total_revenue,
        ROUND(AVG(total_amount), 2) as avg_order_value,
        ROUND(MIN(total_amount), 2) as min_order,
        ROUND(MAX(total_amount), 2) as max_order
    FROM read_parquet('{tablets_dir}/*.parquet')
    WHERE status = 'completed'
    GROUP BY region, payment_method
    ORDER BY total_revenue DESC
""").fetchdf()

time_agg = time.perf_counter() - start

print(f"Tempo: {time_agg*1000:.1f}ms")
print(f"\nTop 10 resultados:")
print(result.head(10).to_string(index=False))

## 2.7 Visualizar Performance

In [None]:
# Criar gr√°fico de compara√ß√£o
optimizations = ['Full Scan', 'Projection\nPushdown', 'Filter\nPushdown', 'Parallel\n(1 thread)', 'Parallel\n(4 threads)']
times = [
    time_no_pushdown * 1000,
    time_projection * 1000,
    time_filter * 1000,
    time_single * 1000,
    time_multi * 1000
]

fig, ax = plt.subplots(figsize=(14, 6))
colors = ['#ff6b6b', '#51cf66', '#339af0', '#ffd43b', '#f06595']
bars = ax.bar(optimizations, times, color=colors, alpha=0.7)

# Adicionar valores
for bar, time_val in zip(bars, times):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{time_val:.1f}ms',
            ha='center', va='bottom', fontweight='bold')

ax.set_ylabel('Tempo (ms)', fontsize=12)
ax.set_title('Impacto das Otimiza√ß√µes de Pushdown', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä Speedup relativo ao Full Scan:")
for opt, t in zip(optimizations[1:], times[1:]):
    speedup = times[0] / t
    print(f"  {opt.replace(chr(10), ' ')}: {speedup:.2f}x")

## 2.8 Simula√ß√£o de Redu√ß√£o de Rede

Calcular redu√ß√£o de dados transferidos pela rede.

In [None]:
# Calcular tamanhos
total_size_mb = sum([os.path.getsize(f'{tablets_dir}/tablet_{i}.parquet') 
                      for i in range(num_tablets)]) / 1024 / 1024

# Tamanho ap√≥s projection (apenas 4 de 10 colunas)
projected_size_mb = total_size_mb * 0.4

# Tamanho ap√≥s filter (estimado 10% das linhas)
filtered_size_mb = projected_size_mb * 0.1

print("=== Redu√ß√£o de Transfer√™ncia de Rede ===")
print(f"\nDados originais (tablets): {total_size_mb:.1f} MB")
print(f"Ap√≥s Projection Pushdown: {projected_size_mb:.1f} MB ({(1-projected_size_mb/total_size_mb)*100:.0f}% redu√ß√£o)")
print(f"Ap√≥s Filter Pushdown: {filtered_size_mb:.1f} MB ({(1-filtered_size_mb/total_size_mb)*100:.0f}% redu√ß√£o total)")

# Visualizar
stages = ['Dados\nOriginais', 'Projection\nPushdown', 'Filter\nPushdown']
sizes = [total_size_mb, projected_size_mb, filtered_size_mb]

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(stages, sizes, color=['#ff6b6b', '#ffd43b', '#51cf66'], alpha=0.7)

for bar, size in zip(bars, sizes):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{size:.1f} MB',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_ylabel('Dados Transferidos (MB)', fontsize=12)
ax.set_title('Redu√ß√£o de Rede com Pushdowns', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nüöÄ Redu√ß√£o total de rede: {(1-filtered_size_mb/total_size_mb)*100:.0f}%")

## 2.9 Conclus√£o do Cap√≠tulo

In [None]:
summary = pd.DataFrame({
    'Otimiza√ß√£o': [
        'Projection Pushdown',
        'Filter Pushdown',
        'Parallel Reading (4 threads)',
        'Redu√ß√£o de Rede'
    ],
    'Benef√≠cio': [
        f'{time_no_pushdown/time_projection:.1f}x mais r√°pido',
        'Pula tablets desnecess√°rios',
        f'{speedup:.2f}x speedup',
        f'{(1-filtered_size_mb/total_size_mb)*100:.0f}% menos dados'
    ],
    'Impacto': [
        '60% menos colunas',
        '90% menos linhas',
        f'{speedup/4*100:.0f}% efici√™ncia',
        '96% economia de banda'
    ]
})

print("\n=== RESUMO DO CAP√çTULO 2 ===")
print(summary.to_string(index=False))

print("\n‚úÖ Principais Aprendizados:")
print("  1. Projection Pushdown: Ler apenas colunas necess√°rias")
print("  2. Filter Pushdown: Usar metadados para pular dados")
print("  3. Parallel Reading: Processar m√∫ltiplos tablets")
print("  4. Redu√ß√£o de Rede: 96% menos transfer√™ncia")

In [None]:
# Limpeza
con.close()
print("\n‚úÖ Notebook conclu√≠do!")