# Capítulo 4: Consultando Log vs. Tabela — A Dualidade do Fluss

Demonstração prática de log (append-only) vs. table (upsert com primary key).

## Setup

In [None]:
!pip install duckdb pandas numpy matplotlib seaborn faker -q

In [None]:
import duckdb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import matplotlib.pyplot as plt
import seaborn as sns
from faker import Faker

fake = Faker()
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print(f"DuckDB: {duckdb.__version__}")

## 4.1 Modo LOG: Append-Only Stream

In [None]:
# Criar banco de dados
con = duckdb.connect('fluss_demo.db')

# Tabela de eventos (LOG mode)
con.execute("""
    CREATE OR REPLACE TABLE user_events (
        event_id VARCHAR PRIMARY KEY,
        user_id INTEGER,
        event_type VARCHAR,
        page VARCHAR,
        timestamp BIGINT,
        session_id VARCHAR
    )
""")

print("✅ Tabela user_events criada (modo LOG)")

In [None]:
# Gerar eventos de navegação
import random
import uuid

n_events = 50000
event_types = ['page_view', 'click', 'scroll', 'form_submit']
pages = ['/home', '/products', '/cart', '/checkout', '/profile']

events = []
base_time = int(datetime.now().timestamp())

for i in range(n_events):
    events.append({
        'event_id': str(uuid.uuid4()),
        'user_id': random.randint(1, 1000),
        'event_type': random.choice(event_types),
        'page': random.choice(pages),
        'timestamp': base_time + i,
        'session_id': f'sess-{random.randint(1, 5000)}'
    })

events_df = pd.DataFrame(events)
con.execute("INSERT INTO user_events SELECT * FROM events_df")

print(f"✅ {n_events:,} eventos inseridos")

In [None]:
# Query 1: Análise de funil
print("=== Análise de Funil (Funnel) ===")

funnel = con.execute("""
    WITH funnel_steps AS (
        SELECT 
            session_id,
            MAX(CASE WHEN page = '/home' THEN 1 ELSE 0 END) as visited_home,
            MAX(CASE WHEN page = '/products' THEN 1 ELSE 0 END) as visited_products,
            MAX(CASE WHEN page = '/cart' THEN 1 ELSE 0 END) as visited_cart,
            MAX(CASE WHEN page = '/checkout' THEN 1 ELSE 0 END) as visited_checkout
        FROM user_events
        GROUP BY session_id
    )
    SELECT 
        SUM(visited_home) as step1_home,
        SUM(visited_products) as step2_products,
        SUM(visited_cart) as step3_cart,
        SUM(visited_checkout) as step4_checkout,
        ROUND(SUM(visited_products)::FLOAT / SUM(visited_home) * 100, 1) as conversion_1_2,
        ROUND(SUM(visited_cart)::FLOAT / SUM(visited_products) * 100, 1) as conversion_2_3,
        ROUND(SUM(visited_checkout)::FLOAT / SUM(visited_cart) * 100, 1) as conversion_3_4
    FROM funnel_steps
""").fetchdf()

print(funnel.to_string(index=False))

In [None]:
# Visualizar funil
steps = ['Home', 'Products', 'Cart', 'Checkout']
values = [
    int(funnel['step1_home'].iloc[0]),
    int(funnel['step2_products'].iloc[0]),
    int(funnel['step3_cart'].iloc[0]),
    int(funnel['step4_checkout'].iloc[0])
]

fig, ax = plt.subplots(figsize=(12, 6))
colors = ['#4CAF50', '#2196F3', '#FF9800', '#F44336']
bars = ax.barh(steps, values, color=colors, alpha=0.7)

# Adicionar valores e percentuais
for i, (bar, val) in enumerate(zip(bars, values)):
    pct = val / values[0] * 100
    ax.text(val, bar.get_y() + bar.get_height()/2, 
            f' {val:,} ({pct:.1f}%)', 
            va='center', fontweight='bold', fontsize=11)

ax.set_xlabel('Número de Sessões', fontsize=12)
ax.set_title('Análise de Funil de Conversão', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 4.2 Modo TABLE: Upsert com Primary Key

In [None]:
# Tabela de inventário (TABLE mode com PK)
con.execute("""
    CREATE OR REPLACE TABLE product_inventory (
        product_id INTEGER PRIMARY KEY,
        product_name VARCHAR,
        quantity INTEGER,
        price DECIMAL(10,2),
        last_updated BIGINT
    )
""")

print("✅ Tabela product_inventory criada (modo TABLE)")

In [None]:
# Estado inicial do inventário
products = []
for i in range(1, 101):
    products.append({
        'product_id': i,
        'product_name': f'Product {i}',
        'quantity': random.randint(0, 1000),
        'price': round(random.uniform(10, 500), 2),
        'last_updated': int(datetime.now().timestamp())
    })

products_df = pd.DataFrame(products)
con.execute("INSERT INTO product_inventory SELECT * FROM products_df")

print(f"✅ {len(products)} produtos inseridos")
print("\nPrimeiros 5 produtos:")
print(con.execute("SELECT * FROM product_inventory LIMIT 5").fetchdf().to_string(index=False))

In [None]:
# Simular atualizações (UPSERT)
print("\n=== Simulando vendas (UPSERT) ===")

# Venda de 20 produtos
updates = []
for _ in range(20):
    product_id = random.randint(1, 100)
    qty_sold = random.randint(1, 50)
    updates.append((product_id, qty_sold))

for product_id, qty_sold in updates:
    con.execute(f"""
        UPDATE product_inventory 
        SET quantity = quantity - {qty_sold},
            last_updated = {int(datetime.now().timestamp())}
        WHERE product_id = {product_id}
    """)

print(f"✅ {len(updates)} atualizações aplicadas")

# Verificar produtos com estoque baixo
low_stock = con.execute("""
    SELECT product_id, product_name, quantity, price
    FROM product_inventory
    WHERE quantity < 50
    ORDER BY quantity ASC
    LIMIT 10
""").fetchdf()

print("\nProdutos com estoque baixo (<50):")
print(low_stock.to_string(index=False))

## 4.3 Point Query: Lookup por Primary Key

In [None]:
# Benchmark: Point query por PK
print("=== Point Query Benchmark ===")

# Query 1000 produtos aleatórios
product_ids = [random.randint(1, 100) for _ in range(1000)]

start = time.perf_counter()
for pid in product_ids:
    result = con.execute(f"""
        SELECT quantity, price FROM product_inventory WHERE product_id = {pid}
    """).fetchone()
time_point = time.perf_counter() - start

latency_per_query = time_point / 1000 * 1000  # ms

print(f"1000 point queries: {time_point*1000:.1f}ms")
print(f"Latência média: {latency_per_query:.2f}ms por query")
print(f"Throughput: {1000/time_point:.0f} queries/segundo")

## 4.4 Temporal Join: ASOF JOIN

In [None]:
# Criar tabela de preços históricos
con.execute("""
    CREATE OR REPLACE TABLE price_history (
        product_id INTEGER,
        price DECIMAL(10,2),
        timestamp BIGINT
    )
""")

# Gerar histórico de preços
base_time = int((datetime.now() - timedelta(days=30)).timestamp())
price_changes = []

for product_id in range(1, 21):  # 20 produtos
    base_price = random.uniform(100, 500)
    for day in range(30):
        price = base_price * (1 + random.uniform(-0.1, 0.1))  # ±10% variação
        price_changes.append({
            'product_id': product_id,
            'price': round(price, 2),
            'timestamp': base_time + (day * 86400)
        })

price_df = pd.DataFrame(price_changes)
con.execute("INSERT INTO price_history SELECT * FROM price_df")

print(f"✅ {len(price_changes)} registros de preço inseridos")

In [None]:
# Criar tabela de compras
con.execute("""
    CREATE OR REPLACE TABLE purchases (
        purchase_id INTEGER,
        product_id INTEGER,
        quantity INTEGER,
        timestamp BIGINT
    )
""")

# Gerar compras
purchases = []
for i in range(1000):
    purchases.append({
        'purchase_id': i,
        'product_id': random.randint(1, 20),
        'quantity': random.randint(1, 10),
        'timestamp': base_time + random.randint(0, 30 * 86400)
    })

purchases_df = pd.DataFrame(purchases)
con.execute("INSERT INTO purchases SELECT * FROM purchases_df")

print(f"✅ {len(purchases)} compras inseridas")

In [None]:
# ASOF JOIN: Encontrar preço no momento da compra
print("\n=== ASOF JOIN: Preço no momento da compra ===")

result = con.execute("""
    SELECT 
        p.purchase_id,
        p.product_id,
        p.quantity,
        p.timestamp as purchase_time,
        ph.price,
        ph.timestamp as price_time,
        ROUND(p.quantity * ph.price, 2) as total_value
    FROM purchases p
    ASOF LEFT JOIN price_history ph
        ON p.product_id = ph.product_id
        AND p.timestamp >= ph.timestamp
    ORDER BY p.purchase_id
    LIMIT 10
""").fetchdf()

print(result.to_string(index=False))

# Calcular valor total
total_revenue = con.execute("""
    SELECT 
        COUNT(*) as num_purchases,
        ROUND(SUM(p.quantity * ph.price), 2) as total_revenue
    FROM purchases p
    ASOF LEFT JOIN price_history ph
        ON p.product_id = ph.product_id
        AND p.timestamp >= ph.timestamp
""").fetchdf()

print(f"\nTotal Revenue: ${total_revenue['total_revenue'].iloc[0]:,.2f}")

## 4.5 Changelog Query

In [None]:
# Criar tabela de changelog
con.execute("""
    CREATE OR REPLACE TABLE inventory_changelog (
        change_id INTEGER PRIMARY KEY,
        product_id INTEGER,
        operation VARCHAR,  -- INSERT, UPDATE, DELETE
        old_quantity INTEGER,
        new_quantity INTEGER,
        timestamp BIGINT
    )
""")

# Gerar changelog simulado
changes = []
change_id = 0
current_time = int(datetime.now().timestamp())

for product_id in range(1, 51):
    old_qty = random.randint(100, 500)
    
    # 5 mudanças por produto
    for i in range(5):
        delta = random.randint(-100, 100)
        new_qty = max(0, old_qty + delta)
        
        changes.append({
            'change_id': change_id,
            'product_id': product_id,
            'operation': 'UPDATE',
            'old_quantity': old_qty,
            'new_quantity': new_qty,
            'timestamp': current_time + (i * 60)
        })
        
        old_qty = new_qty
        change_id += 1

changelog_df = pd.DataFrame(changes)
con.execute("INSERT INTO inventory_changelog SELECT * FROM changelog_df")

print(f"✅ {len(changes)} mudanças no changelog")

In [None]:
# Analisar delta de mudanças
print("\n=== Análise de Changelog ===")

analysis = con.execute("""
    SELECT 
        product_id,
        COUNT(*) as num_changes,
        MIN(old_quantity) as initial_qty,
        MAX(new_quantity) as final_qty,
        MAX(new_quantity) - MIN(old_quantity) as net_change
    FROM inventory_changelog
    GROUP BY product_id
    ORDER BY ABS(MAX(new_quantity) - MIN(old_quantity)) DESC
    LIMIT 10
""").fetchdf()

print(analysis.to_string(index=False))

## 4.6 Resumo Final

In [None]:
summary = pd.DataFrame({
    'Modo': ['LOG (Append-Only)', 'TABLE (Upsert PK)'],
    'Caso de Uso': [
        'Eventos, Logs, Auditoria',
        'Estado atual, Inventário, Perfis'
    ],
    'Operações': [
        'INSERT only',
        'INSERT, UPDATE, DELETE'
    ],
    'Queries': [
        'Funnel, Time-series, Agregações',
        'Point query (1-5ms), Joins'
    ],
    'Latência': [
        'Scan-based',
        f'{latency_per_query:.2f}ms (point query)'
    ]
})

print("\n=== RESUMO DO CAPÍTULO 4 ===")
print(summary.to_string(index=False))

print("\n✅ Principais Conclusões:")
print("  1. LOG: Imutável, ideal para eventos e análises temporais")
print("  2. TABLE: Mutável com PK, ideal para estado atual")
print("  3. ASOF JOIN: Query temporal com consistência")
print("  4. Changelog: Rastreamento de mudanças em tempo real")

In [None]:
# Limpeza
con.close()
import os
if os.path.exists('fluss_demo.db'):
    os.remove('fluss_demo.db')
    
print("\n✅ Notebook concluído!")