# Cap√≠tulo 5: Tiered Storage Transparente

Demonstra√ß√£o de armazenamento em camadas (Memory ‚Üí SSD ‚Üí S3) com queries transparentes.

## Setup

In [None]:
!pip install duckdb pandas numpy matplotlib seaborn pyarrow -q

In [None]:
import duckdb
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print(f"DuckDB: {duckdb.__version__}")

## 5.1 Simular Tiers de Storage

In [None]:
# Criar diret√≥rios para simular tiers
tiers = {
    'hot': 'tier_hot',      # Simula mem√≥ria/SSD (dados recentes)
    'warm': 'tier_warm',    # Simula SSD local (dados de 7-30 dias)
    'cold': 'tier_cold'     # Simula S3 (dados > 30 dias)
}

for tier, path in tiers.items():
    os.makedirs(path, exist_ok=True)
    
print("‚úÖ Tiers de storage criados:")
for tier, path in tiers.items():
    print(f"  - {tier.upper()}: {path}/")

## 5.2 Gerar Dados Hist√≥ricos

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

# Gerar dados para diferentes per√≠odos
def generate_data(start_date, days, rows_per_day):
    data = []
    for day in range(days):
        date = start_date + timedelta(days=day)
        timestamp_base = int(date.timestamp())
        
        for i in range(rows_per_day):
            data.append({
                'order_id': f'{date.strftime("%Y%m%d")}-{i:06d}',
                'customer_id': np.random.randint(1, 100000),
                'amount': round(np.random.uniform(10, 1000), 2),
                'timestamp': timestamp_base + i,
                'date': date.strftime('%Y-%m-%d'),
                'region': np.random.choice(['north', 'south', 'east', 'west']),
                'status': np.random.choice(['completed', 'pending'], p=[0.9, 0.1])
            })
    
    return pd.DataFrame(data)

print("Gerando dados hist√≥ricos...")

# HOT: √öltimos 7 dias (100k linhas/dia)
hot_data = generate_data(datetime.now() - timedelta(days=7), 7, 100000)
pq.write_table(pa.Table.from_pandas(hot_data), f"{tiers['hot']}/recent.parquet")
print(f"  ‚úì HOT tier: {len(hot_data):,} linhas (√∫ltimos 7 dias)")

# WARM: 8-30 dias (50k linhas/dia)
warm_data = generate_data(datetime.now() - timedelta(days=30), 23, 50000)
pq.write_table(pa.Table.from_pandas(warm_data), f"{tiers['warm']}/medium.parquet")
print(f"  ‚úì WARM tier: {len(warm_data):,} linhas (8-30 dias)")

# COLD: 31-90 dias (10k linhas/dia)
cold_data = generate_data(datetime.now() - timedelta(days=90), 60, 10000)
pq.write_table(pa.Table.from_pandas(cold_data), f"{tiers['cold']}/archive.parquet")
print(f"  ‚úì COLD tier: {len(cold_data):,} linhas (31-90 dias)")

total_rows = len(hot_data) + len(warm_data) + len(cold_data)
print(f"\n‚úÖ Total: {total_rows:,} linhas em 3 tiers")

## 5.3 Query Cross-Tier Transparente

In [None]:
# Conectar DuckDB
con = duckdb.connect()

# Query que acessa TODOS os tiers transparentemente
print("=== Query Cross-Tier: √öltimos 90 dias ===")

start = time.perf_counter()
result = con.execute("""
    SELECT 
        date,
        COUNT(*) as num_orders,
        ROUND(SUM(amount), 2) as total_revenue,
        ROUND(AVG(amount), 2) as avg_order_value
    FROM read_parquet(['tier_hot/*.parquet', 'tier_warm/*.parquet', 'tier_cold/*.parquet'])
    WHERE status = 'completed'
    GROUP BY date
    ORDER BY date DESC
    LIMIT 10
""").fetchdf()
query_time = time.perf_counter() - start

print(f"Tempo: {query_time*1000:.1f}ms\n")
print(result.to_string(index=False))

## 5.4 Performance por Tier

In [None]:
# Benchmark de cada tier individualmente
tier_performance = []

for tier_name, tier_path in tiers.items():
    start = time.perf_counter()
    result = con.execute(f"""
        SELECT COUNT(*), SUM(amount)
        FROM read_parquet('{tier_path}/*.parquet')
    """).fetchone()
    elapsed = time.perf_counter() - start
    
    # Tamanho do tier
    size_mb = sum(os.path.getsize(f"{tier_path}/{f}") 
                  for f in os.listdir(tier_path) if f.endswith('.parquet')) / 1024 / 1024
    
    tier_performance.append({
        'Tier': tier_name.upper(),
        'Linhas': f"{result[0]:,}",
        'Tamanho (MB)': f"{size_mb:.1f}",
        'Query Time (ms)': f"{elapsed*1000:.1f}",
        'Throughput (MB/s)': f"{size_mb/elapsed:.1f}"
    })

perf_df = pd.DataFrame(tier_performance)
print("\n=== Performance por Tier ===")
print(perf_df.to_string(index=False))

In [None]:
# Visualizar performance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

tiers_list = ['HOT', 'WARM', 'COLD']
sizes = [float(perf_df[perf_df['Tier'] == t]['Tamanho (MB)'].iloc[0]) for t in tiers_list]
times = [float(perf_df[perf_df['Tier'] == t]['Query Time (ms)'].iloc[0]) for t in tiers_list]

colors = ['#ff6b6b', '#ffd43b', '#51cf66']

# Tamanho
ax1.bar(tiers_list, sizes, color=colors, alpha=0.7)
ax1.set_ylabel('Tamanho (MB)')
ax1.set_title('Tamanho de Dados por Tier')
ax1.grid(axis='y', alpha=0.3)

for i, (tier, size) in enumerate(zip(tiers_list, sizes)):
    ax1.text(i, size, f'{size:.1f} MB', ha='center', va='bottom', fontweight='bold')

# Tempo de query
ax2.bar(tiers_list, times, color=colors, alpha=0.7)
ax2.set_ylabel('Query Time (ms)')
ax2.set_title('Performance de Query por Tier')
ax2.grid(axis='y', alpha=0.3)

for i, (tier, t) in enumerate(zip(tiers_list, times)):
    ax2.text(i, t, f'{t:.1f}ms', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 5.5 Retention Policy Simulation

In [None]:
# Simular pol√≠tica de reten√ß√£o
retention_config = {
    'hot': {'days': 7, 'format': 'Arrow', 'compression': None},
    'warm': {'days': 30, 'format': 'Arrow IPC', 'compression': 'LZ4'},
    'cold': {'days': 365, 'format': 'Parquet', 'compression': 'Snappy'}
}

print("=== Pol√≠tica de Reten√ß√£o ===")
for tier, config in retention_config.items():
    print(f"\n{tier.upper()}:")
    print(f"  - Reten√ß√£o: {config['days']} dias")
    print(f"  - Formato: {config['format']}")
    print(f"  - Compress√£o: {config['compression'] or 'Nenhuma'}")

## 5.6 Cost Analysis

In [None]:
# An√°lise de custo (simulado)
costs = {
    'Memory': {'price_gb_month': 10.00, 'tier': 'HOT'},
    'SSD': {'price_gb_month': 0.20, 'tier': 'WARM'},
    'S3': {'price_gb_month': 0.023, 'tier': 'COLD'}
}

# Calcular custo por tier
total_cost = 0
cost_breakdown = []

for storage, info in costs.items():
    tier = info['tier']
    size_gb = float(perf_df[perf_df['Tier'] == tier]['Tamanho (MB)'].iloc[0]) / 1024
    monthly_cost = size_gb * info['price_gb_month']
    total_cost += monthly_cost
    
    cost_breakdown.append({
        'Storage': storage,
        'Tier': tier,
        'Size (GB)': f"{size_gb:.2f}",
        'Price/GB/Month': f"${info['price_gb_month']:.3f}",
        'Monthly Cost': f"${monthly_cost:.2f}"
    })

cost_df = pd.DataFrame(cost_breakdown)
print("\n=== An√°lise de Custo Mensal ===")
print(cost_df.to_string(index=False))
print(f"\nCusto Total Mensal: ${total_cost:.2f}")

In [None]:
# Comparar com storage √∫nico em SSD
total_size_gb = sum(float(cost_df['Size (GB)'].iloc[i]) for i in range(len(cost_df)))
ssd_only_cost = total_size_gb * 0.20

savings = (ssd_only_cost - total_cost) / ssd_only_cost * 100

print(f"\n=== Compara√ß√£o ===")
print(f"Tudo em SSD: ${ssd_only_cost:.2f}/m√™s")
print(f"Tiered Storage: ${total_cost:.2f}/m√™s")
print(f"Economia: {savings:.1f}%")

In [None]:
# Visualizar distribui√ß√£o de custo
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart de custo
labels = cost_df['Storage'].tolist()
costs_values = [float(cost_df['Monthly Cost'].iloc[i].replace('$', '')) for i in range(len(cost_df))]
colors_pie = ['#ff6b6b', '#ffd43b', '#51cf66']

ax1.pie(costs_values, labels=labels, autopct='%1.1f%%', colors=colors_pie, startangle=90)
ax1.set_title('Distribui√ß√£o de Custo por Storage')

# Bar chart compara√ß√£o
strategies = ['SSD Only', 'Tiered Storage']
strategy_costs = [ssd_only_cost, total_cost]
colors_bar = ['#ff6b6b', '#51cf66']

bars = ax2.bar(strategies, strategy_costs, color=colors_bar, alpha=0.7)
ax2.set_ylabel('Custo Mensal ($)')
ax2.set_title('Compara√ß√£o de Estrat√©gias')
ax2.grid(axis='y', alpha=0.3)

for bar, cost in zip(bars, strategy_costs):
    ax2.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
             f'${cost:.2f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nüí∞ Economia de ${ssd_only_cost - total_cost:.2f}/m√™s com Tiered Storage!")

## 5.7 Time-Travel Query

In [None]:
# Query que acessa dados de diferentes per√≠odos
print("=== Time-Travel Query ===")
print("Analisando tend√™ncias ao longo de 90 dias...\n")

start = time.perf_counter()
trends = con.execute("""
    WITH daily_stats AS (
        SELECT 
            date,
            COUNT(*) as orders,
            SUM(amount) as revenue,
            AVG(amount) as avg_order
        FROM read_parquet(['tier_hot/*.parquet', 'tier_warm/*.parquet', 'tier_cold/*.parquet'])
        WHERE status = 'completed'
        GROUP BY date
    ),
    weekly_avg AS (
        SELECT 
            AVG(revenue) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as ma7
        FROM daily_stats
        ORDER BY date DESC
        LIMIT 1
    )
    SELECT 
        COUNT(DISTINCT date) as total_days,
        SUM(orders) as total_orders,
        ROUND(SUM(revenue), 2) as total_revenue,
        ROUND(AVG(revenue), 2) as daily_avg,
        ROUND((SELECT ma7 FROM weekly_avg), 2) as ma7_revenue
    FROM daily_stats
""").fetchdf()
query_time = time.perf_counter() - start

print(f"Tempo: {query_time*1000:.1f}ms\n")
print(trends.to_string(index=False))

print("\n‚úÖ Query acessou 90 dias de dados cross-tier transparentemente!")

## 5.8 Resumo Final

In [None]:
summary = pd.DataFrame({
    'Tier': ['HOT (Memory/SSD)', 'WARM (SSD Local)', 'COLD (S3)'],
    'Reten√ß√£o': ['7 dias', '8-30 dias', '31-365 dias'],
    'Formato': ['Arrow', 'Arrow IPC + LZ4', 'Parquet + Snappy'],
    'Custo/GB': ['$10.00', '$0.20', '$0.023'],
    'Caso de Uso': [
        'Real-time, alta lat√™ncia',
        'Analytics recente',
        'Hist√≥rico, compliance'
    ]
})

print("\n=== RESUMO DO CAP√çTULO 5 ===")
print(summary.to_string(index=False))

print("\n‚úÖ Principais Conclus√µes:")
print(f"  1. Queries cross-tier s√£o transparentes")
print(f"  2. Economia de custo: {savings:.1f}% vs SSD √∫nico")
print(f"  3. Performance adequada por tier")
print(f"  4. Reten√ß√£o autom√°tica por pol√≠tica")

In [None]:
# Limpeza
con.close()

for tier_path in tiers.values():
    if os.path.exists(tier_path):
        shutil.rmtree(tier_path)

print("\n‚úÖ Notebook conclu√≠do!")