In [None]:
import gzip, random, string, csv
from pathlib import Path

GiB = 1024 ** 3
PATTERN = ("loremIPSUM0123456789_-" * 20)

def make_payload(length: int) -> str:
    if length <= 0:
        return ""
    s = (PATTERN * ((length // len(PATTERN)) + 1))[:length]
    return s

def write_csv_until_size(path: Path, target_bytes: int, *, with_header: bool = True,
                         payload_len: int = 128, progress_mb: int = 256, seed: int = 42) -> None:
    rng = random.Random(seed)
    categories = [f"cat{c}" for c in range(1, 1001)]
    path.parent.mkdir(parents=True, exist_ok=True)

    written_last_report = 0
    with path.open("w", newline='', encoding="utf-8") as f:
        w = csv.writer(f)
        if with_header:
            w.writerow(["id","category","payload"])
        i = 0
        while True:
            i += 1
            cat = rng.choice(categories)
            payload = make_payload(payload_len)
            w.writerow([i, cat, payload])

            pos = f.tell()
            if pos - written_last_report >= progress_mb * 1024 * 1024:
                written_last_report = pos
                print(f"[{path.name}] {pos/1024/1024:.1f} MiB escritos...")

            if pos >= target_bytes:
                break

    print(f"Concluído: {path} ({path.stat().st_size/1024/1024:.2f} MiB)")

def shard_naming(base_out: Path, shard_idx: int) -> Path:
    stem = base_out.stem if base_out.suffix else base_out.name
    suffix = base_out.suffix if base_out.suffix.lower() == ".csv" else ".csv"
    return base_out.with_name(f"{stem}-{shard_idx:05d}{suffix}")

def generate_csv(out: Path, size_gb: float, with_header: bool = True,
                     payload_len: int = 128, progress_mb: int = 256,
                     seed: int = 42, n_shards: int = 1) -> None:
    target_bytes_total = int(size_gb * GiB)
    if n_shards <= 1:
        write_csv_until_size(out, target_bytes_total, with_header=with_header,
                             payload_len=payload_len, progress_mb=progress_mb, seed=seed)
    else:
        per_shard = target_bytes_total // n_shards
        remainder = target_bytes_total % n_shards
        print(f"Escrevendo {n_shards} shards, ~{per_shard/1024/1024:.1f} MiB cada (+ resto {remainder} bytes).")
        for i in range(n_shards):
            shard_target = per_shard + (remainder if i == n_shards - 1 else 0)
            shard_path = shard_naming(out, i)
            write_csv_until_size(shard_path, shard_target, with_header=with_header,
                                 payload_len=payload_len, progress_mb=progress_mb, seed=seed + i)


In [None]:
OUT_DIR = Path("./dados/base_200mb.csv")
SIZE_GB = 0.2

N_SHARDS = 1
WITH_HEADER = True

SEED = 42
random.seed(SEED)

PAYLOAD_LEN = 128 # Define a quantidade de linhas do payload
PROGRESS_MB = 128 # Limita o progresso a ser impresso na saída a cada X MiB (X = PROGRESS_MB)

CONFIRM_LARGE = False # Confirma com True para permitir execuções com SIZE_GB >= 1


In [None]:
# === Execução ===
if SIZE_GB >= 1 and not CONFIRM_LARGE:
    raise SystemExit("Ative CONFIRM_LARGE=True antes de gerar arquivos >= 1 GiB.")

generate_csv(OUT_DIR, SIZE_GB, with_header=WITH_HEADER, payload_len=PAYLOAD_LEN,
                 progress_mb=PROGRESS_MB, seed=SEED, n_shards=N_SHARDS)


In [None]:
from datetime import datetime, timedelta
import csv
import random
from pathlib import Path

STATES = ["AC","AL","AP","AM","BA","CE","DF","ES","GO","MA","MT","MS","MG","PA","PB","PR","PE","PI","RJ","RN","RS","RO","RR","SC","SP","SE","TO"]
CATEGORIES = [f"cat{c}" for c in range(1, 501)]
PATTERN = "loremIPSUM0123456789_-"

def make_payload(length: int) -> str:
    if length <= 0:
        return ""
    s = (PATTERN * ((length // len(PATTERN)) + 1))[:length]
    return s

def write_csv_header(path: Path, header):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", newline='', encoding="utf-8") as f:
        csv.writer(f).writerow(header)

def append_row(path: Path, row):
    with path.open("a", newline='', encoding="utf-8") as f:
        csv.writer(f).writerow(row)

def generate_bases(out_dir: Path, target_bytes: int, seed: int = 42,
                   n_customers: int = 200_000, n_products: int = 100_000,
                   avg_items_per_order: float = 3.0, payload_len: int = 64) -> None:
    rng = random.Random(seed)
    out_dir.mkdir(parents=True, exist_ok=True)

    customers_p = out_dir / "customers.csv"
    products_p = out_dir / "products.csv"
    orders_p = out_dir / "orders.csv"
    items_p = out_dir / "order_items.csv"
    for p, header in [
        (customers_p, ["customer_id","name","email","state"]),
        (products_p, ["product_id","category","price_cents"]),
        (orders_p, ["order_id","customer_id","order_ts","order_total_cents"]),
        (items_p, ["order_id","product_id","qty","unit_price_cents","line_total_cents","payload"]),
    ]:
        write_csv_header(p, header)

    denorm_p = out_dir / "denormalized.csv"
    with denorm_p.open("w", newline='', encoding="utf-8") as fden:
        wden = csv.writer(fden)
        wden.writerow([
            "order_id","order_ts","customer_id","customer_name","customer_email","customer_state",
            "product_id","product_category","qty","unit_price_cents","line_total_cents","payload"
        ])

        customers = {}
        products = {}
        order_id = 0
        t0 = datetime(2020, 1, 1)

        # gera até atingir o tamanho máximo da desnormalizada
        while fden.tell() < target_bytes:
            order_id += 1
            cust_id = rng.randint(1, n_customers)
            cust = customers.setdefault(cust_id, {
                "id": cust_id,
                "name": f"Customer {cust_id}",
                "email": f"c{cust_id}@exemplo.com",
                "state": rng.choice(STATES)
            })
            order_ts = (t0 + timedelta(seconds=rng.randint(0, 60*60*24*365*5))).isoformat()

            n_items = max(1, int(rng.gauss(avg_items_per_order, 1.0)))
            order_total = 0

            for _ in range(n_items):
                prod_id = rng.randint(1, n_products)
                prod = products.setdefault(prod_id, {
                    "id": prod_id,
                    "cat": rng.choice(CATEGORIES),
                    "price": rng.randint(500, 50_000)
                })
                qty = max(1, int(rng.expovariate(1/2)))
                unit_price = prod["price"]
                line_total = unit_price * qty
                payload = make_payload(payload_len)
                order_total += line_total

                wden.writerow([
                    order_id, order_ts, cust["id"], cust["name"], cust["email"], cust["state"],
                    prod["id"], prod["cat"], qty, unit_price, line_total, payload
                ])

                append_row(items_p, [order_id, prod["id"], qty, unit_price, line_total, payload])

            append_row(orders_p, [order_id, cust["id"], order_ts, order_total])

            if order_id % 1000 == 0:
                print(f"[progress] ~{fden.tell()/1024/1024:.1f} MiB escritos (desnormalizada)...")

    for c in customers.values():
        append_row(customers_p, [c["id"], c["name"], c["email"], c["state"]])
    for p in products.values():
        append_row(products_p, [p["id"], p["cat"], p["price"]])

    norm_total = sum(p.stat().st_size for p in (customers_p, products_p, orders_p, items_p))
    print(f"Concluído a Normalizada e a Desnormalizada: {out_dir}")
    print(f" - desnormalizada: {denorm_p.stat().st_size/1024/1024:.8f} MiB")
    print(f" - normalizada: {norm_total/1024/1024:.8f} MiB")

In [None]:
# === Execução de exemplo ===
OUT_DIR = Path("dados/base_comparacao_teste")
SIZE_GB = 0.000001
CONFIRM_LARGE = True

if SIZE_GB >= 1 and not CONFIRM_LARGE:
    raise SystemExit("Ative CONFIRM_LARGE=True antes de gerar bases >= 1 GiB.")

generate_bases(
    OUT_DIR,
    target_bytes=int(SIZE_GB * (1024**3)),
    seed=42,
    n_customers=200_000,
    n_products=100_000,
    avg_items_per_order=3.0,
    payload_len=64
)
