In [1]:
!pip install -q pyiceberg faker 

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "faker",
#     "pyiceberg",
#     "pyarrow",
#     "pandas",
# ]
# ///

import random
import time
from datetime import datetime, timezone

import faker
import pandas as pd
import pyarrow as pa
from pyiceberg.catalog.rest import RestCatalog

# -------------------
# CONFIG
# -------------------
TOTAL_TRANSACTIONS = 1000   # 🔹 total number of transactions to generate
BATCH_SIZE = 10            # 🔹 insert every 10 transactions
MIN_DELAY = 0.1            # 🔹 minimum delay between transactions (sec)
MAX_DELAY = 1.0            # 🔹 maximum delay between transactions (sec)

CATALOG_URL = "http://lakekeeper:8181/catalog"
WAREHOUSE = "sepahram"
NAMESPACE = ("banking",)
TABLE_NAME = ("banking", "source_transactions")

fake = faker.Faker()

# -------------------
# Setup Catalog/Table
# -------------------
catalog = RestCatalog(
    name="banking_catalog",
    warehouse=WAREHOUSE,
    uri=CATALOG_URL,
    token="dummy",
)

if NAMESPACE not in catalog.list_namespaces():
    catalog.create_namespace(NAMESPACE)

if TABLE_NAME in catalog.list_tables(namespace=NAMESPACE):
    catalog.drop_table(TABLE_NAME)

# Define schema based on one fake transaction
sample_tx = {
    "transactionId": pa.string(),
    "userId": pa.string(),
    "timestamp": pa.timestamp("us", tz="UTC"),
    "amount": pa.float64(),
    "currency": pa.string(),
    "city": pa.string(),
    "country": pa.string(),
    "merchantName": pa.string(),
    "paymentMethod": pa.string(),
    "ipAddress": pa.string(),
    "voucherCode": pa.string(),
    "affiliateId": pa.string(),
}
schema = pa.schema(list(sample_tx.items()))

table = catalog.create_table(TABLE_NAME, schema=schema)

In [5]:
# -------------------
# Transaction Generator
# -------------------
def generate_transaction():
    user = fake.simple_profile()
    return {
        "transactionId": fake.uuid4(),
        "userId": user['username'],
        "timestamp": datetime.now(timezone.utc),
        "amount": round(random.uniform(10, 1000), 2),
        "currency": random.choice(['USD', 'GBP']),
        "city": fake.city(),
        "country": fake.country(),
        "merchantName": fake.company(),
        "paymentMethod": random.choice(['credit_card', 'debit_card', 'online_transfer']),
        "ipAddress": fake.ipv4(),
        "voucherCode": random.choice(['', 'DISCOUNT10', '']),
        "affiliateId": fake.uuid4(),
    }


def write_batch(batch):
    """Append a batch of transactions to Iceberg."""
    df = pd.DataFrame(batch)
    pa_df = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
    table.append(pa_df)
    print(f"✅ Written batch of {len(batch)} records to Iceberg")

In [6]:
# -------------------
# Main Loop
# -------------------
if __name__ == "__main__":
    buffer = []
    for i in range(1, TOTAL_TRANSACTIONS + 1):
        t = generate_transaction()
        buffer.append(t)
        print(f"Generated Record #{i:>3} → {t['amount']} {t['currency']} at {t['merchantName']}")

        if len(buffer) >= BATCH_SIZE:
            write_batch(buffer)
            buffer.clear()

        time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))

    # flush leftovers
    if buffer:
        write_batch(buffer)

    print(f"🎉 Done! Inserted {TOTAL_TRANSACTIONS} transactions into Iceberg.")

Generated Record #  1 → 39.92 GBP at Moss, Williamson and Wise
Generated Record #  2 → 286.63 GBP at Woodard Group
Generated Record #  3 → 500.02 USD at Rodriguez, Ball and King
Generated Record #  4 → 62.87 USD at Martinez-Gibson
Generated Record #  5 → 828.68 GBP at Nicholson, Chan and Ballard
Generated Record #  6 → 754.13 GBP at Chan-Rodriguez
Generated Record #  7 → 332.61 USD at Smith, Brown and Thompson
Generated Record #  8 → 704.13 GBP at Mcgee Ltd
Generated Record #  9 → 565.0 GBP at Johns, Reyes and Knight
Generated Record # 10 → 92.06 GBP at Spencer LLC
✅ Written batch of 10 records to Iceberg
Generated Record # 11 → 682.56 GBP at Miller-Benson
Generated Record # 12 → 933.96 GBP at Mckay, Griffin and Russo
Generated Record # 13 → 537.89 GBP at Vance, Lopez and Branch
Generated Record # 14 → 870.49 GBP at Martinez PLC
Generated Record # 15 → 648.59 GBP at Mueller, Jacobson and Brock
Generated Record # 16 → 685.76 USD at Benson-Potter
Generated Record # 17 → 873.38 GBP at Tuc