# The Engine Wars: Code Examples

**Benchmarking & Selecting the Right Compute Engine for ETL, Analytics, and Multimodal AI**

This notebook contains runnable code examples from the article.

---
## Setup: Generate Synthetic Data

First, we generate synthetic datasets that will be used throughout the examples.

In [1]:
import os
import random
import numpy as np
import pandas as pd

# Create data directory
DATA_DIR = "../.data"
os.makedirs(DATA_DIR, exist_ok=True)

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

print("Data directory created:", DATA_DIR)

Data directory created: ../.data


In [2]:
# Generate sales_data.csv
n_sales = 100_000
regions = ["North", "South", "East", "West", "Central"]
products = ["Widget A", "Widget B", "Widget C", "Gadget X", "Gadget Y"]

sales_data = pd.DataFrame(
    {
        "region": np.random.choice(regions, n_sales),
        "product": np.random.choice(products, n_sales),
        "revenue": np.random.uniform(10, 1000, n_sales).round(2),
        "quantity": np.random.randint(1, 50, n_sales),
        "date": pd.date_range("2024-01-01", periods=n_sales, freq="min"),
    }
)
sales_data.to_csv(f"{DATA_DIR}/sales_data.csv", index=False)
print(f"Created sales_data.csv with {n_sales:,} rows")
sales_data.head()

Created sales_data.csv with 100,000 rows


Unnamed: 0,region,product,revenue,quantity,date
0,West,Widget B,50.75,34,2024-01-01 00:00:00
1,Central,Widget A,780.27,42,2024-01-01 00:01:00
2,East,Widget C,492.88,14,2024-01-01 00:02:00
3,Central,Gadget X,923.47,28,2024-01-01 00:03:00
4,Central,Gadget X,803.18,13,2024-01-01 00:04:00


In [3]:
# Generate events.parquet for ETL examples
n_events = 200_000
user_ids = [f"user_{i:05d}" for i in range(1, 5001)]
event_types = ["page_view", "click", "purchase", "signup", "logout"]

events_data = pd.DataFrame(
    {
        "event_id": [f"evt_{i:08d}" for i in range(n_events)],
        "user_id": np.random.choice(user_ids, n_events),
        "event_type": np.random.choice(
            event_types, n_events, p=[0.5, 0.25, 0.1, 0.1, 0.05]
        ),
        "event_date": pd.to_datetime(
            np.random.choice(
                pd.date_range("2024-06-01", "2025-06-01", freq="h"), n_events
            )
        ),
        "purchase_amount": np.where(
            np.random.choice(event_types, n_events) == "purchase",
            np.random.uniform(5, 500, n_events).round(2),
            np.random.uniform(0, 100, n_events).round(2),
        ),
        "session_duration": np.random.exponential(300, n_events).round(0),
    }
)
events_data.to_parquet(f"{DATA_DIR}/events.parquet", index=False)
print(f"Created events.parquet with {n_events:,} rows")
events_data.head()

Created events.parquet with 200,000 rows


Unnamed: 0,event_id,user_id,event_type,event_date,purchase_amount,session_duration
0,evt_00000000,user_04302,page_view,2025-05-26 09:00:00,77.08,593.0
1,evt_00000001,user_04107,page_view,2024-12-17 10:00:00,63.05,565.0
2,evt_00000002,user_01734,click,2025-05-30 18:00:00,71.97,107.0
3,evt_00000003,user_04333,page_view,2024-12-26 00:00:00,109.2,359.0
4,evt_00000004,user_02095,click,2025-04-08 21:00:00,32.9,362.0


In [4]:
# Generate taxi_data.parquet for DataFusion examples
n_trips = 100_000

taxi_data = pd.DataFrame(
    {
        "trip_id": range(n_trips),
        "passenger_count": np.random.choice(
            [1, 2, 3, 4, 5, 6], n_trips, p=[0.7, 0.15, 0.08, 0.04, 0.02, 0.01]
        ),
        "trip_distance": np.abs(np.random.normal(3, 5, n_trips)).round(2),
        "total_amount": np.abs(np.random.normal(15, 20, n_trips)).round(2),
        "payment_type": np.random.choice(
            [1, 2, 3, 4], n_trips, p=[0.6, 0.3, 0.05, 0.05]
        ),
        "pickup_datetime": pd.date_range("2024-01-01", periods=n_trips, freq="30s"),
    }
)
taxi_data.to_parquet(f"{DATA_DIR}/taxi_data.parquet", index=False)
print(f"Created taxi_data.parquet with {n_trips:,} rows")
taxi_data.head()

Created taxi_data.parquet with 100,000 rows


Unnamed: 0,trip_id,passenger_count,trip_distance,total_amount,payment_type,pickup_datetime
0,0,2,0.63,19.25,4,2024-01-01 00:00:00
1,1,1,6.87,3.47,1,2024-01-01 00:00:30
2,2,1,4.23,32.37,1,2024-01-01 00:01:00
3,3,1,7.84,28.1,2,2024-01-01 00:01:30
4,4,1,1.64,13.13,1,2024-01-01 00:02:00


In [5]:
# Generate web_logs.parquet for DataFusion advanced example
n_logs = 150_000
status_codes = [200, 201, 301, 400, 404, 500, 502, 503]

web_logs = pd.DataFrame(
    {
        "log_id": range(n_logs),
        "event_timestamp": pd.date_range("2024-06-01", periods=n_logs, freq="5s"),
        "status_code": np.random.choice(
            status_codes, n_logs, p=[0.7, 0.1, 0.05, 0.05, 0.04, 0.03, 0.02, 0.01]
        ),
        "response_time_ms": np.abs(np.random.exponential(50, n_logs)).round(1),
        "endpoint": np.random.choice(
            ["/api/users", "/api/orders", "/api/products", "/health", "/"], n_logs
        ),
    }
)
web_logs.to_parquet(f"{DATA_DIR}/web_logs.parquet", index=False)
print(f"Created web_logs.parquet with {n_logs:,} rows")
web_logs.head()

Created web_logs.parquet with 150,000 rows


Unnamed: 0,log_id,event_timestamp,status_code,response_time_ms,endpoint
0,0,2024-06-01 00:00:00,200,13.0,/api/users
1,1,2024-06-01 00:00:05,200,33.8,/api/users
2,2,2024-06-01 00:00:10,200,25.9,/api/users
3,3,2024-06-01 00:00:15,200,23.2,/api/users
4,4,2024-06-01 00:00:20,201,2.6,/health


In [6]:
# Generate sample image URLs using picsum.photos
# These are real, accessible images for the multimodal examples
image_urls = pd.DataFrame(
    {
        "url": [f"https://picsum.photos/seed/{i}/400/300" for i in range(1, 21)],
        "category": np.random.choice(["nature", "city", "people", "animals"], 20),
    }
)
image_urls.to_csv(f"{DATA_DIR}/image_urls.csv", index=False)
print(f"Created image_urls.csv with {len(image_urls)} image URLs")
image_urls.head()

Created image_urls.csv with 20 image URLs


Unnamed: 0,url,category
0,https://picsum.photos/seed/1/400/300,people
1,https://picsum.photos/seed/2/400/300,nature
2,https://picsum.photos/seed/3/400/300,city
3,https://picsum.photos/seed/4/400/300,people
4,https://picsum.photos/seed/5/400/300,people


In [7]:
print("\n‚úÖ All synthetic data generated successfully!")
print("\nFiles created:")
for f in os.listdir(DATA_DIR):
    size = os.path.getsize(os.path.join(DATA_DIR, f)) / (1024 * 1024)
    print(f"  - {f}: {size:.2f} MB")


‚úÖ All synthetic data generated successfully!

Files created:
  - sales_data.csv: 4.26 MB
  - events.parquet: 2.66 MB
  - taxi_data.parquet: 1.84 MB
  - web_logs.parquet: 2.33 MB
  - image_urls.csv: 0.00 MB


In [8]:
# ============================================================
# Pre-import all libraries to avoid import overhead in benchmarks
# ============================================================
import time
from io import BytesIO

# Data processing libraries
import pandas as pd
import polars as pl
from datafusion import SessionContext

# Image processing
from PIL import Image
import requests

# Multimodal processing
import daft

# Optional: PySpark (only if installed)
try:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import udf
    from pyspark.sql.types import BinaryType

    print("‚úÖ All libraries imported successfully (including PySpark)")
except ImportError:
    print("‚úÖ All libraries imported successfully (PySpark not available)")

‚úÖ All libraries imported successfully (including PySpark)


---
## Import All Libraries (Warmup)

To ensure fair performance comparisons, we pre-import all libraries here. This eliminates first-time import overhead and initialization costs (like Ray startup for Daft) from our timing measurements.

---
## Part 2: The Single-Node Revolution

### 2.1 Polars: The Tabular Speedster

#### GroupBy Aggregation ‚Äî Pandas vs. Polars

In [9]:
# ============================================================
# Pandas: Eager, Single-Threaded Execution
# ============================================================

start = time.perf_counter()

# Loads ALL data into memory immediately ‚Äî no optimization possible
df = pd.read_csv(f"{DATA_DIR}/sales_data.csv")

# Runs on a single core; other cores sit idle
result = df.groupby("region")["revenue"].mean()

elapsed = time.perf_counter() - start
print(f"Pandas execution time: {elapsed:.3f}s")
print(result)

Pandas execution time: 0.034s
region
Central    505.965290
East       508.634425
North      507.502784
South      504.473092
West       501.384616
Name: revenue, dtype: float64


In [10]:
# ============================================================
# Polars: Lazy, Multi-Threaded Execution
# ============================================================

start = time.perf_counter()

# scan_csv reads only file headers ‚Äî no data loaded yet
q = (
    pl.scan_csv(f"{DATA_DIR}/sales_data.csv")
    # The query optimizer sees this full chain BEFORE execution
    .group_by("region")
    .agg(pl.col("revenue").mean())
)

# .collect() triggers optimized execution:
# - Predicate pushdown filters data during I/O
# - Column pruning reads only "region" and "revenue"
# - Parallel streaming across all CPU cores
result = q.collect()

elapsed = time.perf_counter() - start
print(f"Polars execution time: {elapsed:.3f}s")
print(result)

Polars execution time: 0.005s
shape: (5, 2)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ region  ‚îÜ revenue    ‚îÇ
‚îÇ ---     ‚îÜ ---        ‚îÇ
‚îÇ str     ‚îÜ f64        ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ Central ‚îÜ 505.96529  ‚îÇ
‚îÇ West    ‚îÜ 501.384616 ‚îÇ
‚îÇ East    ‚îÜ 508.634425 ‚îÇ
‚îÇ South   ‚îÜ 504.473092 ‚îÇ
‚îÇ North   ‚îÜ 507.502784 ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò


#### Chained ETL Pipeline ‚Äî Where Lazy Evaluation Truly Shines

In [11]:
# ============================================================
# Polars: Complex ETL with automatic optimization
# ============================================================

start = time.perf_counter()

result = (
    pl.scan_parquet(f"{DATA_DIR}/events.parquet")
    # The optimizer will "push" this filter into the Parquet reader,
    # skipping entire row groups that don't match
    .filter(pl.col("event_date") >= pl.datetime(2025, 1, 1))
    # Column pruning: only the columns referenced below are read
    .group_by("user_id")
    .agg(
        [
            pl.col("purchase_amount").sum().alias("total_spend"),
            pl.col("event_id").count().alias("event_count"),
            pl.col("session_duration").mean().alias("avg_session"),
        ]
    )
    .filter(pl.col("total_spend") > 100)
    .sort("total_spend", descending=True)
    .collect()  # Entire optimized plan executes here
)

elapsed = time.perf_counter() - start
print(f"Polars ETL execution time: {elapsed:.3f}s")
print(f"Result rows: {len(result):,}")
result.head(10)

Polars ETL execution time: 0.005s
Result rows: 5,000


user_id,total_spend,event_count,avg_session
str,f64,u32,f64
"""user_00468""",4399.11,19,337.631579
"""user_00034""",4217.25,25,289.52
"""user_04979""",4168.54,29,241.482759
"""user_04369""",4105.63,27,333.555556
"""user_02468""",3818.64,27,383.333333
"""user_03436""",3714.97,19,268.736842
"""user_01292""",3698.47,27,363.518519
"""user_04904""",3690.2,26,324.961538
"""user_02466""",3687.03,29,274.206897
"""user_04788""",3636.13,17,324.411765


### 2.2 Apache DataFusion: The Extensible Query Engine

#### SQL on Parquet with DataFusion

In [12]:
# ============================================================
# DataFusion: SQL queries directly on Parquet files
# ============================================================

start = time.perf_counter()

# Create a session context ‚Äî similar to a SparkSession, but lightweight
ctx = SessionContext()

# Register a Parquet file as a virtual table (lazy ‚Äî no data loaded)
ctx.register_parquet("taxi_data", f"{DATA_DIR}/taxi_data.parquet")

# Execute SQL with Rust-native vectorized execution.
# DataFusion applies the same optimizations as Polars:
# predicate pushdown, column pruning, and parallel execution.
df = ctx.sql("""
    SELECT 
        passenger_count, 
        COUNT(*) as trip_count,
        AVG(trip_distance) as avg_distance,
        AVG(total_amount) as avg_fare
    FROM taxi_data 
    WHERE trip_distance > 5.0
      AND payment_type = 1
    GROUP BY passenger_count
    ORDER BY trip_count DESC
""")

# Zero-copy conversion to Pandas or Arrow for downstream use
result = df.to_pandas()

elapsed = time.perf_counter() - start
print(f"DataFusion execution time: {elapsed:.3f}s")
print(result)

DataFusion execution time: 0.005s
   passenger_count  trip_count  avg_distance   avg_fare
0                1       16852      8.198771  20.046604
1                2        3560      8.194528  20.356059
2                3        1896      8.144051  20.416250
3                4         964      8.194647  20.092438
4                5         489      8.126135  20.075726
5                6         236      8.261398  20.132881


#### DataFusion with Complex Analytics

In [13]:
# ============================================================
# DataFusion: Complex analytical queries
# ============================================================
# Note: For larger-than-memory workloads, DataFusion supports disk spilling
# via RuntimeEnvBuilder with .with_disk_manager_os() and .with_fair_spill_pool()

start = time.perf_counter()

ctx = SessionContext()

# Register multiple Parquet files as tables
ctx.register_parquet("web_logs", f"{DATA_DIR}/web_logs.parquet")

# Complex analytical query
# Note: Using 2024-06-01 to match our generated data range
result = ctx.sql("""
    SELECT 
        DATE_TRUNC('hour', event_timestamp) as hour,
        status_code,
        COUNT(*) as request_count,
        AVG(response_time_ms) as avg_latency,
        MAX(response_time_ms) as max_latency
    FROM web_logs
    WHERE event_timestamp >= '2024-06-01'
    GROUP BY 1, 2
    ORDER BY hour, request_count DESC
    LIMIT 20
""")

elapsed = time.perf_counter() - start
print(f"DataFusion analytics execution time: {elapsed:.3f}s")
print(result.to_pandas())

DataFusion analytics execution time: 0.001s
                  hour  status_code  request_count  avg_latency  max_latency
0  2024-06-01 00:00:00          200            506    49.423123        322.7
1  2024-06-01 00:00:00          201             86    53.167442        372.0
2  2024-06-01 00:00:00          301             35    55.348571        173.3
3  2024-06-01 00:00:00          404             31    61.077419        267.9
4  2024-06-01 00:00:00          400             26    35.661538        134.9
5  2024-06-01 00:00:00          500             18    55.505556        209.7
6  2024-06-01 00:00:00          502             13    50.169231        147.6
7  2024-06-01 00:00:00          503              5    95.660000        270.8
8  2024-06-01 01:00:00          200            497    46.583903        395.4
9  2024-06-01 01:00:00          201             70    51.877143        241.8
10 2024-06-01 01:00:00          404             37    48.270270        190.3
11 2024-06-01 01:00:00          

---
## Part 3: The Multimodal Shift

### 3.1 Pandas: The Manual Approach (Slow and Memory-Hungry)

In [14]:
# ============================================================
# Pandas + Pillow: Sequential, single-threaded image processing
# ============================================================

# Load URLs
df = pd.read_csv(f"{DATA_DIR}/image_urls.csv")
# Use only first 5 images for demo (to avoid long wait)
df = df.head(5).copy()


def download_and_resize(url):
    """Each image is downloaded and processed one at a time."""
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content))
        img = img.resize((224, 224))
        return f"Image {img.size}"
    except Exception as e:
        return f"Error: {e}"


start = time.perf_counter()

# This runs sequentially on a single core
df["result"] = df["url"].apply(download_and_resize)

elapsed = time.perf_counter() - start
print(f"Pandas+Pillow execution time: {elapsed:.3f}s")
print(df)

Pandas+Pillow execution time: 0.780s
                                    url category            result
0  https://picsum.photos/seed/1/400/300   people  Image (224, 224)
1  https://picsum.photos/seed/2/400/300   nature  Image (224, 224)
2  https://picsum.photos/seed/3/400/300     city  Image (224, 224)
3  https://picsum.photos/seed/4/400/300   people  Image (224, 224)
4  https://picsum.photos/seed/5/400/300   people  Image (224, 224)


### 3.2 PySpark: The JVM Serialization Tax

**Requirements:**
- PySpark: `uv sync --extra distributed`
- Java 8, 11, or 17 installed (on macOS: `brew install openjdk@17`)
- Set `JAVA_HOME` environment variable

**About This Example:**

This demonstrates PySpark's UDF-based image processing with inherent serialization overhead:

1. Data starts in JVM (Spark executor)
2. Serialized via Py4J to Python process  
3. Python UDF runs (download + Pillow resize)
4. Result serialized back to JVM

This JVM ‚Üî Python round-trip happens for **EVERY row**, making it significantly slower than Daft's Rust-native approach where download, decode, and resize all happen in Rust without any language boundary crossings.

At scale (100K+ images), this serialization tax dominates runtime.

**Note:** If PySpark or Java is not installed, this cell will fail. Skip it if you don't have the dependencies.

In [15]:
# ============================================================
# PySpark: Image processing with serialization overhead
# ============================================================

# Create a local Spark session
spark = (
    SparkSession.builder.appName("ImagePipeline")
    .master("local[*]")
    .config("spark.driver.memory", "2g")
    .getOrCreate()
)

# Suppress verbose Spark logging
spark.sparkContext.setLogLevel("WARN")

# Load image URLs
df = spark.read.csv(f"{DATA_DIR}/image_urls.csv", header=True)
df = df.limit(5)  # Use only 5 images for demo


# UDFs in PySpark serialize data from JVM ‚Üí Python ‚Üí JVM.
# This "Py4J bridge" is the performance killer: each row
# crosses the language boundary twice.
@udf(returnType=BinaryType())
def process_image(url):
    response = requests.get(url, timeout=10)
    img = Image.open(BytesIO(response.content))
    img = img.resize((224, 224))
    buffer = BytesIO()
    img.save(buffer, format="JPEG")
    return buffer.getvalue()


start = time.perf_counter()

# The JVM manages task scheduling, but every image passes through
# Python's GIL and Py4J serialization
result = df.withColumn("processed_image", process_image(df["url"]))

# Force execution
count = result.count()

elapsed = time.perf_counter() - start
print(f"PySpark execution time: {elapsed:.3f}s")
print(f"Processed {count} images")
print("\nNote: The serialization overhead becomes significant at scale.")
print("Each image crosses the JVM ‚Üî Python boundary twice.")

spark.stop()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/02 21:53:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


PySpark execution time: 0.187s
Processed 5 images

Note: The serialization overhead becomes significant at scale.
Each image crosses the JVM ‚Üî Python boundary twice.


### 3.3 Daft: Rust-Native Multimodal Processing

Compare the same operation using Daft's native operators:

**Note on First-Run Performance**: If you run the Daft cell below multiple times, you'll notice the first run is slower (~0.8s) and subsequent runs are much faster (~0.09s). This is because:
1. **Ray initialization** (Daft's execution engine) - happens once on first import
2. **HTTP connection pooling** - connections are established on first request

By pre-importing Daft in the warmup cell above, we've already paid this initialization cost, so the timing reflects only the actual work being done.

In [20]:
# ============================================================
# Daft: Zero-serialization multimodal pipeline
# ============================================================

start = time.perf_counter()

# Scan URLs from a CSV file ‚Äî lazy, no data loaded yet
df = daft.read_csv(f"{DATA_DIR}/image_urls.csv").limit(5)

# Everything below runs in Rust, not in Python loops:
# - .download() performs parallel HTTP downloads
# - .decode_image() decodes JPEG/PNG bytes into tensors
# - .resize() resizes in Rust's image-rs library
# No JVM. No Py4J. No GIL. No serialization tax.
df = df.with_column("image", df["url"].download().decode_image())
df = df.with_column("thumbnail", df["image"].resize(224, 224))

result = df.collect()

elapsed = time.perf_counter() - start
print(f"Daft execution time: {elapsed:.3f}s")
print(f"Processed {len(result)} images")
result

[00:00] üó°Ô∏è üêü Csv Scan: 6 rows out, 0 B bytes read | üó°Ô∏è üêü Limit 5: 6 rows in, 5 rows out | üó°Ô∏è üêü id-c0d1c1b9-5bd3-4876-9452-8b140f75ce56: 5 rows in, 5 rows out | üó°Ô∏è üêü image: 5 rows in, 5 rows out | üó°Ô∏è üêü Rename & Reorder: 5 rows in, 5 rows out | üó°Ô∏è üêü thumbnail: 5 rows in, 5 rows out

Daft execution time: 0.093s
Processed 5 images


url String,category String,image Image[RGB],thumbnail Image[RGB; 224 x 224]
https://picsum.photos/seed/1/400/300,people,,
https://picsum.photos/seed/2/400/300,nature,,
https://picsum.photos/seed/3/400/300,city,,
https://picsum.photos/seed/4/400/300,people,,
https://picsum.photos/seed/5/400/300,people,,


---
## Part 4: The Convergence on Rust & Arrow

**Note:** This notebook focuses on single-node examples. For distributed computing examples (Ray Data, PySpark, Daft Flotilla), see the scripts in `src/engine_comparison/distributed/`.

### Zero-Copy Interop Between DataFusion and Polars

In [21]:
# ============================================================
# Zero-copy interoperability via Apache Arrow
# ============================================================

start = time.perf_counter()

# Compute a result in DataFusion
ctx = SessionContext()
ctx.register_parquet("events", f"{DATA_DIR}/events.parquet")

# DataFusion executes the heavy lifting in Rust
df_datafusion = ctx.sql("""
    SELECT user_id, COUNT(*) as event_count
    FROM events
    WHERE event_type = 'purchase'
    GROUP BY user_id
    HAVING COUNT(*) > 5
""")

# Convert to Arrow batches ‚Äî zero-copy, no serialization
arrow_batches = df_datafusion.to_arrow_table()

# Polars reads Arrow natively ‚Äî again, zero-copy
df_polars = pl.from_arrow(arrow_batches)

# Continue analysis in Polars with its rich expression API
result = df_polars.with_columns(pl.col("event_count").rank().alias("rank")).sort("rank")

elapsed = time.perf_counter() - start
print(f"DataFusion ‚Üí Arrow ‚Üí Polars execution time: {elapsed:.3f}s")
print(f"Users with >5 purchases: {len(result):,}")
result.head(10)

DataFusion ‚Üí Arrow ‚Üí Polars execution time: 0.011s
Users with >5 purchases: 1,051


user_id,event_count,rank
str,i64,f64
"""user_00094""",6,259.5
"""user_03579""",6,259.5
"""user_04329""",6,259.5
"""user_02403""",6,259.5
"""user_00899""",6,259.5
"""user_03543""",6,259.5
"""user_00120""",6,259.5
"""user_00511""",6,259.5
"""user_00069""",6,259.5
"""user_01341""",6,259.5


---
## Summary: Performance Comparison

Run a quick benchmark comparing the engines on the same task:

In [22]:
# Benchmark: GroupBy aggregation on events data
results = []

# Pandas
start = time.perf_counter()
df_pd = pd.read_parquet(f"{DATA_DIR}/events.parquet")
_ = df_pd.groupby("user_id").agg({"purchase_amount": "sum", "event_id": "count"})
results.append(("Pandas", time.perf_counter() - start))

# Polars (Lazy)
start = time.perf_counter()
_ = (
    pl.scan_parquet(f"{DATA_DIR}/events.parquet")
    .group_by("user_id")
    .agg([pl.col("purchase_amount").sum(), pl.col("event_id").count()])
    .collect()
)
results.append(("Polars (Lazy)", time.perf_counter() - start))

# DataFusion
start = time.perf_counter()
ctx = SessionContext()
ctx.register_parquet("events", f"{DATA_DIR}/events.parquet")
_ = ctx.sql("""
    SELECT user_id, SUM(purchase_amount), COUNT(event_id)
    FROM events GROUP BY user_id
""").to_pandas()
results.append(("DataFusion", time.perf_counter() - start))

# Display results
print("\nüìä Performance Comparison (GroupBy Aggregation on 200K rows)")
print("=" * 50)
for engine, elapsed in sorted(results, key=lambda x: x[1]):
    print(f"{engine:20s}: {elapsed:.3f}s")

fastest = min(results, key=lambda x: x[1])
slowest = max(results, key=lambda x: x[1])
print(f"\nüèÜ {fastest[0]} is {slowest[1] / fastest[1]:.1f}x faster than {slowest[0]}")


üìä Performance Comparison (GroupBy Aggregation on 200K rows)
Polars (Lazy)       : 0.007s
DataFusion          : 0.008s
Pandas              : 0.021s

üèÜ Polars (Lazy) is 2.9x faster than Pandas


---
## Cleanup (Optional)

---
## Distributed Examples

This notebook covers single-node operations. For distributed computing examples that require cluster infrastructure:

- **PySpark ETL**: `src/engine_comparison/distributed/spark_etl.py`
- **Ray Data GPU Inference**: `src/engine_comparison/distributed/ray_inference.py`
- **Daft Distributed Pipeline**: `src/engine_comparison/distributed/daft_pipeline.py`

Install distributed dependencies: `uv sync --extra distributed`

In [23]:
# # Uncomment to remove generated data files
# import shutil
# shutil.rmtree(DATA_DIR)
# print(f"Removed {DATA_DIR} directory")