# Python DataFrame Query Benchmarks 

Follow along notebook from ibis article aiming to compare and contrast availble dataframe processing libraries for python 

**Article Link:** https://ibis-project.org/posts/1tbc/

## Generating the Data

In [None]:
# generate TPC-H data with scale factor of 1
bench gen-data -s 1

## Benchmarking

In [None]:
# run a benchmark
bench run -s 1 ibis-duckdb ibis-duckdb-sql ibis-datafusion ibis-datafusion-sql ibis-polars polars-lazy

In [10]:
import os 
import glob
import pandas as pd 
import polars as pl 
import plotly.express as px 
import ibis

# set configs
px.defaults.template = "plotly_dark"
ibis.options.interactive = True

In [11]:
# set data directory
data_directory = "../data/tpch_data/parquet/sf=1/n=1"

# load tables
tables = glob.glob(f"{data_directory}/*")

total_rows = 0

# for each table, read the parquet and get the n observations 
for table in tables:
    t = ibis.read_parquet(f"{table}/*.parquet")
    total_rows += t.count().to_pyarrow().as_py()

print(f"Total Rows: {total_rows:,}")

Total Rows: 8,661,245


In [15]:
def get_dir_size(path):
    from pathlib import Path

    return sum(p.stat().st_size for p in Path(path).rglob("*") if p.is_file())

sizes = [get_dir_size(table) for table in tables]
names = [os.path.basename(table) for table in tables]

tmp = ibis.memtable({"name": names, "size": sizes})
tmp = tmp.mutate(size_gb=tmp["size"] / (1024**3))
tmp = tmp.mutate(size_gb_mem=tmp["size_gb"] * 11 / 5)
tmp = tmp.order_by(ibis.desc("size_gb"))

chart = px.bar(
    tmp, 
    x="name", 
    y="size_gb", 
    title="Table Sizes in TPC-H Data", 
    hover_data=["size_gb_mem"], 
    labels={
        "name": "table name", 
        "size_gb": "Size (GB on-disk in compressed Parquet files)", 
        "size_gb_mem": "Size (approx. GB in memory)"
    }, 
)

print(
    f"Total Size: {tmp['size_gb'].sum().to_pyarrow().as_py():,.2f} GBs (compressed Parquet files)"
)

chart

Total Size: 0.37 GBs (compressed Parquet files)


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [16]:
table_name = "lineitem"
data = f"{data_directory}/{table_name}/*.parquet"

t = ibis.read_parquet(data)
print(f"Total Rows: {t.count().to_pyarrow().as_py():,} | Columns: {len(t.columns)}")

Total Rows: 6,001,215 | Columns: 18


### Display Comparison 

Which dataframe libraries allow you to view the data, how long is the load time?
- Ibis and Polars with lazy execition and streaming provide optimal performance 
- Pandas and Polars eager force us to load data into memory before preview and therefore struggle a bit

In [22]:
t = ibis.read_parquet(data)
t.head(3)

In [23]:
df = pd.concat([pd.read_parquet(f) for f in glob.glob(data)], ignore_index=True)
df.head(3)

Unnamed: 0,l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
0,1,155190,7706,1,17.0,21168.23,0.04,0.02,N,O,1996-03-13,1996-02-12,1996-03-22,DELIVER IN PERSON,TRUCK,to beans x-ray carefull
1,1,67310,7311,2,36.0,45983.16,0.09,0.06,N,O,1996-04-12,1996-02-28,1996-04-20,TAKE BACK RETURN,MAIL,according to the final foxes. qui
2,1,63700,3701,3,8.0,13309.6,0.1,0.02,N,O,1996-01-29,1996-03-05,1996-01-31,TAKE BACK RETURN,REG AIR,ourts cajole above the furiou


In [24]:
# eager load
df = pl.read_parquet(data)
df.head(3)

l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
i64,i64,i64,i64,"decimal[15,2]","decimal[15,2]","decimal[15,2]","decimal[15,2]",str,str,date,date,date,str,str,str
1,155190,7706,1,17.0,21168.23,0.04,0.02,"""N""","""O""",1996-03-13,1996-02-12,1996-03-22,"""DELIVER IN PERSON""","""TRUCK""","""to beans x-ray carefull"""
1,67310,7311,2,36.0,45983.16,0.09,0.06,"""N""","""O""",1996-04-12,1996-02-28,1996-04-20,"""TAKE BACK RETURN""","""MAIL""",""" according to the final foxes.…"
1,63700,3701,3,8.0,13309.6,0.1,0.02,"""N""","""O""",1996-01-29,1996-03-05,1996-01-31,"""TAKE BACK RETURN""","""REG AIR""","""ourts cajole above the furiou"""


In [25]:
# lazy load 
df = pl.scan_parquet(data)
df.head(3).collect()

l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
i64,i64,i64,i64,"decimal[15,2]","decimal[15,2]","decimal[15,2]","decimal[15,2]",str,str,date,date,date,str,str,str
1,155190,7706,1,17.0,21168.23,0.04,0.02,"""N""","""O""",1996-03-13,1996-02-12,1996-03-22,"""DELIVER IN PERSON""","""TRUCK""","""to beans x-ray carefull"""
1,67310,7311,2,36.0,45983.16,0.09,0.06,"""N""","""O""",1996-04-12,1996-02-28,1996-04-20,"""TAKE BACK RETURN""","""MAIL""",""" according to the final foxes.…"
1,63700,3701,3,8.0,13309.6,0.1,0.02,"""N""","""O""",1996-01-29,1996-03-05,1996-01-31,"""TAKE BACK RETURN""","""REG AIR""","""ourts cajole above the furiou"""


In [26]:
# lazy stream the data 
df = pl.scan_parquet(data)
df.head(3).collect(streaming=True)

l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
i64,i64,i64,i64,"decimal[15,2]","decimal[15,2]","decimal[15,2]","decimal[15,2]",str,str,date,date,date,str,str,str
1,155190,7706,1,17.0,21168.23,0.04,0.02,"""N""","""O""",1996-03-13,1996-02-12,1996-03-22,"""DELIVER IN PERSON""","""TRUCK""","""to beans x-ray carefull"""
1,67310,7311,2,36.0,45983.16,0.09,0.06,"""N""","""O""",1996-04-12,1996-02-28,1996-04-20,"""TAKE BACK RETURN""","""MAIL""",""" according to the final foxes.…"
1,63700,3701,3,8.0,13309.6,0.1,0.02,"""N""","""O""",1996-01-29,1996-03-05,1996-01-31,"""TAKE BACK RETURN""","""REG AIR""","""ourts cajole above the furiou"""


## Partial Sort Operations

Force some columns from all rows to pass through a query engine, how do each of these libraries and query engiens fare?
- Ibis with DuckDB and DataFusion complete without a hitch 
- Polars lazy is expected to crash as datasets grow larger

In [27]:
ibis.set_backend("duckdb")
t = ibis.read_parquet(data)
t.order_by(t["l_orderkey"], t["l_partkey"], t["l_suppkey"]).head(3)

In [28]:
ibis.set_backend("datafusion")
t = ibis.read_parquet(data)
t.order_by(t["l_orderkey"], t["l_partkey"], t["l_suppkey"]).head(3)

In [29]:
df = pl.scan_parquet(data)
(
    df.sort(pl.col("l_orderkey"), pl.col("l_partkey"), pl.col("l_suppkey"))
    .head(3)
    .collect()
)

l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
i64,i64,i64,i64,"decimal[15,2]","decimal[15,2]","decimal[15,2]","decimal[15,2]",str,str,date,date,date,str,str,str
1,2132,4633,4,28.0,28955.64,0.09,0.06,"""N""","""O""",1996-04-21,1996-03-30,1996-05-16,"""NONE""","""AIR""","""s cajole busily above t"""
1,15635,638,6,32.0,49620.16,0.07,0.02,"""N""","""O""",1996-01-30,1996-02-07,1996-02-03,"""DELIVER IN PERSON""","""MAIL""","""rouches. special """
1,24027,1534,5,24.0,22824.48,0.1,0.04,"""N""","""O""",1996-03-30,1996-03-14,1996-04-01,"""NONE""","""FOB""",""" the regular, regular pa"""


In [30]:
df = pl.scan_parquet(data)
(
    df.sort(pl.col("l_orderkey"), pl.col("l_partkey"), pl.col("l_suppkey"))
    .head(3)
    .collect(streaming=True)
)

l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
i64,i64,i64,i64,"decimal[15,2]","decimal[15,2]","decimal[15,2]","decimal[15,2]",str,str,date,date,date,str,str,str
1,2132,4633,4,28.0,28955.64,0.09,0.06,"""N""","""O""",1996-04-21,1996-03-30,1996-05-16,"""NONE""","""AIR""","""s cajole busily above t"""
1,15635,638,6,32.0,49620.16,0.07,0.02,"""N""","""O""",1996-01-30,1996-02-07,1996-02-03,"""DELIVER IN PERSON""","""MAIL""","""rouches. special """
1,24027,1534,5,24.0,22824.48,0.1,0.04,"""N""","""O""",1996-03-30,1996-03-14,1996-04-01,"""NONE""","""FOB""",""" the regular, regular pa"""
