In [1]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework 

Collecting polars
  Downloading polars-0.20.16-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading polars-0.20.16-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.3/26.3 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 0.20.15
    Uninstalling polars-0.20.15:
      Successfully uninstalled polars-0.20.15
Successfully installed polars-0.20.16


In [3]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [None]:
from typing import Any
from datetime import date

def q3_pandas_native(
    customer_ds: Any,
    line_item_ds: Any,
    orders_ds: Any,
):
    var1 = "BUILDING"
    var2 = date(1995, 3, 15)

    fcustomer = customer_ds[customer_ds["c_mktsegment"] == var1]

    jn1 = fcustomer.merge(orders_ds, left_on="c_custkey", right_on="o_custkey")
    jn2 = jn1.merge(line_item_ds, left_on="o_orderkey", right_on="l_orderkey")

    jn2 = jn2[jn2["o_orderdate"] < var2]
    jn2 = jn2[jn2["l_shipdate"] > var2]
    jn2["revenue"] = jn2.l_extendedprice * (1 - jn2.l_discount)

    gb = jn2.groupby(
        ["o_orderkey", "o_orderdate", "o_shippriority"], as_index=False
    )
    agg = gb["revenue"].sum()

    sel = agg.loc[:, ["o_orderkey", "revenue", "o_orderdate", "o_shippriority"]]
    sel = sel.rename({"o_orderkey": "l_orderkey"}, axis="columns")

    sorted = sel.sort_values(by=["revenue", "o_orderdate"], ascending=[False, True])
    result_df = sorted.head(10)

    return result_df  # type: ignore[no-any-return]

In [4]:
from typing import Any
from datetime import datetime
import narwhals as nw

def q3(
    customer_ds_raw: Any,
    line_item_ds_raw: Any,
    orders_ds_raw: Any,
) -> Any:
    var_1 = var_2 = datetime(1995, 3, 15)
    var_3 = "BUILDING"

    customer_ds = nw.from_native(customer_ds_raw)
    line_item_ds = nw.from_native(line_item_ds_raw)
    orders_ds = nw.from_native(orders_ds_raw)

    q_final = (
        customer_ds.filter(nw.col("c_mktsegment") == var_3)
        .join(orders_ds, left_on="c_custkey", right_on="o_custkey")
        .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey")
        .filter(
            nw.col("o_orderdate") < var_2,
            nw.col("l_shipdate") > var_1,
        ).with_columns(
            (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue")
        )
        .group_by(["o_orderkey", "o_orderdate", "o_shippriority"])
        .agg([nw.sum("revenue")])
        .select(
            [
                nw.col("o_orderkey").alias("l_orderkey"),
                "revenue",
                "o_orderdate",
                "o_shippriority",
            ]
        )
        .sort(by=["revenue", "o_orderdate"], descending=[True, False])
        .head(10)
    )

    return nw.to_native(q_final)

In [None]:
from typing import Any
from datetime import datetime
import narwhals as nw
import ibis

def q3_ibis(
    customer: Any,
    lineitem: Any,
    orders: Any,
    *,
    tool,
) -> Any:
    var1 = "BUILDING"
    var2 = date(1995, 3, 15)

    q_final = (
        customer.filter(customer["c_mktsegment"] == var1)
        .join(orders, customer["c_custkey"] == orders["o_custkey"])
        .join(lineitem, orders["o_orderkey"] == lineitem["l_orderkey"])
        .filter(ibis._["o_orderdate"] < var2)
        .filter(ibis._["l_shipdate"] > var2)
        .mutate(revenue=(lineitem["l_extendedprice"] * (1 - lineitem["l_discount"])))
        .group_by(
            "o_orderkey",
            "o_orderdate",
            "o_shippriority",
        )
        .agg(revenue=ibis._["revenue"].sum())
        .select(
            ibis._["o_orderkey"].name("o_orderkey"),
            "revenue",
            "o_orderdate",
            "o_shippriority",
        )
        .order_by(ibis.desc("revenue"), "o_orderdate")
        .limit(10)
    )
    if tool == 'pandas':
        return q_final.to_pandas()
    if tool == 'polars':
        return q_final.to_polars()
    raise ValueError("expected pandas or polars")

In [5]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
region = dir_ + 'region.parquet'
nation = dir_ + 'nation.parquet'
customer = dir_ + 'customer.parquet'
lineitem = dir_ + 'lineitem.parquet'
orders = dir_ + 'orders.parquet'
supplier = dir_ + 'supplier.parquet'
part = dir_ + 'part.parquet'
partsupp = dir_ + 'partsupp.parquet'

In [6]:
import ibis

con_pd = ibis.pandas.connect()
con_pl = ibis.polars.connect()

IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
    'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),
}

In [None]:
results = {}

## pandas, pyarrow dtypes, via ibis

In [None]:
tool = 'pandas[pyarrow][ibis]'
fn = IO_FUNCS[tool]
timings = %timeit -o q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='pandas')
results[tool] = timings.all_runs

24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


23.841894793999984

## Polars, lazy, via ibis

In [None]:
tool = 'polars[lazy][ibis]'
fn = IO_FUNCS[tool]
timings = %timeit -o q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='polars')
results[tool] = timings.all_runs

24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


23.841894793999984

## pandas, pyarrow dtypes, native

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q3_pandas_native(fn(customer), fn(lineitem), fn(orders))
results[tool+'[native]'] = timings.all_runs

24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


23.841894793999984

## pandas via Narwhals

In [7]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))
results[tool] = timings.all_runs

24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


23.841894793999984

## pandas, pyarrow dtypes, via Narwhals

In [8]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))
results[tool] = timings.all_runs

20.2 s ± 5.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


16.42582530300001

## Polars read_parquet

In [9]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))
results[tool] = timings.all_runs

4.67 s ± 85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


4.574684939999997

## Polars scan_parquet

In [10]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders)).collect()
results[tool] = timings.all_runs

595 ms ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


0.5674880569999914

## Save

In [None]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
