# Select subset of rows

In [1]:
import duckdb
import polars as pl
import pyarrow.compute as pc
import pyarrow.dataset as ds

from dw.sample_data import get_data_dir

In [2]:
data_dir = get_data_dir()

## Polars

In [3]:
df_pl = pl.scan_parquet(f"{data_dir}/*.parquet")

In [4]:
%%time
(df_pl
 .filter(pl.col("id").is_in([4000000, 6000000]))
 .collect(streaming=True)
)

STREAMING CHUNK SIZE: 16666 rows


CPU times: user 5.36 s, sys: 12.6 s, total: 17.9 s
Wall time: 11.6 s


id,text,to_be_ignored
i64,str,i64
4000000,"""Lorem ipsum do…",29
6000000,"""Lorem ipsum do…",20


## DuckDB

In [5]:
cursor = duckdb.connect()

In [6]:
%%time
cursor.execute(f"""
SELECT table_a.id, to_be_ignored, text
FROM (VALUES (4000000), (6000000)) table_a(id)
LEFT JOIN read_parquet("{data_dir}/*.parquet") table_b
ON table_a.id == table_b.id
""").df()

CPU times: user 235 ms, sys: 30.6 ms, total: 265 ms
Wall time: 308 ms


Unnamed: 0,id,to_be_ignored,text
0,4000000,29,"Lorem ipsum dolor sit amet, consectetur adipis..."
1,6000000,20,"Lorem ipsum dolor sit amet, consectetur adipis..."


## PyArrow

In [7]:
%%time
ds.dataset(data_dir, format="parquet").filter(pc.field("id").isin([4000000, 6000000])).to_table().to_pandas()

CPU times: user 27.6 s, sys: 58.9 s, total: 1min 26s
Wall time: 37.7 s


Unnamed: 0,id,text,to_be_ignored
0,4000000,"Lorem ipsum dolor sit amet, consectetur adipis...",29
1,6000000,"Lorem ipsum dolor sit amet, consectetur adipis...",20
