# Select subset of rows

In [1]:
import duckdb
import polars as pl
import pyarrow.compute as pc
import pyarrow.dataset as ds

In [2]:
data_dir = "../tmp"

## Polars

In [3]:
df_pl = pl.scan_parquet(f"{data_dir}/*.parquet")

In [4]:
%%time
(df_pl
 .filter(pl.col("id").is_in([4000000, 6000000]))
 .collect(streaming=True)
)

STREAMING CHUNK SIZE: 16666 rows


CPU times: user 5.98 s, sys: 14.9 s, total: 20.9 s
Wall time: 11.8 s


id,text,to_be_ignored
i64,str,i64
4000000,""" Lorem ipsum d…",6
6000000,""" Lorem ipsum d…",29


## DuckDB

In [5]:
cursor = duckdb.connect()

In [6]:
%%time
cursor.execute(f"""
SELECT table_a.id, to_be_ignored, text
FROM (VALUES (4000000), (6000000)) table_a(id)
LEFT JOIN read_parquet("{data_dir}/*.parquet") table_b
ON table_a.id == table_b.id
""").df()

CPU times: user 234 ms, sys: 26.4 ms, total: 260 ms
Wall time: 301 ms


Unnamed: 0,id,to_be_ignored,text
0,4000000,6,"\nLorem ipsum dolor sit amet, consectetur adip..."
1,6000000,29,"\nLorem ipsum dolor sit amet, consectetur adip..."


## PyArrow

In [7]:
%%time
ds.dataset(data_dir, format="parquet").filter(pc.field("id").isin([4000000, 6000000])).to_table().to_pandas()

CPU times: user 29.8 s, sys: 1min 4s, total: 1min 34s
Wall time: 40.3 s


Unnamed: 0,id,text,to_be_ignored
0,4000000,"\nLorem ipsum dolor sit amet, consectetur adip...",6
1,6000000,"\nLorem ipsum dolor sit amet, consectetur adip...",29
