# 4-4 Lazy

In [1]:
import polars as pl

## eager mode

In [2]:
path = "../../data/reservation.parquet"
df = pl.read_parquet(path)

In [3]:
df.head()

reservation_id,hotel_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at
i64,i64,i64,"datetime[ns, Asia/Tokyo]","datetime[ns, Asia/Tokyo]","datetime[ns, Asia/Tokyo]",i64,i64,i64,str,"datetime[ns, Asia/Tokyo]"
1,2460,53431,2013-12-31 07:00:14 JST,2014-12-31 00:00:00 JST,2015-01-03 00:00:00 JST,3,2,37800,"""reserved""",
2,962,488390,2013-12-31 08:23:35 JST,2014-12-31 00:00:00 JST,2015-01-02 00:00:00 JST,2,3,42000,"""reserved""",
3,558,341335,2013-12-31 09:02:05 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,2,20400,"""reserved""",
4,3666,398981,2013-12-31 23:44:54 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,4,39600,"""reserved""",
5,2180,220381,2014-01-01 02:47:50 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,3,16500,"""reserved""",


In [4]:
(
    df
    # extract specific row
    .filter(pl.col("reserved_at").dt.year() >= 2016)
    .filter(pl.col("people_num") == 1)
    # extract specific column
    .select(["reservation_id", "total_price"])
)

reservation_id,total_price
i64,i64
595174,15500
595177,7900
595183,5600
595189,13200
595202,8500
…,…
1999972,9200
1999977,15200
1999997,7100
1999999,17000


## lazy mode

In [5]:
df = pl.scan_parquet(path)

In [6]:
query = (
    df
    .filter(pl.col("reserved_at").dt.year() >= 2016)
    .filter(pl.col("people_num") == 1)
    .select(["reservation_id", "total_price"])
)

In [7]:
query.collect()

reservation_id,total_price
i64,i64
595174,15500
595177,7900
595183,5600
595189,13200
595202,8500
…,…
1999972,9200
1999977,15200
1999997,7100
1999999,17000


In [8]:
print(query.explain(optimized=False))

 SELECT [col("reservation_id"), col("total_price")] FROM
  FILTER [(col("people_num")) == (1)] FROM
    FILTER [(col("reserved_at").dt.year()) >= (2016)] FROM
      Parquet SCAN [../../data/reservation.parquet]
      PROJECT */11 COLUMNS


In [9]:
print(query.explain(optimized=True))

simple π 2/4 ["reservation_id", ... 1 other column]
  Parquet SCAN [../../data/reservation.parquet]
  PROJECT 4/11 COLUMNS
  SELECTION: [([(col("people_num")) == (1)]) & ([(col("reserved_at").dt.year()) >= (2016)])]


## lazy function

In [10]:
df = pl.read_parquet(path)
query = (
    df
    .lazy()
    .filter(pl.col("reserved_at") >= 2016)
    .filter(pl.col("people_num") == 1)
    .select(["reservation_id", "total_price"])
)

query.collect()

reservation_id,total_price
i64,i64
9,6200
14,8800
28,5300
33,10100
43,8600
…,…
1999972,9200
1999977,15200
1999997,7100
1999999,17000
