# 6-1 Aggregate function for dataset

## Aggregate value for reservation data

### Pandas

In [1]:
import pandas as pd

In [2]:
pd_reservation = pd.read_parquet("../../data/reservation.parquet")

In [3]:
pd_reservation.head()

Unnamed: 0,reservation_id,hotel_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at
0,1,2460,53431,2013-12-31 07:00:14+09:00,2014-12-31 00:00:00+09:00,2015-01-03 00:00:00+09:00,3,2,37800,reserved,NaT
1,2,962,488390,2013-12-31 08:23:35+09:00,2014-12-31 00:00:00+09:00,2015-01-02 00:00:00+09:00,2,3,42000,reserved,NaT
2,3,558,341335,2013-12-31 09:02:05+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,2,20400,reserved,NaT
3,4,3666,398981,2013-12-31 23:44:54+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,4,39600,reserved,NaT
4,5,2180,220381,2014-01-01 02:47:50+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,3,16500,reserved,NaT


In [12]:
(
    pd_reservation
    # Extract by condition
    .query("status != 'canceled'")
    # aggregate value
    .agg(
        reservation_cnt=("reservation_id", "size"),
        sales=("total_price", "sum"),
        mean_sales=("total_price", "mean"),
        min_sales=("total_price", "min"),
        max_sales=("total_price", "max"),
        var_sales=("total_price", "var"),
        std_sales=("total_price", "std"),
    )
)

Unnamed: 0,reservation_id,total_price
reservation_cnt,1799589.0,
sales,,72048930000.0
mean_sales,,40036.32
min_sales,,4000.0
max_sales,,597000.0
var_sales,,1223721000.0
std_sales,,34981.72


### Polars

In [5]:
import polars as pl

In [7]:
pl_reservation = pl.read_parquet("../../data/reservation.parquet")

In [8]:
pl_reservation.head()

reservation_id,hotel_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at
i64,i64,i64,"datetime[ns, Asia/Tokyo]","datetime[ns, Asia/Tokyo]","datetime[ns, Asia/Tokyo]",i64,i64,i64,str,"datetime[ns, Asia/Tokyo]"
1,2460,53431,2013-12-31 07:00:14 JST,2014-12-31 00:00:00 JST,2015-01-03 00:00:00 JST,3,2,37800,"""reserved""",
2,962,488390,2013-12-31 08:23:35 JST,2014-12-31 00:00:00 JST,2015-01-02 00:00:00 JST,2,3,42000,"""reserved""",
3,558,341335,2013-12-31 09:02:05 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,2,20400,"""reserved""",
4,3666,398981,2013-12-31 23:44:54 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,4,39600,"""reserved""",
5,2180,220381,2014-01-01 02:47:50 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,3,16500,"""reserved""",


In [None]:
(
    pl_reservation
    .filter(pl.col("status") != "canceled")
    .select([
        pl.col("reservation_id").len().alias("reservation_cnt"),
        pl.col("total_price").sum().alias("sales"),
        pl.col("total_price").mean().alias("mean_sales"),
        pl.col("total_price").min().alias("min_sales"),
        pl.col("total_price").max().alias("max_sales"),
        pl.col("total_price").var().alias("var_sales"),
        pl.col("total_price").std().alias("std_sales"),
    ])
)

## Unique count

### Pandas

In [18]:
(
    pd_reservation
    .query("status != 'canceled'")
    .loc[:, "customer_id"]
    .nunique()
)

411336

### Polars

In [22]:
(
    pl_reservation
    .filter(pl.col("status") != "canceled")
    .select(pl.col("customer_id"))
    # also count null
    .n_unique()
)

411336

## Mean value and percentile

### Pandas

In [24]:
(
    pd_reservation
    .query("status != 'cancled'")
    .agg(
        median_sales=("total_price", "median"),
        p25_sales=(),
    )
)

Unnamed: 0,reservation_id,hotel_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at
0,1,2460,53431,2013-12-31 07:00:14+09:00,2014-12-31 00:00:00+09:00,2015-01-03 00:00:00+09:00,3,2,37800,reserved,NaT
1,2,962,488390,2013-12-31 08:23:35+09:00,2014-12-31 00:00:00+09:00,2015-01-02 00:00:00+09:00,2,3,42000,reserved,NaT
2,3,558,341335,2013-12-31 09:02:05+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,2,20400,reserved,NaT
3,4,3666,398981,2013-12-31 23:44:54+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,4,39600,reserved,NaT
4,5,2180,220381,2014-01-01 02:47:50+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,3,16500,reserved,NaT
...,...,...,...,...,...,...,...,...,...,...,...
1999995,1999996,2357,280303,2019-12-27 12:14:07+09:00,2019-12-29 00:00:00+09:00,2019-12-30 00:00:00+09:00,1,4,26000,reserved,NaT
1999996,1999997,319,499387,2019-12-27 14:08:57+09:00,2019-12-29 00:00:00+09:00,2019-12-30 00:00:00+09:00,1,1,7100,canceled,2019-12-28 21:37:30+09:00
1999997,1999998,2834,461799,2019-12-28 04:59:51+09:00,2019-12-30 00:00:00+09:00,2019-12-31 00:00:00+09:00,1,5,105000,reserved,NaT
1999998,1999999,3643,163568,2019-12-28 11:56:19+09:00,2019-12-30 00:00:00+09:00,2019-12-31 00:00:00+09:00,1,1,17000,reserved,NaT
