# 6-3 Aggregate by interval

## Aggregate the hotel number by equidistant price range

### Pandas

In [1]:
import pandas as pd

In [2]:
pd_hotel = pd.read_parquet("../../data/hotel.parquet")

In [3]:
pd_hotel.head()

Unnamed: 0,hotel_id,hotel_name,hotel_type,address_prefecture,address_city,address_town,address_zipcode,unit_price,user_rating,tag_001,...,tag_021,tag_022,tag_023,tag_024,tag_025,tag_026,tag_027,tag_028,tag_029,tag_030
0,1,北飯岡ホテル,ビジネスホテル,岩手県,盛岡市,北飯岡,020-0857,10300,3.256591,0.0,...,0,1,,0.0,0.0,1.0,0,0,0,0
1,2,西二条南温泉ホテル,リゾートホテル,北海道,中川郡美深町,西二条南,098-2242,6800,3.453642,0.0,...,0,0,0.0,,1.0,0.0,0,0,0,0
2,3,小屋敷ペンション,民宿,青森県,黒石市,小屋敷,036-0511,18600,1.381796,,...,0,0,0.0,1.0,,,0,0,0,0
3,4,中後町民宿,民宿,愛知県,碧南市,中後町,447-0042,4900,2.090353,0.0,...,0,0,0.0,0.0,0.0,0.0,0,0,1,0
4,5,鵜沼台ホテル,ビジネスホテル,岐阜県,各務原市,鵜沼台,509-0121,9000,2.77149,0.0,...,0,0,0.0,0.0,0.0,1.0,1,1,0,0


In [4]:
import numpy as np

(
    pd_hotel
    .assign(unit_price_range=lambda df:
           (np.floor(df.unit_price / 5000) * 5000).astype(int))
    .groupby("unit_price_range").size()
)

unit_price_range
0         220
5000     2760
10000    1389
15000     456
20000     139
25000      29
30000       6
35000       1
dtype: int64

### Polars

In [5]:
import polars as pl

In [6]:
pl_hotel = pl.read_parquet("../../data/hotel.parquet")

In [7]:
# without creating new column
(
    pl_hotel
    .group_by((pl.col("unit_price") / 5000).floor().cast(pl.Int32) * 5000)
    .agg(pl.len()).sort("unit_price")
)

unit_price,len
i32,u32
0,220
5000,2760
10000,1389
15000,456
20000,139
25000,29
30000,6
35000,1


## Aggregate the hotel number by unequidistant price range

### Pandas

In [8]:
(
    pd_hotel
    .assign(unit_price_range=lambda df:
           np.where(df.unit_price < 5000,  0,
           np.where(df.unit_price < 10000, 5000,
           np.where(df.unit_price < 20000, 10000,
           np.where(df.unit_price < 30000, 20000,
                   30000)))))
    .groupby("unit_price_range").size()
)

unit_price_range
0         220
5000     2760
10000    1845
20000     168
30000       7
dtype: int64

### Polars

In [9]:
(
    pl_hotel
    .group_by(
        pl.when(pl.col("unit_price") < 5000).then(0)
        .when(pl.col("unit_price") < 10000).then(5000)
        .when(pl.col("unit_price") < 20000).then(10000)
        .when(pl.col("unit_price") < 30000).then(20000)
        .otherwise(30000)
        .alias("unit_price")
    )
    .agg(pl.len()).sort("unit_price")
)

unit_price,len
i32,u32
0,220
5000,2760
10000,1845
20000,168
30000,7
