# All combinations

## Month sales by customers

### Pandas

In [1]:
import pandas as pd

In [2]:
pd_reservation = pd.read_parquet("../../data/reservation.parquet")
pd_customer    = pd.read_parquet("../../data/customer.parquet")

In [3]:
pd_reservation.head()

Unnamed: 0,reservation_id,hotel_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at
0,1,2460,53431,2013-12-31 07:00:14+09:00,2014-12-31 00:00:00+09:00,2015-01-03 00:00:00+09:00,3,2,37800,reserved,NaT
1,2,962,488390,2013-12-31 08:23:35+09:00,2014-12-31 00:00:00+09:00,2015-01-02 00:00:00+09:00,2,3,42000,reserved,NaT
2,3,558,341335,2013-12-31 09:02:05+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,2,20400,reserved,NaT
3,4,3666,398981,2013-12-31 23:44:54+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,4,39600,reserved,NaT
4,5,2180,220381,2014-01-01 02:47:50+09:00,2014-12-31 00:00:00+09:00,2015-01-01 00:00:00+09:00,1,3,16500,reserved,NaT


In [4]:
pd_customer.head()

Unnamed: 0,customer_id,name,age,sex,address_prefecture,address_city,address_town,address_zipcode
0,1,山田 裕太,75,,岐阜県,岐阜市,鷺山清洲町,502-0853
1,2,藤井 稔,83,M,大阪府,豊能郡能勢町,地黄,563-0121
2,3,青木 太一,62,M,佐賀県,佐賀市,本庄町袋,840-0023
3,4,渡辺 裕太,28,M,福島県,喜多方市,豊川町高堂太,966-0911
4,5,渡辺 明美,62,F,兵庫県,西宮市,津門西口町,663-8231


In [5]:
# cross join
(
    pd_customer[["customer_id"]]
    .merge(
        pd.period_range("2019-01", "2019-12", freq="M").to_series(name="month"),
        how="cross"
    )
    .merge(
        pd_reservation
        .query("status != 'canceled'")
        .assign(month=lambda df: df.checkout_date.dt.to_period("M"))
        .groupby(["customer_id", "month"]).total_price.sum(),
        on=["customer_id", "month"],
        how="left"
    )
    .fillna({"total_price": 0})
)

  .assign(month=lambda df: df.checkout_date.dt.to_period("M"))


Unnamed: 0,customer_id,month,total_price
0,1,2019-01,0.0
1,1,2019-02,0.0
2,1,2019-03,0.0
3,1,2019-04,0.0
4,1,2019-05,0.0
...,...,...,...
5999995,500000,2019-08,0.0
5999996,500000,2019-09,0.0
5999997,500000,2019-10,0.0
5999998,500000,2019-11,0.0


### Polars

In [6]:
import polars as pl

In [7]:
pl_reservation = pl.read_parquet("../../data/reservation.parquet")
pl_customer    = pl.read_parquet("../../data/customer.parquet")

In [8]:
pl_reservation.head()

reservation_id,hotel_id,customer_id,reserved_at,checkin_date,checkout_date,length_of_stay,people_num,total_price,status,canceled_at
i64,i64,i64,"datetime[ns, Asia/Tokyo]","datetime[ns, Asia/Tokyo]","datetime[ns, Asia/Tokyo]",i64,i64,i64,str,"datetime[ns, Asia/Tokyo]"
1,2460,53431,2013-12-31 07:00:14 JST,2014-12-31 00:00:00 JST,2015-01-03 00:00:00 JST,3,2,37800,"""reserved""",
2,962,488390,2013-12-31 08:23:35 JST,2014-12-31 00:00:00 JST,2015-01-02 00:00:00 JST,2,3,42000,"""reserved""",
3,558,341335,2013-12-31 09:02:05 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,2,20400,"""reserved""",
4,3666,398981,2013-12-31 23:44:54 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,4,39600,"""reserved""",
5,2180,220381,2014-01-01 02:47:50 JST,2014-12-31 00:00:00 JST,2015-01-01 00:00:00 JST,1,3,16500,"""reserved""",


In [9]:
pl_customer.head()

customer_id,name,age,sex,address_prefecture,address_city,address_town,address_zipcode
i64,str,i64,str,str,str,str,str
1,"""山田 裕太""",75,,"""岐阜県""","""岐阜市""","""鷺山清洲町""","""502-0853"""
2,"""藤井 稔""",83,"""M""","""大阪府""","""豊能郡能勢町""","""地黄""","""563-0121"""
3,"""青木 太一""",62,"""M""","""佐賀県""","""佐賀市""","""本庄町袋""","""840-0023"""
4,"""渡辺 裕太""",28,"""M""","""福島県""","""喜多方市""","""豊川町高堂太""","""966-0911"""
5,"""渡辺 明美""",62,"""F""","""兵庫県""","""西宮市""","""津門西口町""","""663-8231"""


In [10]:
from datetime import date
(
    pl_customer.select("customer_id")
    .join(
        pl.date_range(date(2019, 1, 1), date(2019, 12, 1), "1mo", eager=True)
        .alias("month")
        .to_frame(), # to return dataframe
        how="cross"
    )
    .join(
        pl_reservation
        .filter(pl.col("status") != "canceled")
        .group_by([
            "customer_id",
            pl.col("checkout_date").dt.truncate("1mo").dt.date().alias("month")
        ])
        .agg(pl.col("total_price").sum()),
        on=["customer_id", "month"],
        how="left",
    )
    .with_columns(
        pl.col("total_price").fill_null(0)
    )
)

customer_id,month,total_price
i64,date,i64
1,2019-01-01,0
1,2019-02-01,0
1,2019-03-01,0
1,2019-04-01,0
1,2019-05-01,0
…,…,…
500000,2019-08-01,0
500000,2019-09-01,0
500000,2019-10-01,0
500000,2019-11-01,0
