# EDA for monthly data for revenue

First we have to get daily stats/KPI. And then we aggregate the stats by week or month.

In [1]:
import pandas as pd

In [2]:
import sys
from pathlib import Path

PROJ_ROOT = Path.cwd().parent

if str(PROJ_ROOT) not in sys.path:
    sys.path.append(str(PROJ_ROOT))

In [3]:
[f.name for f in PROJ_ROOT.iterdir() if f.is_dir() and not f.name.startswith(".")]

['tmp', 'hotels', 'data', 'notebooks', 'pages', 'script']

In [4]:
from hotels.processing import enrich_reservation_data

df = pd.read_parquet("../data/hotels.parquet")
df = enrich_reservation_data(df)
df["reservation_id"] = range(1, 1 + len(df))
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,departure_date,total_transaction,is_last_minute_cancellation,actual_departure_date,n_stay_actual,is_early_departure,breakfast,lunch,dinner,reservation_id
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,2015-07-01,0.0,False,2015-07-01,0.0,False,True,False,False,1
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,2015-07-01,0.0,False,2015-07-01,0.0,False,True,False,False,2
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,2015-07-02,75.0,False,2015-07-02,1.0,False,True,False,False,3
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,2015-07-02,75.0,False,2015-07-02,1.0,False,True,False,False,4
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,2015-07-03,196.0,False,2015-07-03,2.0,False,True,False,False,5


- number of used rooms
- number of lodgers 
- number of check-in/-out
- parking spaces 
- no-show / last minute cancellation 
- country (reservation/lodgers)
- adr 
- adr by room type 
- meal type (breakfast, lunch, dinner: these are complicated..)

In [5]:
## count: no-show, cancels, check-ins (WIP)
(df.groupby(["hotel", "arrival_date", "reservation_status"]).size().rename("n_checkin").reset_index())

Unnamed: 0,hotel,arrival_date,reservation_status,n_checkin
0,City Hotel,2015-07-01,Canceled,14
1,City Hotel,2015-07-01,Check-Out,65
2,City Hotel,2015-07-02,Canceled,46
3,City Hotel,2015-07-02,Check-Out,1
4,City Hotel,2015-07-02,No-Show,2
...,...,...,...,...
3748,Resort Hotel,2017-08-30,Canceled,14
3749,Resort Hotel,2017-08-30,Check-Out,28
3750,Resort Hotel,2017-08-31,Canceled,12
3751,Resort Hotel,2017-08-31,Check-Out,41


In [6]:
cols = [
    "reservation_id",
    "hotel",
    "is_canceled",
    "arrival_date",
    "n_lodgers",
    "country",
    "adr",
    "required_car_parking_spaces",
    "n_nights",
    "departure_date",
    "total_transaction",
    "actual_departure_date",
    "n_stay_actual",
]
df_stayed = df.query("is_canceled == 0")[cols].drop("is_canceled", axis=1)
df_stayed

Unnamed: 0,reservation_id,hotel,arrival_date,n_lodgers,country,adr,required_car_parking_spaces,n_nights,departure_date,total_transaction,actual_departure_date,n_stay_actual
0,1,Resort Hotel,2015-07-01,2.0,PRT,0.00,0,0,2015-07-01,0.00,2015-07-01,0.0
1,2,Resort Hotel,2015-07-01,2.0,PRT,0.00,0,0,2015-07-01,0.00,2015-07-01,0.0
2,3,Resort Hotel,2015-07-01,1.0,GBR,75.00,0,1,2015-07-02,75.00,2015-07-02,1.0
3,4,Resort Hotel,2015-07-01,1.0,GBR,75.00,0,1,2015-07-02,75.00,2015-07-02,1.0
4,5,Resort Hotel,2015-07-01,2.0,GBR,98.00,0,2,2015-07-03,196.00,2015-07-03,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,119202,City Hotel,2017-08-30,2.0,BEL,96.14,0,7,2017-09-06,672.98,2017-09-06,7.0
119386,119203,City Hotel,2017-08-31,3.0,FRA,225.43,0,7,2017-09-07,1578.01,2017-09-07,7.0
119387,119204,City Hotel,2017-08-31,2.0,DEU,157.71,0,7,2017-09-07,1103.97,2017-09-07,7.0
119388,119205,City Hotel,2017-08-31,2.0,GBR,104.40,0,7,2017-09-07,730.80,2017-09-07,7.0


In [7]:
df_stayed.query("n_stay_actual == 0").sort_values(by="adr", ascending=False)

Unnamed: 0,reservation_id,hotel,arrival_date,n_lodgers,country,adr,required_car_parking_spaces,n_nights,departure_date,total_transaction,actual_departure_date,n_stay_actual
70660,70613,City Hotel,2017-06-20,5.0,ITA,318.50,0,2,2017-06-22,637.00,2017-06-20,0.0
803,804,Resort Hotel,2015-07-30,4.0,ESP,280.74,1,3,2015-08-02,842.22,2015-07-30,0.0
114002,113827,City Hotel,2017-06-15,4.0,PRT,230.00,0,1,2017-06-16,230.00,2017-06-15,0.0
114003,113828,City Hotel,2017-06-15,4.0,PRT,230.00,0,1,2017-06-16,230.00,2017-06-15,0.0
7886,7881,Resort Hotel,2016-09-01,2.0,PRT,153.00,0,5,2016-09-06,765.00,2016-09-01,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
25072,25066,Resort Hotel,2016-06-13,2.0,PRT,0.00,0,0,2016-06-13,0.00,2016-06-13,0.0
25083,25077,Resort Hotel,2016-06-14,2.0,ESP,0.00,0,0,2016-06-14,0.00,2016-06-14,0.0
25084,25078,Resort Hotel,2016-06-14,2.0,PRT,0.00,0,0,2016-06-14,0.00,2016-06-14,0.0
25113,25107,Resort Hotel,2016-06-15,2.0,PRT,0.00,0,0,2016-06-15,0.00,2016-06-15,0.0


In [8]:
def expand_reservation(reservation: pd.Series) -> pd.DataFrame:
    reservation_id = reservation["reservation_id"]
    checkin = reservation["arrival_date"]
    checkout = reservation["actual_departure_date"]
    dates = pd.date_range(checkin, checkout).rename("date").to_frame().reset_index(drop=True)
    overnight = pd.Series([True] * (len(dates) - 1) + [False], name="overnight").to_frame()

    expanded_reservation = pd.concat([dates, overnight], axis=1).assign(reservation_id=reservation_id)
    return expanded_reservation

In [9]:
def concat_expanded_reservation(data: pd.DataFrame) -> pd.DataFrame:
    df_expanded = pd.concat(
        [expand_reservation(reservation) for _, reservation in df_stayed.iterrows()], axis=0
    ).sort_values(by=["date", "reservation_id"])
    return df_expanded

In [10]:
s_group = df_stayed["reservation_id"] % 100
s_group.nunique()

100

In [11]:
from joblib import Parallel, delayed

job = (delayed(concat_expanded_reservation)(data) for _, data in df_stayed.groupby(s_group))

In [12]:
%%time

results = Parallel(n_jobs=6)(job)

CPU times: user 3 s, sys: 842 ms, total: 3.84 s
Wall time: 1h 5min 29s


In [13]:
pd.concat(results, axis=0).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32910700 entries, 0 to 14
Data columns (total 3 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   overnight       bool          
 2   reservation_id  int64         
dtypes: bool(1), datetime64[ns](1), int64(1)
memory usage: 784.7 MB


In [14]:
sum(len(result) for result in results)

32910700