In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import seaborn as sns

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

In [None]:
data = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv', parse_dates=[[3, 4, 6]])
data.head()

In [None]:
data.rename({'arrival_date_year_arrival_date_month_arrival_date_day_of_month': 'arrival_date'}, inplace=True, axis=1)
data.head()

In [None]:
data.loc[data.previous_cancellations==11]

# First impression

In [None]:
data['meal'].value_counts()
# Equivalently -
# data.groupby('meal').size()

In [None]:
data.groupby(['reserved_room_type', 'meal']).size().unstack(level=1, fill_value=0).plot.bar(stacked=True);

In [None]:
MIN_RECORDS = 100
data = data.groupby('market_segment').filter(lambda grp: len(grp) > MIN_RECORDS)

In [None]:
data.company.value_counts(dropna=False).head()

In [None]:
# data.days_in_waiting_list.value_counts(normalize=True)
data.days_in_waiting_list.value_counts(normalize=False).head(20)

In [None]:
crosstab = data.groupby(['reserved_room_type', 'assigned_room_type']).size().unstack(fill_value=0)
crosstab
sns.heatmap(crosstab, vmax=1000)

In [None]:
data.company.value_counts()

In [None]:
data.customer_type.value_counts()

In [None]:
data.adr.plot.hist(bins=1000)

In [None]:
data.required_car_parking_spaces.value_counts()

In [None]:
data.reservation_status.value_counts()

In [None]:
data.days_in_waiting_list.value_counts()

**Notes:**

* Rows with 0 nights? Maybe spa?
* Adult-child-babies combos
* Should split observations to PRT and others?
* Specific countries order shorter vacations?
* Groups in the data?
* No agent == direct distribution ???
* New feature - group size + Room type vs. group size
* Remove rare categories in market_segment and/or room_type.
* compatability of agent / company / distribution...
* Split people with waiting list and without. Should we drop them?
* Why sometimes the adr is >1000 and <10? It should consider the group size and the number of nights.
* Remove rows with 8 car parking. Probably they stand for special parking requirements.
* Found 188 rows of 0 visitors. Should we drop them? What do they mean?
* What about groups with >10 vistors? What does it mean?
* What are the weird family combinations? 1 adult & 9 babies, 55 visitors, etc.


# Analyses

## Trend of reservations

In [None]:
data.groupby('arrival_date').size().resample('2W').mean().plot()

## Do previous cancellations tell us something about the current reservation?

In [None]:
data['cancellation_ratio'] = (data.loc[:, 'previous_cancellations'] / (data.loc[:, 'previous_cancellations'] + data.loc[:, 'previous_bookings_not_canceled']))
data.hist(column='cancellation_ratio', by='is_canceled', sharey=True);

In [None]:
data.loc[data.cancellation_ratio<0.2].hist(column='cancellation_ratio', by='is_canceled', sharey=False);

## Family type and its relation to vacation duration

In [None]:
# data.loc[data[['adults', 'children', 'babies']].sum(axis=1)==0]

In [None]:
data[['adults', 'children', 'babies']].groupby(['adults', 'children', 'babies']).size()#.sort_values(ascending=False)

In [None]:
def division2familytype(row):
    if row['adults'] == 0:
        return 'No adults'
    elif row['adults'] == 1:
        if row['children'] + row['babies'] == 0:
            return 'Single'
        else:
            return 'Single parent'
    elif row['adults'] == 2:
        if row['children'] + row['babies'] == 0:
            return 'Couple'
        else:
            return 'Couple with children'
    elif row['adults'] > 30:
        return 'Group'
    else:
        return 'Other'

In [None]:
data['family_type'] = data.apply(division2familytype, axis=1)
data.head()

In [None]:
data.groupby(['family_type', 'is_canceled']).size().unstack().plot.bar()

In [None]:
data['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']

In [None]:
data.groupby('family_type')['total_nights'].mean().sort_values().plot.bar(figsize=[10,5])

In [None]:
data.groupby('family_type')['total_nights'].median().sort_values().plot.bar(figsize=[10,5])