Import necessary packages.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
import warnings
warnings.filterwarnings("ignore")

Load dataset as pandas dataframe.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("../input/hotel-booking-demand/hotel_bookings.csv")

First, we look at the data structure and calculate the unique values and missing values in each column. The total missing value in the dataset is 129425.

In [None]:
types = df.dtypes
distinct = df.nunique()
count_na = df.isna().sum()
percent_na = round((df.isna().sum()/len(df))*100, 3)

data = pd.concat([types, distinct, count_na, percent_na], axis=1)
data.reset_index(level=0, inplace=True)
data.rename(columns = {'index':'column', 0:'types', 1:'distinct', 2:'count_na', 3: 'percent_na'}, inplace = True)

In [None]:
print(data.count_na.sum())
data

In the dataset only 4 columns have the most missing values and the missing values of company ID is almost 95%.

In [None]:
data[data['count_na'] > 0]

In [None]:
# plt.figure(figsize=(15, 6))
labels = list(df['country'].value_counts()[:20])
ax = df['country'].value_counts()[:20].plot(kind='bar', figsize=(15, 6), rot = 0)
# ax = se.plot(kind='bar', figsize=(15, 6), rot = 0)
ax.set_title('Top 20 Country Distribution', fontsize = 15)
ax.set_xlabel("Country", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
rects = ax.patches

for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 10, label,
            ha='center', va='bottom')

plt.show()

Portugal has the most booking demand based on the data (**more than 40%**). It is pretty obvious because if we trace to [the publication](http://www.sciencedirect.com/science/article/pii/S2352340918315191) page, the description tells us that the data source locations are from "both hotels are located in Portugal: H1 at the resort region of Algarve and H2 at the city of Lisbon".
<br>
But it's kind of interesting from what we see that there are two China's ISO codes in this dataframe which is "CN" and "CHN", from the data source description country column represent country of origin in the ISO 3155–3:2013 format which is consist of 3 alpha numeric and "CN" is represent ISO 3155 with 2 alpha numeric format. We need to find all 2 alpha numeric ISO codes in the dataset.

In [None]:
mask = df['country'].str.len() == 2
ctr = df.loc[mask]
ctr['country'].unique()

As we can see it only 1 ISO Code on dataset and that is "CN". So, I think we need to change it to a the ISO 3155–3:2013 format.

In [None]:
df[df['country'] == 'CN'] = 'CHN'
print("Number of 'CN' = " + str(len(df[df['country'] == 'CN'])))

labels = list(df['country'].value_counts()[:20])
ax = df['country'].value_counts()[:20].plot(kind='bar', figsize=(15, 6), rot = 0)
# ax = se.plot(kind='bar', figsize=(15, 6), rot = 0)
ax.set_title('Top 20 Country Distribution', fontsize = 15)
ax.set_xlabel("Country", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
rects = ax.patches

for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 10, label,
            ha='center', va='bottom')

plt.show()

From the bar chart above we can see that booking demands in China changed from 999 to 2278.
<br>
Now let's look at the data from the perspective of each country. 

In [None]:
list_ctr = list(df['country'].value_counts(sort = True)[:10].index)
# list_ctr
top10 = df[df['country'].isin(list_ctr)]
top10['country'].unique()

group_10 = top10.groupby(['country', 'hotel']).size()
group_10 = group_10.to_frame(name = 'size').reset_index()

ax = group_10.pivot("country", "hotel", "size").plot(kind='bar', figsize=(15, 6), rot = 0)

ax.set_title('Hotel Type Distribution in Top 10 Country', fontsize = 15)
ax.set_xlabel("Country", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
rects = ax.patches

for p in rects[0:]:
    h = p.get_height()
    x = p.get_x()+p.get_width()/2.
    if h != 0:
        ax.annotate("%g" % p.get_height(), xy=(x,h), xytext=(0,4), rotation = 90, 
                   textcoords="offset points", ha="center", va="bottom")

plt.show()

As we can see there is strange hotel category in China that called 'CHN', the 'CHN' occurences is pretty big and outclass the city hotel and the resort hotel categories. From what we see on the trend above, there is more city hotel than resort hotel, so by this trend most likely the occurences of 'CHN' category in China is belong to China's city hotel. However, we cannot just change the 'CHN' category to the city hotel or resort hotel category due to lack of justification related to that data. So, i think it will be better if we drop this value of category.

In [None]:
mask1 = df['hotel'] != 'CHN'
df_1 = df.loc[mask1]

print("Number of 'CHN' = " + str(len(df_1[df_1['hotel'] == 'CHN'])))

labels = list(df_1['country'].value_counts()[:20])
ax = df_1['country'].value_counts()[:20].plot(kind='bar', figsize=(15, 6), rot = 0)
# ax = se.plot(kind='bar', figsize=(15, 6), rot = 0)
ax.set_title('Top 20 Country Distribution', fontsize = 15)
ax.set_xlabel("Country", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
rects = ax.patches

for p in rects[0:]:
    h = p.get_height()
    x = p.get_x()+p.get_width()/2.
    if h != 0:
        ax.annotate("%g" % p.get_height(), xy=(x,h), xytext=(0,4), 
                   textcoords="offset points", ha="center", va="bottom")

plt.show()

And now China have only 999 record hotel bookings, so the top 10 would be different.

In [None]:
list_ctr = list(df_1['country'].value_counts(sort = True)[:10].index)
top10 = df_1[df_1['country'].isin(list_ctr)]

group_10 = top10.groupby(['country', 'hotel']).size()
group_10 = group_10.to_frame(name = 'size').reset_index()

ax = group_10.pivot("country", "hotel", "size").plot(kind='bar', figsize=(15, 6), rot = 0)

ax.set_title('Hotel Type Distribution in Top 10 Country', fontsize = 15)
ax.set_xlabel("Country", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
rects = ax.patches

for p in rects[0:]:
    h = p.get_height()
    x = p.get_x()+p.get_width()/2.
    if h != 0:
        ax.annotate("%g" % p.get_height(), xy=(x,h), xytext=(0,4), rotation=90, 
                   textcoords="offset points", ha="center", va="bottom")

plt.show()

The graph shows that the Netherlands has entered the top 10 group which was previously ranked 11th. Then almost all bookings in each country are dominated by bookings at the city hotels, only the Great Britain is dominated by the resort hotels.
<br>
Let's see more further about the group and add customer type as our perspective. Customer type column here is actually a type of booking. 

In [None]:
group_10 = top10.groupby(['country', 'hotel', 'customer_type']).size()
group_10 = group_10.to_frame(name = 'size').reset_index()

g = sns.FacetGrid(group_10,
            col='country', 
            col_wrap=3,
            sharex=False,
            sharey=False,
            height=4)
g = g.map(sns.barplot, 'hotel', 'size', "customer_type", 
          hue_order= np.unique(group_10["customer_type"]), palette = "hls")
g.add_legend()
g._legend.set_title("Type of Booking")
g.set_titles(row_template = '{row_name}', col_template = '{col_name}')

for ax in g.axes.flatten():
    rects = ax.patches
    
    for p in rects[0:]:
        h = p.get_height()
        x = p.get_x()+p.get_width()/2.
        if h != 0:
            ax.annotate("%g" % p.get_height(), xy=(x,h), xytext=(0,4), rotation=0, 
                       textcoords="offset points", ha="center", va="bottom")

plt.show()



Type of booking is obviously dominated by transient. Interesting view is shown in resort hotel category at Great Britain where contract type has a high number behind Portugal.

In [None]:
group_10 = top10.groupby(['country', 'hotel', 'deposit_type', 'is_canceled']).size()
group_10 = group_10.to_frame(name = 'size')
group_10 = group_10.unstack()
group_10