In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA of Hotel Booking Demand
## Content:
&emsp;&emsp;Hi, this is my first EDA project. In this project, I refer the questions provided by the most voted project "EDA of bookings and ML to predict cancelations" on Kaggle and use my own methods to realize and analyst it again. This is a great experience but due to my poor Python skill I complete it slowly and the codes are very naive. Hoping I can do better work in the future.

&emsp;&emsp;The following preface is copied from Kaggle, I will choose several questions in them.

&emsp;&emsp;The dataset contains data from two different hotels. One Resort hotel and one City hotel.    

&emsp;&emsp;The data contains "bookings due to arrive between the 1st of July of 2015 and the 31st of August 2017".  
Note: For most questions I will only use bookings that were not canceled, to get acutal guest numbers. As you will see, this is quite a big difference.

&emsp;&emsp;Topics covered and questions to answer from the data:  

    Where do the guests come from?
    How much do guests pay for a room per night?
    How does the price per night vary over the year?
    Which are the most busy month?
    How long do people stay at the hotels?
    Bookings by market segment
    How many bookings were canceled?
    Which month have the highest number of cancelations?

In [None]:
# Data Input and Cleaning

full_data = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
full_data.head()

In [None]:
full_data.isnull().sum()

In [None]:
# fill nan values

nan_replacements = {"children": 0, "country": 0, "agent": 0, "company": 0}
full_data_cln = full_data.fillna(nan_replacements)
full_data_cln['meal'].replace('Undefined', 'SC', inplace = True)

zero_guest = list(full_data_cln.loc[full_data_cln['adults'] + full_data_cln['children'] + full_data_cln['babies'] == 0].index)
full_data_cln.drop(zero_guest, inplace = True)

### 1. Where do the guests come from?

In [None]:
# First, let us group the data by hotel and only use the uncanceled data

rh = pd.DataFrame(full_data_cln.loc[(full_data_cln['hotel'] == 'Resort Hotel') & (full_data_cln['is_canceled'] == 0)])
ch = pd.DataFrame(full_data_cln.loc[(full_data_cln['hotel'] == 'City Hotel') & (full_data_cln['is_canceled'] == 0)])
rh.head(5)

In [None]:
# sort country data

country_data = pd.DataFrame(full_data_cln.loc[full_data_cln["is_canceled"] == 0]["country"].value_counts())
country_data.rename(columns = {'country': 'guests number'}, inplace = True)
country_data['country'] = country_data.index
country_data['percent'] = country_data['guests number'] / country_data['guests number'].sum()
country_data.head()

In [None]:
# plot

fig = plt.figure(figsize = (16,9))
ax = fig.add_subplot(121)
country_data_plot = country_data.iloc[:10, :]

font = {'family': 'Times New Roman', 'weight': 'normal', 'size': 20}
ax.set_xlabel('Country', font)
ax.set_ylabel('Number', font)
ax.bar(country_data_plot['country'], country_data_plot['guests number'])

ax = fig.add_subplot(122)
country_pie_data = country_data.iloc[:10, :]
ax.pie(x = country_pie_data['guests number'], labels = country_pie_data['country'], autopct = '%.3f%%')

fig.suptitle('The Number of Guests From Different Countries',fontsize = 20)
ax.legend()

&emsp;&emsp;Sorry that I can't use python to draw a map so I use tableau instead.

[![2VNMin.png](https://z3.ax1x.com/2021/05/30/2VNMin.png)](https://imgtu.com/i/2VNMin)

Analysis:  
&emsp;&emsp;From the pictures we can see that most of the guests come from PRT, GBR, FRA or other Europe countries. Here the guests from PRT account for about a third.  
&emsp;&emsp;But pay attention that we sum all the guests of different hotels together, so I also want to see the differences between the two hotels.

### 2. How does the price per night vary over the year?

In [None]:
# First let us calculate the average price per person

rh['adrpp'] = rh['adr'] / (rh['adults'] + rh['children'])
ch['adrpp'] = ch['adr'] / (ch['adults'] + ch['children'])
print("The price per person of Resort hotel is {:.2f}".format(rh['adrpp'].mean()))
print("The price per person of City hotel is {:.2f}".format(ch['adrpp'].mean()))

In [None]:
rh_2015 = rh.loc[rh['arrival_date_year'] == 2015][['arrival_date_month', 'adrpp']]
rh_2016 = rh.loc[rh['arrival_date_year'] == 2016][['arrival_date_month', 'adrpp']]
rh_2017 = rh.loc[rh['arrival_date_year'] == 2017][['arrival_date_month', 'adrpp']]

months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]

rh_2015_plot = pd.DataFrame(columns = {'month', 'adrpp'})
rh_2016_plot = pd.DataFrame(columns = {'month', 'adrpp'})
rh_2017_plot = pd.DataFrame(columns = {'month', 'adrpp'})


for i in range(len(months)):
    adrpp = rh_2015.loc[rh_2015['arrival_date_month'] == months[i]]['adrpp'].mean()
    rh_2015_plot = rh_2015_plot.append(pd.DataFrame({'month': months[i], 'adrpp': adrpp}, index = [i]), ignore_index = True)

for i in range(len(months)):
    adrpp = rh_2016.loc[rh_2016['arrival_date_month'] == months[i]]['adrpp'].mean()
    rh_2016_plot = rh_2016_plot.append(pd.DataFrame({'month': months[i], 'adrpp': adrpp}, index = [i]), ignore_index = True)

for i in range(len(months)):
    adrpp = rh_2017.loc[rh_2017['arrival_date_month'] == months[i]]['adrpp'].mean()
    rh_2017_plot = rh_2017_plot.append(pd.DataFrame({'month': months[i], 'adrpp': adrpp}, index = [i]), ignore_index = True)
    

fig = plt.figure(figsize = (16, 9))
plt.plot(rh_2015_plot['month'], rh_2015_plot['adrpp'], label = 'Price in 2015')
plt.plot(rh_2016_plot['month'], rh_2016_plot['adrpp'], label = 'Price in 2016')
plt.plot(rh_2017_plot['month'], rh_2017_plot['adrpp'], label = 'Price in 2017')
plt.title("Room price per night and person over the year [Resort Hotel]", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Price", fontsize=16)
plt.ylim(0, 120)
plt.legend()

In [None]:
ch_2015 = ch.loc[ch['arrival_date_year'] == 2015][['arrival_date_month', 'adrpp']]
ch_2016 = ch.loc[ch['arrival_date_year'] == 2016][['arrival_date_month', 'adrpp']]
ch_2017 = ch.loc[ch['arrival_date_year'] == 2017][['arrival_date_month', 'adrpp']]

months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]

ch_2015_plot = pd.DataFrame(columns = {'month', 'adrpp'})
ch_2016_plot = pd.DataFrame(columns = {'month', 'adrpp'})
ch_2017_plot = pd.DataFrame(columns = {'month', 'adrpp'})


for i in range(len(months)):
    adrpp = ch_2015.loc[ch_2015['arrival_date_month'] == months[i]]['adrpp'].mean()
    ch_2015_plot = ch_2015_plot.append(pd.DataFrame({'month': months[i], 'adrpp': adrpp}, index = [i]), ignore_index = True)

for i in range(len(months)):
    adrpp = ch_2016.loc[ch_2016['arrival_date_month'] == months[i]]['adrpp'].mean()
    ch_2016_plot = ch_2016_plot.append(pd.DataFrame({'month': months[i], 'adrpp': adrpp}, index = [i]), ignore_index = True)

for i in range(len(months)):
    adrpp = ch_2017.loc[ch_2017['arrival_date_month'] == months[i]]['adrpp'].mean()
    ch_2017_plot = ch_2017_plot.append(pd.DataFrame({'month': months[i], 'adrpp': adrpp}, index = [i]), ignore_index = True)
    

fig = plt.figure(figsize = (16, 9))
plt.plot(ch_2015_plot['month'], ch_2015_plot['adrpp'], label = 'Price in 2015')
plt.plot(ch_2016_plot['month'], ch_2016_plot['adrpp'], label = 'Price in 2016')
plt.plot(ch_2017_plot['month'], ch_2017_plot['adrpp'], label = 'Price in 2017')
plt.title("Room price per night and person over the year [City Hotel]", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Price", fontsize=16)
plt.ylim(0, 120)
plt.legend()


## 这一大段后来看的时候发现用groupby就可以解决了...就是需要调整下月份的顺序...当时经验不足

Analysis:  
&emsp;&emsp;From the pictures we see different trends. For the Resort hotel, the price is regular, which reachs its maximum in September (This is not surprising because September is in summer vacation). For the City hotel, the price has two peaks and one is in May. But the price actually is very stable during thw whole year.  
&emsp;&emsp;Besides, no matter which hotel, the price is increasing with the year.

### 3. Bookings by market segment

In [None]:
# The segment proportion of two hotels

fig = plt.figure(figsize = (16, 9))

ax = fig.add_subplot(121)
rh['market_segment'].value_counts()
rh_segment_pie = pd.DataFrame(rh['market_segment'].value_counts())
ax.set_title('The Market Segment of Resort Hotel', fontsize = 14)
ax = plt.pie(x = rh_segment_pie['market_segment'], labels = rh_segment_pie.index, autopct = '%.3f%%')

ax = fig.add_subplot(122)
ch['market_segment'].value_counts()
ch_segment_pie = pd.DataFrame(ch['market_segment'].value_counts())
ax.set_title('The Market Segment of City Hotel', fontsize = 14)
ax = plt.pie(x = ch_segment_pie['market_segment'], labels = ch_segment_pie.index, autopct = '%.3f%%')

Analysis:  
&emsp;&emsp;From the pie graphs we can see that the Online TA takes a large part, meaning that Internet booking has been a main current for hotel booking even the whole travel industry since 2015. This inspires merchants that they should expand Internet business and improving corresponding service. But according to my experience, some scenic spots or hotels have no good online software environment and depend on other software like Ctrip or Meituan Greatly.

In [None]:
# The price on different segments

plt.figure(figsize=(16, 9))
sns.barplot(x="market_segment",
            y="adrpp",
            hue = "reserved_room_type",
            data=rh,
            ci="sd",
            errwidth=0,
            capsize=0.1)
plt.title("ADR by market segment and room type [Resort Hotel]", fontsize=16)
plt.xlabel("Market segment", fontsize=16)
plt.xticks(rotation=45)
plt.ylim(0,300)
plt.ylabel("ADR per person", fontsize=16)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(16, 9))
sns.barplot(x="market_segment",
            y="adrpp",
            hue="reserved_room_type",
            data=ch,
            ci="sd",
            errwidth=0,
            capsize=0.1)
plt.title("ADR by market segment and room type [City Hotel]", fontsize=16)
plt.xlabel("Market segment", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("ADR per person", fontsize=16)
plt.legend()
plt.ylim(0,300)
plt.show()

We can see that the price of aviation is very high. Now let's find the reason. 

In [None]:
aviation = ch.loc[ch['market_segment'] == 'Aviation'][['is_canceled', 'adults', 'children', 'babies', 'lead_time', 'adrpp', 'required_car_parking_spaces']]
aviation.describe()

In [None]:
non_aviation = ch.loc[ch['market_segment'] != 'Aviation'][['is_canceled', 'adults', 'children', 'babies', 'lead_time', 'adrpp', 'required_car_parking_spaces']]
non_aviation.describe()

We can find that the lead time is very short for aviation booking, and the booking often has just one adult but the guests of non aviation booking order seem like families.  

### 4. Which are the most busy months?

In [None]:
rh_month_guests = pd.DataFrame(rh['arrival_date_month'].value_counts())
ch_month_guests = pd.DataFrame(ch['arrival_date_month'].value_counts())
rh_month_guests = rh_month_guests.loc[months]
ch_month_guests = ch_month_guests.loc[months]


fig = plt.figure(figsize = (16, 9))

x = list(range(len(months)))
x_rh = [i-0.2 for i in list(range(len(months)))]
x_ch = [i+0.2 for i in list(range(len(months)))]

plt.bar(x_rh, rh_month_guests['arrival_date_month'], width = 0.4, label = 'Resort Hotel')
plt.bar(x_ch, ch_month_guests['arrival_date_month'], width = 0.4, label = 'City Hotel')
plt.plot(x_rh, rh_month_guests)
plt.plot(x_ch, ch_month_guests)
plt.ylabel('Number of Booking', font)
plt.xlabel('Month', font)
plt.xticks(x, months, rotation=45)
plt.title('The Booking Number of different months', font)
plt.legend()