In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

## Allow plots show in jupyter notebook
%matplotlib inline

# Hide warnings from displaying in jupyter notebook
import warnings
warnings.filterwarnings('ignore')

In [None]:
os.listdir('../input/hotel-booking-demand')

In [None]:
## Read the csv file and read the file into a DataFrame: df
df = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')

In [None]:
## Show the head of this data frame
df.head()

In [None]:
## Get the information of each column
df.info()

<h2 style="color:LightSlateGray">DataSet Glossary</h2>
<p><b>hotel</b>: H1 = Resort Hotel or H2 = City Hotel

<b>is_canceled</b>: Booking was canceled (1) or not (0)

<b>lead_time</b>: Number of days that elapsed between the entering date of the booking into the PMS and the arrival date

<b>arrival_date_year</b>: Year of arrival date

<b>arrival_date_month</b>: Month of arrival date

<b>arrival_date_week_number</b>: Week number of year for arrival date

<b>arrival_date_day_of_month</b>: Day of arrival date

<b>stays_in_weekend_nights</b>: Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel

<b>stays_in_week_nights</b>: Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel

<b>adults</b>: Number of adults

<b>children</b>: Number of children

<b>babies</b>: Number of babies

<b>meal</b>: 
•	Type of meal booked. Categories are presented in standard hospitality meal packages: Undefined/SC – no meal package. 
•	BB – Bed and Breakfast. 
•	HB – Half board (breakfast and one other meal – usually dinner); 
•	FB – Full board (breakfast, lunch and dinner)

<b>country</b>: Country of origin. Categories are represented in the ISO 3155–3:2013 format

<b>market_segment</b>: Market segment designation. In categories, the term “TA” means “Travel Agents” and “TO” means “Tour Operators”

<b>distribution_channel</b>: Booking distribution channel. The term “TA” means “Travel Agents” and “TO” means “Tour Operators”

<b>is_repeated_guest</b>: Value indicating if the booking name was from a repeated guest (1) or not (0)

<b>previous_cancellations</b>: Number of previous bookings that were cancelled by the customer prior to the current booking

<b>previous_bookings_not_canceled</b>: Number of previous bookings not cancelled by the customer prior to the current booking

<b>reserved_room_type</b>: Code of room type reserved. Code is presented instead of designation for anonymity reasons.

<b>assigned_room_type</b>: Code for the type of room assigned to the booking. Sometimes the assigned room type differs from the reserved room type due to hotel operation reasons (e.g. overbooking) or by customer request. Code is presented instead of designation for anonymity reasons.

<b>booking_changes</b>: Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation 

<b>deposit_type</b>: 
Indication on if the customer made a deposit to guarantee the booking. This variable can assume three categories: 
•	No Deposit – no deposit was made; 
•	Non Refund – a deposit was made in the value of the total stay cost; 
•	Refundable – a deposit was made with a value under the total cost of stay.

<b>agent</b>: ID of the travel agency that made the booking

<b>company</b>: ID of the company/entity that made the booking or responsible for paying the booking. ID is presented instead of designation for anonymity reasons

<b>days_in_waiting_list</b>: Number of days the booking was in the waiting list before it was confirmed to the customer

<b>customer_type</b>: Type of booking, assuming one of four categories: 
•	Contract - when the booking has an allotment or other type of contract associated to it; 
•	Group – when the booking is associated to a group; 
•	Transient – when the booking is not part of a group or contract, and is not associated to other transient booking; 
•	Transient-party – when the booking is transient, but is associated to at least other transient booking 

<b>adr</b>: Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights

<b>required_car_parking_spaces</b>: Number of car parking spaces required by the customer

<b>total_of_special_requests</b>: Number of special requests made by the customer (e.g. twin bed or high floor)

<b>reservation_status</b>: Reservation last status, assuming one of three categories: 
•	Canceled – booking was canceled by the customer; 
•	Check-Out – customer has checked in but already departed; 
•	No-Show – customer did not check-in and did inform the hotel of the reason why

<b>reservation_status_date</b>: Date at which the last status was set. This variable can be used in conjunction with the Reservation Status to understand when was the booking canceled or when did the customer checked-out of the hotel
    </p>

<h2 style="color:LightSlateGray">Cleaning Data</h2>

In [None]:
## Find out which columns contain null value
df.isnull().sum()

<b>There are four columns contain null value: children, country, agent, and company</b>

In [None]:
## Since only 4 rows miss data for children, we can just fill them with 0
df['children'].fillna(0, inplace=True)

In [None]:
## Statstically overview of each column
df.describe()

In [None]:
## correlation overview between columns
df.corr()

<h2 style="color:LightSlateGray">General Analysis on Booking Count</h2>

In [None]:
#count the number of bookings from different types of hotels
df.hotel.value_counts().to_frame()

This data only has two types of hotels, and there is more bookings in city hotel.

In [None]:
## Use pivot table to find number of total booking and canceled booking by year and hotel type
pd.pivot_table(df,index=['hotel', 'arrival_date_year'], values='is_canceled', aggfunc=['count', 'sum'])

Since the datatype for column "is_canceled" is int64: 1 represents canceled booking and 0 for not canceled. <br>
We could use count function to get the total booking count, and adding one's in this column to get the sum of canceled bookings.

In [None]:
## Reorder month for better Virtualization
Month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [None]:
## Use catplot to show bookings in a timeline
c = sns.catplot(x='arrival_date_month', order=Month, col = 'arrival_date_year', kind='count', data=df)
c.set_xticklabels(rotation=60)

This data is from July 2015 to August 2017

<h2 style="color:LightSlateGray">Where are visitors from?</h2>

In [None]:
## Import libraries needed for choropleth maps
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
## Count how many visitors from each country
country_data = df['country'].value_counts()
country_data

In [None]:
## we are going to use dict to cast the list into a dictionary
data = dict(
        type = 'choropleth',
        locations = country_data.index,
        z = country_data
      )

In [None]:
## Create a layout object
layout = dict(
    title = 'Where are the Visitors Coming from',
    geo = dict(
        showframe = False,
        projection = {'type':'mercator'}
    )
)

In [None]:
## Display of the World Visitors Choromap
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)

Based on the illustration of the World Visitors Choromap, while it demonstrates after analysis the number of bookings and their origin country. More visitors are from western europe, namely France,UK and Portugal being the highest.

<h2 style="color:LightSlateGray">Visitors Count</h2>

In [None]:
# Display the number of adults in both botels.
plt.figure(figsize=(15, 8))
sns.countplot(x='adults',hue='hotel', data=df, palette='pastel')
plt.title("Number of adults in both hotels",fontweight="bold", size=20)

In [None]:
# Create a new dataframe to display hotel, adults, children, and babies only.
df2 = df[['hotel', 'adults', 'children', 'babies']]
df2['kids'] = df2['children'] + df2['babies']
df2

In [None]:
# Display the number of kids in both hotels.
plt.figure(figsize=(15, 8))
sns.countplot(x='kids',hue='hotel', data=df2, palette='coolwarm')
plt.title("Number of kids in both hotels",fontweight="bold", size=20)

plt.show()

In [None]:
# Create a new dataframe to display hotel, adults, children, babies, stays in weekend and week days only.
df6=df[['hotel', 'adults', 'children', 'babies', 'stays_in_weekend_nights','stays_in_week_nights']]
df6['total_nights'] = df6['stays_in_weekend_nights'] + df6['stays_in_week_nights']
df6['total_people'] = df6['adults'] + df6['children'] + df6['babies']

In [None]:
# Display the ratio of group of parties among total people. 
df6['total_people'].value_counts().plot.pie(autopct='%1.1f%%',shadow=True)

In [None]:
# Display the number of nights people stay among total nights. 
df6['total_nights'].value_counts().plot.pie(autopct='%1.1f%%',shadow=True)

In [None]:
# Display the relationship between total people vs total nights. 
sns.barplot(x='total_people',y='total_nights',data= df6)

### Conclusion
Adults for party of 2 with no kids are the most frequent party size for both hotel bookings; especially, City hotel has about double of the bookings than Resort hotel. In addition, we counted the total number of people(including adults, children, and babies), and found out the party size of 2 has 68.7%, the highest percentage over the total number. Also, we counted the total stay in nights(including stay in week days and weekends), and found out that people stay in 2 nights over the total stay in nights is 23.2%, slightly exceed others; people stay in 3 nights has 22.7%, which is the second common stay in nights. ratios dropped dramatically after 4 stay in nights. We couldn't find much after comparing total people and total stay in nights; however, from the image shown above, we could roughly say that people who come with party of 12 people stay in 8 nights, has the maximum stay in nights over the total. Moreover, party sizes from 6 to 26 people, usually stay in more nights than other party sizes.  

<h2 style="color:LightSlateGray">Analyze data about cancellation</h2>

In [None]:
# calculated the canceled percentage
total_canceled = df.is_canceled.value_counts()[1]
print(f' Booking is canceled: {total_canceled}')
total_not_canceled = df.is_canceled.value_counts()[0]
print(f' Booking is not canceled: {total_not_canceled}')
cancelation_rate = total_canceled / (total_canceled + total_not_canceled) *100
print(f' Cancelation rate is {cancelation_rate:.2f}%')

In [None]:
## Retrieve customers that canceled their reservation
## Save it to new dataframe canceled_table
canceled_table = df[df.is_canceled == 1]
## Count canceled bookings for both hotels
canceled_table.hotel.value_counts()

In [None]:
# divide two df to get the cancellation percentage for both hotel types
(canceled_table.hotel.value_counts()/df.hotel.value_counts()).round(decimals = 4).mul(100).astype(str) + '%'

<h5>Cancellation by year and month</h5>

In [None]:
# Cancellation by year
plt.figure(figsize=(30, 8))
plt.subplot(1, 2, 1)
sns.barplot(data = df, x= 'arrival_date_year', y = 'is_canceled', hue = 'hotel')
plt.title('cancellation percentage by year',fontweight="bold", size=25)
plt.xlabel('year',size=20)
plt.ylabel('cancellation percentage',size=20)
# Cancellation by month
plt.subplot(1, 2, 2)
Month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
sns.barplot(data = df, x= 'arrival_date_month', y = 'is_canceled', order = Month, hue = 'hotel')
plt.title('cancellation percentage by month',fontweight="bold", size=25)
plt.xlabel('month',size=20)
plt.ylabel('cancellation percentage',size=20)

<h2 style="color:LightSlateGray">Analysis of Lead Time and Cancellation Rate</h2>

In [None]:
## See total number of bookings for each lead_time 
lead_time_count = df.groupby('lead_time')['is_canceled'].count()
lead_time_count

In [None]:
#distribution of lead time of the bookings
plt.figure(figsize = (10,5))
sns.distplot(df.lead_time.values, bins = 10)
plt.title('Distrubution of lead time', fontsize = 20)

In [None]:
#total number of bookings canceled for each lead time
lead_time_canceled_count = canceled_table.groupby('lead_time')['hotel'].count()
lead_time_canceled_count

In [None]:
# calculate cancellation_rate by divide two series
cancellation_rate = lead_time_canceled_count / lead_time_count 
cancellation_rate.dropna(inplace = True)
cancellation_rate

In [None]:
#save series to a data frame
cancellation_rate.to_frame()

In [None]:
# regression plot of lead time and cancellation rate
plt.figure(figsize = (10,8))
sns.regplot(cancellation_rate.index, cancellation_rate.values)
plt.title('Lead Time vs Cancellation Rate', fontsize = 20)
plt.ylabel('cancellation_rate')

In [None]:
#  Pearson's Correlation test of lead time and cancellation rate
from scipy.stats import pearsonr
stat, p = pearsonr(cancellation_rate.index, cancellation_rate.values)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably independent')
else:
    print('Probably dependent')

### Conclusion
The x axis shows days of lead time; y aixs shows cancellation rate. We used regression plot of lead time and cancellation rate to reveal the relationship. Regression plot shows the effect of adding another variable to a model that already has one or more independent variables. It also referred to as added variable plots, adjusted variable plots, and individual coefficient plots. Overall, it suggests that visitors who book hotels in advance are more likely to cancel whereas the late minute bookings have a lower cancellation rate.

<h2 style="color:LightSlateGray">What are some facts about the daily rates?</h2>

In [None]:
#First of all, we could visualize Average Daily Rate (adr) for the entire data
sns.boxplot(df['adr'])

There was an error (over 5000) needs to be removed. Let's remove that and try again.

In [None]:
## Get rid of the outlier
df = df[df['adr'] < 5000]
sns.boxplot(df['adr'])

<h5>Which factors affect daily price?</h5>

In [None]:
## Shows the mean of daily rate by hotel type
pd.pivot_table(df,index=['hotel'], values='adr')

In general, city hotel is more expensive than resort hotel

In [None]:
## Shows the distribution of this data by year and hotel type
sns.boxplot(x=df['arrival_date_year'], y=df['adr'], hue='hotel', data=df)

In [None]:
## Shows the distribution of this data by month
## Month was the array reorder the month from Janunary to December
plt.figure(figsize=(14, 8))
sns.barplot(data=df, x='arrival_date_month', y='adr', order=Month, hue='hotel')

The price of resort hotel is more fluctuate than that of city hotel.<br>
The average daily rate for resort hotel is more expensive in summer season (July & August are peak months). <br>
City hotel's daily price is always between 75 and 125.

In [None]:
## Find the average price booked by different resources
plt.figure(figsize=(12, 6))
sns.barplot(x='market_segment', y='adr', data=df)

If visitors book hotels directly or by Online Tutor Agency, the price tends to be higher than other resources.

<h5>Check price based on room types</h5>

In [None]:
## Change the size of the plot
plt.figure(figsize=(12, 8))

## Reorder room type in alphabetical order
letter_order = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'P']

## Distribute the result
sns.boxplot(x='reserved_room_type', y='adr', hue='hotel', order=letter_order, data=df)
plt.title('Room Type vs Average Daily Rate')

Since the price of resort hotel was seasonal, we test the price for city hotels based on room type.<br>
This boxplot showed the prices for room types A, B, and C are similar, so they shoube be standard room types.<br>
Alphabetically, price gradually increased from room type D to G.

<h5>Apply ECDF</h5>

In [None]:
#create a function that takes adr as argument and return x and y.
def ecdf(adr):
    # Number of data points : n
    n = len(adr)
    #  x value  for the ECDF: x
    x = np.sort(adr)
    # y value for the ECDF: y
    y = np.arange(1, n+1) / n
    return x, y

In [None]:
# apply defination ECDF, pass different hotel type to see the booking price
# compute x, y for City Hotel and Resort hotel by using ECDF funtion
plt.figure(figsize=(12, 8))
x_city, y_city = ecdf(df[df['hotel'] =='City Hotel']['adr'])
x_resort, y_resort = ecdf(df[df['hotel'] =='Resort Hotel']['adr'])
# Plot all ECDFs on the same plot
adr_percentage = plt.plot(x_city, y_city,marker = '.', linestyle = 'none')
adr_percentage = plt.plot(x_resort, y_resort,marker = '.', linestyle = 'none')
# Label the plot
plt.legend(('City Hotel','Resort Hotel'), loc = 'lower right')
adr_percentage = plt.xlabel('Booking price')
adr_percentage = plt.ylabel('percentage')
# Display the plot
plt.show()

70 % of the city hotel booking price is under 100 while 50 % of the resort hotel booking price is under 100.

<h5>Let's compare different room types (use 2017 city hotel data)</h5>

In [None]:
## Select a sample
df2017 = df[df['arrival_date_year'] == 2017]
city = df2017[df2017['hotel'] == 'City Hotel']

In [None]:
## Do the value count for different room types
city['reserved_room_type'].value_counts()

I don't want to compare the price for room type C and P since the sample population is too small

In [None]:
## Extract the room type and price data
df_anova=city[['reserved_room_type', 'adr']]

## Group the data by different rooms
grouped_anova=df_anova.groupby(["reserved_room_type"])

In [None]:
## Import needed library
from scipy import stats

In [None]:
#The ANOVA test can be performed in Python using the f_oneway method #as the built-in function of the scipy package. 
#We pass in the price data of the two room types that we want to compare, and it calculates the ANOVA results.
anova_result_1=stats.f_oneway(
    grouped_anova.get_group("A")["adr"],
    grouped_anova.get_group("D")["adr"])

print( "ANOVA results: F=",anova_result_1)

The prices between room type <b>A and D</b> are significantly different since the F-score is very large. F equals 5110, and the p-value is small.

In [None]:
#The ANOVA test can be performed in Python using the f_oneway method #as the built-in function of the scipy package. 
#We pass in the price data of the two room types that we want to compare, and it calculates the ANOVA results.
anova_result_1=stats.f_oneway(
    grouped_anova.get_group("A")["adr"],
    grouped_anova.get_group("B")["adr"])

print( "ANOVA results: F=",anova_result_1)

The prices between room type A and room type B are not significantly different, as the F-test score is less than 1 and p-value is larger than 0.05

<h5>City Hotel: Lead Time vs Price (use 2017 data)</h5>

In [None]:
test = df2017[df2017['arrival_date_month'] == 'June']
sns.lmplot(data=test, x='lead_time', hue='hotel', y='adr', scatter_kws={'alpha':0.15, 's': 10})

In [None]:
## Import needed library
from scipy import stats

In [None]:
## Select average daily rate of city hotel since it's less effected by summar season
test_city = df2017[df2017['hotel'] == 'City Hotel']
test_city_direct = test_city[test_city['market_segment'] == 'Direct']
test_city_direct

In [None]:
pearson_coef, p_value = stats.pearsonr(test_city_direct['lead_time'], test_city_direct['adr'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)

<p>Since the p-value is $>$ 0.1 and the linear relationship isn't extremely strong, the correlation between lead time and price is not significant.</p>

<h3>Average Daily Rate Conclusion</h3>
<p>The average daily rate for both hotels is 101.8, where city hotel average 105.23 and resort hotel 94.95. <br>
The price for <b>resort hotel</b> varies by season since over summar season, it's more expensive than other months.<br>
The price for <b>city hotel</b> varies by room type.<br>
Also, the resource of bookings would affect the daily rate, it's more expensive to book directly or by online tutor agent.<br>
<b>No significant trend daily rate</b> by lead time.
</p>