In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Import all the necessary packages for EDA

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
# Input data files are available in the read-only "../input/" directory

# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')

## Getting to know your data

In [None]:
df.head()

In [None]:
# There are too many columns. Let's expand the maximum columns can be displayed 
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Let's visualize our data!

# Count Plot

In [None]:
sns.set(style = 'whitegrid')
plt.figure(figsize=(10,8))
ax = sns.countplot(x='hotel',data=df,palette='Set1')
ax.set_title(label='Hotel',size=30,color='b')
plt.show()

About two thirds of the customers booked City Hotel.

# Box Plot

In [None]:
# We can use a histogram to visualize lead_time
sns.set(style='ticks')
plt.figure(figsize=(15,12))
ax = sns.boxplot(x=df['is_canceled'],y=df['lead_time'],hue=df['hotel'])
ax.set_title('Lead_Time_Box_Plot', size=25)
plt.xlabel('lead time',fontsize=20)
plt.ylabel('is canceled',fontsize=20)
plt.show()

From the bot plot, we can conclude that when lead_time is higher, there is a higher chance that a customer cancel the resevation and it makes sense. When a customer books a hotel 100 days (3 months) ahead, it's very likely for he or she to change the itineary and cancel the reservation. Besides, there is not much difference in average hotel book lead time whether is it cancelled or not between resort and city hotel.

Let's try to use boxen plot to visualize it!

# Boxen Plot

In [None]:
# We can use a histogram to visualize lead_time
sns.set(style='ticks')
plt.figure(figsize=(15,12))
ax = sns.boxenplot(x=df['is_canceled'],y=df['lead_time'],hue=df['hotel'])
ax.set_title('Lead_Time_Box_Plot', size=25)
plt.xlabel('lead time',fontsize=20)
plt.ylabel('is canceled',fontsize=20)
plt.show()

Let's take a look at the arrival date week number. My hypothesis would be there will be more hotel reservations in July, August because it's summer time.

In [None]:
# Sort arrival date week number by count of hotel reservations
print(df['arrival_date_week_number'].nunique())
print(df['arrival_date_week_number'].value_counts().sort_values(ascending=False))  

# Histogram

Let's use histogram to visualize the distributions of the arrival date week hotel reservations number count.

In [None]:
sns.set(style='ticks')
plt.figure(figsize=(20,12))
ax = plt.hist(x=df['arrival_date_week_number'],bins=np.arange(55)-0.5,facecolor='blue',alpha=0.5)
plt.xlabel('arrival_date_week_number',size=20)
plt.ylabel('count',size=20)
plt.title('arrival_date_week_number_count',size=30)
plt.xticks(range(1,54,10))

plt.show()

My hypothesis was right!
<br><br> Indeed,hotel reservation reaches its peak around July and August. 

Let's take a look at previous cancellations. 
<br> If a customer has previous cancelled before, he or she should be more likely to cancel again. 

We take a look at the correlation between 'previous cancellations' and 'is cancelled' first.

In [None]:
df[['is_canceled','previous_cancellations']].corr()

 Correlation is only 0.1101. Let's check the distribution of previous cancellations.

In [None]:
# Sort previous cancellations by index (cancellation times)
df['previous_cancellations'].value_counts().sort_index()

Let's take a look customer cancellation rate for customers who cancelled more than 10 times before!

In [None]:
print('Cancellation rate for customers who have canceled \
more than 10 times before:',str(round(df[df['previous_cancellations']>10]['is_canceled'].mean()*100,2))+'%')


85.56%! 
<br>Let's dive in more to see the customer cancellation rate for customers with each previous cancellation times

In [None]:
# Convert index to a list 
a=df['previous_cancellations'].value_counts().index.to_list()
# Sort the list by ascending order
a.sort()
a

In [None]:
b = []
for ccltime in a:
    b.append(round(df[df['previous_cancellations']==ccltime]['is_canceled'].mean(),2))
b

In [None]:

df[df['previous_cancellations']==1]['is_canceled'].value_counts()

In [None]:
round(df[df['previous_cancellations']==1]['is_canceled'].mean(),2)

To our surprise, when customer previouslly had one time hotel reservation cancellation, 94% of them cancelled again. Notice, we have a fairly large data for it too. (6,000 data points)
<br><br> Let's dig a little further!

Notice there is a column "previous_bookings_not_canceled" next to the column "previous_cancellations". It's possible that a customer had previously not canceled every reservation he or she made. Thus, by only take account of 0 previous cancellation doesn't give us all embedded information. We define a new variable "previous cancellation rate" as previous cancellations divided by total previous reservations. Let's see if the new features makes more sense in our data.

In [None]:
# Define a new parameter pervious_cancellation_rate
df['previous_cancellation_rate']=df['previous_cancellations']/(df['previous_cancellations']+df['previous_bookings_not_canceled'])

In [None]:
df['previous_cancellation_rate'].value_counts(dropna=False)

There are 109933 NaN values for previous_cancellation_rate. It's possibly cause by 0 denominator. Check if that's the case.

In [None]:
(df['previous_cancellations']+df['previous_bookings_not_canceled']).value_counts()

There are 109933 "0: values for denominator. It proves our hypothesis.

Let's divide "previous_cancellation_rate" into 10 equal length intervals and compare their mean of "is_canceled".

In [None]:
df.sort_values('previous_cancellation_rate')

In [None]:
# Create a list of 10 elements with equal difference from 0 to 1
l = list(range(11))
l = [i/10 for i in l]
# Cut previous cancellation rate into 10 equal length intervals
df['previous_cancellation_rate_interval'] = pd.cut(x=df['previous_cancellation_rate'],bins=l, include_lowest=True)
df

Check value counts

In [None]:
df['previous_cancellation_rate_interval'].value_counts()

In [None]:
# Find is_canceled mean group by previous cancellation rate interval
df.groupby('previous_cancellation_rate_interval').mean()

This data makes better sense! Overall, when a customer is in a higher previous cancellaion rate interval, he or she is more inclined to cancel their reservation this time.
<br>We need to also check those customers who have not previous booked a reservation.

In [None]:
df[df['previous_cancellation_rate'].isna()].mean()

34.76%!  Without any given previous booking data, a customer has around 34.67% chance to cancel his or her reservation.

We will use a scatter plot to visualize this. Since it's very difficult to deal with "NaN" values in previous cancellation rate, we will keep previous cancellations and previous bookings not canceled for machine learning purpose.
<br> For the scatter plot, we will use the mean of previous cancellation rate in each interval on the X-axis.

# Scatter Plot

In [None]:
# Add a new column mean of previous cancellation rate in each previous cancellation rate interval
df = df.join(df.groupby('previous_cancellation_rate_interval')['previous_cancellation_rate'].mean(),on='previous_cancellation_rate_interval',
             rsuffix='_r').sort_values(by='previous_cancellation_rate_r')
# Add a new column mean of cancellration rate of the last booking in each previous cancellation rate interval
df = df.join(df.groupby('previous_cancellation_rate_interval')['is_canceled'].mean(),on='previous_cancellation_rate_interval',
             rsuffix='_r')

In [None]:
df['previous_cancellation_rate_r'].value_counts()

In [None]:
# Define X,Y for scatter plot
x = df['previous_cancellation_rate_r']
y = df['is_canceled_r']
u, c = np.unique(np.c_[x,y], return_counts=True, axis=0)

In [None]:
# Normalize
s = lambda x : (((x-x.min())/float(x.max()-x.min())+1)*20)**2
# Scatter Plot
plt.figure(figsize = (20,12))
plt.scatter(u[:,0],u[:,1],s=s(c))
# Add trend line
sns.regplot(x='previous_cancellation_rate_r',y='is_canceled_r',data=df, fit_reg=True) 
plt.xlabel('previous cancellation rate',size=20)
plt.ylabel('last booking cancellation rate',size=20)
plt.title('Previous and last booking cancellation rate',size=30)
plt.show()

From the scatter plot along with trend line, we can confirm previous cancellation rate is a good indication of last cancellation rate. Let's use bar plot to illustrate again.

# Bar Plot

In [None]:
# Bar Plot
sns.set_context('paper')
plt.figure(figsize=(20,12))
sns.barplot(x=df['previous_cancellation_rate_r'],y=df['is_canceled_r'])
plt.xlabel('previous cancellation rate',size=20)
plt.ylabel('last booking cancellation rate',size=20)
plt.title('Previous and last booking cancellation rate',size=30)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

Perfect!

We can also use a line plot to visualize!

# Line Plot

In [None]:
# Bar Plot
sns.set_context('paper')
plt.figure(figsize=(20,12))
sns.lineplot(x=df['previous_cancellation_rate_r'],y=df['is_canceled_r'])
plt.xlabel('previous cancellation rate',size=20)
plt.ylabel('last booking cancellation rate',size=20)
plt.title('Previous and last booking cancellation rate',size=30)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

Bar plot also shows a clear trend. 

Reserved room type and assigned room type together could also be a good feature to predict whether the last reservation was canceled. For instance, if a customer specifically want one room type, but he or she is assigned another room type. This customer would have a strong tendency to cancel this reservation and try to find a new hotel with his or her ideal room type. Let's explore our data to see whether it makes sense.

In [None]:
df[['is_canceled','reserved_room_type','assigned_room_type']]

In [None]:
# Define a function to check whether a customer get his or her reserved room type
def rrt_art_same(df):
    if df['reserved_room_type'] == df['assigned_room_type']:
        return 'True'
    else:
        return 'False'

In [None]:
# Apply to dataframe
df['rrt_art_same'] = df.apply(rrt_art_same,axis=1)
# Check dataframe
df[['is_canceled','reserved_room_type','assigned_room_type','rrt_art_same']]

In [None]:

df.groupby('rrt_art_same')['is_canceled'].mean()

41% and 5%! Indeed, reserved room type together with assigned room type is a good indicator of whether last reservation was canceled. Let's dig a little further and plot a stacked bar graph to visualize it!

In [None]:
df[['reserved_room_type','assigned_room_type']].describe()

Let's encode the room type varaible.
For data visualiztion purpose, we will first encode it as ordinal data. Later, we will encode it as nominal data for model training.

In [None]:
#Check room types 
room_type_1 = np.unique(df['reserved_room_type'])
room_type_2 = np.unique(df['assigned_room_type'])
room_type_1, room_type_2 

There are "L" and "K" room types which were not booked but assigned.

We will first encode reserved room type and manually code assigned room type to make sure "L" and "P" have same index.

In [None]:
# Encode reserved room type
enc = OrdinalEncoder()
df['en_reserved_room_type'] = enc.fit_transform(df[['reserved_room_type']])
# Encode assigned room type
enc_ord_map = {'A': 0, 'B': 1, 'C': 2, 
               'D': 3, 'E': 4, 'F': 5,
               'G': 6, 'H': 7, 'I': 10,
               'K': 11, 'L': 8, 'P': 9}
df['en_assigned_room_type'] = df['assigned_room_type'].map(enc_ord_map)
df[['en_reserved_room_type', 'en_assigned_room_type']].iloc[4:10]

In [None]:
# We need to make en_reserved_room_type to intergers
df['en_reserved_room_type'] = df['en_reserved_room_type'].astype(int)


In [None]:
# put room type and counts in a list for reserved room type
en_reserved_room_type_list = df['en_reserved_room_type'].value_counts().sort_index().to_list()
# Extend 0 for reserved room type "I" and "K"
en_reserved_room_type_list.extend([0,0])
# put room type and counts in a list for assigned room type
en_assigned_room_type_list = df['en_assigned_room_type'].value_counts().sort_index().to_list()


# Stacked Bar Plot

In [None]:
plt.figure(figsize=(20,12))
# Stacked Bar Plot
p1 = plt.bar(height=en_reserved_room_type_list,x=room_type_2)
p2 = plt.bar(height=en_assigned_room_type_list,x=room_type_2,
             bottom=en_reserved_room_type_list)

plt.xlabel('room type',size=20)
plt.ylabel('Counts',size=20)
plt.title('Reserved and assigned room types',size=30)
plt.legend((p1, p2), ('reserved_room_counts', 'assigned_room_counts'))

plt.show()

In fact, stacked bar plot is not a good representation of the relationship between reserved room type and assigned room type. It's difficult for us to visualiztion how many percentage of the customers is assigned their reserved room type. In addition, since most of the customers reserved and assigned room type A, we barely can see the relationship between reseved and assigned rooms for room type B, C, I,K and so on.

Let's try using a different plot technique.

In [None]:
# Convert to Numpy array
en_reserved_room_type_list_2 = np.array(en_reserved_room_type_list)
en_assigned_room_type_list_2 = np.array(en_assigned_room_type_list)
# Find percentage of reservent and assigned room counts in terms of total room counts
room_type_total_counts = en_reserved_room_type_list_2+en_assigned_room_type_list_2
reserved_room_type_percentage = en_reserved_room_type_list_2/room_type_total_counts*100
assigned_room_type_percentage = en_assigned_room_type_list_2/room_type_total_counts*100


# Percent Stacked Bar Plot

In [None]:
plt.figure(figsize=(20,12))

# stack bars
plt.bar(room_type_2, reserved_room_type_percentage, label='reserved_room_type_percentage')
plt.bar(room_type_2, assigned_room_type_percentage, bottom=reserved_room_type_percentage, label='assigned_room_type_percentage')

# add text annotation corresponding to the percentage of each data.
for xpos, ypos, yval in zip(room_type_2, reserved_room_type_percentage/2, reserved_room_type_percentage):
    plt.text(xpos, ypos, "{:.2%}".format(yval/100), ha="center", va="center")
for xpos, ypos, yval in zip(room_type_2, reserved_room_type_percentage+assigned_room_type_percentage/2, assigned_room_type_percentage):
    plt.text(xpos, ypos, "{:.2%}".format(yval/100), ha="center", va="center")

# add text annotation corresponding to the "total" value of each bar
for xpos, ypos, yval in zip(room_type_2, reserved_room_type_percentage+assigned_room_type_percentage, room_type_total_counts):
    plt.text(xpos, ypos, "N=%d"%yval, ha="center", va="bottom")

plt.ylim(0,110)

# Labels, title, ticks and legend
plt.xlabel('Room type',size=20)
plt.ylabel('Percentage',size=20)
plt.title('Percentage of requested and assigned room type',size=30)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(bbox_to_anchor=(1.01,0.5), loc='center left',fontsize=12)
plt.savefig('normalized_stacked_barplot_with_number.png', bbox_inches='tight', pad_inches=0.02)

Wonderful!

Let's now encode room types as nominal data for model training purpose. Before do that, let's drop perviously encodes columns.

In [None]:
df.columns

In [None]:
# Drop columns
df.drop(columns={'en_reserved_room_type','en_assigned_room_type'},inplace=True)

In [None]:
# Use get dummies to enocde
df.reserved_room_type = df.reserved_room_type.astype('category').cat.codes
df = pd.get_dummies(df,columns=['reserved_room_type','assigned_room_type'])
df

Let's keep going down the features to see which are explantory variable.

Deposit type should be included in our feature set. If a customer has paid non refundable desposit, he or she won't cancel the booking because it's already a sunk cost.Let's check that!



In [None]:
df.groupby('deposit_type').mean()['is_canceled']

Our hypothesis is right again!
Let's use a pie graph to visualize the composition of different deposit types.

# Pie Chart

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = df['deposit_type'].value_counts().index.tolist()
sizes = df['deposit_type'].value_counts().tolist()
explode = (0, 0.1, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')


fig1, ax1 = plt.subplots(figsize=(20,12))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',textprops={'fontsize': 14},
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Deposit Type',fontsize=30)
plt.show()

High negative total of sepcial requests with cancelation. My hypothesis is that when a customer has more special reqeusts and they are met, the customer is more likely to stay with the hotel

# Point Plot

In [None]:
# Point Plot
sns.catplot(y='is_canceled',x='total_of_special_requests',kind='point',data=df,height=8,aspect=2)
plt.xlabel('Cancelation',size=10)
plt.ylabel('Num of Speical Requests',size=10)
plt.title('Special Requests',size=20)
plt.show()

Great! We have done a great job in explotory data analysis. Now let's delete all the unnecessary features for model training purpose.

In [None]:
df.drop(columns={'previous_cancellation_rate','previous_cancellation_rate_interval','previous_cancellation_rate_r','is_canceled_r'
                ,'rrt_art_same',},inplace=True)

Well down!