In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Read the hotel_bookings.csv file
df_bookings = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')
df_bookings.head()

In [None]:
# Let us analyze the size of DataFrame
df_bookings.shape

There are total 119390 records for 32 features.

In [None]:
# Get data type for all these 32 features
df_bookings.info()

Here, we have integers, float and object i.e. aplhabets and there are no any time or date variables.  
Also, it is observed that there are null values for features 'children', 'country', 'agent' and 'company', as the count is not equal to total number of records.  
  
Following are the ways in which we can eliminate the null:  
1. **Children:** Null values represent no children in guest list and hence replacing the nulls with '0'.  
2. **Country:** Here, we do not know from which country the guests are and hence replacing all null values with 'Unknown'.  
3. **Agent:** Here also, no agent number means it is a private booking and hence should be replaced with '0.0'.  
4. **Company:** There is only 5% data in this feature. So, let's just drop this column.

In [None]:
# Updating missing values
df_bookings.fillna({'children':0, 'country':'Unknown', 'agent':0.0}, inplace=True)

# Drop 'Company' column
#df_bookings.drop('company', axis=1, inplace=True)

df_bookings.head()

Now the data is free of null values.  
Let's check for the duplicate values in the dataset

In [None]:
len(df_bookings[df_bookings.duplicated()])

In [None]:
# Let's remove 32001 duplicate records from the DataFrame
df_bookings.drop_duplicates(inplace=True)

Let's figure out the correlation between all the features by visualizing it on Heat Map

In [None]:
# Check correlation on the heatmap
fig,axes = plt.subplots(1,1,figsize=(10,7))

sns.heatmap(df_bookings.corr(), cmap='coolwarm', linecolor='white')

plt.show()

From this map, the relation cannot be displayed so clearly. Let's go through some features and visualize the same.

### Hotel
See the list of no. of hotels listed and no. of booking in them

In [None]:
df_bookings['hotel'].value_counts()

From here we can say that 'City Hotel' gets more number of bookings than 'Resort Hotel'.  
### Lead Time
Let's see how much time usually guests do the bookings.

In [None]:
fig = plt.figure(figsize=(15,5))

plt.hist(df_bookings['lead_time'], bins=40)

plt.xlabel('Lead time')
plt.ylabel('count')
plt.title('How much time guests do the bookings?')

plt.show()

Here, we can see that maximum guests are booking it on same day of just few days from bookings.

### Cancellation rate

In [None]:
# Next column 'is_canceled'.
df_bookings['is_canceled'].value_counts()

In [None]:
# Also, let's see how many cancellations are there in each hotel
sns.countplot('hotel', data=df_bookings, hue='is_canceled')

plt.show()

In [None]:
df_bookings.groupby('hotel')['is_canceled'].value_counts(normalize=True)*100

So, there are **30% cancellations for 'City Hotel'** while only **23.48% for 'Resort Hotel'**

### Average Daily Price (adr) 
Price of the hotel room is one of the biggest factor in deciding whether the person will book it or not and these prices keep changing with the season in year.  
Here, we will see the prices for both hotels in each month of the year.

In [None]:
# Let us view the average daily price per day per customer. Here, we will consider that the hotels are not charging babies 
sns.boxplot(df_bookings['adr'])

plt.show()

From above boxplot, we can see one outlier.  
This graph explians that all the price ranges are way below 1000 and only one of the price is above 5000. Now this is can human error while reading the price.
Let us remove the outlier where the price is too high.

In [None]:
df_bookings.drop(df_bookings[df_bookings['adr']>5000].index, axis=0, inplace=True)
sns.boxplot(df_bookings['adr'])

plt.show()

In [None]:
# Prices round the year
sns.barplot(x='adr', y='arrival_date_month', data=df_bookings, hue='hotel')

plt.show()

From above plot, it is observed that the price range for 'Resort Hotel' varies a lot while this variation is less for 'City Hotel'.  
Maybe, because of these variations, total number of customers for 'Resort Hotel' is less than 'City Hotel'. 

### Stay in Week nights v/s Stay in Weekend nights
As we saw that maximum bookings are done during summer time. Let's try to see whether the bookings are done for weekdays or weekends

In [None]:
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure

plt.hist(df_bookings['stays_in_week_nights'][df_bookings['stays_in_week_nights'] < 10].dropna(), 
         bins=8,alpha = 1,color = 'lemonchiffon',label='Stays in week night' )

plt.hist(df_bookings['stays_in_weekend_nights'][df_bookings['stays_in_weekend_nights'] < 10].dropna(),
         bins=8, alpha = 0.5,color = 'blueviolet',label='Stays in weekend night' )

plt.xlabel('No.of days')
plt.ylabel('Count')
plt.title('No. of Bookings in Week & Weekends')
plt.legend(loc=1)

plt.show()

Here, we can say that maximum bookings are received for weekends and the stay length is between 0 to 2 nights.

### Average Daily Price with respect to the room type
Now, price of any room won't only depend on season, but also on the type of room offered.  
So, let's see the price according to the room type.

In [None]:
df_price = df_bookings.groupby('reserved_room_type')['adr'].agg({'Average_price':'mean', 'No. of bookings':'size'})
df_price.reset_index(inplace=True)

df_price

In [None]:
# Let's visualize these to understand better
fig,ax = plt.subplots(1, 2, figsize=(15,5))

# Plot 1 for checking average price per room
ax[0].plot(df_price['reserved_room_type'], df_price['Average_price'], color='red')
ax[0].set_xlabel('Room Types')
ax[0].set_ylabel('Average Price')
ax[0].set_title('Average price per room type')

# Plot 2 for checking number of bookings
ax[1].plot(df_price['reserved_room_type'], df_price['No. of bookings'], color='green')
ax[1].set_xlabel('Room Types')
ax[1].set_ylabel('No. of bookings')
ax[1].set_title('Number of booking for each room')

plt.show()

From first plot, we can understand that hotel room type 'H' is the costliest and the room type 'P' is the cheapest.  
While from second plot, it is very clear that room type 'A' is booked the most while room type 'P' is least selected.  

### Market Segment
Let's see how many market segments are booking in these hotels.

In [None]:
df_bookings['market_segment'].value_counts()

In [None]:
# Let's visualize the market segment in a pie chart
fig = plt.figure(figsize=(10,10))

market_size = df_bookings['market_segment'].value_counts().tolist()
labels = df_bookings['market_segment'].value_counts().index.tolist()

plt.pie(market_size, labels=labels, autopct='%1.1f%%', startangle=90)

plt.show()

So, most of the bookings (almost 59%) is done through Online TA and after that it is offline.  
Let's see which agent has booked maximum booking in these hotels.

### Agent

In [None]:
# Check count of Agents 
agent_list = list(df_bookings['agent'].value_counts().index)

print('Total number of agents in list: ', len(agent_list))

There are total 334 agents available for booking in these hotels. Let us see the top 10 Agents for bookings these hotels.

In [None]:
# Let's see which are the top 10 agents who are responsible for booking in these hotels
fig = plt.figure(figsize=(10,10))

# Values to be seen
bookings = df_bookings['agent'].value_counts().tolist()
agent_list = df_bookings['agent'].value_counts().index.tolist()

# Pop out top 3 agents with maximum bookings
explode = (0.10,0.07,0.03,0,0,0,0,0,0,0)

plt.pie(bookings[:10], labels=agent_list[:10], explode=explode, autopct='%1.1f%%', startangle=90)
plt.tight_layout()
plt.title('Best agent')

plt.show()

Here, we can see that maximum bookings are done by Agent number '9.0' and he holds 42.3% of total bookings.

### Country
Let's see from which countries the guests are bookings

In [None]:
# Print the top 10 count
fig = plt.figure(figsize=(15,5))

x = df_bookings['country'].value_counts().index[:10]
y = df_bookings['country'].value_counts()[:10]

plt.bar(x,y, color='green')
plt.xlabel('Countries')
plt.ylabel('Customer count')
plt.title('Top 10 customer count')

plt.show()

So, people from Portugal are visiting the hotels more than any other country.

### Customer Type

In [None]:
# Check which type of customers are visiting in each hotel

sns.countplot(df_bookings['customer_type'], hue=df_bookings['hotel'], order=df_bookings['customer_type'].value_counts().index)

plt.show()