# Hotel analysis
# The dataset contains data from two different hotels. One Resort hotel and one City hotel.
# Both hotels are located in Portugal (southern EuropeThe distance between these two locations is ca. 280 km by car and both locations border on the north atlantic.
# The data contains "bookings due to arrive between the 1st of July of 2015 and the 31st of August 2017".
# Note: For most questions I will only use bookings that were not cancelled, to get actual guest numbers. As you will see, this is quite a big difference.
# Topics covered and questions to answer from the data:


# Where do the guests come from?
# How much do guests pay for a room per night?
# How does the price per night vary over the year?
# Which are the most busy month?
# How long do people stay at the hotels?
# Bookings by market segment
# How many bookings were canceled?
# Which month have the highest number of cancelations?



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
df.head()

# # Data cleaning and pre-processing

In [None]:
# shape of data 
df.shape

In [None]:
# null value
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
# from above out put its clear that some booking were made without agent and privetly
# so fill that null values with '0'

df.fillna(0,inplace=True)

# in 'fillna' consider 'na' as 'not available'

In [None]:
# now lets check null value in data
df.isnull().sum()

In [None]:
df['meal'].value_counts()

In [None]:
df['children'].unique()

In [None]:
df['adults'].unique()

In [None]:
df['babies'].unique()

In [None]:
# from above 3 o/p it is clear that children,adults,babies value not be zero at a time
# so lets check for such condition children = adults = babies =0 by creating filter

filter = (df['children']==0) & (df['adults']==0) & (df['babies']==0)

# now lets pass this filter into datafarme

df[filter]

In [None]:
# from above o/p it is clear that there are total 180 row which are fill with wrong info/entry , so we remove thoes row 
df[~filter] 
# '~filter' negation of filter

In [None]:
data = df[~filter] # this is final data on which we will perform different analysis

# where do the guests come from and sptial analysis

In [None]:
# where do the guests come from and sptial analysis
#Spatial analysis or spatial statistics includes any of
#the formal techniques which studies entities using their topological, geometric, or geographic properties.

In [None]:
data['hotel'].value_counts() # so here two catergories in hotel , so we can perform analysis seperatly for where do the guests come from

In [None]:
resort_data = data[(data['hotel']=='Resort Hotel') & (data['is_canceled']==0)]
city_data = data[(data['hotel']=='City Hotel') & (data['is_canceled']==0)]

In [None]:
resort_data.head()

In [None]:
city_data.head()

In [None]:

from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import plotly.express as px

In [None]:
labels = resort_data['country'].value_counts().index
values = resort_data['country'].value_counts()

In [None]:
trace = go.Pie(labels=labels,values=values,hoverinfo='label+percent',textinfo='value + label', textposition='inside')



In [None]:
iplot([trace])

In [None]:
country_wise_data = data[data['is_canceled']==0]['country'].value_counts().reset_index()
country_wise_data.columns=['country','no of guests']
country_wise_data.head()

In [None]:
px.choropleth(country_wise_data,
              locations = country_wise_data['country'],
              color=country_wise_data['no of guests'],
              hover_name=country_wise_data['country'],
              title='Home country of guest')
# choropleth means a symbol or marked and bounded area on a map denoting the distribution of some property

In [None]:
# from above map we conclude that max no. of guest are coming from Europe countries
# from portugal we have higest no of guest

# # How does guest pay for a room per night ?

In [None]:
data2=data[data['is_canceled']==0]

In [None]:
data2.columns

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='reserved_room_type',y='adr',data=data2,hue='hotel') # adr stands for prize of the room  
plt.title('Price of room type per night', fontsize=16)
plt.xlabel('room type')
plt.ylabel('Price in [EUR]' )
plt.show()

In [None]:
# This figure shows the average price per room, depending on its type and the standard deviation. 
# Note that due to data anonymization rooms with the same type letter may not necessarily be the same across hotels
# from above o/p it is clear that type A room have highest outliar i.e highest prices = 500
# where as in G type in city hotel room have higher price  

# # How does the price of room vary over the year?

In [None]:
resort1 = resort_data[resort_data['is_canceled']==0]
city1 = city_data[city_data['is_canceled']==0]

In [None]:
resort1.head()

In [None]:
city1.head()

In [None]:
# to find cost of room varying cross yr lets groupby the data on basis of(arrival_date_month)

In [None]:
resort_hotel=resort1.groupby(by='arrival_date_month')['adr'].mean().reset_index()
city_hotel=city1.groupby(by='arrival_date_month')['adr'].mean().reset_index()
# it shows the mean price of room across yr on basis of month
# reset_index convert given data into data frame

In [None]:
resort_hotel

In [None]:
city_hotel

In [None]:
# resort_hotel and city_hotel arrival_date_month is common colunm so merge it
final = resort_hotel.merge(city_hotel,on='arrival_date_month')
final

In [None]:
final.columns=['Month','Price_for_resort','Price_for_city_hotel'] 
final

In [None]:
# now if we try to plot any plot it will give us wrong insight because the final dataframe is not contain month in sort manner 
# so first sort the dataframe on base of month
# we can solve this by logic or some handy models of python but by logic it is very difficult

In [None]:
!pip install sort-dataframeby-monthorweek

## Dependency package needs to be installed
!pip install sorted-months-weekdays

In [None]:
import sort_dataframeby_monthorweek as sd

In [None]:
final2 = sd.Sort_Dataframeby_Month(final,'Month')
final2.head()
# so this is final2 data which we will use for our plot

In [None]:
px.line(final2,x='Month',y=['Price_for_resort','Price_for_city_hotel'],title='Room price over the year')

In [None]:
#This clearly shows that the prices in the Resort hotel are much higher during the summer (no surprise here).,
#The price of the city hotel varies less and is most expensive during spring(march to june) and autumn(sep to dec).

# # Distribution of nights Spent at hotels by market segment and hotel type

In [None]:
data['market_segment'].value_counts().index

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(x='market_segment',y='stays_in_weekend_nights',data=data,hue='hotel')

In [None]:
# above plot show that most plot are normally distributed and some have positive skewness
# from above plot it is clear that people mostly prefer to stay in resorte more than 1 week as compare to city hotel

# # Analysing Preference of guests, what they basically prefer?

In [None]:
data['meal'].value_counts()

In [None]:
# its clear that people mostly prefer for breakfast type meal

In [None]:
px.pie(data_frame=data,names=data['meal'].value_counts().index,values=data['meal'].value_counts(),hole=0.5)

In [None]:
data.columns

In [None]:
sns.countplot(data['total_of_special_requests'],)

In [None]:
# lets groupby our data on the basis of 'total_of_special_requests' & 'is_canceled'

In [None]:
data.groupby(['total_of_special_requests','is_canceled']).agg({'total_of_special_requests':'count'})

In [None]:
# let convert into pivot table, and also rename the column
pivot = data.groupby(['total_of_special_requests','is_canceled']).agg({'total_of_special_requests':'count'}).rename(columns={'total_of_special_requests':'count'}).unstack()

In [None]:
pivot

In [None]:
pivot.plot(kind='bar')

In [None]:
# we can say that half of booking without any special request has been canceled and another half of them not canceled

# # analysis most busy month

In [None]:
data.columns

In [None]:
rush_resort = resort_data['arrival_date_month'].value_counts().reset_index()
rush_resort.columns = ['month','no of guests']
rush_resort

In [None]:
rush_city = city_data['arrival_date_month'].value_counts().reset_index()
rush_city.columns = ['month','no of guests']
rush_city

In [None]:
# now merge rush_resort and rush_city
final_rush = rush_resort.merge(rush_city,on='month')
final_rush

In [None]:
final_rush.columns = ['month','no of guests in resort','no of guests in city hotel']
final_rush

In [None]:
final_rush2 = sd.Sort_Dataframeby_Month(final_rush,'month')
final_rush2

In [None]:
px.line(data_frame=final_rush2,x='month',y=['no of guests in resort','no of guests in city hotel'],title = 'Total no of guests per month')

In [None]:
# The City hotel has more guests during spring and autumn, when the prices are also highest.
# In July and August there are less visitors, although prices are lower.
# Guest numbers for the Resort hotel go down slighty from June to September, which is also when the prices are highest.
# Both hotels have the fewest guests during the winter.

# # how long do people stay at the hotels ?

In [None]:
#data2=data[data['is_canceled']==0]
clean_data = data2
clean_data.columns

In [None]:
clean_data['Total no of night'] = clean_data['stays_in_weekend_nights'] + clean_data['stays_in_week_nights']
# adding new column 'Total no of night'

In [None]:
clean_data.columns

In [None]:
clean_data.groupby(['hotel','Total no of night']).agg('count').reset_index()

In [None]:
stay = clean_data.groupby(['Total no of night','hotel']).agg('count').reset_index()
stay = stay.iloc[:,0:3]
stay.head()

In [None]:
stay = stay.rename(columns={'is_canceled':'no of stays'})
stay.head()

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x='Total no of night',y='no of stays',hue='hotel',hue_order=['City Hotel','Resort Hotel'],data=stay)

## booking by market segment

In [None]:
clean_data['market_segment'].value_counts()

In [None]:
px.pie(clean_data,names=clean_data['market_segment'].value_counts().index,values=clean_data['market_segment'].value_counts(),title='Booking per market segment')

In [None]:
clean_data.columns