In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
#data processing
import pandas as pd
#linear algebra
import numpy as np
#data visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use("dark_background")

# Data wrangling

In [None]:
#Few important attributes to look at with an objective to optimize business
#-Average room rates in different cities
#-Number of hotels in different cities
#-Cancellation Rate
#-Number of bookings in a given time period
#-Discounts offered
#-Average Number of nights stayed by customers
#-How many days prior to the stay bookings are made
#-Revenue

In [None]:
#Loading Datsets
data_hotel=pd.read_csv('../input/hotel-analysis1/Hotel Details.csv')
data_city=pd.read_csv('../input/hotel-analysis1/Hotel_City.csv')

In [None]:
data_hotel.head(3)

In [None]:
data_city.head(3)

In [None]:
#understanding shape of the data
print('Hotels & City datasets have',data_hotel.shape,data_city.shape,'rows and columns respectively')

In [None]:
#merging the datasets
data= data_hotel.merge(data_city,left_on='hotel_id',right_on='Hotel_id',how='left')
data.head()

In [None]:
data.describe()

In [None]:
#The booking amount varies from as low as ~INR 900 to as high as ~ INR 1.2Lacs

In [None]:
#check for nulls
data.isnull().sum()

In [None]:
#check for any other missing data
data.isin(['?']).sum()

In [None]:
#Distinct number of hotels
print('There are', data['hotel_id'].nunique(), 'distinct hotels in the dataset spread across', data['City'].nunique(), 'cities')

#1.Distribution of hotels by cities

In [None]:
hotels_by_cities=pd.DataFrame(data.groupby(['City']).nunique()['hotel_id'])
hotels_by_cities=hotels_by_cities.reset_index().sort_values(by='hotel_id',ascending=False)
hotels_by_cities['%Distribution']=(hotels_by_cities['hotel_id']/(hotels_by_cities['hotel_id'].sum()))*100
hotels_by_cities['Labels']=hotels_by_cities['%Distribution'].round().astype(str) + '%'
hotels_by_cities

In [None]:
plt.figure(figsize=(5,5))
plt.title("Distribution of hotels by City")
colors = ['#191970','#001CF0','#0038E2','#0055D4','#0071C6','#008DB8','#00AAAA','#00C69C','#00E28E','#00FF80']
plt.axis("equal")
g1=plt.pie(hotels_by_cities['%Distribution'],labels=hotels_by_cities['City'],colors=colors,autopct='%0.0f%%',
        shadow=True,explode=[0,0, 0, 0,0.1,0.1,0.15,0.175,0.2,0.25])

plt.show()

In [None]:
#More than half the hotels are concentrated in Delhi, Bangalore & Gurgaon

#2.Cancellation Rate

In [None]:
data.columns

In [None]:
status_check=pd.DataFrame(data.groupby(['City','status']).count()['booking_id'])
status_check.reset_index(inplace=True)
status_check['Total']=status_check.groupby('City')['booking_id'].transform('sum')
status_check['Cancellation Rate']=((status_check['booking_id']/status_check['Total'])*100).round(1)
status_check['Labels']=status_check['Cancellation Rate'].astype(str) + '%'
status_check.head()

In [None]:
#Distribution on a bar plot
plt.figure(figsize=(10,5))
g2=sns.barplot(data=status_check[status_check['status']=='Cancelled'],x='City',y='Cancellation Rate',palette='GnBu_r')
g2.set(title='Cancellation Rate By Cities')

for a in g2.patches:
    g2.annotate('{:.1f}%'.format(a.get_height()),
              ((a.get_x()+a.get_width()/2),a.get_height()),
               ha="center", va="top",
               xytext=(0,9),
               textcoords="offset points")

In [None]:
#Delhi has the highest cancellation rate while Pune has the lowest

#3.Booking by Month

In [None]:

data['Month']=data['check_in'].apply(lambda x:x.split("-")[1])
data['Month']=data['Month'].map({'01':'Jan','02':'Feb','03':'Mar'})
data.head()
bookings_by_month=pd.DataFrame(data.groupby(['City','Month']).count()['booking_id']).reset_index()
bookings_by_month.head()

In [None]:
plt.figure(figsize=(15,6))
g3=sns.barplot(data=bookings_by_month,x='City',y='booking_id',palette='crest',hue='Month',dodge=True)
g3.set(title='Number of Bookings by City and Month')

for a in g3.patches:
    g3.annotate('{:.0f}'.format(a.get_height()),
              ((a.get_x()+a.get_width()/2),a.get_height()),
               ha="center", va="center",
               xytext=(0,9),
               textcoords="offset points")

#4.Revenue Distribution by Cities

In [None]:

data['Total Price']=data['amount']-data['discount']
data['Net Revenue']=data[data['status']!='Cancelled'].groupby(['City'])['Total Price'].transform('sum')
data['Gross Revenue']=data.groupby(['City'])['amount'].transform('sum')
Revenue=data[data['status']!='Cancelled'][['City','Net Revenue','Gross Revenue']].drop_duplicates()
Revenue=pd.melt(Revenue, id_vars="City", var_name="Revenue_Type", value_name="Revenue")

In [None]:
plt.figure(figsize=(16,5))
g4=sns.factorplot(data=Revenue,x='City',y='Revenue',palette='crest_r',hue='Revenue_Type',kind='bar',dodge=True)
g4.set(title='Revenue by City')
plt.xticks(rotation=90)
plt.show()