In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns

In [None]:
df = pd.read_csv("../input/hotel-booking-demand/hotel_bookings.csv")

In [None]:
df.describe()

In [None]:
df.head(5)

In [None]:
#Making a copy of our data
data = df

In [None]:
#Getting an overview of the number of unique values present in each of the columns
data.nunique()

### Using the above results, we understand that:-
##### 1) The dataset captures data of two hotels. 
##### 2) The column arrival_date_year has 3 unique values i.e. it captures data of 3 years of bookings made by their customers.
##### 3) The dataset deals with international travel as well, as the "country" column has 177 unique values.
#### More inferences can be made out of this but we'll have a detailed look as we move forward with the EDA.

In [None]:
# Names of the two hotels
data.hotel.unique()

## Checking missing values in the dataset.

In [None]:
null_check = data.isnull().sum()
cols_with_missing_values = null_check[null_check != 0]
cols_with_missing_values

#### As there are only 4 null values for 'children' column and 488 null values for 'country' column (out of 1,19,390 entries), we can simply drop those rows without losing much information.

#### For 'agent' and 'company' columns, we can do a little more digging and decide.

In [None]:
# deleting rows with null values for 'children' and 'country'
data.dropna(subset = ['children', 'country'], inplace = True)

In [None]:
'''to preserve important information for 'agent' and 'company' columns, 
  we'll replace the null values with 0 since both columns contain numerical data.'''
data.agent.fillna(0, inplace = True)
data.company.fillna(0, inplace = True)

In [None]:
#verifying whether we have handled the null values or not
null_check = data.isnull().sum()
cols_with_missing_values = null_check[null_check != 0]
cols_with_missing_values

#### Using the above result, we have verified that the dataset doesn't have any missing values now.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

## Distribution of Countries with Maximum Number of Bookings

In [None]:
#Getting the frequency of booking of each country into a Data Frame
country_dist = data.groupby('country').count()['hotel']
country_dist = pd.DataFrame(country_dist)

#The Data Frame has columns "Country" and "No. of Bookings"
country_dist['Country'] = country_dist.index
country_dist = country_dist.rename(columns = {'hotel': 'No. of Bookings'})

#Sorting the DataFrame in descending order and getting only those countries which have bookings more than 1000.
country_dist = country_dist.sort_values(by = 'No. of Bookings', ascending = False)
popular_country_dist = country_dist[country_dist['No. of Bookings'] > 1000]

In [None]:
popular_country_dist.head(5)

### We can see that PRT has unusually high number of bookings i.e. approx 36k more number of bookings than the second highest value for GBR.

### Due to this reason, identifying PRT as an outlier, we exclude it from our barplot.

In [None]:
sns.barplot(x = popular_country_dist['Country'][1:], y = popular_country_dist['No. of Bookings'][1:])

## No. of Bookings Monthly (Cancelled vs Successful) 

In [None]:
#Getting the monthly frequency of cancelled and successful bookings 
monthly_dist = data[data.is_canceled == 0].groupby('arrival_date_month').count()['hotel']
monthly_cancelled_dist = data[data.is_canceled == 1].groupby('arrival_date_month').count()['hotel']

In [None]:
'''Storing the data into two dataframes and concatenating both of them to get a single dataframe with columns "No. of Bookings"
"Month" and "is_canceled"'''
monthly_dist = pd.DataFrame(monthly_dist)
monthly_cancelled_dist = pd.DataFrame(monthly_cancelled_dist)

monthly_dist = monthly_dist.rename(columns = {"hotel" : "No. of Bookings"})
monthly_cancelled_dist = monthly_cancelled_dist.rename(columns = {"hotel" : "No. of Bookings"})

monthly_dist['is_canceled'] = 'No'
monthly_cancelled_dist['is_canceled'] = 'Yes'

monthly_dist["Month"] = monthly_dist.index
monthly_cancelled_dist["Month"] = monthly_cancelled_dist.index

monthly_freq = pd.concat([monthly_dist, monthly_cancelled_dist])

In [None]:
months_in_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [None]:
ax = sns.factorplot("Month", "No. of Bookings", col="is_canceled", data=monthly_freq, kind="bar", order = months_in_order)
print(type(ax))
#ax.set(title = 'Monthly Distribution of Bookings acc. to the status of booking.')
ax.set_xticklabels(rotation=45)
ax1,ax2 = ax.axes[0]
ax1.axhline(7000, ls = '--', linewidth = 2)
ax2.axhline(5000, ls = '--', linewidth = 2)
plt.show()


In [None]:
data.head(2)

## Lead Time

In [None]:
lead_time = data['lead_time']
lead_time = pd.DataFrame(sorted(lead_time, reverse = True), columns = ['Lead'])


In [None]:
sns.distplot(lead_time)

### We have the following observations:
####     1) Most of the bookings in the dataset have a lead time between 0 to 100 days.
####     2) We can see a huge peak where the lead time is less than 10-20 days. We can infer that a large part of people arrived at the stay in less than a month from their booking time.
####     2) As the lead time increases, the number of bookings reduce, and there are very few bookings which have a lead time of more than a year ( > 365 ).

### Therefore, plotting the Lead Time distribution when:
#### 1) Distribution when Lead Time < 100 Days.
#### 2) Distribution when Lead Time > 100 Days and < 365 Days.
#### 3) Distribution when Lead Time is more than a year.

In [None]:
a4_dims = (21, 6)
fig, ax = plt.subplots(1,3,figsize=a4_dims)
sns.distplot(lead_time[lead_time['Lead'] < 100], ax = ax[0])
sns.distplot(lead_time[(lead_time['Lead'] > 100) & (lead_time['Lead'] < 365)], ax = ax[1])
sns.distplot(lead_time[lead_time['Lead'] > 365], ax = ax[2])

In [None]:
data.head(2)