In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
import calendar
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
#from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
#from category_encoders import BinaryEncoder
#from sklearn.pipeline import Pipeline
#from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#### Load the dataset ######
Hotel_Df = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')

 ### Exploratory Data Analysis for Hotel Data Set ###

In [None]:
## Finding the shape of the dataset
print(Hotel_Df.shape)

Shape explains that the dataset has 119390 records with 32 columns 

In [None]:
### Printing the information of the dataset
print(Hotel_Df.info())

The dataset contains 32 columns with a total length of 119390. 
Displayed above dataset types for different columns
Few columns have wrong data types like children

In [None]:
### Print the length of the dataset
print('Length of the dataset:',len(Hotel_Df))

### Pre-Processing of the Data Set ###

In [None]:
####### top 5 dataset information ###
print(Hotel_Df.head(5))

Displayed the top 5 rows of the dataset. There are records with Nan values.

In [None]:
### Printing the statistics of the dataset ###
print(Hotel_Df.describe())

Calculate the mean, standard deviation, minimum , maximum and quantitle values.

In [None]:
## Duplicate or null values in the dataset
print(Hotel_Df.isnull().sum())

There are three columns Children, Country, agent and company has null columns. 

In [None]:
## Bar plot to see the Null values after filling with 0
msno.bar(Hotel_Df)

Company column has more than 94 % of the data as null
Company column represents the ID of the company that made booking.
Agent column represents the ID of the booking agent.
Plan to drop these column in the upcoming steps

In [None]:
### Dropping the columns that have null values.
Hotel_Book_DF = Hotel_Df.copy()
Hotel_Book_DF = Hotel_Book_DF.drop(['company', 'agent','arrival_date_week_number'], axis=1)
Hotel_Book_DF = Hotel_Book_DF.dropna(subset=['country', 'children'], axis=0)
Hotel_Book_DF = Hotel_Book_DF.reset_index(drop=True)
Hotel_Book_DF.head()

After dropping the columns now we have 29 columns to explore.

In [None]:
## Check for Null values again
print(Hotel_Book_DF.isnull().sum())

There are no more null values.

In [None]:
## Bar plot to see the Null values after filling with 0
msno.bar(Hotel_Book_DF)

Now the graph looks good with all the bars equally lined up

In [None]:
##Find the Numerical Columns 
numerical_cols = Hotel_Book_DF.columns[Hotel_Book_DF.dtypes != object]

In [None]:
print(numerical_cols)

In [None]:
## Print the Categorical Variables
categorical_cols = Hotel_Book_DF.columns[Hotel_Book_DF.dtypes == object]

In [None]:
print(categorical_cols)

In [None]:
#After cleaning the dataset the above chart gives a clear understanding of the dataset
Book_Hotel_DF = Hotel_Book_DF.copy()
Book_Hotel_DF = Book_Hotel_DF.copy().hist(figsize=(20,14))
plt.show()

In [None]:
## Number of bookings cancelled at the hotel
Book_Hotel_DF = Hotel_Book_DF.copy()
sns.set(style = "darkgrid")
plt.title("Is Booking Canceled or not", fontdict = {'fontsize': 20})
ax = sns.countplot(x = "is_canceled", data = Book_Hotel_DF)

From the above graph we can infer that more than 70,000 records of the bookings were not cancelled 
and around 45000 of the bookings were cancelled.

In [None]:
#How many Bookings were Cancelled by Hotel Type?
Book_Hotel_DF = Hotel_Book_DF.copy()
sns.set(style = "darkgrid")
plt.title("Is Canceled or not by Hotel Type", fontdict = {'fontsize': 20})
ax = sns.countplot(x = "hotel", hue = 'is_canceled', data = Book_Hotel_DF)

City Hotel has the highest canceled booked and also the highest that not canceled booked.

In [None]:
## Overview of arrival period
lst3 = ['hotel', 'arrival_date_year', 'arrival_date_month','arrival_date_day_of_month' ]
period_arrival = Book_Hotel_DF[lst3]
sns.countplot(data = period_arrival, x = 'arrival_date_year', hue = 'hotel')

2016 seems to be the year where hotel booking is at its highest at City Hotel and the same year Resort hotel also has the highest but its less than the city hotel.

In [None]:
### Count of Bookings by month cancelled vs Non-Cancelled
Book_Hotel_DF = Hotel_Book_DF.copy()
plt.figure(figsize=(15, 5));
plt.title("Count of orders by month");
sns.countplot(Book_Hotel_DF.arrival_date_month, hue=Book_Hotel_DF.is_canceled, palette="mako", order=calendar.month_name[1:]);

August has the highest number of bookings and 
January has the lowest number of cancellations.

In [None]:
# Which Month is the Most Occupied with Bookings at the Hotel?
Book_Hotel_DF = Hotel_Book_DF.copy()
month_sorted = ['January','February','March','April','May','June','July','August','September','October','November','December']
plt.figure(figsize=(14,6))
plt.title("what times of the year do we have the highest bookings", fontdict = {'fontsize': 20})
sns.countplot(Book_Hotel_DF['arrival_date_month'], palette='tab10', order = month_sorted)
plt.xticks(rotation = 90)
plt.show()

August has the highest number of bookings might be its summer and 
January has the lowest number of bookings could be because of winter

In [None]:
## Based on Reservation room type average daily rate of hotel
plt.figure(figsize=(12,6))
ds=Book_Hotel_DF[Book_Hotel_DF['is_canceled']==0]
sns.violinplot(x = 'reserved_room_type', y = 'adr',data=ds,hue='hotel',palette='Set1')

From the above plot we infer that the average price per room depends on its type and the standard deviation.
Average daily rate is high for City hotel with room type G.

In [None]:
### Number of Booking records based on Countries
country_freq = Book_Hotel_DF['country'].value_counts().to_frame()
country_freq.columns = ['count']
fig = px.choropleth(country_freq, color='count',
                    locations=country_freq.index,
                    hover_name=country_freq.index,
                    color_continuous_scale=px.colors.sequential.Magenta)
fig.update_traces(marker=dict(line=dict(color='#000000', width=1)))
fig.update_layout(title_text='Number of Records by Countries',
                  title_x=0.5, title_font=dict(size=22))  # Location and the font size of the main title
fig.show()

The Darker color represents the country with highest number of bookings
PRT (Portugal) has the highest number of booking records.

In [None]:
## Company Booking IDs vs Types of hotel
sns.boxplot(x='company',y='hotel',data=Hotel_Df) ## with the whole dataset

Resort hotel has more number of bookings than City hotel 
City hotel has outliers

In [None]:
## Percentage of reservation status
reservation_status = Book_Hotel_DF['reservation_status'].value_counts()
fig = go.Figure(data=[go.Pie(labels=reservation_status.index, values=reservation_status, opacity=0.8)])
fig.update_traces(textinfo='percent+label', marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Reservation Status', title_x=0.5, title_font=dict(size=32))
fig.show()

From the pie chart we infer that 62.9% of customers stayed in the hotel and check out
36.1% customers have cancelled the bookings and only 1.01% customers have not showed after booking the reservations.

### Bivariant Analysis ###

In [None]:
### Count of bookings based on month for all three years
canceled = Book_Hotel_DF[Book_Hotel_DF['is_canceled'] == 1]
canceled_by_month_year = canceled.pivot_table(index="arrival_date_year", columns="arrival_date_month", aggfunc="count").fillna(0).adr.T


canceled_by_month_year = canceled_by_month_year.reindex(calendar.month_name[1:])
plt.figure(figsize=(15, 5));
plt.title("Count of canceled orders by month and year");
sns.lineplot(data=canceled_by_month_year, palette="mako_r", linewidth=2);

For the year 2017 the count of cancellations are high in May
For the year 2016 the count of cancellations are high in October
For the year 2015 the count of cancellations are high in September

In [None]:
### Count of bookings based on month for all three years
canceled = Book_Hotel_DF[Book_Hotel_DF['is_canceled'] == 0]
canceled_by_month_year = canceled.pivot_table(index="arrival_date_year", columns="arrival_date_month", aggfunc="count").fillna(0).adr.T


canceled_by_month_year = canceled_by_month_year.reindex(calendar.month_name[1:])
plt.figure(figsize=(15, 5));
plt.title("Count of Non-Cancelled booking by month and year");
sns.lineplot(data=canceled_by_month_year, palette="mako_r", linewidth=2);

For the year 2016 has highest number of non-cancelled bookings 
For the year 2015 has lowest number of non-cancelled bookings
For the year 2017 has moderate number of non-cancelled bookings

In [None]:
### children staying in a hotel on a week day vs weekend
plt.figure(figsize=(5, 4), dpi=100)
sns.scatterplot(data=Book_Hotel_DF, x='stays_in_week_nights', y='stays_in_weekend_nights', 
                hue='children',style='children', palette='bright')

Most of the time there are no children stayed during the Week nights 

## Pearson Corelation Matrix

In [None]:
## Correlation matrix of Hotel Booking
Book_Hotel_DF = Hotel_Book_DF.copy()
plt.figure(figsize=(15, 8))
correlation = sns.heatmap(Book_Hotel_DF.corr(), vmin=-1, vmax=1, annot=True, linewidths=1, linecolor='black', cmap = "viridis")
correlation.set_title('Correlation Matrix of the Hotel Booking', fontdict={'fontsize': 24})

 There is no clear and strong relationship between any of the features and the cancelation maybe we must include more factors in the dataset.

## Feature Engineering to Predict the Model

Dropping few columns that is not required for predicting

In [None]:
hotel = Book_Hotel_DF.drop(['meal','country','reserved_room_type','assigned_room_type','deposit_type','reservation_status','reservation_status_date'], axis=1)
hotel = pd.concat([hotel, 
                 pd.get_dummies(hotel['hotel'], drop_first=True), 
                 pd.get_dummies(hotel['arrival_date_month'], drop_first=True), 
                 pd.get_dummies(hotel['market_segment'], drop_first=True),
                 pd.get_dummies(hotel['distribution_channel'], drop_first=True),
                 pd.get_dummies(hotel['customer_type'], drop_first=True)
                 ], axis=1)
hotel = hotel.drop(['hotel','arrival_date_month','market_segment','distribution_channel','customer_type'], axis=1)

Train and Test Data sets

In [None]:
from sklearn.model_selection import train_test_split
y = hotel['is_canceled']
X = hotel.drop('is_canceled', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=101,test_size=0.3)

Scalling for logistic regression

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Logistic Regression ###

In [None]:
from sklearn.linear_model import LogisticRegressionCV

reg_model = LogisticRegressionCV(max_iter=1000)
reg_model.fit(X_train,y_train)
y_pred_reg = reg_model.predict(X_test)
acc_reg = accuracy_score(y_test, y_pred_reg)
print("Classification Report of Logistic Regression:")
print(classification_report(y_test,y_pred_reg))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred_reg))
print("Training Score:\n",reg_model.score(X_train,y_train)*100)
print(f"Accuracy Score of Logistic Regression is : {acc_reg}")

Training Score: 78.69 %
Accuracy score of Logistic Regression = 79.08 %

Splitting the dataset again for KNN and Random Forest Prediction

In [None]:
X = hotel.iloc[:, 1:].values
y = hotel.iloc[:, 0].values
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
# Empty dictionary of model accuracy results
model_accuracy_results = {}

# Function for calculating accuracy from confusion matrix
#from sklearn.metrics import confusion_matrix
def model_accuracy(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    accuracy = ((cm[0,0] + cm [1,1]) * 100 / len(y_test)).round(2)
    return accuracy

In [None]:
# Fit and train
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 10)
classifier.fit(X_train,y_train)

# Predict
y_pred = classifier.predict(X_test)

#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)

# Computing accuracy
model_accuracy_results['KNearestNeighbors'] = model_accuracy(y_test, y_pred)

In [None]:
# Fit and train
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train,y_train)

# Predict
y_pred = classifier.predict(X_test)

#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
# Computing accuracy
model_accuracy_results['RandomForest'] = model_accuracy(y_test, y_pred)

In [None]:
acurracies = pd.DataFrame(list(model_accuracy_results.values()), index=model_accuracy_results.keys(), columns=['Accuracy'])
acurracies