In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

The given dataset revolves around two hotels, a City hotel and a Resort hotel. The dataset provides the information about the services and facilities provided to a customer during its stay at the hotel. The dataset also shows the booking habits of the customers from various regions.

We need to build a machine learning model to predict the cancellation habits of the customers. For this we need to take the characterisitics of the customer as the features for the model. We will also analyse the dataset and try to extract some valuable information from it.

Importing necessary libraries and functions.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

Importing the dataset

In [None]:
df = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')
df.set_index('hotel',inplace = True)
df.head()

Check the columns in the dataset.

In [None]:
df.columns

#####Check for the total number of NaN(not a number) values in each column.
We basically want to filter the dataset for NaN values and take out the most relevant information.

In [None]:
df.isna().sum()

The above solution shows that the columns: 'company' , 'agent' and 'country' contain the most number of not a number/ missing values.

In [None]:
df.drop(['company', 'agent', 'country'],inplace=True,axis = 1)
df.head()

The column 'children' also has some missing values but they are not sufficient in number for the whole column to be ignored in calculation.

So we will check for the most common value in the 'children' column using the mode() function and fill the missing values by the same.

In [None]:
df['children'].mode()

In [None]:
df['children'].fillna(0,inplace =True)

Now we will make use of the Label Encoder to convert the non-numeric entries to the encoded values that can be understood by the model.

In [None]:
encode = LabelEncoder()
df['arrival_date_month'] = encode.fit_transform(df['arrival_date_month'])
df['meal'] = encode.fit_transform(df['meal'])
df['market_segment'] = encode.fit_transform(df['market_segment'])
df['distribution_channel'] = encode.fit_transform(df['distribution_channel'])
df['reserved_room_type'] = encode.fit_transform(df['reserved_room_type'])
df['assigned_room_type'] = encode.fit_transform(df['assigned_room_type'])
df['deposit_type'] = encode.fit_transform(df['deposit_type'])
df['customer_type'] = encode.fit_transform(df['customer_type'])
df['reservation_status'] = encode.fit_transform(df['reservation_status'])

We also convert the year column to encoded values using the map function.

In [None]:
df['arrival_date_year'] = df['arrival_date_year'].map({2015:1, 2016:2, 2017:3})

Now to prevent the machine learning model from producing inaccurate results, we scale down the values of the 'lead_time' and 'adr' columns. They are brought in the range of -1 to +1 via the MinMaxScaler() function.

In [None]:
scaler = MinMaxScaler()
df['lead_time'] = scaler.fit_transform(df['lead_time'].values.reshape(-1,1))
df['adr'] = scaler.fit_transform(df['adr'].values.reshape(-1,1))

For our model we need the most relevant and highly correlated columns to produce precise results. So to check the relationship between the variables we derive their correlation.

In [None]:
df.corr()

Plotting the heatmap for better visualisation of the result.

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(df.corr())

From the derived result we will only select those variables which have high correlation to the decision variable('is_canceled'). Those with very low correlation will be neglected.

In [None]:
data = df[['reservation_status','total_of_special_requests','required_car_parking_spaces','deposit_type','booking_changes','assigned_room_type','previous_cancellations','distribution_channel','lead_time','is_canceled']]

Dividing the dataset

In [None]:
X = data.drop(['is_canceled'],axis= 1)
y = data['is_canceled']

Using Linear Regression to train and test the dataset.

In [None]:
linreg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 2)
linreg.fit(X_train, y_train)

In [None]:
y_pred = linreg.predict(X_test)

Checking the accuracy of the model and constructing the confusion matrix.

In [None]:
accuracy = linreg.score(X_test,y_test)
print(accuracy)

In [None]:
matrix = confusion_matrix(y_test,y_pred.round())
matrix

Using Logistic Regression to train and test the dataset.

In [None]:
logreg = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 2)
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

Checking the accuracy of the model and constructing the confusion matrix.

In [None]:
accuracy = logreg.score(X_test,y_test)
print(accuracy)

In [None]:
matrix = confusion_matrix(y_test, y_pred.round())
matrix

Since the accuracy of the Logistic Regression is better, we select it for constructing the machine learning model.

# Data Visualisation

Importing the dataset for visualisation purposes.

In [None]:
df1 = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')

Check the most prominent month in which the customers are visiting the hotels.

In [None]:
plt.figure(figsize = (10,5))
df1.groupby(['arrival_date_month'])['arrival_date_month'].count().plot.bar()

Calculating the number of entries to the hotel year-wise.

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x='arrival_date_year',hue = 'hotel',data=df1)

Grouping the customers according to their country.

In [None]:
plt.figure(figsize=(30,5))
df1.groupby(['country'])['country'].size().sort_values(ascending=False).plot.bar()

The above result shows that the people from Portugal are the ones who had visited the given two hotels the maximum number of times.

Checking the data specifically for the Portuguese people.

In [None]:
maxpop = df1[df1['country'] == 'PRT']
plt.figure(figsize = (10,5))
maxpop.groupby(['arrival_date_month'])['arrival_date_month'].count().plot.bar()

It is seen that the Portuguese people have travelled the most in the month of August, July and October.

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x='arrival_date_year',hue='hotel',data=maxpop)

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x='market_segment',hue='hotel',data=df1)

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x='is_canceled',hue='hotel',data=df1)

Dividing the dataset on the basis that the customers have received the same room or not as they had selected at the time of the booking.

In [None]:
change_room = df1[df1['reserved_room_type'] != df1['assigned_room_type']]

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='is_canceled',hue='hotel',data=change_room)

This shows that the alotment of rooms, different from those that were reserved, did not lead to the reservation getting canceled.

In [None]:
plt.figure(figsize=(10,5))
deposit =df1.groupby(['deposit_type','is_canceled'])['deposit_type'].count()
print(deposit)
sns.countplot(x=df1['deposit_type'],data=df1,hue='is_canceled')

This shows that the customers who had submitted a non-refundable amount had the most number of cancellations.

In [None]:
plt.figure(figsize=(10,5))
customer = df1.groupby(['customer_type','is_canceled'])['customer_type'].count()
print(customer)
sns.countplot(x='customer_type',hue='is_canceled',data=df1)

The 'Transient' type of customers are the ones which have canceled the most of their reservations.