In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
pd.options.display.max_columns = None
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("/kaggle/input/hotel-booking-demand/hotel_bookings.csv")

In [None]:

data

# Comparison of hotel and cancellation
### It can be observed that the city hotel has more cancelation than resort hotel

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['hotel'], y=data['is_canceled'], palette=sns.color_palette("magma"))

In [None]:
sns.countplot(data.hotel,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

# Comparison of lead_time and cancellation
### It can be observed that there maybe a weak linear relationship between leadtime and cancelation

In [None]:
#sns.scatterplot(x=data['is_canceled'], y=data['lead_time']) #hue=insurance_data['smoker'])
sns.lmplot(x="is_canceled", y="lead_time", data=data)

# Distribution of cancellation and month of the year
### It can be observed that the cancellation is more probable at the beginning of new year

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data['arrival_date_month'], y=data['is_canceled'], palette=sns.color_palette("magma"))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data.arrival_date_month,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

# Distribution of leadtime and month of the year
### It can be observed that the leadtime is less at the beginning of new year and we saw previously that cancellation is more probable with more leadtime due to positive corr

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.arrival_date_month, y=data['lead_time'], palette=sns.color_palette("magma"))

# Distribution of cancellation and week of the year
### No strong patterns can be observed

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.arrival_date_week_number, y=data['is_canceled'], palette=sns.color_palette("pastel"))

# Distribution of cancellation and day of the month
### No strong patterns can be observed

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.arrival_date_day_of_month, y=data['is_canceled'], palette=sns.color_palette("pastel"))

# Distribution of cancellation and weekend nights
### For fewer weekend nights booking the standard error is less with lower cancellation chances
#### With weekend night more than 5(about a month) it seems that chances of cancellation is higher with stronger standard error

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.stays_in_weekend_nights, y=data['is_canceled'], palette=sns.color_palette("magma"))

In [None]:
data["stays_in_weekend_nights"] = data["stays_in_weekend_nights"]>4

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.stays_in_weekend_nights, y=data['is_canceled'], palette=sns.color_palette("magma"))

# Distribution of cancellation and week nights
### For fewer week nights booking the standard error is less with lower cancellation chances
#### With weekend night more than 10 it seems that chances of cancellation is higher with stronger standard error

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.stays_in_week_nights, y=data['is_canceled'])

In [None]:
data["stays_in_week_nights"] = data["stays_in_week_nights"]>10

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.stays_in_week_nights, y=data['is_canceled'])

# Data Distribution of Number of Adults

In [None]:
data.adults.value_counts()

In [None]:
sns.distplot(a=data['adults'], kde=False)

# Distribution of cancellation and Number of Adults
### For fewer number of adults booking the standard error is less with lower cancellation chances
#### It's most likely because people booking for more than 5 people were probably not serious about it

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.adults, y=data['is_canceled'])

In [None]:
data["adults"] = data["adults"]>4

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data.adults, y=data['is_canceled'])

# Data Distribution of deposit Type

In [None]:
data.deposit_type.unique()

In [None]:
plt.figure(figsize=(20,10))
sns.catplot(x="deposit_type", kind="count", data=data)

In [None]:
data.deposit_type.isnull().any()

# Distribution of cancellation and Deposit Type
### It Seems more bookings were non refundable and were cancelled more as well 

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['deposit_type'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data.deposit_type,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

# Data Distribution of Customer Type

In [None]:
data.customer_type.value_counts()

In [None]:
plt.figure(figsize=(20,10))
sns.catplot(x="customer_type", kind="count", data=data)

# Distribution of cancellation and Customer Type
### It Seems more bookings were cancelled by transient customers 

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['customer_type'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data.customer_type,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

# Distribution of cancellation and car parking
### It Seems people who booked space for parking didnt cancel booking

In [None]:
data.required_car_parking_spaces.value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['required_car_parking_spaces'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
data.total_of_special_requests.value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['total_of_special_requests'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data.total_of_special_requests,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

In [None]:
data

# Distribution of cancellation and children
### It doesnt't show any significant pattern

In [None]:
data.children.value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['children'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data.children,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

In [None]:
data["children"] = data["children"]>4

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['children'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

# Distribution of cancellation and Babies
### It shows a high standard error for people with more than 2 babies, it better not be considered

In [None]:
data.babies.value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['babies'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

# Distribution of cancellation and Meal
### It shows a large proportion of data is undefined so it may affect teh predictions

In [None]:
data.meal.value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['meal'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

# Observation with country Data, some countries show more cancellation

In [None]:
data.country.isnull().any()

In [None]:
plt.figure(figsize=(100,10))
sns.barplot(x=data['country'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

# Observation Betn Is_repeated_guest and cancellation

repeated guests have lower chances of cancellation

In [None]:
data.is_repeated_guest.value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['is_repeated_guest'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data.is_repeated_guest,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

# Observation Betn num_previous_cancellation and cancellation

A pattern can be seen for irregular number of previous cancellations the chances of cancellation are very high

In [None]:
data.previous_cancellations.value_counts()

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data['previous_cancellations'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
data["previous_cancellations"] = data["previous_cancellations"] > 0

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['previous_cancellations'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

# Observation Betn previous_bookings_not_canceled and cancellation

There is a lot of standard error in the data

In [None]:
data.previous_bookings_not_canceled.value_counts()

In [None]:
plt.figure(figsize=(100,10))
sns.barplot(x=data['previous_bookings_not_canceled'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

# Observation Betn reserved_room_type and cancellation

There is a lot of standard error in the data, and no significant pattern visible 

In [None]:
data.reserved_room_type.value_counts()

In [None]:
plt.figure(figsize=(100,10))
sns.barplot(x=data['reserved_room_type'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

# Observation Betn assigned room and cancellation

Some rooms dont have enough data to show concrete pattern

In [None]:
data.assigned_room_type.value_counts()

In [None]:
plt.figure(figsize=(100,10))
sns.barplot(x=data['assigned_room_type'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
temp = data

In [None]:
temp["booking_changes"]>0

In [None]:
data["booking_changes"] = data["booking_changes"]>0

In [None]:
data.booking_changes.value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=data['booking_changes'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data.booking_changes,hue=data.is_canceled, palette=sns.color_palette("magma"))
plt.show()

In [None]:
data.market_segment.value_counts()

In [None]:
plt.figure(figsize=(100,10))
sns.barplot(x=data['market_segment'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
data.distribution_channel.value_counts()

In [None]:
plt.figure(figsize=(50,10))
sns.barplot(x=data['distribution_channel'], y=data['is_canceled'], palette=sns.color_palette("Blues"))

In [None]:
data.booking_changes.value_counts()

# From the observations I can assume the the features in the following cell can be used to make a predictive model to find out the chances of a booking being cancelled or no

In [None]:
data

In [None]:
features = ["hotel", "lead_time","meal", "arrival_date_month", "stays_in_weekend_nights", "stays_in_week_nights" ,"market_segment","distribution_channel","adr","adults","children","previous_cancellations","customer_type","reserved_room_type","assigned_room_type", "is_repeated_guest","days_in_waiting_list", "booking_changes", "is_canceled","required_car_parking_spaces"]

# The Effect of Different features Have been Observed
## Some are not considered due to being undefined or having too much standard error

## Now I will Preprocess the data starting from the raw csv again
### We will do label encoding later, first we will categorize some the existing data  

In [None]:
 data = pd.read_csv("/kaggle/input/hotel-booking-demand/hotel_bookings.csv")

In [None]:
data

In [None]:
data = data[features]

In [None]:
data

In [None]:
# # More than 4 week nights had more chances of cancellation
# data["stays_in_weekend_nights"] = data["stays_in_weekend_nights"]>4
# # More than 10 week nights had more chances of cancellation
# data["stays_in_week_nights"] = data["stays_in_week_nights"]>10
# # More than 4 adults had more chances of cancellation
# data["adults"] = data["adults"]>4
# # More than 4 children had more chances of cancellation
# data["children"] = data["children"]>4
# # More previous cancellations increases chance of cancellation
# data["previous_cancellations"] = data["previous_cancellations"] > 0
# # More booking changes lowers chances of cancellation
# data["booking_changes"] = data["booking_changes"]>0

In [None]:
data

## I am done with categorizing the data, now I will label encode it

In [None]:
labelHotel = LabelEncoder()
data.hotel = labelHotel.fit_transform(data.hotel)

labelmonth = LabelEncoder()
data.arrival_date_month = labelmonth.fit_transform(data.arrival_date_month)

labelWeekendNights = LabelEncoder()
data.stays_in_weekend_nights = labelWeekendNights.fit_transform(data.stays_in_weekend_nights)

labelWeekNights = LabelEncoder()
data.stays_in_week_nights = labelWeekNights.fit_transform(data.stays_in_week_nights)

labelAdults = LabelEncoder()
data.adults = labelAdults.fit_transform(data.adults)

labelChildren = LabelEncoder()
data.children = labelChildren.fit_transform(data.children)

labelMeal = LabelEncoder()
data.meal = labelMeal.fit_transform(data.meal)

labelCtype = LabelEncoder()
data.customer_type = labelCtype.fit_transform(data.customer_type)

labelmarket_segment = LabelEncoder()
data.market_segment = labelmarket_segment.fit_transform(data.market_segment)

labelres_room_type = LabelEncoder()
data.reserved_room_type = labelres_room_type.fit_transform(data.reserved_room_type)

labelassigned_room = LabelEncoder()
data.assigned_room_type = labelassigned_room.fit_transform(data.assigned_room_type)

labeldist_channel = LabelEncoder()
data.distribution_channel = labeldist_channel.fit_transform(data.distribution_channel)


labelPrevCan = LabelEncoder()
data.previous_cancellations = labelPrevCan.fit_transform(data.previous_cancellations)

labelRepGuest = LabelEncoder()
data.is_repeated_guest = labelRepGuest.fit_transform(data.is_repeated_guest)

labelBooking_changes = LabelEncoder()
data.booking_changes = labelBooking_changes.fit_transform(data.booking_changes)

In [None]:
data

In [None]:
data.isnull().any()

In [None]:
X=data.drop(['is_canceled'],axis=1)
y=data['is_canceled']

In [None]:
X

In [None]:
y

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [None]:
print("Training X shape ", X_train.shape)
print("Training y shape ", y_train.shape)
print("Testing X shape ", X_test.shape)
print("Testing y shape ", y_test.shape)

# Applying Different Models to the Problem 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

### Gaussian Naive bayes

In [None]:

clf=GaussianNB()
clf.fit(X_train,y_train)
preds=clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,preds)
print(accuracy)
recall=metrics.recall_score(y_test,preds)
precision = metrics.precision_score(y_test,preds)
print(recall)
print(precision)

### XGB Classifier

In [None]:
from xgboost import XGBClassifier
clf=XGBClassifier()
clf.fit(X_train,y_train)
preds=clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,preds)
print(accuracy)
recall=metrics.recall_score(y_test,preds)
precision = metrics.precision_score(y_test,preds)
print(recall)
print(precision)

### Multilayered Perceptron

In [None]:
# from sklearn.neural_network import MLPClassifier
# clf=MLPClassifier(alpha=0.0001)
# clf.fit(X_train,y_train)
# preds=clf.predict(X_test)
# accuracy=metrics.accuracy_score(y_test,preds)
# print(accuracy)
# recall=metrics.recall_score(y_test,preds)
# precision = metrics.precision_score(y_test,preds)
# print(recall)
# print(precision)

### Decision Tree Classifier

In [None]:
from sklearn import tree
clf=tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
preds=clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,preds)
print(accuracy)
recall=metrics.recall_score(y_test,preds)
precision = metrics.precision_score(y_test,preds)
print(recall)
print(precision)

### KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier(n_neighbors=20)
clf.fit(X_train,y_train)
preds=clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,preds)
print(accuracy)
recall=metrics.recall_score(y_test,preds)
precision = metrics.precision_score(y_test,preds)
print(recall)
print(precision)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
clf.fit(X_train,y_train)
preds=clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,preds)
print(accuracy)
recall=metrics.recall_score(y_test,preds)
precision = metrics.precision_score(y_test,preds)
print(recall)
print(precision)

In [None]:
from sklearn.linear_model import RidgeClassifier
clf=RidgeClassifier()
clf.fit(X_train,y_train)
preds=clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,preds)
print(accuracy)
recall=metrics.recall_score(y_test,preds)
precision = metrics.precision_score(y_test,preds)
print(recall)
print(precision)