In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
from warnings import filterwarnings
filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/hotel-booking-demand-complete/hotel_booking.csv")
df.head()

In [None]:
df.shape

## DATA CLEANING

In [None]:
df.isna().sum() # get missing values

In [None]:
# Fill the missing values with '0' for "Agent" and "Company"
# because if no agency is given it can be possible that the booking has been done without one,
# if no company specified, it mean booking is done privately

In [None]:
# deal with missing values
def data_clean(df):
    df.fillna(0, inplace=True)
    print(df.isnull().sum())

data_clean(df)

In [None]:
df.columns

In [None]:
list = ['adults', 'children', 'babies']

for i in list:
    print('{} has unique values as {}'.format(i, df[i].unique()))

In [None]:
# wrong entries where adults, childern and babies are '0' at a time
filter = (df['children']==0) & (df['adults']==0) & (df['babies']==0) 
df[filter]

In [None]:
# data after cleaning
data = df[~filter] # negation of filter i.e., data without above data
data.head()

## Analysing home country of guests

In [None]:
country_wise = data[data['is_canceled'] == 0]['country'].value_counts()
country_wise

In [None]:
# make a dataframe of above data
country_wise_df = country_wise.reset_index()
country_wise_df.columns = ['country', 'no. of guests']
country_wise_df

In [None]:
import folium
from folium.plugins import HeatMap
basemap = folium.Map()

import plotly.express as px
map_guests = px.choropleth(country_wise_df,
             locations=country_wise_df['country'],
             color=country_wise_df['no. of guests'],
             hover_name=country_wise_df['country'],
             title='Home country of guests')
map_guests.show()

In [None]:
data2 = data[data['is_canceled']==0]
data2.columns

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='reserved_room_type', y='adr', hue='hotel', data=data2)
plt.title('Price of rooms types per night & per person')
plt.xlabel('Room type')
plt.ylabel('Price(Euro)')
plt.legend()
plt.show()

The best distribution of price with City hotel is with "G" room type and with Resort hotel is with "H" and "C"

## Analysing prices of hotel across year for Resort Hotel and City Hotel  
(how much price varies over year?)

In [None]:
data_resort = data[(data['hotel']=='Resort Hotel') & (data['is_canceled']==0)]
data_city = data[(data['hotel']=='City Hotel') & (data['is_canceled']==0)]

In [None]:
data_resort.head()

In [None]:
resort_hotel_df = data_resort.groupby(['arrival_date_month'])['adr'].mean().reset_index()
resort_hotel_df

In [None]:
city_hotel_df = data_city.groupby(['arrival_date_month'])['adr'].mean().reset_index()
city_hotel_df

In [None]:
# merge both data 
final_df = resort_hotel_df.merge(city_hotel_df, on='arrival_date_month')
final_df.columns=['month', 'price_for_resort_hotel', 'price_for_city_hotel'] # rename columns
final_df

In [None]:
# python modules for sorting the month, weekday etc
!pip install sort_dataframeby_monthorweek
!pip install sorted-months-weekdays

In [None]:
import sort_dataframeby_monthorweek as sd

def sort_data(df, column):
    return sd.Sort_Dataframeby_Month(df, column)

In [None]:
final_df = sort_data(final_df, 'month')
final_df

In [None]:
final_df.columns

In [None]:
# line plot
px.line(final_df, x='month', y=['price_for_resort_hotel', 'price_for_city_hotel'], 
        title='Room price per night over the months')

## Analysing the demand of hotels  
(highest number of guests in which season?)

In [None]:
rush_resort_df = data_resort['arrival_date_month'].value_counts().reset_index()
rush_resort_df.columns = ['month', 'no. of guests']
rush_resort_df

In [None]:
rush_city_df = data_city['arrival_date_month'].value_counts().reset_index()
rush_city_df.columns = ['month', 'no. of guests']
rush_city_df

In [None]:
final_rush_df = rush_resort_df.merge(rush_city_df, on='month')
final_rush_df.columns = ['month', 'no. of guests in resort', 'no. of guests in city hotel']
final_rush_df

In [None]:
final_rush_df = sort_data(final_rush_df, 'month')
final_rush_df

In [None]:
# line plot
px.line(final_rush_df, x='month', y=['no. of guests in resort', 'no. of guests in city hotel'], 
        title='Total number of guests per months')

## Select important features using ML (correlation matrix)

In [None]:
data.corr()

eg. "is_canceled" and "lead_time" have a correlation of 0.29, i.e., if lead_time value increases, there is a probability of 29% that is_canceled also increases.

In [None]:
# find the correlation with "is_canceled" because it is depended feature 
# (how all other variable are going to be depended on is_canceled?)

correlation = data.corr()['is_canceled']
correlation

In [None]:
correlation.abs().sort_values(ascending=False)

In [None]:
data.groupby('is_canceled')['reservation_status'].value_counts()

- when booking is not cancelled and reservation status is check-out = 75011 guests
- when booking is going to be cancelled when reservation is canceled = 42993 guest

In [None]:
data.columns

In [None]:
# exclude numerical columns which are not going to contribute much
num_not = ['days_in_waiting_list', 'arrival_date_year']

In [None]:
# get numerical features
num_features = [col for col in data.columns if data[col].dtype != 'O' and col not in num_not]
num_features

In [None]:
# exclude the categorical features that are not going to contribute much
cat_not = ['arrival_date_year', 'assigned_room_type', 'booking_changes', 'reservation_status', 'country', 'days_in_waiting_list']

In [None]:
# get catrgorical features
cat_features = [col for col in data.columns if data[col].dtype == 'O' and col not in cat_not]
cat_features

## Extract derived features from data

In [None]:
data_cat = data[cat_features]
data_cat.head()

In [None]:
data_cat.dtypes

In [None]:
data_cat['reservation_status_date'] = pd.to_datetime(data_cat['reservation_status_date'])

In [None]:
# derived features
data_cat['year'] = data_cat['reservation_status_date'].dt.year
data_cat['month'] = data_cat['reservation_status_date'].dt.month
data_cat['day'] = data_cat['reservation_status_date'].dt.day

data_cat['cancellation'] = data['is_canceled']

In [None]:
# drop a feature because derived features already carries its information
data_cat.drop('reservation_status_date', axis=1, inplace=True) # drop vertically and update dataframe

In [None]:
data_cat.head()

## Handling Categorical features (Mean Encoding)

In [None]:
data_cat['market_segment'].unique()

In [None]:
cols = data_cat.columns[0:8] # each columns except 'cancellation'

In [None]:
# mean encoding, for example
data_cat.groupby(['hotel'])['cancellation'].mean() # it will replace the resort hotel with value '0.277674'

In [None]:
#  Each categorical feature has its value which will be used by ML model
# and convert into dictionary so it can be easily mapped

for col in cols:
    print(data_cat.groupby([col])['cancellation'].mean().to_dict())
    print('\n')

In [None]:
# mapping the data into dataframe
for col in cols:
    dict = data_cat.groupby([col])['cancellation'].mean().to_dict()
    data_cat[col] = data_cat[col].map(dict)

In [None]:
data_cat.head()

In [None]:
dataframe = pd.concat([data_cat, data[num_features]], axis=1)

In [None]:
dataframe.columns

In [None]:
dataframe.drop('cancellation', axis=1, inplace=True) # we had both 'cancellation' and 'is_canceled'
dataframe

## Handling Outliers

In [None]:
dataframe.head()

In [None]:
def handle_outlier(col):
    dataframe[col] = np.log1p(dataframe[col])

In [None]:
# for lead_time
sns.distplot(dataframe['lead_time'])

In [None]:
handle_outlier('lead_time')
sns.distplot(dataframe['lead_time'])

In [None]:
# for adr
sns.distplot(dataframe['adr'])

In [None]:
handle_outlier('adr')
sns.distplot(dataframe['adr'].dropna())

## Applying techniques of Feature Importance
To select most important features for ML model

In [None]:
dataframe.isnull().sum()

In [None]:
dataframe.dropna(inplace=True) # drop and update the dataframe

In [None]:
x = dataframe.drop('is_canceled', axis=1) # independent features
y = dataframe['is_canceled'] # dependent feature

In [None]:
x.shape

In [None]:
y.shape

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005,random_state=0))

In [None]:
feature_sel_model.fit(x,y)

In [None]:
feature_sel_model.get_support() # if a feature is going to contribute or not

In [None]:
cols = x.columns
selected_features = cols[feature_sel_model.get_support()] # using as a filter, feature corresponding to True will be filtered

print('total features: {}'.format(x.shape[1]))
print('selected features: {}'.format(len(selected_features)))

In [None]:
# update x
x = x[selected_features]

In [None]:
x.shape

## Apply Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
logreg = LogisticRegression(solver='liblinear')

logreg.fit(X_train,y_train)

In [None]:
y_pred = logreg.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

## Apply Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
score = cross_val_score(logreg, x, y, cv=10)
score.mean()

## Apply multiple ML algorithms

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
models = []

models.append(('LogisticRegression', LogisticRegression(solver='liblinear')))
models.append(('Naive bayes', GaussianNB()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))

In [None]:
models

In [None]:
for name, model in models:
    print(name)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    print(confusion_matrix(predictions, y_test))
    
    print(accuracy_score(predictions, y_test))
    print('\n')