### Importing required libraries

In [None]:
!pip install wordcloud

In [None]:
# import 'Numpy'
import numpy as np

# import 'Pandas' 
import pandas as pd

# import subpackage of Matplotlib
import matplotlib.pyplot as plt

# import color package from matplotlib
from matplotlib.colors import ListedColormap

# import 'Seaborn'
import seaborn as sns

# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

# import regular expression
import re

# import ast
import ast

# display all columns of the dataframe
pd.options.display.max_columns = None

# import label encoder , ordinal encoder , onehot encoder 
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder

# import stats for performing statistical tests
import scipy.stats as stats

# import train-test split
from sklearn.model_selection import train_test_split

# import PowerTransformer
from sklearn.preprocessing import PowerTransformer

# import SMOTE to create synthetic data
from imblearn.over_sampling import SMOTE

# import various functions from sklearn
from sklearn.metrics import accuracy_score,roc_curve,roc_auc_score,classification_report,confusion_matrix,f1_score
from sklearn.model_selection import KFold,GridSearchCV

# import Linear Regression
from sklearn.linear_model import LinearRegression

# import statsmodels
import statsmodels.api as sma

# import Logistic Regression
from sklearn.linear_model import LogisticRegression

# import DecisionTree Classifier
from sklearn.tree import DecisionTreeClassifier

# import tree to visualize DecisiontTree
from sklearn import tree

# import RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier

# import AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

# import GradientBoosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

# import XtremeGradientBoost Classifier
from xgboost import XGBClassifier

# import wordlcoud, stopwords
from wordcloud import WordCloud,STOPWORDS


In [None]:
# set the plot size using 'rcParams'
# once the plot size is set using 'rcParams', it sets the size of all the forthcoming plots in the file
# pass width and height in inches to 'figure.figsize' 
plt.rcParams['figure.figsize'] = [8,8]

# Creating custom color
colors = ['#97C1A9','#DCDCDC','#AFC197','#9AC197','#97C1BE','#97C1A2','#C1979A',
          '#A997C1','#77AE8F','#DCDCDC','#67A481','#C197AF']

### Reading the dataset and viewing the first 10 rows of it

In [None]:
pd.set_option('display.max_columns',20)

In [None]:
df_hotel = pd.read_csv('Hotel Reservations.csv')
df_hotel.head(10)

### Checking the shape/dimension of the dataset¶

In [None]:
print(f'The dataset has {df_hotel.shape[0]} rows and {df_hotel.shape[1]} columns')

In [None]:
print(f'Dimension of the dataset is {df_hotel.ndim}')

### Checking the datatype, number of non null values and name of each variable in the dataset

In [None]:
df_hotel.info()

### Checking for the missing values. Displaying number of missing values per column

In [None]:
missing_values = pd.DataFrame({'No of missing values': df_hotel.isnull().sum(),
                              '% of missing values': (df_hotel.isnull().sum()/df_hotel.shape[0])*100}
                             ).reset_index().rename(columns = {'index':'Name'})

missing_values

From above dataframe we can see that there are no missing values present in the dataset

### Dropping of irrelevant columns from the dataset

The variable booking id is a unqiue identfier of each booking. This variable can be dropped as it might not contribute much to the prediction booking status or might impute noise to the model.

In [None]:
df_hotel.drop(columns = 'Booking_ID', inplace = True)

### Checking for the descriptive statistics of the dataset

In [None]:
df_hotel.describe(include = 'object').T

In [None]:
df_hotel.shape

In [None]:
round(df_hotel['booking_status'].value_counts(normalize=True)*100)

### Checking for the summary statistics of the dataset

In [None]:
df_hotel.describe().T

### Univariate Analysis

#### Type_of_meal_plan

This variable explain the type of meal plan booked by the customer

In [None]:
plt.rcParams['figure.figsize'] = [5,5]

df_hotel['type_of_meal_plan'].unique()

In [None]:
sns.countplot(df_hotel['type_of_meal_plan'].sort_values(),palette = colors)
plt.title('Frequency of Type of Meal Plan')
# plt.savefig("Frequency of Type of Meal Plan.jpeg")

for i,v in enumerate(df_hotel['type_of_meal_plan'].value_counts().sort_index()):
    plt.text( x = i , y = v + 100, s = v, ha = 'center')
    
plt.savefig("Frequency of Type of Meal Plan.jpeg")

Inference:

From above countplot it is clearly evident that meal plan 1 is prefered by most of the customers. It is followed by not selected and meal plan 2 with count of 5130 and 3305. Only a few customer prefer meal plan 3

#### Room_type_reserved

It explains the type of room reserved by the customer.

In [None]:
sns.countplot(df_hotel['room_type_reserved'].sort_values(),palette= colors)
plt.title('Frequency of Room Type Reserved')
plt.xticks(rotation = 90)

for i,v in enumerate(df_hotel['room_type_reserved'].value_counts().sort_index()):
    plt.text( x = i , y = v + 300, s = v, ha = 'center')

plt.savefig("Frequency of Room Type Reserved.jpeg")

In [None]:
df_hotel.groupby(["room_type_reserved"])["avg_price_per_room"].mean()

#### Market_segment_type

The "Market Segment" refers to the designation or classification of the source of the booking made by the guest. It is a categorical variable that provides information about the type of customer or entity that made the reservation.

In [None]:
sns.countplot(df_hotel['market_segment_type'].sort_values(),palette= colors)
plt.title('Count of Market Segment Type')
plt.xticks(rotation = 45)

for i,v in enumerate(df_hotel['market_segment_type'].value_counts().sort_index()):
    plt.text( x = i , y = v + 200, s = v, ha = 'center')
    
plt.savefig('Count of Market Segment Type.jpeg')

Inference: 

From above plot it is clearly evident that customers who reserved rooms through online modes is higher compared to other modes of reservation.
The industry is driven by the online bookings offlate, and more thoughts on the cancellation of online cancellation should be given a thought

#### No_of_adults

The "number of adults" refers to the number of guests over the age of 18 who will be occupying a hotel room

In [None]:
df_hotel['no_of_adults'].unique()

In [None]:
sns.countplot(df_hotel['no_of_adults'].sort_values(),palette = colors)
plt.title('Breakdown of No of Adults')

for i,v in enumerate(df_hotel['no_of_adults'].value_counts().sort_index()):
    plt.text( x = i , y = v + 200, s = v, ha = 'center')

plt.savefig("Breakdown of No of Adults.jpeg")

#### No_of_children

The "number of children" refers to the number of guests below the age of 18 who will be occupying a hotel room

In [None]:
sns.countplot(df_hotel['no_of_children'].sort_values(),palette = colors)
plt.title('Breakdown of No of Children')

for i,v in enumerate(df_hotel['no_of_children'].value_counts().sort_index()):
    plt.text( x = i , y = v + 350, s = v, ha = 'center')

plt.savefig("Breakdown of No of Children.jpeg")

#### No_of_weekend_nights

The "no_of_weekend_nights" variable refers to the number of weekend nights (i.e., Saturday, or Sunday nights) that a guest will be staying at the hotel as part of their reservation. 

In [None]:
sns.countplot(df_hotel['no_of_weekend_nights'].sort_values(),palette = colors)
plt.title('Frequency of No of Weekend Nights')

for i,v in enumerate(df_hotel['no_of_weekend_nights'].value_counts().sort_index()):
    plt.text( x = i , y = v + 350, s = v, ha = 'center')
    
plt.savefig("Frequency of No of Weekend Nights.jpeg")

#### No_of_week_nights

It is the number of weekday nights (i.e., Monday to Friday nights) that a guest will be staying at the hotel as part of their reservation.

In [None]:
sns.countplot(df_hotel['no_of_week_nights'].sort_values(),palette = colors)
plt.title('Investigating the Frequency of No of Week Nights')

for i,v in enumerate(df_hotel['no_of_week_nights'].value_counts().sort_index()):
    plt.text( x = i , y = v + 150, s = v, ha = 'center')
plt.savefig("Investigating the Frequency of No of Week Nights.jpeg")

#### Lead_time

It is the amount of time between when a guest makes a reservation and when their planned arrival date at the hotel is. It is a measure of how much advance notice the hotel has before the guest arrives.

In [None]:
plt.rcParams["figure.figsize"] = [8,5]
sns.countplot(df_hotel['no_of_week_nights'].sort_values(),palette = colors)
plt.title('Investigating the Frequency of Lead Time')

for i,v in enumerate(df_hotel['no_of_week_nights'].value_counts().sort_index()):
    plt.text( x = i , y = v + 150, s = v, ha = 'center')

plt.savefig("Investigating the Frequency of Lead Time.jpeg")

#### Arrival_year

"Arrival year" refers to the year in which a guest is scheduled to arrive for their reservation at a hotel. 

In [None]:
plt.pie(df_hotel['arrival_year'].value_counts(), labels = df_hotel['arrival_year'].value_counts().index,
        autopct = '%.2f%%', shadow = True, explode = [0.1,0.05],colors = colors)
plt.title('Proportions of Bookings Yearly')


plt.savefig("Proportions of Bookings Yearly.jpeg")

plt.show()

Inference: 
The given data Set is dominated by the 2018 data, or the bookings have boosted high in the year 2018

#### Arrival_month

"Arrival month" refers to the month in which a guest is scheduled to arrive for their reservation at a hotel.

In [None]:
plt.pie(df_hotel['arrival_month'].value_counts(), labels = df_hotel['arrival_month'].value_counts().index,
        autopct = '%.1f%%', shadow = True,colors = colors )
plt.title('Proportions of monthly bookings')

plt.savefig('Proportions of monthly bookings.jpeg')
plt.show()

Inference: 
8,9,10 has the highest no bookings. which is August, September and November. So in the financial year perspective 2nd and 3rd has the highest footfall / bookings
There is a scope for feature engineering  for this column, to bin the months quarter wise or bin it season wise to establish better pattern with the target variable


#### Repeated_guest

This variable explains whether a guest who made a hotel reservation has stayed at the same hotel before.

In [None]:
plt.rcParams["figure.figsize"] = [8,5]
sns.countplot(df_hotel['repeated_guest'].sort_values(),palette = colors)
plt.title('Frequency of Repeated Guest')

for i,v in enumerate(df_hotel['repeated_guest'].value_counts().sort_index()):
    plt.text( x = i , y = v + 150, s = v, ha = 'center')

plt.savefig("Frequency of Repeated Guest.jpeg")

#### No_of_previous_cancellations

"No of previous cancellations" refers to the number of times a guest has previously cancelled a reservation at the hotel

In [None]:
sns.countplot(df_hotel['no_of_previous_cancellations'].sort_values(),palette = colors)
plt.title('Breakdown of No of Previous Cancellations')

for i,v in enumerate(df_hotel['no_of_previous_cancellations'].value_counts().sort_index()):
    plt.text( x = i , y = v + 150, s = v, ha = 'center')
    
plt.savefig("Breakdown of No of Previous Cancellations.jpeg")

#### Avg_price_per_room

Average price per day of the reservation; prices of the rooms are dynamic. (in Dollars)

In [None]:
sns.distplot(df_hotel['avg_price_per_room'], color = '#97C1A9')
plt.title('Understanding the distribution of Average Price Per Room')
plt.show()

#### No_of_special_requests

The "Number of special requests" in the Hotel Reservations dataset refers to the number of additional services or special requests made by a guest when booking a hotel reservation. 

In [None]:
sns.countplot(df_hotel['no_of_special_requests'].sort_values(),palette = colors)
plt.title('Breakdown of No of Special Requests')

for i,v in enumerate(df_hotel['no_of_special_requests'].value_counts().sort_index()):
    plt.text( x = i , y = v + 150, s = v, ha = 'center')
    
plt.savefig('Breakdown of No of Special Requests.jpeg')

In [None]:
round(df_hotel["no_of_special_requests"].value_counts(normalize= True)*100)

#### Booking_status

Flag indicating if the booking was canceled or not. This is the target variable

In [None]:
plt.pie(df_hotel['booking_status'].value_counts(), labels = df_hotel['booking_status'].value_counts().index,
        autopct = '%.2f%%', shadow = True, explode = [0.05,0.025],colors = colors)
plt.title('Distribution of Booking status')
plt.savefig('Distribution of Booking status.jpeg')
plt.show()


From the above distribution it is clearly evident that there is class imbalance in the target variable

### Bivariate Analysis

#### Avg_price_per_room vs Booking_status

In [None]:
sns.barplot(x = df_hotel['booking_status'].sort_values(), y = df_hotel['avg_price_per_room'],palette= colors)
plt.title('Examining the Variability of Average Price Per Room in Booking Status')

for i,v in enumerate(round(df_hotel.groupby('booking_status')['avg_price_per_room'].mean(),2)):
    plt.text(x = i , y = v + 2, s = v , ha = 'center')
    
plt.savefig("Examining the Variability of Average Price Per Room in Booking Status.jpeg")

#### No_of_week_nights vs Booking_status

In [None]:
sns.barplot(x = df_hotel['booking_status'].sort_values(), y = df_hotel['no_of_week_nights'],palette = colors)
plt.title('Examining the Variability of No of Week Nights in Booking Status')

for i,v in enumerate(round(df_hotel.groupby('booking_status')['no_of_week_nights'].mean(),2)):
    plt.text(x = i , y = v , s = v , ha = 'center')
    
plt.savefig('Examining the Variability of No of Week Nights in Booking Status.jpeg')

Inference: 
This plot can help us visualize how booking status varies with the length of the stay. For example, we might find that bookings with longer weeknight stays are more likely to result in a cancellation

#### No_of_adults vs Booking_status

In [None]:
sns.barplot(x = df_hotel['booking_status'].sort_values(), y = df_hotel['no_of_adults'], palette = colors)
plt.title('Examining the Variability of No of Adults in Booking Status')

for i,v in enumerate(round(df_hotel.groupby('booking_status')['no_of_adults'].mean())):
    plt.text(x = i , y = v - 0.5, s = v , ha = 'center')
    
plt.savefig('Examining the Variability of No of Adults in Booking Status.jpeg')

#### Type_of_meal_plan vs Booking_status

In [None]:
sns.countplot(x = df_hotel['type_of_meal_plan'] , hue = df_hotel['booking_status'],palette = colors)
plt.title('Describing the Count of Type of Meal Plan by Booking Status')
plt.savefig('Describing the Count of Type of Meal Plan by Booking Status.jpeg')
plt.show()

#### Repeated_guest vs Booking_status

In [None]:
sns.countplot(x = df_hotel['repeated_guest'] , hue = df_hotel['booking_status'],palette = colors)
plt.title('Describing the Count of Repeated Guests by Booking Status')
plt.savefig("Describing the Count of Repeated Guests by Booking Status.jpeg")
plt.show()

#### No_of_weekend_nights vs Booking_status

In [None]:
sns.barplot(x = df_hotel['booking_status'] , y = df_hotel['no_of_weekend_nights'],palette = colors)
plt.title('Examining the Variability of No of Weekend Nights in Booking Status')

for i,v in enumerate(round(df_hotel.groupby('booking_status')['no_of_weekend_nights'].mean())):
    plt.text(x = i , y = v - 0.2, s = v , ha = 'center')

plt.savefig("Examining the Variability of No of Weekend Nights in Booking Status.jpeg")

#### Room_type_reserved vs Booking_status

In [None]:
sns.countplot(x = df_hotel['room_type_reserved'] , hue = df_hotel['booking_status'],palette = colors)
plt.title('Describing the Count of Room Type Reserved by Booking Status')
plt.savefig("Describing the Count of Room Type Reserved by Booking Status.jpeg")
plt.show()

#### Required_car_parking_space vs Avg_price_per_room

In [None]:
sns.barplot(x = df_hotel['required_car_parking_space'] , y = df_hotel['avg_price_per_room'],palette = colors)
plt.title('Examining the Variability of Average Price Per Room in Required Car Parking Space')

for i,v in enumerate(round(df_hotel.groupby('required_car_parking_space')['avg_price_per_room'].mean(),2)):
    plt.text(x = i , y = v + 3, s = v , ha = 'center')
    
plt.savefig("Examining the Variability of Average Price Per Room in Required Car Parking Space.jpeg")

#### Type_of_meal_plan vs Avg_price_per_room

In [None]:
sns.barplot(x = df_hotel['type_of_meal_plan'].sort_values() , y = df_hotel['avg_price_per_room'],palette = colors)
plt.title('Examining the Variability of Average Price Per Room in Type of Meal Plan')

for i,v in enumerate(round(df_hotel.groupby('type_of_meal_plan')['avg_price_per_room'].mean(),2)):
    plt.text(x = i , y = v + 3, s = v , ha = 'center')
    
plt.savefig("Examining the Variability of Average Price Per Room in Type of Meal Plan.jpeg")

#### Market_segment_type vs Booking_status

In [None]:
sns.countplot(x = df_hotel['market_segment_type'] , hue = df_hotel['booking_status'],palette = colors)
plt.title('Comparison of Market Segment Type by Booking Status')
plt.savefig("Comparison of Market Segment Type by Booking Status.jpeg")
plt.show()

#### No_of_week_nights vs Avg_price_per_room

In [None]:
sns.scatterplot(x = df_hotel['no_of_week_nights'], y = df_hotel['avg_price_per_room'])
plt.title('Visualizing the Spread of Average Price Per Room in No of Week Nights')
plt.savefig('Visualizing the Spread of Average Price Per Room in No of Week Nights.jpeg')
plt.show()

#### No_of_weekend_nights vs Avg_price_per_room

In [None]:
sns.scatterplot(x = df_hotel['no_of_weekend_nights'], y = df_hotel['avg_price_per_room'])
plt.title('Visualizing the Spread of Average Price Per Room in No of Weekend Nights')
plt.savefig('Visualizing the Spread of Average Price Per Room in No of Weekend Nights.jpeg')
plt.show()

### Multivariate Analysis

#### Type_of_meal_plan vs Avg_price_per_room vs Booking_status

In [None]:
sns.barplot(x = df_hotel['type_of_meal_plan'].sort_values() , y = df_hotel['avg_price_per_room'], 
           hue = df_hotel['booking_status'].sort_values(), palette = colors)

plt.title('Exploring the Interactions among Type of Meal Plan , Average Price Per Room , Booking Status')

for i,v in enumerate(round(df_hotel.groupby(['type_of_meal_plan','booking_status']
                                           )['avg_price_per_room'].mean(),2).sort_index()):
    plt.text(x = i-i/2 - 0.2, y = v + 3, s = v , ha = 'center')
    
plt.savefig('Exploring the Interactions among Type of Meal Plan , Average Price Per Room , Booking Status.jpeg')

#### Room_type_reserved vs Avg_price_per_room vs Booking_status

In [None]:
sns.barplot(x = df_hotel['room_type_reserved'].sort_values() , y = df_hotel['avg_price_per_room'], 
           hue = df_hotel['booking_status'].sort_values(), palette = colors)

plt.title('Exploring the Interactions among Room Type Reserved , Average Price Per Room , Booking Status')

for i,v in enumerate(round(df_hotel.groupby(['room_type_reserved','booking_status']
                                           )['avg_price_per_room'].mean(),2).sort_index()):
    plt.text(x = i-i/2 - 0.2, y = v + 3, s = v , ha = 'center')
    
plt.savefig('Exploring the Interactions among Room Type Reserved , Average Price Per Room , Booking Status.jpeg')

#### Market_segment_type vs Avg_price_per_room vs Booking_status

In [None]:
sns.barplot(x = df_hotel['market_segment_type'], y = df_hotel['avg_price_per_room'], hue = df_hotel['booking_status'], 
            palette = colors)
plt.title('Examining the Predictive Power of Market Segment Type and Average Price Per Room on Booking_status')
plt.savefig('Examining the Predictive Power of Market Segment Type and Average Price Per Room on Booking_status.jpeg')
plt.show()

#### Room_type_reserved vs No_of_weekend_nights vs Booking_status

In [None]:
sns.barplot(x = df_hotel['room_type_reserved'].sort_values() , y = df_hotel['no_of_weekend_nights'], 
           hue = df_hotel['booking_status'].sort_values(), palette = colors)

plt.title('Examining the Predictive Power of Room Type Reserved and No of Weekend Nights on Booking_status')

for i,v in enumerate(round(df_hotel.groupby(['room_type_reserved','booking_status']
                                           )['no_of_weekend_nights'].mean(),2).sort_index()):
    plt.text(x = i-i/2 - 0.2, y = v + 0.05, s = v , ha = 'center')
    
plt.savefig('Examining the Predictive Power of Room Type Reserved and No of Weekend Nights on Booking_status.jpeg')

#### Room_type_reserved vs No_of_week_nights vs Booking_status

In [None]:
sns.barplot(x = df_hotel['room_type_reserved'].sort_values() , y = df_hotel['no_of_week_nights'], 
           hue = df_hotel['booking_status'].sort_values(), palette = colors)

plt.title('Exploring the Interactions among Room Type Reserved , No of Week Nights , Booking_status')

for i,v in enumerate(round(df_hotel.groupby(['room_type_reserved','booking_status']
                                           )['no_of_week_nights'].mean(),2).sort_index()):
    plt.text(x = i-i/2 - 0.2, y = v + 0.05, s = v , ha = 'center')
    
plt.savefig('Exploring the Interactions among Room Type Reserved , No of Week Nights , Booking_status.jpeg')

#### Repeated_guest vs No_of_weekend_nights vs Booking_status

In [None]:
sns.barplot(x = df_hotel['repeated_guest'].sort_values() , y = df_hotel['no_of_weekend_nights'], 
           hue = df_hotel['booking_status'].sort_values(), palette = colors)

plt.title('Exploring the Interactions among Repeated Guest , No of Weekend Nights , Booking_status')

for i,v in enumerate(round(df_hotel.groupby(['repeated_guest','booking_status']
                                           )['no_of_weekend_nights'].mean(),2).sort_index()):
    plt.text(x = i-i/2 - 0.2, y = v + 0.05, s = v , ha = 'center')

plt.savefig('Exploring the Interactions among Repeated Guest , No of Weekend Nights , Booking_status.jpeg')

#### No_of_week_nights vs Avg_price_per_room vs Booking_status

In [None]:
sns.scatterplot(x = df_hotel['no_of_week_nights'] , y = df_hotel['avg_price_per_room'], 
                hue = df_hotel['booking_status'])

plt.title('Visualizing the Spread of No of Week Nights and Average Price Per Room on Booking Status')
plt.savefig('Visualizing the Spread of No of Week Nights and Average Price Per Room on Booking Status.jpeg')

plt.show()

#### No_of_weekend_nights vs Avg_price_per_room vs Booking_status

In [None]:
sns.scatterplot(x = df_hotel['no_of_weekend_nights'] , y = df_hotel['avg_price_per_room'], hue = df_hotel['booking_status'])
plt.title('Visualizing the Spread of No of Weekend Nights and Average Price Per Room on Booking Status')
plt.savefig('Visualizing the Spread of No of Weekend Nights and Average Price Per Room on Booking Status.jpeg')
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = [15,10]
sns.heatmap(df_hotel.corr(), annot = True, cmap = "Blues" , mask = np.triu(df_hotel.corr()))
plt.title('Correlation Heatmap')
plt.savefig('Correlation Heatmap.jpeg')
plt.show()

### Performing hypothesis testing to find the significant variables

Hypothesis :

H0 : There is no significant relationship between the dependent and independent variable

Ha : There is significant relationship between the dependent and independent variable

Significance level :

Considering significance level as 0.05

In [None]:
# No_of_adults is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['no_of_adults'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between no of adults and booking status

In [None]:
# No_of_children is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['no_of_children'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between no_of_children and booking status

In [None]:
# No_of_weekend_nights is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['no_of_weekend_nights'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between no_of_weekend_nights and booking status

In [None]:
# No_of_week_nights is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['no_of_week_nights'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between no_of_week_nights and booking status

In [None]:
# Type_of_meal_plan is a categorical column and Booking_status is categorical column.
# Categorical vs Categorical - chi2_contingency test

ge = pd.crosstab(df_hotel['type_of_meal_plan'], df_hotel['booking_status'])

stats.chi2_contingency(ge)

since pvalue is less than significance level, reject H0. There is significant relationship between type_of_meal_plan and booking status

In [None]:
# Required_car_parking_space is a categorical column and Booking_status is categorical column.
# Categorical vs Categorical - chi2_contingency test

ge = pd.crosstab(df_hotel['required_car_parking_space'], df_hotel['booking_status'])

stats.chi2_contingency(ge)

since pvalue is less than significance level, reject H0. There is significant relationship between required_car_parking_space and booking status

In [None]:
# Room_type_reserved is a categorical column and Booking_status is categorical column.
# Categorical vs Categorical - chi2_contingency test

ge = pd.crosstab(df_hotel['room_type_reserved'], df_hotel['booking_status'])

stats.chi2_contingency(ge)

since pvalue is less than significance level, reject H0. There is significant relationship between room_type_reserved and booking status

In [None]:
# Lead_time is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['lead_time'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between lead_time and booking status

In [None]:
# Market_segment_type is a categorical column and Booking_status is categorical column.
# Categorical vs Categorical - chi2_contingency test

ge = pd.crosstab(df_hotel['market_segment_type'], df_hotel['booking_status'])

stats.chi2_contingency(ge)

since pvalue is less than significance level, reject H0. There is significant relationship between market_segment_type and booking status

In [None]:
# Repeated_guest is a categorical column and Booking_status is categorical column.
# Categorical vs Categorical - chi2_contingency test

ge = pd.crosstab(df_hotel['repeated_guest'], df_hotel['booking_status'])

stats.chi2_contingency(ge)

since pvalue is less than significance level, reject H0. There is significant relationship between repeated_guest and booking status

In [None]:
# No_of_previous_cancellations is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['no_of_previous_cancellations'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between no_of_previous_cancellations and booking status

In [None]:
# No_of_previous_bookings_not_canceled is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['no_of_previous_bookings_not_canceled'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between no_of_previous_bookings_not_canceled and booking status

In [None]:
# Avg_price_per_room is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['avg_price_per_room'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between avg_price_per_room and booking status

In [None]:
# No_of_special_requets is a numerical column and Booking_status is categorical column.
# Numerical vs Categorical - f_oneway test

stats.f_oneway(df_hotel['no_of_special_requests'],df_hotel['booking_status'])

since pvalue is less than significance level, reject H0. There is significant relationship between no_of_special_requests and booking status

###### Insights from statistical tests
From above performed statistical tests we can conclude that none of the columns has failed to reject null hypotheses which means none of the columns are insignificant to the target variable booking_status. At this stage we cant drop any variable. On further progress after building few models and checking for their metrics if the performance is considerably low we can drop some columns based on the feature_importance score.

### Splitting the dataset randomly into train and test dataset using ratio of 70:30 

In [None]:
df_hotel = df_hotel.drop(columns="arrival_year")

In [None]:
x = df_hotel.drop(columns= 'booking_status')
y = df_hotel["booking_status"]

In [None]:
xtrain , xtest , ytrain , ytest = train_test_split(x, y, test_size = 0.30, random_state = 7, stratify=y)

### Checking and treating of outliers¶

In [None]:
plt.rcParams["figure.figsize"] = [8,5]

In [None]:
num_cols = ['no_of_adults','no_of_children','no_of_weekend_nights','no_of_week_nights','lead_time',
            'no_of_previous_cancellations','no_of_previous_bookings_not_canceled','avg_price_per_room']
num_cols

For Train Data 

In [None]:
f,ax = plt.subplots(2,4)
for i, v in zip(num_cols, ax.flatten()):
    sns.boxplot(y = xtrain[i], ax = v)
    
plt.tight_layout()
plt.show()

For Test Data 

In [None]:
f,ax = plt.subplots(2,4)
for i, v in zip(num_cols, ax.flatten()):
    sns.boxplot(y = xtest[i], ax = v)
    
plt.tight_layout()
plt.show()

From the above it is clearly evident that there are outliers in both train annd test data. By doin IQR we tend to lose data, therefore we proceed to do Power Transformer.

In [None]:
out_cols =['no_of_adults','no_of_children','no_of_weekend_nights','no_of_week_nights','lead_time',
            'no_of_previous_cancellations','no_of_previous_bookings_not_canceled','avg_price_per_room']

In [None]:
import pickle

In [None]:
pt = PowerTransformer()

for i in out_cols:
    variable = pt.fit(xtrain[[i]])
    xtrain[i] = pt.transform(xtrain[[i]])
    
#     with open(f'{i}.pkl','wb') as file:
#         pickle.dump(variable,file)
    
    xtest[i] = pt.transform(xtest[[i]])

Since the Power Trasformation has inbuilt scaling feature , we are not scaling again 

### Encoding of categorical variables

#### Type_of_meal_plan

In [None]:
# with open("oe_market.pkl", "wb") as file:
#     pickle.dump(oe_market, file)

In [None]:
xtrain['type_of_meal_plan'].unique()

Type of meal plan variable is an ordinal categorical variable. It has a hierarchy between the subclasses. Meal plan type affects the price of room. Each plan has different prices.

In [None]:
xtrain.groupby('type_of_meal_plan')['avg_price_per_room'].mean().sort_values()

In [None]:

oe = OrdinalEncoder(categories = [['Meal Plan 3','Not Selected','Meal Plan 1', 'Meal Plan 2']])

oe_mealplan = oe.fit(xtrain[["type_of_meal_plan"]])

xtrain['type_of_meal_plan'] = oe_mealplan.transform(xtrain[['type_of_meal_plan']])
xtest['type_of_meal_plan'] = oe_mealplan.transform(xtest[['type_of_meal_plan']])

xtrain['type_of_meal_plan'].unique()


#### Room_type_reserved

In [None]:
df_hotel['room_type_reserved'].unique()

Room type variable is an ordinal categorical variable. It has a hierarchy between the subclasses. Different room price will be having different prices

In [None]:
xtrain.groupby('room_type_reserved')['avg_price_per_room'].mean().sort_values(ascending=True)

In [None]:
oe = OrdinalEncoder(categories = [['Room_Type 3','Room_Type 2','Room_Type 1', 'Room_Type 5',
                                  'Room_Type 4','Room_Type 7','Room_Type 6']])

oe_roomtype = oe.fit(xtrain[["room_type_reserved"]])

xtrain['room_type_reserved'] = oe_roomtype.transform(xtrain[['room_type_reserved']])
xtest['room_type_reserved'] = oe_roomtype.transform(xtest[['room_type_reserved']])

xtrain['room_type_reserved'].unique()

#### Market_segment_type

In [None]:
xtrain['market_segment_type'].unique()

Market segment type variable is an ordinal categorical variable. It has a hierarchy between the subclasses. For example when a corporate company does bulk booking there maybe slight discount in price of the room and for the customer who books through online has to pay more due to internet service charges , platform charges etc. Same can be applied also for other subclasses also

In [None]:
xtrain.groupby('market_segment_type')['avg_price_per_room'].mean().sort_values(ascending=False)

From above groupby we can see that the prices for different market type the prices are different. There is an hierarchy followedFrom above groupby we can see that the prices for different market type the prices are different. There is an hierarchy followed

In [None]:
oe = OrdinalEncoder(categories = [['Complementary','Corporate','Offline','Aviation','Online']])

oe_market = oe.fit(xtrain[["market_segment_type"]])

xtrain['market_segment_type'] = oe_market.transform(xtrain[['market_segment_type']])
xtest['market_segment_type'] = oe_market.transform(xtest[['market_segment_type']])

xtrain['market_segment_type'].unique()

#### Booking_status

In [None]:
ytrain.unique()

For booking status variable we can replace not_canceled with 0 and cancelled as 1

In [None]:
ytrain.replace({'Not_Canceled':0,'Canceled':1}, inplace = True)
ytest.replace({'Not_Canceled':0,'Canceled':1}, inplace = True)

ytrain.unique()

### Checking for imbalance in target variable

In [None]:
plt.pie(ytrain.value_counts(), labels = ytrain.value_counts().index,
        autopct = '%.2f%%', shadow = True, explode = [0.05,0.025],colors = colors)
plt.title('Investigating the Frequency of Booking Status')
plt.show()

In [None]:
ytrain.value_counts()

From above plot it is clearly evident that there is class imbalance in the target variable. In class 0 there is 24390 instances and 11885 instances in class 1. We can do over sampling technique to overcome this imblalance

### Oversampling technique using SMOTE

For train data 

In [None]:
smtrain = SMOTE()

xtrain,ytrain = smtrain.fit_resample(xtrain,ytrain)

In [None]:
smtest = SMOTE()

xtest, ytest = smtest.fit_resample(xtest,ytest)

In [None]:
# Checking of instances after performing over sampling

ytrain.value_counts(normalize=True)

From above value counts we can see that class imbalance is rectified

### Building a base model

Building a base model using Logistic Regression as it is having the highest explanatory power compared to other models

In [None]:
model_lr = sma.Logit(ytrain,sma.add_constant(xtrain)).fit()

##### checking for summary

In [None]:
model_lr.summary()

#### Calculating various metrics to evaluate the model performance

In [None]:
pred_prob_train = model_lr.predict(sma.add_constant(xtrain))
pred_prob_test = model_lr.predict(sma.add_constant(xtest))

In [None]:
# Calculating youden's index to convert probability prediction to class prediction

# For train data

fpr , tpr , threshold = roc_curve(ytrain,pred_prob_train)

youden_index_train = []

for i,v in zip(fpr , tpr):
    res = v - i
    youden_index_train.append(res)

yi_train = max(np.round(youden_index_train,2))


# For test data

fpr , tpr , threshold = roc_curve(ytest,pred_prob_test)

youden_index_test = []

for i,v in zip(fpr , tpr):
    res = v - i
    youden_index_test.append(res)

yi_test = max(np.round(youden_index_test,2))

print(f'Youdens index for train data is {yi_train}')
print(f'Youdens index for test data is {yi_test}')

In [None]:
# Converting probability prediction to class prediction using 0.59 as threshold value

pred_train = [ 1 if i > 0.59 else 0  for i in pred_prob_train]
pred_test = [ 1 if i > 0.59 else 0  for i in pred_prob_test]


In [None]:
# plotting confusion matrix for train data

cm = confusion_matrix(ytrain,pred_train)
conf_matrix = pd.DataFrame(data = cm , columns = ['Predicted : 0','Predicted : 1'], index = ['Actual : 0','Actual : 1'])
c = ['#97C1A9']

sns.heatmap(data = conf_matrix, annot = True , cbar = False , fmt = 'd' , cmap = c , linewidth = 1.5 , 
           annot_kws = {'size' : 25})

In [None]:
# plotting confusion matrix for test data

cm = confusion_matrix(ytest,pred_test)
conf_matrix = pd.DataFrame(data = cm , columns = ['Predicted : 0','Predicted : 1'], index = ['Actual : 0','Actual : 1'])
c = ['#97C1A9']

sns.heatmap(data = conf_matrix, annot = True , cbar = False , fmt = 'd' , cmap = c , linewidth = 1.5 , 
           annot_kws = {'size' : 25})

In [None]:
# Classification report

print(f'Train report : \n{classification_report(ytrain,pred_train)}\n')
print(f'Testreport : \n{classification_report(ytest,pred_test)}')

From above report we can conclude that our base model has performed good in both train and unseen data with accuracy of almost 80%. On further progress we try to improve our performance by building other models, tuning their hyperparameters and selecting columns based on feature importance score

In [None]:
odds = pd.DataFrame(np.round(np.exp(model_lr.params),2))
odds.sort_values(by = 0 , ascending=False)

In [None]:
odds.sort_values(by = 0 , ascending=False)[0:8]

# the below extracted features has the highest influence on the target variable, according to the oddds ratio 

In [None]:
# Classification using logistic regression: 

from sklearn.linear_model import LogisticRegression

model_lr1 = LogisticRegression().fit(xtrain , ytrain)

model_performance_remarks(model_lr1, 'Base Model')

In [None]:
# Decision Tree Model 

dt = DecisionTreeClassifier()

model_dt = dt.fit(xtrain,ytrain)

model_performance_remarks(model_dt, 'Decision Tree')

In [None]:
# Random Forest Model 

rf = RandomForestClassifier()

model_rf = rf.fit(xtrain,ytrain)

model_performance_remarks(model_rf, "Random Forest")

In [None]:
with open ("xg.pkl", "wb") as file:
    pickle.dump(model_xg, file)

In [None]:
# XGBoost Model:

xg = XGBClassifier()

model_xg = xg.fit(xtrain, ytrain)

model_performance_remarks(model_xg, "XGBoost")

In [None]:
# Ada Boost Model 

ab = AdaBoostClassifier()

model_ab = ab.fit(xtrain,ytrain)

model_performance_remarks(model_ab, "AdaBoost")

In [None]:
# Grid Search CV -- RF params 

params = ({"n_estimators" :range(100,200,25),
    "criterion" :['gini','entropy'],
    "max_depth": range(1,6,1),
    "min_samples_split" : range(1,10,2),
    "min_samples_leaf" : range(1,5,1),
    "max_features" : ['sqrt','log2']})
          
cv = GridSearchCV(estimator=rf,param_grid=params, cv = 3, n_jobs=-1, 
                 verbose=5, return_train_score="f1")
          
cv.fit(xtrain,ytrain)

cv.best_score_

In [None]:
model_grid = cv.best_estimator_.fit(xtrain,ytrain)

model_performance(model_grid, "GridSearchCV")

In [None]:
# pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb = CatBoostClassifier()

model_cb = cb.fit(xtrain, ytrain)

In [None]:
model_performance_remarks(model_cb, "CatBoost")

In [None]:
# User Defined Function with remarks

from sklearn.metrics import accuracy_score , f1_score , cohen_kappa_score

performance_remarks_df = pd.DataFrame(columns = ['Name','Train_accuracy','Test_accuracy','Train_f1score',
                                        'Test_f1score','Train_Kappa','Test_Kappa','Remarks'])


def model_performance_remarks(model , name , xtrain = xtrain, xtest = xtest):
    
    global performance_remarks_df
    
    pred_train = model.predict(xtrain)
    pred_test = model.predict(xtest)
    
    acc_train = round(accuracy_score(ytrain , pred_train),2)*100
    acc_test = round(accuracy_score(ytest , pred_test),2)*100
    
    f1_train = f1_score(ytrain , pred_train)
    f1_test = f1_score(ytest , pred_test)
    
    kappa_train = cohen_kappa_score(ytrain , pred_train)
    kappa_test = cohen_kappa_score(ytest , pred_test)
    
    #remarks
    
    def remark(train , test):
        
        if name == 'Base Model':
            return 'Base Model'
        
        else:
            
            if abs(train - test) > 10 or train > 95:
                        return 'Over Fit'
            elif train < 81 or test < 78:
                        return 'Under Fit'
            else:
                        return 'Good Fit'
        
    
    performance_remarks_df = performance_remarks_df.append({'Name':name ,
                                           'Train_accuracy': round(acc_train,2),
                                           'Test_accuracy': round(acc_test,2),
                                           'Train_f1score': f1_train,
                                           'Test_f1score': f1_test,
                                           'Train_Kappa': kappa_train,
                                           'Test_Kappa' : kappa_test,
                                           'Remarks' : remark(acc_train , acc_test)}, ignore_index = True)
    
    print('Train report \n',classification_report(ytrain , pred_train))
    print('Test report \n', classification_report(ytest , pred_test))

In [None]:
# model_performance_remarks(model_lr1, "Linear Regression")
# model_performance_remarks(model_dt, "Decision Tree")
# model_performance_remarks(model_rf, "Random forest")
# model_performance_remarks(model_xg, "XGBoost")
# model_performance_remarks(model_ab, "AdaBoost")
# # model_performance_remarks(model_grid, "Grid SearchCV")

In [None]:
performance_remarks_df

In [None]:
def highlight_row(df):
    color_green = ['background-color : pink']*len(df)
    color_white = ['background-color : white']*len(df)
    
    if df['Remarks'] == 'Good Fit':
        return color_green
    
    else:
        return color_white

In [None]:
performance_remarks_df.style.apply(highlight_row, axis =1)

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
X = df.drop(columns=["Booking_ID","arrival_year","booking_status"])
y = df["booking_status"]

In [None]:
df_num.columns

In [None]:
df_hotel.columns

In [None]:
df = pd.read_csv("Hotel reservations.csv")

In [None]:
df[df.columns[:]].agg(["min","max"]).T

In [None]:
meal_plan_oe = OrdinalEncoder(categories = [['Meal Plan 2','Meal Plan 1','Not Selected', 'Meal Plan 3']])

room_type_oe =OrdinalEncoder(categories = [['Room_Type 6','Room_Type 7','Room_Type 4', 'Room_Type 5',
                                  'Room_Type 1','Room_Type 2','Room_Type 3']])

segment_type_oe = OrdinalEncoder(categories = [['Online','Aviation','Offline','Corporate','Complementary']])

booking_status_oe = OrdinalEncoder(categories = [['Not_Canceled','Canceled']])

In [None]:
from sklearn.pipeline import Pipeline
pt = PowerTransformer()

rf = RandomForestClassifier()

preprocessor = ColumnTransformer(
    transformers=[
        ('Power Trns', pt, df_num.columns),("meal plan",meal_plan_oe,["type_of_meal_plan"] ),
        ("room type", room_type_oe,["room_type_reserved"]),
        ("segment type" , segment_type_oe,["market_segment_type"])
        ],remainder='passthrough')

pipeline = Pipeline([("preprocessor",preprocessor), ("classifier", rf)])


x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=42)

pipeline.fit(x_train,y_train)
y_pred = pipeline.predict(x_test)


In [None]:
# y_pred

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# Freeze the model 

import pickle #preserve all the learning params that made this model

# writebinary
model = open("xg.pickle", "wb")
pickle.dump(model_xg, model)
model.close()

In [None]:
pip install streamlit-extras

In [None]:
xtrain.info()

In [None]:
%%writefile app.py
import streamlit as st
# from IPython.display import display, HTML
import pandas as pd 
import matplotlib.pyplot as plt 
import pickle
from sklearn.preprocessing import PowerTransformer

# st.balloons()
# st.snow()


# st.markdown(
#          f"""
#          <style>
#          .stApp {{
#              background-image: url("https://cdn.pixabay.com/photo/2019/04/24/11/27/flowers-4151900_960_720.jpg");
#              background-attachment: fixed;
#              background-size: cover
#          }}
#          </style>
#          """,
#          unsafe_allow_html=True
#      )

import base64

with open("hotel_bg.jpg", "rb") as image_file:
    encoded_string = base64.b64encode(image_file.read())
st.markdown(
f"""
<style>
.stApp {{
    background-image: url(data:image/{"png"};base64,{encoded_string.decode()});
    background-size: cover
}}
</style>
""",
unsafe_allow_html=True
)




# from PIL import Image

# image = Image.open("Hotel - BG.jpg")

# st.image(image, caption='Hotel')


st.title("Hotel Booking Cancellation Prediction")
st.markdown("Will this customer honour the booking? ")

# step 1 load the pickled model --> rb read binary

model = open("rf.pickle","rb")
clf = pickle.load(model)
model.close()

# step2 get the user input from the front end
no_of_adults= st.number_input('No of Adults',0,4,step = 1) 
no_of_children = st.slider('No of Children',0,10,1) 
no_of_weekend_nights = st.slider("No of weekend nights",0,7,1)
no_of_week_nights = st.slider('No of week nights',0,17,1)
type_of_meal_plan = st.selectbox("Select a meal plan ", ('Meal Plan 1', 'Meal Plan 2', 'Meal Plan 3', 'Not Selected'))
required_car_parking_space = st.selectbox("Parking required or not ", (0,1))
room_type_reserved = st.selectbox("Type of room type reserved ", ('Room_Type 1', 'Room_Type 2', 'Room_Type 3', 'Room_Type 4',
                       'Room_Type 5', 'Room_Type 6', 'Room_Type 7'))
lead_time = st.number_input("Lead Time" , 0,443,1)
arrival_month = st.slider("Month of arrival " , 1,12,1)
arrival_date = st.slider("Date of arrival", 1,30,1)
market_segment_type = st.selectbox("Mode of Booking ", ('Online','Aviation','Offline','Corporate','Complementary'))
repeated_guest = st.selectbox("Repeat visit(if yes --> 1, no -->0)" , (0,1))
no_of_previous_cancellations = st.slider("No of previous cancellations", 0,13,1)
no_of_previous_bookings_not_canceled = st.slider("No of successful visits" , 0,58,1)
avg_price_per_room = st.slider("Price per room" , 0, 540, 10)
no_of_special_requests = st.slider("Special requests if any" , 0,5,1)


# step3 : converting user input to model input 

data = {'no_of_adults': no_of_adults,
        'no_of_children' : no_of_children, 
        'no_of_weekend_nights' : no_of_weekend_nights, 
        'no_of_week_nights': no_of_week_nights,
        'type_of_meal_plan' : type_of_meal_plan,
       'required_car_parking_space': required_car_parking_space,
        'room_type_reserved': room_type_reserved,
        'lead_time':lead_time,
       "arrival_month": arrival_month,
       "arrival_date": arrival_date,
       "market_segment_type": market_segment_type,
       "repeated_guest": repeated_guest,
       "no_of_previous_cancellations" : no_of_previous_cancellations,
       "no_of_previous_bookings_not_canceled" : no_of_previous_bookings_not_canceled,
       "avg_price_per_room": avg_price_per_room,
       "no_of_special_requests" : no_of_special_requests}
input_data = pd.DataFrame([data])
# st.write(input_data)


prediction = clf.predict(input_data)
if st.button("Prediction"):
    if prediction == "Not_Canceled":
        st.subheader("Booking will be honoured")
    if prediction=="Canceled":
        st.subheader("Booking will be cancelled")

In [None]:
df.head(1)