# Importing Libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Reading the data
df = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')

In [None]:
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.shape

In [None]:
df.info()

There are 32 columns. 
* 12 Categorical
* 20 Numerical

There are 4 columns with the missing values-
1. country
2. agent
3. company
4. children

Lets try to study each features first and then we shall deal with these missing columns.

In [None]:
#Converting this date tiem column to its correct type otherwise it shall be counted in categorical features and mess with the visualisations
df['reservation_status_date'] = df['reservation_status_date'].astype('datetime64')

In [None]:
#Separating categorical columns in a list for the ease of visualising charts for categorical variables
categoricals = [i for i in df.columns if df.dtypes[i] == 'object']
#Separating numerical columns in a list for the ease of visualising charts for numerical variables
numericals = [i for i in df.columns if df.dtypes[i] != 'object']


In [None]:
print("Categorical Columns are: ", *categoricals, sep = '\n')
print("\n")
print("\n")
print("Numerical Columns are: ", *numericals, sep = '\n')

In [None]:
#Visualising categories of the Categorical columns
for i in categoricals:
    print(("{} : {} Total nunique = {} \n").format(i, df[i].unique(), df[i].nunique()))

* **Given the categories, we can understand the different possible categoreis for each feature.**
* **Some of them are intuitive like hotels, months, countries while some of them are not like meals.**
* **A few things are not also clear like what's room labels and what are different distribution channel for and market segment of what?**
* **Who defined them and how are we getting that data?**

In [None]:
#Lets check the summary statistics of the numerical columns first to spot fisrt hand discrepancy in data 
df.describe()

# Quick observations
* 37 % of the people have cancelled their booking as per the dataset.
* Avg. lead time is 104 days, around 3.5 months.
* Each booking has on an average 1.8 adults and 0.1 children. 
* Only  3% of the guests are repeated.
* Median lead time is 69 days. 

### Lets try to observe all these features and try to relate them to cancellation


In [None]:
#The color palette we shall be using. Although this is not the best one for this dataset but I am using it here. Will cahnge later

sns.color_palette("Set2", 8)

In [None]:
sns.set_palette('Set2')
plt.figure(figsize = (8,6))
sns.countplot(x = 'hotel', data = df, hue = 'is_canceled')
plt.title("No. of bookings according to reservation status in both of the hotels")
plt.show()

* City hotel has high Cancellation rate than Resort Hotel.
* Around 30% for resort hotel and greater than 50 % for city  hotel.

* Well this might have many reasons - 
1. Might be the group that is highly likely to cancel booking dont prefer resort hotel.
2. It might have strict booking cancellation policy.


In [None]:
df.groupby(['hotel'])['is_canceled'].value_counts()

In [None]:
df.groupby(['hotel'])['is_canceled'].mean()

* That's a quite high booking cancellation rate.
* It would be great if we are able to find the cause for it as well. 

# Market Segment

In [None]:
_, ax = plt.subplots( nrows = 2, ncols = 1, figsize = (10,10))
sns.countplot(x = 'market_segment', data = df, ax = ax[0])
sns.countplot(x = 'market_segment', data = df, hue = 'is_canceled', ax = ax[1])
plt.show()

* The chart shows the no. of bookings by different market segment as per the reservation status.
> Online TA -> Offline TA -> Groups -> Direct -> Corporate 
* Descending order of No. of bookings.
* Online TA, Offline TA and groups tend to have high cancellation rates. 

# Customer Type

In [None]:
_, ax = plt.subplots( nrows = 1, ncols = 2, figsize = (12,5))
sns.countplot(x = 'customer_type', data = df, ax = ax[0])
sns.countplot(x = 'customer_type', data = df, hue = 'is_canceled', ax = ax[1])
plt.show()

* The plots show the count of bookings by different customer type and their reservation status.
1. Transient > Transient-Party > Contract > Group.
> Same order for no. of bookings and cancellation rate in descending order

In [None]:
_, ax = plt.subplots( nrows = 2, ncols = 1, figsize = (10,8))
sns.countplot(x = 'reservation_status', data = df, ax = ax[0])
sns.countplot(x ='reservation_status', data = df, hue = 'is_canceled', ax = ax[1])
plt.show()

* There are very little no. of no shows. Most of the people cancel the reservataon if they don't wish to come. 
* Well this makes thepoint that there must be some charge on cancelling the fees or some pre deposit, otherwise we would see many no show cases here.
* These columns are both same columns. When we make is_canceled a target variable, remember to remove this column as well. Coz this is also the same column that we would have to predict.

## Cancellation %age in different years

In [None]:
df.groupby(['arrival_date_year'])['is_canceled'].mean()

* Cancellation rates have remain almost consistent in all 3 years around 35-36%.

# Deposit Type

In [None]:
_, ax = plt.subplots( nrows = 2, ncols = 1, figsize = (10,8))
sns.countplot(x = 'deposit_type', data = df, hue = 'hotel', ax = ax[0])
sns.countplot(x = 'deposit_type', data = df, hue = 'is_canceled', ax = ax[1])
plt.show()


* Deposit type has 3 categories - No Deposit, refundable, Non Refund
* Either customers have opted for no deposit or non refundable deposits. 
* Maybe refundable deposit type is not offered by the hotels.
* All of the non refund bookings have been cancelled in our dataset. That might prove important feature based on how many such bookings are part of cancelled bookings.

* No hotel has refundable deposit type

## No. of Bookings during the year as per the cancellation status

In [None]:
order = ['January',
 'February', 'March' ,'April', 'May', 'June','July', 'August', 'September', 'October', 'November', 'December' ]

In [None]:
_, ax = plt.subplots( nrows = 2, ncols = 1, figsize = (10,8))
sns.countplot(x = 'arrival_date_month', data = df, ax = ax[0], order = order)
sns.countplot(x = 'arrival_date_month', data = df, hue = 'is_canceled', ax = ax[1], order = order)
plt.show()



* The cancellation rate is quite consistently high during april to october having its peak at august.

## Countries with high cancellation rate

In [None]:
df.groupby(['country'])['is_canceled'].agg( np.mean).sort_values(ascending = False).head(30)

* These are the top 30 countries out of 177 countries  with mean cancellation rate > 60%.

# Correlation Heat Map of features

In [None]:
mat = ['is_canceled',
 'lead_time',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'required_car_parking_spaces',
 'total_of_special_requests']

In [None]:
plt.figure(figsize = (10,10))
cormat = df[mat].corr()
sns.heatmap(cormat)

In [None]:
#Lets print the most correlated features to the target variable, in descending order
cancel_correlation_array = df.corr()['is_canceled']
cancel_correlation_array.abs().sort_values(ascending = False)[1:]

In [None]:
cat  = ['is_canceled',
    'lead_time',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'is_repeated_guest',
'previous_bookings_not_canceled',
         'required_car_parking_spaces',
 'total_of_special_requests'
]

In [None]:
%config InlineBackend.figure_format = 'png' #loads faster
df[cat].hist(figsize = (50,50))

# Pair plot of categorical features

In [None]:
%config InlineBackend.figure_format = 'png'
sns.pairplot(df[cat], hue = 'is_canceled', palette = 'husl')

There is not much informaton/insight directly bring conveyed by the pair plot.
Except, the kde plot of the lead time shows a significant difference in shape for both the classes.

Lets try to find the bheaviour of the both of the different groups.

# Box plot of numerical features vs cancellation status

In [None]:
plot_list = ['lead_time',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'is_repeated_guest',
'previous_bookings_not_canceled',
         'required_car_parking_spaces',
 'total_of_special_requests']

In [None]:
for i in plot_list:
    _, ax = plt.subplots( nrows = 1, ncols = 2, figsize = (12,6))
    sns.boxplot(x = 'is_canceled', y = i, data = df, showfliers = False, ax = ax[0])
    sns.violinplot(x = 'is_canceled', y = i, data = df, showfliers = False, ax = ax[1])
    
    plt.show()


1. As expected, few of the features contribute very less to tell whether the customer will cancel the booking.
They are - 
* stays in wekkend nights
* stay in week nights
* repeated guest
* previous booking canceled or not
* required car parking spaces
2. Although special request feature's median differ for both of the groups, but since its 75th percentile is 1 that means, its zero for most of the percentage of people. Hence, this would also not help much in predicting. 

3. Now, lets discuss about the lead time. 
* It is quite significantly different for both of the groups. 
* The people more likely to cancel have longer lead time for booking.

# Cancellation due to the difference in wanted and assigned room type

In [None]:
sns.countplot(x = 'reserved_room_type', hue = 'hotel', data = df)
plt.show()

This figure illustrates that room tyoes are common across the hotels. Both have room types with same names. 

In [None]:
_, ax = plt.subplots( nrows = 1, ncols = 2, figsize = (12,5))
sns.countplot(x = 'reserved_room_type', hue = 'is_canceled', data = df,  ax = ax[0])
sns.countplot(x = 'assigned_room_type', hue = 'is_canceled', data = df,  ax = ax[1])
plt.show()

Room Types A,D,E have quite high reservation and assignment satus.
But as a result, they have high cancellation rate as well.
Although one cancellation reason might be the allotment of unwanted room type.
Lets check it.

In [None]:
unwanted_room = np.where(df['reserved_room_type'] == df['assigned_room_type'], 'Wanted', 'Unwanted')

In [None]:
df['unwanted_room'] = unwanted_room

In [None]:
sns.countplot(x = 'unwanted_room', data = df, hue = 'is_canceled')
plt.title("No. of guests who cancelled the booking on getting room type as wanted or unwanted")
plt.show()

* Well, it is highly likely to cancel the reservation if alloted room type is unwanted according to our hypothesis.
* But here the cancellation rate is very low. 
* Although we may completely disprove our hypothesis according to p value statistics
* but here we shall just go with the intutition of seeing the chart

* Since, it doesnt follow the hypothesis, so should we drop this column now? What if its an important column?
* Well, I guess we should keep the original columns and not worry about this as information is already carried by the previous 2 columns. and they contain more info than this as room types A , D, E had more cancellation rates 
Let's drop this column.


In [None]:
df.shape

In [None]:
df.drop('unwanted_room', axis = 1, inplace = True)
df.head()

# **Its Feature Engineering Time Now**

# **Data Pre-Processing before applying ML Models**
1. Handling missing values
2. Handling non-numeric data (handling categorical, ordinal variables and strings)
3. Feature engineering and creating new features out of existing features.
4. Extracting only useful and relevant features: Feature selection
5. Remove features with missing values above a particular threshold
6. Splitting data into Training and test sets


# Handling Missing Values


In [None]:
df.isna().sum()

> 1. As customers with no children may be assumed to leave the children field empty. Hence, lets replace NaN values with zero here.
> 2. And people with missing values in agent id might have booked themselves, so we may replace it with 0.0
> 3.  Similarly, NaN value in company column signifies the customer has booked it personally and not on the expense of company.
> 4. Finally, lets replace the Missing values in company with Unknown category.

In [None]:
nan_replacement_dict = {"children": 0 ,"country" : "UKNWN", 'agent' : 0.0, 'company' : 0}
df.fillna(nan_replacement_dict, inplace = True)

In [None]:
df.isna().any().sum()

In [None]:
#Dropping columns that might cause data leakage
df.drop(['reservation_status', 
'reservation_status_date' , 'arrival_date_week_number', 'stays_in_weekend_nights', 'arrival_date_month', 'agent'], axis = 1, inplace = True)
df.shape

In [None]:
df.columns

# Pre Processing Steps

In [None]:
#Copying dataframe so that we don't need to dropcolumns from the main dataframe inplace
df1 = df.copy()

In [None]:
# Manually Onehotencoding the hotel variable
hotel = {'Resort Hotel': 0, 'City Hotel' : 1}

In [None]:
df1['hotel'] = df1['hotel'].map(hotel)

In [None]:
#one hot encoding 
df1 = pd.get_dummies(data = df1, columns = ['meal', 'market_segment', 'distribution_channel',
                                            'reserved_room_type', 'assigned_room_type', 'customer_type', 'deposit_type'], drop_first = True)

In [None]:
#LabelEncoder
le = LabelEncoder()

In [None]:
df1['country'] = le.fit_transform(df1['country'])

* Label Encoding Country column as it has more than 300 categories. 
* It will create a large  no. of columns, if we choose to do one hot encoding

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
# Lets separate training and target dataset
X = df1.drop('is_canceled', axis = 1)
y = df1['is_canceled']

In [None]:
#Splitting them further into training and test set
X_train,  X_test,y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 40)

In [None]:
#Setting gloabl random state variable for all the steps used further in the process to avoid randomness in the model accuracy.
global random_state
random_state = 40

In [None]:
#Made a model dictionary for all the Classification models being used
model_dict = {
    'LOR_Model' : LogisticRegression(n_jobs = -1),
    'KNN_Model' : KNeighborsClassifier(),
    'RFC_Model' : RandomForestClassifier(n_jobs = -1),
    'XGB_Model' : XGBClassifier(n_jobs = -1)
    
}

In [None]:
#Function to train the respective algorithm with the train data and printing out accuracy score and confusion matrix
def model1(algorithm, X_train, X_test, y_train, y_test):
    alg = algorithm
    alg_model = alg.fit(X_train, y_train)
    global y_pred
    y_pred = alg_model.predict(X_test)
    
    print('Accuracy Score: {}\n\nConfusion Matrix:\n {}'
      .format(accuracy_score(y_test,y_pred), confusion_matrix(y_test,y_pred)))

In [None]:
for name, model in model_dict.items():
    print("\n")
    print(name, "\n")
    model1(model, X_train, X_test, y_train, y_test )


The best model accuracy is obtained with Random Forest Classifier, that is 88.47%

# Tuning the model with the best accuracy

In [None]:
rf_parameters = {"max_depth": [10,13],
                 "n_estimators": [10,100,500],
                 "min_samples_split": [2,5]}

In [None]:
rf_model = RandomForestClassifier()


In [None]:
rf_cv_model = GridSearchCV(rf_model,
                           rf_parameters,
                           cv = 10,
                           n_jobs = -1,
                           verbose = 2)

rf_cv_model.fit(X_train, y_train)

In [None]:
print('Best parameters: ' + str(rf_cv_model.best_params_))

In [None]:
rf_tuned = RandomForestClassifier(max_depth = 13,
                                  min_samples_split = 2,
                                  n_estimators = 500)

print('Model: Random Forest Tuned\n')
model1(rf_tuned, X_train, X_test, y_train, y_test)

* Tuned Random Forest Model is giving worse accuracy than the deafult one. 
* In the default model there is no limit for max depth. 
* Increasing max depth gives us better accuracy scores but may decrease generalization.

# Conclusion


## Feature Importance

In [None]:
rf = RandomForestClassifier()

In [None]:
rf_model = rf.fit(X_train, y_train)

In [None]:
imp_features = pd.DataFrame(data = rf_model.feature_importances_*100, columns = ["Importances"], index = X_train.columns)


In [None]:
imp_features.sort_values("Importances", ascending = False)[:15].plot(kind = "barh")
plt.xlabel("Feature Importances (%)")

# Summary
* Random Forest Classifier is the best algorithm to predict the cancellation. 
* It is giving 88% accurate predictions nearly.
* Having noted the important features, and their stats for both the groups for these features, a general presiction can also be made reagrding the cancellation
* This analysis can help us out in this way.