In [None]:
# Setup

# common:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.patches as mpatches
from scipy.stats import norm
from scipy import stats
import time
import folium
import collections
import eli5 # Feature importance evaluation
import urllib
from PIL import Image

# for ML:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, average_precision_score, roc_curve, precision_recall_curve, classification_report, confusion_matrix, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, cross_validate, cross_val_score, cross_val_predict, RandomizedSearchCV, GridSearchCV, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier

# Imported Libraries
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

# set some display options:
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 36)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load data:
file_path = '/kaggle/input/hotel-booking-demand/hotel_bookings.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
pd.set_option("display.float_format", "{:.2f}".format)
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.drop('company', axis=1, inplace=True)

# "meal" contains values "Undefined", which is equal to SC.
df["meal"].replace("Undefined", "SC", inplace=True)

# Some rows contain entreis with 0 adults, 0 children and 0 babies. 
# I'm dropping these entries with no guests.
zero_guests = list(df.loc[df["adults"]
                   + df["children"]
                   + df["babies"]==0].index)
df.drop(df.index[zero_guests], inplace=True)

In [None]:
df.drop(['country',
            'arrival_date_year',
            'arrival_date_week_number',
            'arrival_date_day_of_month',
            'stays_in_weekend_nights',
            'stays_in_week_nights',
            'days_in_waiting_list',
            'required_car_parking_spaces',
            'reservation_status'
            ], 
            axis=1, inplace=True)

In [None]:
categorical_features = []
for column in df.columns:
    if df[column].dtype == object:
        categorical_features.append(column)
        print(f"{column}")
        print("====================================")

In [None]:
numerical_features = []
for column in df.columns:
    if df[column].dtype != object:
        numerical_features.append(column)
        print(f"{column}")
        print("====================================")

In [None]:
numerical_features.remove('is_canceled')

# EDA

### Cancellation rate

In [None]:
print('No Canceled', round(df['is_canceled'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Canceled', round(df['is_canceled'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

In [None]:
sns.countplot(x='is_canceled', data=df)
plt.title('Is_canceled Distributions \n (0: No Canceled || 1: Canceled)', fontsize=14)

In [None]:
plt.figure(figsize=(20, 20))

for i, feature in enumerate(numerical_features, 1):
    plt.subplot(4, 3, i)
    df[df["is_canceled"] == 0][feature].hist(bins=35, color='blue', label='Not Cancelation', alpha=0.6)
    df[df["is_canceled"] == 1][feature].hist(bins=35, color='red', label='Cancelation', alpha=0.6)
    plt.legend()
    plt.xlabel(feature)
    plt.ylabel('count')

In [None]:
plt.figure(figsize=(20, 20))

for i, feature in enumerate(categorical_features, 1):
    plt.subplot(4, 3, i)
    df[df["is_canceled"] == 0][feature].hist(bins=35, color='blue', label='Not Cancelation', alpha=0.6)
    df[df["is_canceled"] == 1][feature].hist(bins=35, color='red', label='Cancelation', alpha=0.6)
    plt.legend()
    plt.xlabel(feature)
    plt.ylabel('count')

### hotel vs is_canceled

In [None]:
plt.figure(figsize = (8,6))
sns.countplot(x='hotel', data=df, hue='is_canceled')
plt.show()

In [None]:
city_hotel_canceled_count, resort_hotel_canceled_count = df.loc[df['is_canceled'] == 1]['hotel'].value_counts()
city_hotel_count, resort_hotel_count = hotel_data = df['hotel'].value_counts()
percent_city_hotel_canceled = round(city_hotel_canceled_count / city_hotel_count * 100, 2)
percent_resort_hotel_canceled = round(resort_hotel_canceled_count / resort_hotel_count * 100, 2)

# df.groupby(['hotel'])['is_canceled'].value_counts()
# df.groupby(['hotel'])['is_canceled'].mean()

print(
f"""
Cancelation rate on hotel type
City hotel: {percent_city_hotel_canceled:.2f} %
Resort hotel: {percent_resort_hotel_canceled:.2f} %
"""    
)


City hotel has higher Cancellation rate than Resort Hotel.

Around 30% for resort hotel and greater than 40 % for city hotel.

In [None]:
rh = df.loc[(df["hotel"] == "Resort Hotel") & (df["is_canceled"] == 0)]
ch = df.loc[(df["hotel"] == "City Hotel") & (df["is_canceled"] == 0)]

rh["adr_pp"] = rh["adr"] / (rh["adults"] + rh["children"] + rh["babies"])
ch["adr_pp"] = ch["adr"] / (ch["adults"] + ch["children"] + ch["babies"])

print(
f"""
From all non-cnceled bookings, across all room types and meals, the average prices are:
City hotel: {ch["adr_pp"].mean():.2f} € per night and person.
Resort hotel: {rh["adr_pp"].mean():.2f} € per night and person.
"""
)

I think the average price is one of the most important reasons why City hotel has higher Cancellation rate than Resort Hotel.

In [None]:
# normalize price per night (adr):
df["adr_pp"] = df["adr"] / (df["adults"] + df["children"] + df["babies"])
full_data_guests = df.loc[df["is_canceled"] == 0] # only actual gusts
room_prices = full_data_guests[["hotel", "reserved_room_type", "adr_pp"]].sort_values("reserved_room_type")

# boxplot:
plt.figure(figsize=(12, 8))
sns.boxplot(x="reserved_room_type",
            y="adr_pp",
            hue="hotel",
            data=room_prices, 
            hue_order=["City Hotel", "Resort Hotel"],
            fliersize=0)
plt.title("Price of room types per night and person", fontsize=16)
plt.xlabel("Room type", fontsize=16)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 160)
plt.show()

This figure shows the average price per room, depending on its type and the standard deviation. 
 
Note that due to data anonymization rooms with the same type letter may not necessarily be the same across hotels.

### market_segment vs is_canceled

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10,8))
sns.countplot(x='market_segment', data=df, ax=ax[0])
sns.countplot(x='market_segment', data=df, hue='is_canceled', ax=ax[1])
plt.show()

### customer_type vs is_canceled

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10,8))
sns.countplot(x='customer_type', data=df, ax=ax[0])
sns.countplot(x='customer_type', data=df, hue='is_canceled', ax=ax[1])
plt.show()

### deposit_type vs is_canceled

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10,8))
sns.countplot(x='deposit_type', data=df, ax=ax[0])
sns.countplot(x='deposit_type', data=df, hue='is_canceled', ax=ax[1])
plt.show()

### month vs is_canceled

In [None]:
order = ['January', 'February', 'March' ,'April', 'May', 'June','July', 'August', 'September', 'October', 'November', 'December' ]

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10,8))
sns.countplot(x='arrival_date_month', data=df, ax=ax[0], order=order)
sns.countplot(x='arrival_date_month', data=df, hue='is_canceled', ax=ax[1], order=order)
plt.show()

### lead_time vs is_canceled

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,8))
sns.boxplot(x='is_canceled', y='lead_time', data=df, ax=ax[0])
sns.violinplot(x='is_canceled', y='lead_time', data=df, hue='is_canceled', ax=ax[1])
plt.show()

In [None]:
# Make sure we use the subsample in our correlation
fig = plt.figure(figsize=(15, 12))

palette = sns.diverging_palette(20, 220, n=256)

corr = df.corr(method='pearson')
sns.heatmap(corr, cmap=palette, vmax=.3, center=0, square=True, linewidths=.5, annot_kws={"size":15}, cbar_kws={'shrink': .5})
plt.title("Imbalanced Correlation Matrix \n (don't use for reference)")

plt.show()

In [None]:
cancel_corr = df.corr()["is_canceled"]
cancel_corr.abs().sort_values(ascending=False)[1:]

# ML

In [None]:
# Separate features and predicted value
features = numerical_features + categorical_features
Y = df['is_canceled']
X = df.drop('is_canceled', axis=1)[features]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# preprocess numerical feats:
# for most num cols, except the dates, 0 is the most logical choice as fill value
# and here no dates are missing.
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ('scaler', StandardScaler())])

# Preprocessing for categorical features:
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical features:
preprocessor = ColumnTransformer(transformers=[("num", num_transformer, numerical_features),
                                               ("cat", cat_transformer, categorical_features)])

In [None]:
# define base_models to test:
base_models = {
    'LOR_model': LogisticRegression(),
    'KNC_model': KNeighborsClassifier(),
    'SVM_model': SVC(),
    'DTR_model': DecisionTreeClassifier(),
    'RFC_model': RandomForestClassifier(),
    'ETC_model': ExtraTreesClassifier(),
    'BAG_model': BaggingClassifier(),
    'MLP_model': MLPClassifier(),
    'XGB_model': XGBClassifier(),
}

model_score = {}

# split data into 'kfolds' parts for cross validation,
# use shuffle to ensure random distribution of data:
kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

# Preprocessing, fitting, making predictions and scoring for every model:
for name, model in base_models.items():
    # pack preprocessing of data and the model in a pipeline:
    model_steps = Pipeline(steps=[
                                    ('preprocessor', preprocessor),
                                    ('model', model)])
    # get cross validation score for each model:
    cv_results = cross_val_score(model_steps, 
                                 X_train, Y_train, 
                                 cv=split,
                                 scoring="accuracy",
                                 n_jobs=-1)
    model_score[name] = cv_results
    
    # output:
    min_score = round(min(cv_results), 4)
    max_score = round(max(cv_results), 4)
    mean_score = round(np.mean(cv_results), 4)
    std_dev = round(np.std(cv_results), 4)
    print(f"{name} cross validation accuracy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

In [None]:
figure = plt.figure(figsize=(15,12))

mean_score = []
lower_mean_socre = []
upper_mean_socre = []
model_name = []
for model, score in model_score.items():
    mean_score.append(round(np.mean(score), 4))
    lower_mean_socre.append(round(np.mean(score), 4) - round(np.std(score), 4))
    upper_mean_socre.append(round(np.mean(score), 4) + round(np.std(score), 4))
    model_name.append(model)
plt.plot(model_name, mean_score, 'o-')
plt.fill_between(model_name, lower_mean_socre, upper_mean_socre, alpha=0.1)
plt.title("Score Curve", fontsize=14)
plt.xlabel('model name')
plt.ylabel('Score')
plt.grid(True)
plt.legend(loc="best")

In [None]:
model = RandomForestClassifier(random_state=42, n_jobs=-1,)

model_steps = Pipeline(steps=[
                                ('preprocessor', preprocessor),
                                ('model', model)])

# fit model(pipeline) so values can be accessed:
model_steps.fit(X_train, Y_train)

Y_pred = model_steps.predict(X_test)
ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

labels = ['No Canceled', 'Canceled']
print(classification_report(Y_test, Y_pred, target_names=labels))

In [None]:
# Names of all (encoded) features are needed.
# Get names of columns from One Hot Encoding:
onehot_columns = list(model_steps.named_steps['preprocessor'].
                      named_transformers_['cat'].
                      named_steps['onehot'].
                      get_feature_names(input_features=categorical_features))

# Add num_features for full list.
# Order must be as in definition of X, where num_features are first: 
feat_imp_list = numerical_features + onehot_columns

# show 10 most important features, provide names of features:
feat_imp_df = eli5.formatters.as_dataframe.explain_weights_df(
    model_steps.named_steps['model'],
    feature_names=feat_imp_list)
feat_imp_df.head(10)

### Looking at the three most important features:  
- lead_time  
- deposit_type  
- adr

### lead_time:

In [None]:
# group data for lead_time:
lead_cancel_data = df.groupby("lead_time")["is_canceled"].describe()
# use only lead_times wih more than 10 bookings for graph:
lead_cancel_data_10 = lead_cancel_data.loc[lead_cancel_data["count"] >= 10]

#show figure:
plt.figure(figsize=(12, 8))
sns.regplot(x=lead_cancel_data_10.index, y=lead_cancel_data_10["mean"].values * 100)
plt.title("Effect of lead time on cancelation", fontsize=16)
plt.xlabel("Lead time", fontsize=16)
plt.ylabel("Cancelations [%]", fontsize=16)
# plt.xlim(0,365)
plt.show()

Bookings made a few days before the arrival date are rarely canceled, whereas bookings made over one year in advance are canceled very often. 

### Deposit type:

In [None]:
# group data for deposit_type:
deposit_cancel_data = df.groupby("deposit_type")["is_canceled"].describe()

#show figure:
plt.figure(figsize=(12, 8))
sns.barplot(x=deposit_cancel_data.index, y=deposit_cancel_data["mean"] * 100)
plt.title("Effect of deposit_type on cancelation", fontsize=16)
plt.xlabel("Deposit type", fontsize=16)
plt.ylabel("Cancelations [%]", fontsize=16)
plt.show()

As Susmit Vengurlekar already pointed out in the Discussion section of the dataset, the deposit_type 'Non Refund' and the 'is_canceled' column are correlated in a counter-intuitive way.  
Over 99 % of people who paid the entire amount upfront canceled. This raises the question if there is something wrong with the data (or the description).  
What else stands out for Non Refund deposits?  
Here is a table of all mean values of the data, grouped by deposit type:

In [None]:
deposit_mean_data = df.groupby("deposit_type").mean()
deposit_mean_data

Comparing the mean values for Non refund to No Deposit shows the following:
- Non Refund deposits are characterized by > 2x longer lead_time   
- is_repeated_guest is ~ 1/10th  
- previous_cancellations is 10x higher 
- previous_bookings_not_canceled is 1/15th  
- required_car_parking_spaces is almost zero  
- special requests are very rare 
  
Based on these findings it seems that especially people who have not previosly visited one of the hotels book, pay and cancel repeatedly... this is strange!
  
To adress this issue, I will make a model without this feature below.

### ADR:

In [None]:
# group data for adr:
adr_cancel_data = df.groupby("adr")["is_canceled"].describe()
#show figure:
plt.figure(figsize=(12, 8))
sns.regplot(x=adr_cancel_data.index, y=adr_cancel_data["mean"].values * 100)
plt.title("Effect of ADR on cancelation", fontsize=16)
plt.xlabel("ADR", fontsize=16)
plt.ylabel("Cancelations [%]", fontsize=16)
plt.xlim(0,400)
plt.ylim(0,100)
plt.show()

In [None]:
# Separate features and predicted value
categorical_features.remove('deposit_type')
features = numerical_features + categorical_features
Y = df['is_canceled']
X = df.drop('is_canceled', axis=1)[features]

cv_results = cross_val_score(model_steps, 
                             X_train, Y_train, 
                             cv=split,
                             scoring="accuracy",
                             n_jobs=-1)
                             
# output:
min_score = round(min(cv_results), 4)
max_score = round(max(cv_results), 4)
mean_score = round(np.mean(cv_results), 4)
std_dev = round(np.std(cv_results), 4)
print(f"Enhanced RFC model cross validation accuracy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

In [None]:
# fit model(pipeline) so values can be accessed:
model_steps.fit(X_train, Y_train)

# Names of all (encoded) features are needed.
# Get names of columns from One Hot Encoding:
onehot_columns = list(model_steps.named_steps['preprocessor'].
                      named_transformers_['cat'].
                      named_steps['onehot'].
                      get_feature_names(input_features=categorical_features))

# Add num_features for full list.
# Order must be as in definition of X, where num_features are first: 
feat_imp_list = numerical_features + onehot_columns

# show 10 most important features, provide names of features:
feat_imp_df = eli5.formatters.as_dataframe.explain_weights_df(
    model_steps.named_steps['model'],
    feature_names=feat_imp_list)
feat_imp_df.head(10)

In [None]:
Y_pred = model_steps.predict(X_test)
ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

labels = ['No Canceled', 'Canceled']
print(classification_report(Y_test, Y_pred, target_names=labels))

The new accuracy score of 0.8819 is almost identical to the one obtained with the deposit_type included (0.8819),  
which placed significant wheight on this feature.  
The new model compensated this by placing increased weight on lead_time, country_PRT, adr and others. 