## Setting up environment

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Load data

In [None]:
import pandas as pd

# Load dataset
orig_dataset = pd.read_csv("../input/hotel-booking-demand/hotel_bookings.csv");
orig_dataset.info()

### Split data
Split data into train and test dataset. <br> 
Select and explore only train dataset to avoid training bias from test set.

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
    
# Explore missing data in dataset
print(orig_dataset.isnull().sum())

# Drop duplicate rows
orig_dataset.drop_duplicates(inplace= True)

# Convert data type
orig_dataset['children'] = orig_dataset['children'].astype('Int64')

# Handling null value
orig_dataset.drop(['company', 'agent'], axis=1, inplace= True)   # Drop features with lots of missing value out
orig_dataset['country'].fillna(method='ffill', inplace=True)  # Impute missing data with LOCF method
orig_dataset = orig_dataset[orig_dataset['children'].notna()]   # Drop missing data rows

print(orig_dataset.isnull().sum())

In [None]:
from sklearn.model_selection import train_test_split

# Select features and label
features = orig_dataset.drop(['is_canceled'], axis=1)
label = orig_dataset['is_canceled']

# Split train test data
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state= 3)

In [None]:
explore_data = pd.concat([X_train, y_train], axis=1).copy()

explore_data.columns

## EDA - Exploratory Data Analysis
Investigate the hidden pattern, information in data

### 1. Time series analysis
The below graphs show the amount of lodging detail in each time of year.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create dict for sorting purpose
month_dict = {'January': 0, 'February': 1, 'March': 2, 'April': 3, 'May': 4, 'June': 5, 'July': 6, 'August': 7, 'September': 8, 'October': 9, 'November': 10, 'December': 11} 

_, axs = plt.subplots(4, 1, figsize = (15,15))

# Amount of lodging in each month separated by years
loding_trend = pd.crosstab(explore_data['arrival_date_month'], explore_data['arrival_date_year'])
loding_trend = loding_trend.sort_values(by=['arrival_date_month'], key=lambda x: x.map(month_dict))
sns.lineplot(data=loding_trend, ax=axs[0])

# Amount of hotel reserved in each month
hotel_type = pd.crosstab(explore_data['arrival_date_month'], explore_data['hotel'])
hotel_type = hotel_type.sort_values(by=['arrival_date_month'], key=lambda x: x.map(month_dict))
sns.lineplot(data= hotel_type, ax=axs[1])

# Amount of customer type lodge the hotel in each month
customer_type_month = pd.crosstab(explore_data['arrival_date_month'], explore_data['customer_type'])
customer_type_month = customer_type_month.sort_values(by=['arrival_date_month'], key=lambda x: x.map(month_dict))
sns.lineplot(data= customer_type_month, ax=axs[2])

# Amount of customer country lodge the hotel in each month
top10_country = explore_data['country'].value_counts().head(10).index
country_month = explore_data.loc[explore_data['country'].isin(top10_country), ['arrival_date_month', 'country']]
country_month = pd.crosstab(country_month['arrival_date_month'], country_month['country'])
country_month = country_month.sort_values(by=['arrival_date_month'], key=lambda x: x.map(month_dict))
sns.lineplot(data= country_month, ax=axs[3])

### Observations
Exploring time series is exposed the useful information as listed below
* Around third quarter of each year will have the most of lodging transactions and will start decrease in the fourth quarter. <br>
* City and resort hotel have similar demand growth along the year. <br>
* Transient type has significantly larger demand than the others along the year <br>
* BEL is the country that most lodging the hotel.

### 2. Bivariate analysis
This section is trying to find the hidden pattern or relation between interested features and a target feature. <br>
Moreover, trying to prove the assumptions that came in to my mind.

**2.1 Cancellation due to days in waiting list**

In [None]:
# Find ratio of cancellation due to days in waiting list

# Separate into three groups
cancel_because_waiting1  = explore_data.loc[(explore_data['days_in_waiting_list'] >= 0) & (explore_data['days_in_waiting_list'] < 100), ['days_in_waiting_list', 'is_canceled'] ]
cancel_because_waiting2  = explore_data.loc[(explore_data['days_in_waiting_list'] >= 100) & (explore_data['days_in_waiting_list'] < 200), ['days_in_waiting_list', 'is_canceled'] ]
cancel_because_waiting3  = explore_data.loc[(explore_data['days_in_waiting_list'] >= 200) , ['days_in_waiting_list', 'is_canceled'] ]

# Calculate in percentage
cancel_because_waiting1 = cancel_because_waiting1['is_canceled'].value_counts(normalize=True) * 100
cancel_because_waiting2 = cancel_because_waiting2['is_canceled'].value_counts(normalize=True) * 100
cancel_because_waiting3 = cancel_because_waiting3['is_canceled'].value_counts(normalize=True) * 100

# Plot
_, axs = plt.subplots(1, 3, figsize=(12, 18))
axs[0].pie(cancel_because_waiting1, labels = ['Not canceled', 'Canceled'], autopct='%1.1f%%', startangle=90)
axs[1].pie(cancel_because_waiting2, labels = ['Not canceled', 'Canceled'], autopct='%1.1f%%', startangle=90)
axs[2].pie(cancel_because_waiting3, labels = ['Not canceled', 'Canceled'], autopct='%1.1f%%', startangle=90)

axs[0].title.set_text('Less than 100 days')
axs[1].title.set_text('Between 100 and 200 days')
axs[2].title.set_text('More than 200 days')

plt.show()

### Observations
Investigating the cancellation due to days in waiting list. <br>
I assumed that longer time the customers waiting in the list will make them likely decide to cancel the lodging because of dissatifaction of service. <br>
I have try below steps to visualize data:
1. Split the time which customers use to wait in the list into three groups. Less than 100 days, more than 100 days but less than 200 days and more than 200 days.
2. Calculate the percentage amount of cancel or not cancel for each group.
3. Visualize each group in pie charts. <br>

The apparent result shows that the longer time used is not significantly affect the cancellation decision.


**2.2 Cancellation due to prvious cancellation**

In [None]:
# Find ratio of cancellation due to previous cancellation

# Separate into three segments
previous_cancel1 = explore_data.loc[ explore_data['previous_cancellations'] < 9, ['previous_cancellations', 'is_canceled']]
previous_cancel2 = explore_data.loc[ (explore_data['previous_cancellations'] >= 9) & (explore_data['previous_cancellations'] < 18), ['previous_cancellations', 'is_canceled']]
previous_cancel3 = explore_data.loc[ explore_data['previous_cancellations'] >= 18, ['previous_cancellations', 'is_canceled']]

# Calculate in percentage 
previous_cancel1 = previous_cancel1['is_canceled'].value_counts(normalize=True) * 100
previous_cancel2 = previous_cancel2['is_canceled'].value_counts(normalize=True) * 100
previous_cancel3 = previous_cancel3['is_canceled'].value_counts(normalize=True) * 100

# Simulate
_, axs = plt.subplots(1, 3, figsize=(12, 18))
axs[0].pie(previous_cancel1, labels = ['Not canceled', 'Canceled'], autopct='%1.1f%%', startangle=90, colors=['forestgreen', 'firebrick'])
axs[1].pie(previous_cancel2, labels = ['Not canceled', 'Canceled'], autopct='%1.1f%%', startangle=90, colors=['forestgreen', 'firebrick'])
axs[2].pie(previous_cancel3, labels = ['Canceled'], autopct='%1.1f%%', startangle=90, colors=['firebrick'])

axs[0].title.set_text('Less than 9 times')
axs[1].title.set_text('Between 9 and 18 times')
axs[2].title.set_text('More than 18 times')

plt.show()

**2.3 Cancellation due to distribution channel**

In [None]:
# Compare percentage of cancellation for each distribution channel

# Create cross tab
channels = pd.crosstab(explore_data['is_canceled'], explore_data['distribution_channel']) 

# Calculate percentage separated by channel
channels = pd.DataFrame([ channels[channel] / channels[channel].sum() * 100 for channel in channels ])

x = channels.reset_index().rename(columns={'index': 'Channel', 0:'Not canceled', 1: 'Canceled'})
x = pd.concat( [x, pd.DataFrame([100] * 5)], axis=1)
sns.barplot(x='Channel', y=0, data=x, color='salmon')
sns.barplot(x='Channel', y='Canceled', data=x, color='red')
plt.title('Portion of cancellation for each channel')

**2.4 Cancellation due to customer segment**

In [None]:
# Compare percentage of cancellation for each distribution channel

# Create cross tab
cus_types = pd.crosstab(explore_data['is_canceled'], explore_data['customer_type']) 

# Calculate percentage separated by channel
cus_types = pd.DataFrame([ cus_types[cus_type] / cus_types[cus_type].sum() * 100 for cus_type in cus_types ])

x = cus_types.reset_index().rename(columns={'index': 'Types', 0:'Not canceled', 1: 'Canceled'})
x = pd.concat( [x, pd.DataFrame([100] * 5)], axis=1)
sns.barplot(x='Types', y=0, data=x, color='salmon')
sns.barplot(x='Types', y='Canceled', data=x, color='red')
plt.title('Portion of cancellation for each customer type')

**2.5 Cancellation due to repeated guest**

In [None]:
# Explore relation between customer used to be guest before and cancellation
cross_cancel = pd.crosstab(explore_data['is_canceled'], explore_data['is_repeated_guest'])
cross_cancel = cross_cancel.rename({0: 'Not repeated guest', 1: 'Repeated guest'}, axis=1)
cross_cancel = pd.DataFrame([cross_cancel[guest] / cross_cancel[guest].sum() * 100 for guest in cross_cancel ]).reset_index()
cross_cancel = cross_cancel.rename({'index': 'Is repeated guest', 0: 'Not canceled', 1: 'Canceled'}, axis=1)
cross_cancel = pd.concat([cross_cancel, pd.DataFrame([100] * 2)], axis=1)

# Visualize
sns.barplot(x= 'Is repeated guest', y= 0, data= cross_cancel, color='salmon')
sns.barplot(x= 'Is repeated guest', y= 'Canceled', data= cross_cancel, color='red')
plt.title('Portion of cancellation for each repeated and not repeated guest')

## Feature engineering
Investigate features to find the appropriate features for prediction

In [None]:
explore_data.columns

In [None]:
# Investigate imbalanced data
sns.catplot(x= 'is_canceled', kind= 'count', data=explore_data)
plt.title('Portion of cancellation in dataset')

### Construct new features from features

In [None]:
# Find the total amount of guest
explore_data.loc[:, 'total_guest'] = explore_data['adults'] + explore_data['children'] + explore_data['babies']
X_train.loc[:, 'total_guest'] = (X_train['adults'] + X_train['children'] + X_train['babies']).copy()
X_test.loc[:, 'total_guest'] = (X_test['adults'] + X_test['children'] + X_test['babies']).copy()

# Find the problability of customers who likely to cancel the lodging by determining from previous cancellations
explore_data.loc[:, 'cancellation_rate'] = explore_data['previous_cancellations'] / (explore_data['previous_cancellations'] + explore_data['previous_bookings_not_canceled'])
# In case customers have never lodge, the rate will be -1
explore_data.loc[:, 'cancellation_rate'] = explore_data['cancellation_rate'].fillna(-1)

X_train.loc[:, 'cancellation_rate'] = (X_train['previous_cancellations'] / (X_train['previous_cancellations'] + X_train['previous_bookings_not_canceled'])).copy()
X_train.loc[:, 'cancellation_rate'].fillna(-1, inplace=True)

X_test.loc[:, 'cancellation_rate'] = (X_test['previous_cancellations'] / (X_test['previous_cancellations'] + X_test['previous_bookings_not_canceled'])).copy()
X_test.loc[:, 'cancellation_rate'].fillna(-1, inplace= True)

# Find the total nights customers stay
explore_data.loc[:, 'total_nights_stay'] = explore_data['stays_in_week_nights'] + explore_data['stays_in_weekend_nights']
X_train.loc[:, 'total_nights_stay'] = (X_train['stays_in_week_nights'] + X_train['stays_in_weekend_nights']).copy()
X_test.loc[:, 'total_nights_stay'] = (X_test['stays_in_week_nights'] + X_test['stays_in_weekend_nights']).copy()

In [None]:
# Categorize data into two types
numerical_data = explore_data.select_dtypes(include=['int64', 'float64']).columns
categorical_data = explore_data.select_dtypes(include=['object']).columns

In [None]:
# Declare variable for keeping selected categorical features
selected_num = []

# Explore corrlation of numerical variables
plt.figure(figsize= (12,8))
features_corr = explore_data[numerical_data].corr()
sns.heatmap(features_corr)

# Select features from numerical features
sig_features = features_corr['is_canceled'].sort_values().drop('is_canceled', axis=0)
selected_num.extend(sig_features.head(3).index.tolist())
selected_num.extend(sig_features.tail(3).index.tolist())
plt.title('Heat map of numerical correlation')

In [None]:
from scipy.stats import chisquare
from scipy.stats import chi2_contingency
from scipy.stats import chi2

# Declare variable for keeping selected numerical features
selected_cate = []

# Explore chi-square of categorical variables
for cat in categorical_data:
    contingency_tab = pd.crosstab(explore_data[cat], explore_data['is_canceled'])
    stat, p, dof, expected = chi2_contingency(contingency_tab)
    # Select features with p-value less than 0.05     
    if p < 0.05 and cat != 'reservation_status':
        selected_cate.append(cat)
    print('p-value of {} is {}'.format(cat, p))

In [None]:
print('Selected categorical features: {}'.format(selected_cate))
print('Selected numerical features: {}'.format(selected_num))

# Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

selected_features = selected_cate.copy()
selected_cate.remove('arrival_date_month')

# Deine pipeline for transforming each feaure type separately
preprocessor = ColumnTransformer([
        ('scaler', StandardScaler(), pd.Index(selected_num)),
        ('one_hot', OneHotEncoder(), pd.Index(selected_cate)),
        ('ordinal_enc', OrdinalEncoder(categories= [months]), pd.Index(['arrival_date_month']))
])

# Modeling
Try multiple algorithms to fit the data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import svm

# Define model training pipelines
lgs = Pipeline([('preprocess', preprocessor),
                ('logistic', LogisticRegression(max_iter=300))])

svm = Pipeline([('preprocess', preprocessor),
                ('svm', svm.SVC(max_iter=300))])

tree = Pipeline([('preprocess', preprocessor),
                ('decisionTree', DecisionTreeClassifier())])

In [None]:
from sklearn.model_selection import cross_val_score

selected_features =  selected_num + selected_cate + ['arrival_date_month']
X_train = X_train[selected_features].copy()
X_test = X_test[selected_features].copy()

lgs_cv = cross_val_score(lgs, X_train, y_train, cv=5)
svm_cv = cross_val_score(svm, X_train, y_train, cv=5)
tree_cv = cross_val_score(tree, X_train, y_train, cv=5)

In [None]:
print('Logistic Regression mean: {}'.format(lgs_cv.mean()))
print('SVM mean: {}'.format(svm_cv.mean()))
print('Decision Tree mean: {}'.format(tree_cv.mean()))

In [None]:
# Fit the model
lgs_model = lgs.fit(X_train, y_train)
svm_model = svm.fit(X_train, y_train)
tree_model = tree.fit(X_train, y_train)

# Predict the result
lgs_pred = lgs_model.predict(X_test)
svm_pred = svm_model.predict(X_test)
tree_pred = tree_model.predict(X_test)

# Show classification report
print(classification_report(y_test, lgs_pred), end='\n\n')
print(classification_report(y_test, svm_pred), end='\n\n')
print(classification_report(y_test, tree_pred), end='\n\n')

# Parameter Tuning
Select best type of model which can be fitted to data from past section and tune the parameters with GridSearchCV.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define options
tree_params = {
  'decisionTree__criterion': ['gini', 'entropy'],
  'decisionTree__splitter': ['best', 'random'],
  'decisionTree__max_depth': [3,4,5],
  'decisionTree__min_samples_leaf': [1,2,3],
  'decisionTree__min_samples_split': [2,3,4]
} 


# Define GridSearchCV
model = GridSearchCV(estimator=tree,
            param_grid=tree_params,
            scoring='accuracy',
            cv=10)

# Train, predict and evaluate
model.fit(X_train, y_train)
print('Best params are: {}'.format(model.best_params_))
print('Best training accuracy: {}'.format(model.best_score_))
y_pred = model.predict(X_test)
print('Test set accuracy: {}'.format(accuracy_score(y_test, y_pred)))