In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# PreProcessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from category_encoders import BinaryEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Splitting Data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score

In [None]:
hotel = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv', usecols = ['hotel', 'is_canceled', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'booking_changes', 'deposit_type', 'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces', 'total_of_special_requests'])
hotel.head()

In [None]:
hotel.info()

# EDA

In [None]:
hotel_eda = hotel.copy()
hotel_eda['hotel'].value_counts()

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(x='is_canceled', hue = 'hotel', data= hotel_eda)
plt.title('Cancellation Situation')
plt.show()

- City Hotel has the highest canceled booked and also the highest that not canceled booked.

In [None]:
hotel_eda['is_canceled'].value_counts()/hotel.shape[0]*100

- The difference ratio between canceled and not canceled indicate imbalanced data:
    * Not Canceled: 62,96%
    * Canceled: 37,04%

### Customer Profiling

In [None]:
hotel_eda['all_child'] = hotel_eda['children'] + hotel_eda['babies']
pd.crosstab(hotel_eda['adults'], hotel_eda['all_child'], margins=True, margins_name = 'Total')

- In this data, I combine children and babies in the all_child column to simplify the calculation.

#### Adults
- Judging from the data above, as many as 4175 adults came and without bring any children as many as 3777. There is a possibility that they have a business purpose or a couple who are on honeymoon.
- In second place were adults who came alone as many as 654 people and who did not bring children as many as 642 people. It is possible to have a business purpose.
- For adults who come more than 4 adults, no one brings a single child. There is the possibility of having a business trip or traveling.

#### All Child
- In the data above, the majority of adults who came without bringing children were 4560, followed by those who brought 1 child as many as 237 adults and 2 children as many as 201 adults.

In [None]:
plt.figure(figsize=(18,6))
country_booking = hotel_eda['country'].value_counts(normalize=True).rename_axis('country').reset_index(name='Percentage')
sns.barplot(x='country', y='Percentage', data=country_booking.head(10))
plt.title('Country of Customers')
plt.show()

- Categories are represented in the ISO 3155–3:2013 format.
- PRT or Portugal has the most booking demand based on the data (more than 60%). It is pretty obvious because if we trace to the publication page, the description tells us that the data source locations are from hotels located in Portugal.

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(x='customer_type', data= hotel_eda)
plt.title('Customer Type')
plt.show()

In [None]:
hotel_eda['customer_type'].value_counts()/len(hotel_eda['customer_type'])*100

Type of booking, assuming one of four categories:
- Contract, when the booking has an allotment or other type of contract associated to it
- Group, when the booking is associated to a group
- Transient, when the booking is not part of a group or contract, and is not associated to other transient booking
- Transient-party, when the booking is transient, but is associated to at least other transient booking

From the graph:
- Transient as much as 75.05%
- Transient-party as much as 21.04%
- Contract as much as 3.41%
- Group as much as 0.48%

Most of customers is Transient, means they are walk-in guests, last-minute or bookers, or simply people that require a very short-term stay in your facility. Transient customers are one of the major market segments consist of individuals or groups.

In [None]:
plt.figure(figsize=(18,6))
room_booking = hotel_eda['reserved_room_type'].value_counts(normalize=True).rename_axis('reserved_room_type').reset_index(name='Percentage')
sns.barplot(x='reserved_room_type', y='Percentage', data= room_booking)
plt.title('Customer Reserved Room Type')
plt.show()

- Type of Reserved Room is A, D, E, G, C, F, H, L, P. Code is presented instead of designation for anonymity reasons.
- Most of customers book the type A room more than 70%.

### Customer Behavior

In [None]:
hotel_eda['meal'].value_counts()/len(hotel_eda['meal'])*100

Type of meal booked. Categories are presented in standard hospitality meal packages: 
- BB means Bed & Breakfast as much as 77.31%
- HB means Half board (breakfast and one other meal – usually dinner) as much as 12.11%
- FB means Full board (breakfast, lunch and dinner) as much as 0.66%
- Undefined/SC means no meal as much as 9.89%

Most customers prefer to book include Breakfast, which's the usual complimentary if someone wants to book a hotel. In case they don't have time to find breakfast outside the hotel. Some of them around 12.11% book with another meal, usually dinner. Them who came with business interests can get more resting time to not find any dinner outside hotel anymore.

In [None]:
plt.figure(figsize=(18,10))
plt.subplot(211)
sns.countplot(data=hotel_eda,x='deposit_type',hue='market_segment')
plt.title('Deposit Type for Market Segment')
plt.legend(loc = 1)
plt.subplot(212)
sns.countplot(data=hotel_eda,x='is_canceled',hue='market_segment')
plt.title('Cancellation for Market Segment')
plt.show()

#### Deposit Type for Market Segment
- The highest book is using Online TA, with No Deposit. While Groups and Offline TA/TO are next with Refundable and Non refund. In this era, it will be easy to book a hotel using Online TA, especially most Online TA offers the customers without deposit facility. Customers can choose any type of room up to any special request by Online TA.
- Looking at Offline TA/TO and Groups, the situations where the deposit was received were only in the scenarios where the groups came. It is quite logical to apply a deposit for a large number of customers who will fill important amount of the hotel capacity.

#### Cancellation for Market Segment
- The highest cancellation book is using Online TA. Most of Online TA give a customers previllege to cancel without any charge apply. Second, Groups segment has cancellation rate around 50%.

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(x='total_of_special_requests', data= hotel_eda)
plt.title('Total of Special Requests from Customers')
plt.show()

- Number of special requests made by the customer (e.g. twin bed or high floor)
- Most customers don't make any special requests. It has a correlation with customer type. They don't ask for any additional requests because they only transient customers.

# Data Cleaning

In [None]:
hotel.isna().sum()/len(hotel)*100

* There are missing values and it will be processed on ColumnTransformer.

# PreProcessing

*Preprocessing Scheme*

- OneHot: hotel, meal, market_segment, distribution_channel, reserved_room_type, deposit_type, customer_type
- Binary:
    * mode: country (country column more than 10 countries, soI decide to use binary)
- Mode Impute: children
- PassThrough: adults, babies, booking_changes, days_in_waiting_list, required_car_parking_spaces, total_of_special_requests
- Target: is_canceled
- No need to scale the data because there isn't any interval nominal between columns

In [None]:
mode_binary = Pipeline([
    ('encoder', SimpleImputer(strategy = 'most_frequent')),
    ('binary', BinaryEncoder())])

transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), [ 'hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']),
    ('mode binary', mode_binary, ['country']),
    ('impute mode', SimpleImputer(strategy = 'most_frequent'), ['children'])], remainder = 'passthrough')

*Define Target Data*

In [None]:
hotel['is_canceled'].value_counts()

In [None]:
X = hotel.drop('is_canceled', axis = 1)
y = hotel['is_canceled']

* *0 = Not Canceled*
* *1 = Canceled*

        - TN: Guest's predict with Not Canceled and the actual is Not Canceled
        - TP: Guest's predict with Canceled and the actual is Canceled
        - FP: Guest's predict with Canceled and the actual is Not Canceled
        - FN: Guest's predict with Not Canceled and the actual is Canceled

Actions:
* FP: When the customer arrives, the room is not available. Customers won't book at the hotel again because they are considered to provide poor service, the hotel experiences financial losses, and a good name.
* FN: The hotel has the advantage of empty rooms because it's canceled. Customers have more previlledge to choose facilities if they want.

#### I want to reduce one of the situations and its FP. The score that will be pressed is the precision score.

*Splitting Data*

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 1515)

- I use 0.2 as default score for test_size and X.shape for random_state so the data will be devided equally.

# Modeling

*Define Model*
- I use 3 basic models to predict:
    * Logistic Regression
    * Decision Tree Classifier
    * K-Nearest Neighbor (I add scaler because usually, the score will be better if using a scaler)

### Cross Validation

In [None]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(random_state = 1515)
knn = KNeighborsClassifier()

logreg_pipe = Pipeline([('transformer', transformer), ('logreg', logreg)])
tree_pipe = Pipeline([('transformer', transformer), ('tree', tree)])
knn_pipe = Pipeline([('transformer', transformer), ('scale', MinMaxScaler()), ('knn', knn)])

def model_evaluation(model, metric):
    model_cv = cross_val_score(model, X_train, y_train, cv = StratifiedKFold(n_splits = 5), scoring = metric)
    return model_cv

logreg_pipe_cv = model_evaluation(logreg_pipe, 'precision')
tree_pipe_cv = model_evaluation(tree_pipe, 'precision')
knn_pipe_cv = model_evaluation(knn_pipe, 'precision')

for model in [logreg_pipe, tree_pipe, knn_pipe]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_pipe_cv.mean(), tree_pipe_cv.mean(), knn_pipe_cv.mean()]
score_std = [logreg_pipe_cv.std(), tree_pipe_cv.std(), knn_pipe_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe.predict(X_test)), precision_score(y_test, tree_pipe.predict(X_test)), precision_score(y_test, knn_pipe.predict(X_test))]
method_name = ['Logistic Regression', 'Decision Tree Classifier', 'KNN Classifier']
cv_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score})
cv_summary

From this method, I will choose Logistic Regression	 because it has the highest precision score and continue to HyperParameter Tuning process.

# HyperParam Tuning

In [None]:
estimator = Pipeline([
    ('transformer', transformer),
    ('model', logreg)])

hyperparam_space = {
    'model__C': [1, 5, 10, 20, 30, 50],
    'model__class_weight': ['dict', 'balanced'],
    'model__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'model__max_iter': [50, 100, 150, 200, 300],
    'model__random_state': [1515],
    'model__n_jobs': [-1]
}

random = RandomizedSearchCV(
                estimator,
                param_distributions = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'precision',
                n_iter = 10,
                n_jobs = -1)

random.fit(X_train, y_train)

print('best score', random.best_score_)
print('best param', random.best_params_)

- So far, the best parameter is:
    * solver: saga
    * max_iter: 50
    * class_weight: dict
    * C: 20

# Before and After Tuning

In [None]:
estimator.fit(X_train, y_train)
y_pred_estimator = estimator.predict(X_test)
before = precision_score(y_test, y_pred_estimator)

random.best_estimator_.fit(X_train, y_train)
y_predict = random.best_estimator_.predict(X_test)
after = precision_score(y_test, y_predict)

score_list = [before, after]
method_name = ['Logistic Regression Before Tuning', 'Logistic Regression After Tuning']
best_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
best_summary

After tuning process, the precision score increasing. I decide to use Logistic Regression After Tuning to predict hotel cancellation.