In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Basic Data Preparation

In [None]:
df = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')
print(f'Dataframe shape: {df.shape}')
df.head()

In [None]:
df.isna().sum()

In [None]:
df = df.drop(columns=['country', 'agent', 'company', 'meal', 'required_car_parking_spaces', 'arrival_date_week_number', 'arrival_date_day_of_month', 'total_of_special_requests'])
df['children'] = df['children'].fillna(0)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df['children'] = df['children'].astype('int64')

## Variable Types

### Categorical Variables
- arrival_date_month (nominal)
- arrival_date_year (nominal)
- market_segment (nominal)
- distribution_channel (nominal)
- reserved_room_type (would be ordinal but because we dont know anything about the room, it's nominal)
- hotel (nominal)
- assigned_room_type (would be ordinal but because we dont know anything about the room, it's nominal)
- deposit_type (nominal)
- customer_type (nominal)
- is_repeated_guest (binary)
- is_canceled (binary)

### Numerical Variables
- lead_time (discrete)
- adults, children, babies (discrete)
- stays_in_weekend_nights, stays_in_week_nights, stays_in_total_nights (discrete)
- previous_cancellations (discrete)
- previous_bookings_not_canceled (discrete)
- booking_changes (discrete)
- days_in_waiting_list (discrete)
- adr (continuous)
- total_of_special_requests (discrete)

# EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# print(plt.style.available)
plt.style.use('ggplot')

### Hotel Type

In [None]:
df['hotel'].value_counts(normalize=True).plot.bar();

In [None]:
plt.figure(figsize = (20,10))
sns.barplot(x = 'arrival_date_month', y = 'is_canceled', hue = 'hotel', data = df);

### Lead Time

In [None]:
# lead_time
fig, ax = plt.subplots(1, 2, figsize=(17, 7))
df['lead_time'].plot(kind='hist', bins=25, ax = ax.flat[0], legend='lead_time');
sns.violinplot(x='is_canceled', y='lead_time', data=df, ax=ax.flat[1]);

In [None]:
(sns.FacetGrid(df, hue = 'is_canceled',
             height = 6,
              xlim=(0, 600))
    .map(sns.kdeplot, 'lead_time', shade = True)
    .add_legend());

### Number of Bookings vs Number of Cancellations

In [None]:
# Compare number of bookings and number of cancellations
fig, ax = plt.subplots(1, 2, figsize=(17, 5))
bkng_cancel_year = pd.concat([
    df.groupby('arrival_date_year').sum()['is_canceled'].sort_values(ascending=False),
    df['arrival_date_year'].value_counts()
], axis=1).rename(columns={'is_canceled':'num_cancellations', 'arrival_date_year': 'num_bookings'})
bkng_cancel_year.plot.bar(ax=ax.flat[0]);


bkng_cancel_month = pd.concat([
    df.groupby('arrival_date_month').sum()['is_canceled'].sort_values(ascending=False),
    df['arrival_date_month'].value_counts()
], axis=1).rename(columns={'is_canceled':'num_cancellations', 'arrival_date_month': 'num_bookings'})
bkng_cancel_month.plot.bar(ax=ax.flat[1]);

### Duration of Stay

In [None]:
# Visualizing duration of stay
df['stays_in_total_nights'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']
fig, ax = plt.subplots(1, 2, figsize=(17, 7))
df[df['stays_in_total_nights'] < 25]['stays_in_total_nights'].plot.hist(bins=25, ax=ax.flat[0]);
ax.flat[0].set_xlabel("Duration of stay in nights");
sns.violinplot(x='is_canceled', y='stays_in_total_nights', data=df[df['stays_in_total_nights'] < 25], ax=ax.flat[1]);

Majority of the stays at the hotel are less than a week (roughly). We see a peak at roughly 7 nights, and another peak at around 2 nights. The peak around 2-3 nights is likely for those that only book a stay for the weekend.
Also only looking at cases below 25 nights, because the portion of bookings that are larger than 25 are significantly small

In [None]:
(sns.FacetGrid(df, hue = 'is_canceled',
             height = 6,
              xlim=(0, 25))
    .map(sns.kdeplot, 'stays_in_total_nights', shade = True)
    .add_legend());

### Adults, Children, Babies

In [None]:
# df['adults'].value_counts(normalize=True).plot.bar();
plt.figure(figsize=(10, 7))
sns.countplot(x='adults', hue='hotel', data=df);

In [None]:
df[df['adults'] > 10]

In [None]:
# Interesting cases. Could be considered as outliers?
df[(df['babies'] > 3) | (df['children'] > 5)]

### Market Segment

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
df['market_segment'].value_counts().plot(kind='pie', ax=ax.flat[0]);

# Compare number of cancellations for each market segment
cancellations_per_market_segment = pd.concat([
    df.groupby('market_segment').sum()['is_canceled'].sort_values(ascending=False),
    df['market_segment'].value_counts()
], axis=1).rename(columns={'is_canceled':'num_cancellations', 'market_segment': 'num_bookings'})
cancellations_per_market_segment.plot.bar(ax=ax.flat[1]);

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(hue='reservation_status', x='market_segment', data=df);

In [None]:
plt.figure(figsize = (15,10))
sns.boxplot(x = 'market_segment', y = 'stays_in_total_nights', data = df, hue = 'hotel');

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (30, 10))
sns.violinplot(x = 'market_segment', y = 'stays_in_week_nights', data = df, hue = 'hotel', ax=ax.flat[0]);
sns.violinplot(x = 'market_segment', y = 'stays_in_weekend_nights', data = df, hue = 'hotel', ax=ax.flat[1]);

Although there are more Offline TA/TO bookings than Group Bookings, the number of cancellations for Groups is higher. 

### Distribution Channel

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
df['distribution_channel'].value_counts().plot(kind='pie', ax=ax.flat[0]);

# Compare number of cancellations for each market segment
cancellations_per_distribution_channel = pd.concat([
    df.groupby('distribution_channel').sum()['is_canceled'].sort_values(ascending=False),
    df['distribution_channel'].value_counts()
], axis=1).rename(columns={'is_canceled':'num_cancellations', 'distribution_channel': 'num_bookings'})
cancellations_per_distribution_channel.plot.bar(ax=ax.flat[1]);

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x='distribution_channel', y='stays_in_total_nights', hue='is_canceled', data=df);

### Past Record of Cancellations, Bookings

In [None]:
(df['is_repeated_guest'].value_counts(normalize=True) * 100).rename(index={0: 'No', 1: 'Yes'}).plot(kind='bar', xlabel='Repeated Guest', figsize=(9, 6));

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(30, 7))
sns.stripplot(x='is_canceled', y='previous_cancellations', data=df, hue='reservation_status', ax=ax.flat[0]);
sns.stripplot(x='is_canceled', y='previous_bookings_not_canceled', data=df, ax=ax.flat[1]);
sns.scatterplot(x='previous_cancellations', y='previous_bookings_not_canceled', data=df, hue='is_canceled', ax=ax.flat[2]);

In [None]:
plt.figure(figsize=(10, 7))
ax = sns.countplot(x='is_canceled', hue='is_repeated_guest', data=df);
ax.set_ylabel('num_cancellations');

### Room Type Characteristics

In [None]:
room_type_stats = pd.concat([df['reserved_room_type'].value_counts().rename('num_reserved_rooms'),
                             df['assigned_room_type'].value_counts().rename('num_assigned_rooms'),
                             df.groupby('reserved_room_type').sum()['is_canceled'].rename('reserved_rooms_cancelled'), 
                             df.groupby('assigned_room_type').sum()['is_canceled'].rename('assigned_rooms_cancelled')], axis=1)
room_type_stats

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(20, 9))
room_type_stats[['num_reserved_rooms', 'reserved_rooms_cancelled']].plot(kind='bar', ax=ax.flat[0]);
room_type_stats[['num_assigned_rooms', 'assigned_rooms_cancelled']].plot(kind='bar', ax=ax.flat[1]);
room_type_stats[['num_reserved_rooms', 'num_assigned_rooms']].plot(kind='bar', ax=ax.flat[2]);
room_type_stats[['reserved_rooms_cancelled', 'assigned_rooms_cancelled']].plot(kind='bar', ax=ax.flat[3]);

A detailed analysis can be performed in order to understand the characteristics of each of the room types, possibly using clustering of some kind and/or observing the correlation of the room types with other features. Following are some observations
- Room A has the most bookings, and therefore will also have the most number of cancellations, regardless of whether it was reserved or assigned
- Room A is reserved the most. 
- For Room A, num_reserved_rooms > num_assigned_rooms. We notice on the other hand that for the other room types, num_assigned_rooms > num_reserved_rooms.

### Booking Changes

In [None]:
sns.stripplot(x='is_canceled', y='booking_changes', data=df);

### Deposit Type
Close to 100% of those that had a non-refundable booking cancelled their booking. Big difference in contrast to the bookings that either had 'No Deposit' or had 'Refundable' bookings

In [None]:
deposit_type_cancellations_stats = pd.concat([df.groupby('deposit_type').sum()['is_canceled'].sort_values(ascending=False),
                                               df['deposit_type'].value_counts(),
                                               df.groupby('deposit_type').sum()['is_canceled'].sort_values(ascending=False) / df['deposit_type'].value_counts() * 100],
                                              axis=1).rename(columns={'is_canceled': 'num_cancellations', 'deposit_type': 'total_num_bookings', 0: 'percent_cancellations'})
fig, ax = plt.subplots(1, 2, figsize=(17, 6))
deposit_type_cancellations_stats.drop(columns='percent_cancellations').plot(kind='bar', ax=ax.flat[0]);
sns.barplot(x=deposit_type_cancellations_stats.index, y='percent_cancellations', data=deposit_type_cancellations_stats, ax=ax.flat[1]);

### Days in the Waiting List

No definitive correlation here

In [None]:
sns.stripplot(x='is_canceled', y='days_in_waiting_list', data=df);

### Customer Type
Groups had the lowest percentage of cancellations

In [None]:
customer_type_cancellations_stats = pd.concat([df.groupby('customer_type').sum()['is_canceled'].sort_values(ascending=False),
                                               df['customer_type'].value_counts(),
                                               df.groupby('customer_type').sum()['is_canceled'].sort_values(ascending=False) / df['customer_type'].value_counts() * 100],
                                              axis=1).rename(columns={'is_canceled': 'num_cancellations', 'customer_type': 'total_num_bookings', 0: 'percent_cancellations'})
fig, ax = plt.subplots(1, 2, figsize=(17, 6))
customer_type_cancellations_stats.drop(columns='percent_cancellations').plot(kind='bar', ax=ax.flat[0]);
sns.barplot(x=customer_type_cancellations_stats.index, y='percent_cancellations', data=customer_type_cancellations_stats, ax=ax.flat[1]);

### ADR vs is_canceled

No significant correlation. 

In [None]:
sns.stripplot(x='is_canceled', y='adr', data=df);

In [None]:
# Drop outlier and replot
df = df[df['adr'] < 1000]
sns.stripplot(x='is_canceled', y='adr', data=df);

In [None]:
(sns.FacetGrid(df, hue = 'is_canceled',
             height = 6,
              xlim=(0, 25))
    .map(sns.kdeplot, 'adr', shade = True)
    .add_legend());

### Reservation Status
Those that were marked as no-show also have 'is_canceled = 1', but we see that a little less than 40% of the people canceled their bookings

In [None]:
df['reservation_status'].value_counts(normalize=True).plot.bar(figsize=(8, 5));

In [None]:
sns.countplot(x='reservation_status', hue='is_repeated_guest', data=df);

# ML Modeling

## Data Preprocessing

In [None]:
df = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')
print(f'Dataframe shape: {df.shape}')
df.head()

In [None]:
df = df.drop(columns=['country', 'agent', 'company', 'meal', 
                      'arrival_date_week_number', 'arrival_date_day_of_month', 'reservation_status_date'])
df['children'] = df['children'].fillna(0)
df['children'] = df['children'].astype('int64')

In [None]:
# Creation of New Features
df['stays_in_total_nights'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']
df['total_guests'] = df['adults'] + df['children'] + df['babies']

df = df.drop(columns=['adults', 'children', 'babies'])

# Check if room type was changed
df['room_type_changed'] = (df['reserved_room_type'] != df['assigned_room_type']).astype('int')

In [None]:
df.dtypes

In [None]:
# Drop Outliers
df = df[df['adr'] < 1000]

In [None]:
from sklearn import preprocessing

## Preprocess Features to Evaluate Correlation

In [None]:
df_copy = df.copy()

In [None]:
features_to_label_encode = ['arrival_date_year', 'arrival_date_month', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type',
                           'deposit_type', 'customer_type', 'reservation_status']
le = preprocessing.LabelEncoder()

for feature in features_to_label_encode:
    df_copy[feature] = le.fit_transform(df[feature])

In [None]:
plt.figure(figsize=(10, 7))
ax = (df_copy.corr()['is_canceled']
            .filter(items=[x for x in df.columns if x != 'is_canceled'], axis=0)
            .sort_values().plot.bar());
ax.set_title('Correlation with is_canceled');

### Preprocessing Categorical Features

In [None]:
# Drop certain columns
# Drop reservation status because it is basically directly correlated with the target column
df = df.drop(columns=['reserved_room_type', 'assigned_room_type', 'reservation_status'])

In [None]:
nominal_features = ['arrival_date_month', 'arrival_date_year', 'market_segment', 
                   'distribution_channel', 'hotel', 'deposit_type', 'customer_type']

df = pd.get_dummies(df, columns=nominal_features)

### Preprocessing Continuous Features

In [None]:
df['lead_time'] = pd.qcut(df['lead_time'], 3, labels=['less_than_a_month', 'more_than_a_month', 'more_than_4_months'])
df = pd.get_dummies(df, columns=['lead_time'])

In [None]:
df['total_guests'].value_counts()

In [None]:
# A reservation with no customers seems rather odd, we will remove those rows
df = df[df['total_guests'] > 0]

In [None]:
def bin_num_guests(data):
    
    if data['total_guests'] < 3:
        return 'small'
    elif 3 <= data['total_guests'] <= 6:
        return 'medium'
    else:
        return 'large'


df['guest_group_size'] = df.apply(bin_num_guests, axis=1)
df['is_single'] = df['total_guests']
df['is_single'] = (df['is_single'] == 1).astype(int)
df = pd.get_dummies(df, columns=['guest_group_size'])

In [None]:
# convert weekend only as a feature
# convert week only to a feature
# bin the duration of stay feature
df['weekend_only'] = ((df['stays_in_week_nights'] == 0) & (df['stays_in_weekend_nights'] > 0)).astype('int')
df['week_only'] = ((df['stays_in_week_nights'] > 0) & (df['stays_in_weekend_nights'] == 0)).astype('int')

In [None]:
sns.jointplot(x='stays_in_week_nights', y='stays_in_weekend_nights', data=df[df['stays_in_total_nights'] < 6], kind='hex', color="#4CB391");

In [None]:
def categorize_duration_of_stay(data):
    
    if data['stays_in_total_nights'] < 6:
        if data['stays_in_weekend_nights'] > 0 and data['stays_in_week_nights'] > 0:
            return 'long_weekend'
        elif data['stays_in_weekend_nights'] > 0 and data['stays_in_week_nights'] == 0:
            return 'weekend_visit'
        elif data['stays_in_weekend_nights'] == 0 and data['stays_in_week_nights'] > 0:
            return 'weekday_visit'
    elif 6 <= data['stays_in_total_nights'] <= 7:
        return "week_long_visit"
    else:
        return 'long_visit'

In [None]:
df['type_of_visit'] = df.apply(categorize_duration_of_stay, axis=1)
df[['type_of_visit', 'stays_in_week_nights', 'stays_in_weekend_nights']].head()

In [None]:
# turns out there are bookings with duration of stay being zero, that does not help, so we will drop these rows.
df = df[df['stays_in_total_nights'] > 0]

In [None]:
df = pd.get_dummies(df, columns=['type_of_visit'], prefix='')

In [None]:
for col in df.columns:
    if col[0] == '_':
        df = df.rename(columns={col: col[1:]})

## ML Models

In [None]:
from sklearn import model_selection as ms
from sklearn import metrics

# Algorithms to consider
from sklearn import tree, linear_model, ensemble, neural_network
import xgboost

### Setup train and test data

In [None]:
X = df.drop(columns=['is_canceled'])
y = df['is_canceled']

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

### Wrapper methods

In [None]:
def fit_and_evaluate(model, X_train, y_train, X_test, y_test):
    
    fit_model = model.fit(X_train, y_train)
    y_predicted = fit_model.predict(X_test)
    
    accuracy_score = metrics.accuracy_score(y_test, y_predicted)
    
    print(f"Evaluation of trained {type(model).__name__} model")
    print(f"Accuracy score: {accuracy_score}")
    print(f"Confusion Matrix: {metrics.confusion_matrix(y_test, y_predicted)}")
    
    return fit_model, accuracy_score

def plot_roc_curve(X_test, y_test, models):
    
    fig, ax = plt.subplots(1, 1, figsize=(10, 7))
    
    for model in models:
        metrics.plot_roc_curve(model, X_test, y_test, ax = ax)

### Training and Evaluation of a few ML Models

In [None]:
training_results = []

# SVC was excluded in this because it took too long to train

models = [linear_model.LogisticRegression(solver='liblinear'), tree.DecisionTreeClassifier(max_depth = 10), 
          ensemble.RandomForestClassifier(), neural_network.MLPClassifier(solver='adam'), xgboost.XGBClassifier()]


for model in models:
    trained_model, acc_score = fit_and_evaluate(model, X_train, y_train, X_test, y_test)
    training_results.append((trained_model, acc_score))

In [None]:
# result[0] has the trained model
plot_roc_curve(X_test, y_test, [result[0] for result in training_results])