In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as ex

In [None]:
df = pd.read_csv("../input/hotel-booking-demand/hotel_bookings.csv")

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.isna().sum()

# # Exploratory Data Analysis

In [None]:
ex.pie(df, names = 'is_canceled', title = 'Cancelled vs Not Cancelled')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['hotel'], ax=ax[0])
sns.countplot(df['hotel'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.boxplot(df['lead_time'], ax=ax[0])
sns.boxplot(data = df, x=df['is_canceled'], y=df['lead_time'], ax=ax[1])

In [None]:
# There is possibly a slight proportion increase in cancels in 2017, however not a noticable difference
# between the years.  Will drop the year column due to its irrelevance in predicting future cancellations
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['arrival_date_year'], ax=ax[0])
sns.countplot(df['arrival_date_year'], hue=df['is_canceled'], ax=ax[1])
drop_columns = []
drop_columns.append('arrival_date_year')

In [None]:
# Months can be relevant in predictions, must one hot encode this during preprocessing
fig,ax = plt.subplots(2,1, figsize=(7,13))
sns.countplot(df['arrival_date_month'], ax=ax[0])
sns.countplot(df['arrival_date_month'], hue=df['is_canceled'], ax=ax[1])
for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=90)
ohe_columns =[]
ohe_columns.append('arrival_date_month')

In [None]:
# as expected more people traveling during the summer and then a spike during the Holidays
sns.distplot(df['arrival_date_week_number'])

In [None]:
# Canceled vs non-canceled seems to have similar pattern. Not enough difference to include the column and over complicate the algorithm
# Plus most likely week number is redundant with month
df['arrival_date_week_number'].hist(by=df['is_canceled'], figsize=(6,7))
drop_columns.append('arrival_date_week_number')

In [None]:
# there seems to be equal distribution of bookings on the days of the month
sns.distplot(df['arrival_date_day_of_month'])

In [None]:
# Similar pattern between cancelled and non-cancelled with day of month, will drop this column too for 
# simplifying the model
df['arrival_date_day_of_month'].hist(by=df['is_canceled'], figsize=(9,9))
drop_columns.append('arrival_date_day_of_month')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['stays_in_weekend_nights'], ax=ax[0])
sns.countplot(df['stays_in_weekend_nights'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['stays_in_week_nights'], ax=ax[0])
sns.countplot(df['stays_in_week_nights'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['adults'], ax=ax[0])
sns.countplot(df['adults'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['children'], ax=ax[0])
sns.countplot(df['children'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['babies'], ax=ax[0])
sns.countplot(df['babies'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['meal'], ax=ax[0])
sns.countplot(df['meal'], hue=df['is_canceled'], ax=ax[1])
ohe_columns.append('meal')

In [None]:
# High dimension (177 countries) with countries, dropping column for simplification
fig, ax = plt.subplots(2,1, figsize=(15,15))
sns.countplot(df['country'], ax=ax[0])
sns.countplot(df['country'], hue=df['is_canceled'], ax=ax[1])
drop_columns.append('country')

In [None]:
df['country'].value_counts()

In [None]:
# high proportion of "groups" market segment have high cancels, low proportion for "direct" market segment
fig, ax = plt.subplots(2,1, figsize=(9,15))
sns.countplot(df['market_segment'], ax=ax[0])
sns.countplot(df['market_segment'], hue=df['is_canceled'], ax=ax[1])
for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=45)
ohe_columns.append('market_segment')

In [None]:
# seems to be redundant to market_segment, dropping distribution_channel
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['distribution_channel'], ax=ax[0])
sns.countplot(df['distribution_channel'], hue=df['is_canceled'], ax=ax[1])
drop_columns.append('distribution_channel')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['is_repeated_guest'], ax=ax[0])
sns.countplot(df['is_repeated_guest'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(4,1, figsize=(7,14))
sns.countplot(df['previous_cancellations'], ax=ax[0])
sns.countplot(df['previous_cancellations'], hue=df['is_canceled'], ax=ax[1])
sns.boxplot(df['previous_cancellations'], ax=ax[2])
sns.boxplot(data=df, x='is_canceled', y='previous_cancellations', ax=ax[3])

In [None]:
fig, ax = plt.subplots(4,1, figsize=(7,14))
sns.countplot(df['previous_bookings_not_canceled'], ax=ax[0])
sns.countplot(df['previous_bookings_not_canceled'], hue=df['is_canceled'], ax=ax[1])
sns.boxplot(df['previous_bookings_not_canceled'], ax=ax[2])
sns.boxplot(data=df, x='is_canceled', y='previous_bookings_not_canceled', ax=ax[3])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['reserved_room_type'], ax=ax[0])
sns.countplot(df['reserved_room_type'], hue=df['is_canceled'], ax=ax[1])
drop_columns.append('reserved_room_type')

In [None]:
# Room type A has a higher proportion of cancels than the other room types
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['assigned_room_type'], ax=ax[0])
sns.countplot(df['assigned_room_type'], hue=df['is_canceled'], ax=ax[1])

In [None]:
# higher cancel rate for 0 booking changes compared to non-zero, will create custom encoding 
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['booking_changes'], ax=ax[0])
sns.countplot(df['booking_changes'], hue=df['is_canceled'], ax=ax[1])

In [None]:
# will drop this column as it is calculatd afer a cancel occurs, not a valid predictor
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['deposit_type'], ax=ax[0])
sns.countplot(df['deposit_type'], hue=df['is_canceled'], ax=ax[1])
drop_columns.append('deposit_type')

In [None]:
# will drop this column as it is calculatd afer a cancel occurs, not a valid predictor
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['agent'], ax=ax[0])
sns.countplot(df['agent'], hue=df['is_canceled'], ax=ax[1])

In [None]:
df_agent = (df['agent']).value_counts().sort_values(ascending=False).head(10)
df_agent

In [None]:
df_agent_df = pd.DataFrame(df_agent)
df_agent_df

In [None]:
# High Cancel Rate for Agent 1, however since agents are likely to change, using agents as predictors will not make
# for a robust model, therefore will drop Agent column
agent_avg = df.groupby('agent').mean()
top_agents = agent_avg[agent_avg.index.isin(df_agent_df.index)]
print("Very High Cancel Rate for Agent 1", '\n\n' ,top_agents['is_canceled'].head(10))
drop_columns.append('agent')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['company'], ax=ax[0])
sns.countplot(df['company'], hue=df['is_canceled'], ax=ax[1])

In [None]:
df['company'].isna().sum()

In [None]:
# Too many missing values, dropping company column
drop_columns.append('company')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.boxplot(df['days_in_waiting_list'], ax=ax[0])
sns.boxplot(data=df, x='is_canceled', y='days_in_waiting_list', ax=ax[1])

In [None]:
df['days_in_waiting_list'].hist(by=df['is_canceled'], figsize=(9,9))

In [None]:
# Dropping days in waiting list for simplification
drop_columns.append('days_in_waiting_list')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['customer_type'], ax=ax[0])
sns.countplot(df['customer_type'], hue=df['is_canceled'], ax=ax[1])
ohe_columns.append('customer_type')

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.boxplot(df['adr'], ax=ax[0])
sns.boxplot(data=df, x='is_canceled', y='adr', ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['required_car_parking_spaces'], ax=ax[0])
sns.countplot(df['required_car_parking_spaces'], hue=df['is_canceled'], ax=ax[1])

In [None]:
fig, ax = plt.subplots(2,1, figsize=(7,10))
sns.countplot(df['total_of_special_requests'], ax=ax[0])
sns.countplot(df['total_of_special_requests'], hue=df['is_canceled'], ax=ax[1])

In [None]:
drop_columns.append('reservation_status')
drop_columns.append('reservation_status_date')

# Preprocessing

In [None]:
df1 = df

In [None]:
drop_columns

In [None]:
df1 = df1.drop(drop_columns, axis=1)

In [None]:
ohe_columns

In [None]:
df1.head()

In [None]:
df1['hotel'] = df1['hotel'].replace({'Resort Hotel':0, 'City Hotel':1})
df1 = pd.concat([df1, pd.get_dummies(df1['arrival_date_month'])], axis=1)
df1 = pd.concat([df1, pd.get_dummies(df1['meal'])], axis=1)
df1 = pd.concat([df1, pd.get_dummies(df1['market_segment'])], axis=1)
df1 = pd.concat([df1, pd.get_dummies(df1['assigned_room_type'])], axis=1)
df1 = pd.concat([df1, pd.get_dummies(df1['customer_type'])], axis=1)        

In [None]:
df1 = df1.drop(['arrival_date_month', 'meal', 'market_segment', 'assigned_room_type', 'customer_type'], axis=1)

In [None]:
df1['children'] = df1['children'].fillna(0)

In [None]:
df1.isna().sum()

In [None]:
df1.head()

In [None]:
fig, ax = plt.subplots(figsize=(16,16))
sns.heatmap(df1.corr('pearson'))
plt.title("Correlation Heatmap")

In [None]:
# Resample the data
df_s = df1.sample(frac=0.2)
df_s.shape

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
X = df_s.drop(['is_canceled'], axis=1)
y = df_s['is_canceled']

In [None]:
X.isna().sum()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=9)

# Random Forest Classifer

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
scaler = StandardScaler()

In [None]:
rf_pipe = Pipeline([('scaler',scaler), ('rf',rf)])
rf_params = {'rf__n_estimators': [100,200,300,500,1000],
            'rf__max_features': [2,3,5,7,9]
            }

In [None]:
rf_grid = RandomizedSearchCV(rf_pipe, rf_params, cv=7, scoring='accuracy', n_iter=10, random_state=9, n_jobs=-1)
rf_grid.fit(X_train, y_train)

In [None]:
rf_grid.best_params_

In [None]:
rf_grid.best_score_

In [None]:
rf_pred = rf_grid.predict(X_test)
rf_acc = accuracy_score(rf_pred, y_test)
print('Random Forest Accuracy Score: ',rf_acc)

In [None]:
rf_conf = confusion_matrix(rf_pred, y_test)
rf_confusion = pd.DataFrame(data=rf_conf, columns=[['Predicted Not Canceled', 'Predicted Canceled']],
                           index = [['Actual Not Canceled', 'Actual Canceled']])
fig, ax = plt.subplots(figsize = (8,8))
sns.heatmap(rf_confusion, annot=True, fmt='g')

# Logistic Regression

In [None]:
from sklearn import linear_model
lr = linear_model.LogisticRegression()
lr_pipe = Pipeline([('scaler',scaler), ('lr',lr)])
lr_params = {'lr__solver': ['newton-cg'],
            'lr__C':[.1,.5,1,2,5,7,10,15,20,50]
            }

In [None]:
lr_grid = RandomizedSearchCV(lr_pipe, lr_params, cv=7, scoring='accuracy', n_iter=10, random_state=9, n_jobs=-1)
lr_grid.fit(X_train, y_train)

In [None]:
lr_grid.best_score_

In [None]:
lr_grid.best_params_

In [None]:
lr_pred = lr_grid.predict(X_test)
lr_acc = accuracy_score(lr_pred, y_test)
print('Logistic Regression Accuracy Score: ', lr_acc)

In [None]:
lr_conf = confusion_matrix(lr_pred, y_test)
lr_confusion = pd.DataFrame(data=lr_conf, columns=[['Predicted Not Canceled', 'Predicted Canceled']],
                           index = [['Actual Not Canceled', 'Actual Canceled']])
fig, ax = plt.subplots(figsize = (8,8))
sns.heatmap(lr_confusion, annot=True, fmt='g')

# Artificial Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp_pipe = Pipeline([('scaler',scaler), ('mlp',mlp)])
mlp_params = {'mlp__solver': ['sgd'],
              'mlp__hidden_layer_sizes': [(10,10,10),(15,15,15),(20,20,20),(30,30,30),(40,40,40)],
              'mlp__max_iter': [2000,6000,8000,10000]
             }

In [None]:
mlp_grid = RandomizedSearchCV(mlp_pipe, mlp_params, cv=7, scoring='accuracy', n_iter=10, random_state=9, n_jobs=-1)
mlp_grid.fit(X_train, y_train)

In [None]:
mlp_grid.best_score_

In [None]:
mlp_grid.best_params_

In [None]:
mlp_pred = mlp_grid.predict(X_test)
mlp_acc = accuracy_score(mlp_pred, y_test)
print('Accuracy Score for Neural Network: ', mlp_acc)

In [None]:
mlp_conf = confusion_matrix(mlp_pred, y_test)
mlp_confusion = pd.DataFrame(data=mlp_conf, columns=[['Predicted Not Canceled', 'Predicted Canceled']],
                           index = [['Actual Not Canceled', 'Actual Canceled']])
fig, ax = plt.subplots(figsize = (8,8))
sns.heatmap(mlp_confusion, annot=True, fmt='g')

# Results

In [None]:
results = pd.DataFrame({'Model': ['Random Forest',
                                  'Logistic Regression',
                                  'Neural Network'
                                 ],
                       'Accuracy': [rf_acc,
                                   lr_acc,
                                   mlp_acc]
                       })

In [None]:
results

In [None]:
results = results.sort_values('Accuracy', ascending=False)
results = results.set_index('Model')

In [None]:
ax = results.plot(kind='bar', title = 'Model Results', figsize=(7,5), legend=True, fontsize=13)