#                     Hotel Booking Cancellation EDA and Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

from sklearn.neural_network import MLPClassifier


import folium
from folium.plugins import HeatMap
import plotly.express as px

plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)

In [None]:
# reading data
df = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# chicking for null values

null = pd.DataFrame({"Null Values": df.isna().sum(),
                    'Percentage': (df.isna().sum() / (df.shape[0]) * (100))})
null = null[null['Percentage'] > 0]
null

In [None]:
# filling null values with zero

df.fillna(0, inplace=True)

In [None]:
# visualizing null values

msno.bar(df)
plt.show();

In [None]:
filter = (df.children == 0)&(df.adults == 0)&(df.babies == 0)
df[filter].head(2)

In [None]:
df = df[~filter]
df.head(2)

In [None]:
correlation = df.corr()['is_canceled'].abs().sort_values(ascending=False)
correlation

In [None]:

useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year',
               'assigned_room_type', 'booking_changes',
               'reservation_status', 'country', 'days_in_waiting_list']

df.drop(useless_col, axis = 1, inplace = True)

df.head()

In [None]:
# creating numerical and categorical dataframes

cat_cols = [col for col in df.columns if df[col].dtype == 'O']
cat_cols

In [None]:
cat_df = df[cat_cols]
cat_df.head()

In [None]:
cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])

cat_df['year'] = cat_df['reservation_status_date'].dt.year
cat_df['month'] = cat_df['reservation_status_date'].dt.month
cat_df['day'] = cat_df['reservation_status_date'].dt.day

cat_df.drop(['reservation_status_date','arrival_date_month'] ,
            axis = 1, inplace = True)

cat_df.head()

In [None]:
# printing unique values of each column

for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

In [None]:
cat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel': 0, 
                             'City Hotel': 1})

cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2,
                                     'SC': 3, 'Undefined': 4})

cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0,
                                                         'Corporate': 1,
                                                         'Online TA': 2,
                                                         'Offline TA/TO': 3,
                                                        'Complementary': 4,
                                                         'Groups': 5,
                                                         'Undefined': 6, 'Aviation': 7})

cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0,
                                                                     'Corporate': 1,
                                                                     'TA/TO': 2,
                                                                     'Undefined': 3,
                                                                       'GDS': 4})

cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
                                                                   'L': 7, 'B': 8})

cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})

cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})

cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})

In [None]:
cat_df.head()

In [None]:
num_df = df.drop(columns = cat_cols, axis = 1)
num_df.drop('is_canceled', axis = 1, inplace = True)
num_df.head()

In [None]:
num_df.var() 

In [None]:

fig = plt.figure(figsize=(18,16))
for index,col in enumerate(num_df.columns):
    plt.subplot(5,4,index+1)
    sns.distplot(num_df.loc[:,col].dropna(), kde=False)
fig.tight_layout(pad=1.0)

In [None]:
num_df['lead_time'] = np.log(num_df['lead_time'] + 1)
num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)
num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)
num_df['agent'] = np.log(num_df['agent'] + 1)
num_df['company'] = np.log(num_df['company'] + 1)
num_df['adr'] = np.log(num_df['adr'] + 1)

In [None]:
fig = plt.figure(figsize=(18,16))
for index,col in enumerate(num_df.columns):
    plt.subplot(5,4,index+1)
    sns.distplot(num_df.loc[:,col].dropna(), kde=False)
fig.tight_layout(pad=1.0)

In [None]:
num_df.var()

In [None]:
num_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())
num_df.head()

In [None]:
X = pd.concat([cat_df, num_df], axis=1)
y = df['is_canceled']
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head(), y_test.head()

In [None]:
                                # Model Building

In [None]:
# MLPClassifier

mlp = MLPClassifier(solver='adam', alpha=1e-5,
                     hidden_layer_sizes=(50, 100, 50), random_state=1)

mlp.fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)


acc_mlp = accuracy_score(y_test, y_pred_mlp)
conf = confusion_matrix(y_test, y_pred_mlp)
clf_report = classification_report(y_test, y_pred_mlp)

print(f"Accuracy Score of MLP Classifier is : {acc_mlp}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
mlp = MLPClassifier(solver='adam', alpha=1e-5,
                     hidden_layer_sizes=(50, 100, 100), random_state=1)

mlp.fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)


acc_mlp = accuracy_score(y_test, y_pred_mlp)
conf = confusion_matrix(y_test, y_pred_mlp)
clf_report = classification_report(y_test, y_pred_mlp)

print(f"Accuracy Score of MLP Classifier is : {acc_mlp}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

In [None]:
# Random Search Optimizer

from sklearn.model_selection import RandomizedSearchCV
parameter_space = {
    'hidden_layer_sizes': [(50,100, 50), (96, 144, 192)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.001, 0.05],
    'beta_1': [0.93, 0.94],
    'beta_2': [0.993, 0.994],
    'learning_rate': ['constant','adaptive'],
                }

mlp = MLPClassifier(max_iter=1000, random_state=42)

score = ['accuracy', 'precision']
clf = RandomizedSearchCV(mlp, parameter_space, n_jobs = -1, n_iter = 15,
                         cv=3, refit='precision', scoring=score, random_state=0)

In [None]:
clf.fit(X_train, y_train)

In [None]:
score = clf.score(X_test, y_test)
print("Validation Accuracy",score*100,"%")

In [None]:
plot_confusion_matrix(clf, X_test, y_test)