In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# READING THE INPUT CSV FILE
hotel_review = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')

In [None]:
hotel_review.head()

In [None]:
hotel_review.tail()

In [None]:
#Getting information regarding the data types and number of missing values in the dataset
hotel_review.info()

In [None]:
#seperating numerical and categorical columns
categorical_columns=[]
numerical_columns=[]
for col in hotel_review.columns:
    if hotel_review[col].dtype!='object':
        numerical_columns.append(col)
    else:
        categorical_columns.append(col)

In [None]:
hotel_review.describe() # for numerical values

In [None]:
hotel_review[categorical_columns].describe()# Statistical relations for categorical values

In [None]:
#CHECKING FOR MISSING VALUE
#As we saw earlier in the info method number of missing values in few of columns.
#Finding missing values in all columns
hotel_review.isna().sum()

As company values cannot be filled accurately using any preprocessed techniques we drop the company column we do the same with agent as number of empty obsrevations are large in number fianlly we can remove the empty data observations for the Country column and dropping the columns can create significant impact in the analysis

In [None]:
#Getting a closer look on the 3 parameter having Missing values
#Checking for corelation in missing data columns
check_for_corelation = hotel_review[['is_canceled','agent','company']]
check_for_corelation.corr()

In [None]:
# dropping Company column
hotel_review.drop(columns=['agent', 'company'],inplace=True)
hotel_review.dropna(axis=0,inplace=True)
hotel_review.shape

In [None]:
# removing the empty observation for country column
hotel_review.country.dropna()

In [None]:
hotel_review.country.isna().sum()

In [None]:
# Lets copy data to check the correlation between variables. 
from sklearn.preprocessing import LabelEncoder, StandardScaler
corelation_of_data = hotel_review.copy()
le = LabelEncoder()

In [None]:
# for variables and thier correlation with other variables.
corelation_of_data['meal'] = le.fit_transform(corelation_of_data['meal'])
corelation_of_data['distribution_channel'] = le.fit_transform(corelation_of_data['distribution_channel'])
corelation_of_data['reserved_room_type'] = le.fit_transform(corelation_of_data['reserved_room_type'])
corelation_of_data['assigned_room_type'] = le.fit_transform(corelation_of_data['assigned_room_type'])
corelation_of_data['customer_type'] = le.fit_transform(corelation_of_data['customer_type'])
corelation_of_data['reservation_status'] = le.fit_transform(corelation_of_data['reservation_status'])
corelation_of_data['market_segment'] = le.fit_transform(corelation_of_data['market_segment'])
corelation_of_data['deposit_type'] = le.fit_transform(corelation_of_data['deposit_type'])
corelation_of_data['reservation_status_date'] = le.fit_transform(corelation_of_data['deposit_type'])
corelation_of_data['is_canceled'] = le.fit_transform(corelation_of_data['deposit_type'])

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corelation_of_data.corr(),annot=True,cmap='viridis')

In [None]:
corelation_of_data.corr().is_canceled.sort_values(ascending = False)

As we can see that reservation_status_date and deposit_type are perfectly correlated to the is_canceled which is our Dependent variable, we need to take care of them seperately later for getting better result we will be handling this at the time of the modeling.

In [None]:
#graphical potray of the correlation values
corelation_of_data.corr()['is_canceled'][:-1].sort_values().plot(kind='bar')

EXPLORATORY DATA ANALYSIS

In [None]:
#Having a closer look at the type of values inside different attributes
hotel_review.reservation_status.unique()

In [None]:
hotel_review.customer_type.unique()

In [None]:
hotel_review.customer_type.value_counts()

In [None]:
plt.figure(figsize=(12,8))
plt.title(label='Cancellation by ADR & Hotel Type')
sns.barplot(x='hotel',y='adr',hue='is_canceled',data=hotel_review)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.title(label='Cancellation by Market Segments')
plt.xticks(rotation=45) 
sns.countplot(x='market_segment',hue='is_canceled',data=hotel_review)
plt.show()

CHECKING FOR THE MOST BUSIEST MONTH IN A YEAR

In [None]:
hotel_review.arrival_date_month.value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(data = hotel_review, x= 'arrival_date_month',y='adr',hue='hotel')

In [None]:
most_occupied_month_price = hotel_review.groupby(['arrival_date_month','hotel']).sum().adr
most_occupied_month_price

As we can see that August and July are the most occupied booking month.

In [None]:
# next we can look for the number of people and diffenrt variates of people come in
# combining the adults and children into one category as the expense is relatively the same and excluidng the babies
hotel_review['Family'] = hotel_review.adults + hotel_review.children 

In [None]:
# droping the existing columns
hotel_review.drop(columns=['adults','children','babies'],inplace=True)

In [None]:
hotel_review['Family'] = hotel_review['Family'].astype(int)

Variation in price with respect to hotels

In [None]:
# now checking for which type of Hotel have more number of cancelations
# % of cancellations in City Hotel
hotel_review[hotel_review['hotel']=='City Hotel']['is_canceled'].value_counts(normalize=True)

In [None]:
# cancelation with respect to time
plt.figure(figsize=(12,8))
plt.title(label='Cancellation by Lead Time')
sns.barplot(x='hotel',y='lead_time',hue='is_canceled',data=hotel_review)
plt.show()

* MODEL PRE-PROCESSING
Converting the categorical features in the columns into numerical values, so that it easy and fast for the algorithm to learn the characteristics

In [None]:
# converting hotel and months into numerical value and mapping them
hotel_review['hotel'] = hotel_review['hotel'].map({'Resort Hotel':0, 'City Hotel':1})
hotel_review['arrival_date_month'] = hotel_review['arrival_date_month'].map({'January':1, 'February': 2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7,
                                                            'August':8, 'September':9, 'October':10, 'November':11, 'December':12})

In [None]:
hotel_review.country.nunique()

In [None]:
hotel_review.Family.value_counts()

In [None]:
hotel_review.deposit_type.value_counts()

In [None]:
#As discussed earlier due to high correlation with these factors we will highly inaccurate results therefore we drop these columns
hotel_review.columns

In [None]:
hotel_review.drop(columns="reservation_status_date", inplace=True, axis=1)

In [None]:
hotel_review.reservation_status.value_counts()

In [None]:
hotel_review.drop(columns=['reservation_status'], inplace=True, axis=1)

In [None]:
hotel_review['country'] = le.fit_transform(hotel_review['country'])
hotel_review['deposit_type'] = le.fit_transform(hotel_review['deposit_type'])
hotel_review['adr'] = le.fit_transform(hotel_review['adr'])
hotel_review['market_segment'] = le.fit_transform(hotel_review['market_segment'])
hotel_review['meal'] = le.fit_transform(hotel_review['meal'])
hotel_review['distribution_channel'] = le.fit_transform(hotel_review['distribution_channel'])
hotel_review['reserved_room_type'] = le.fit_transform(hotel_review['reserved_room_type'])
hotel_review['assigned_room_type'] = le.fit_transform(hotel_review['assigned_room_type'])
hotel_review['customer_type'] = le.fit_transform(hotel_review['customer_type'])

In [None]:
hotel_review.shape

In [None]:
# APPLYING MACHINE LEARNING MODELS
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
y = hotel_review["is_canceled"]
X = hotel_review.drop(["is_canceled"], axis=1)

# SPLITTING THE DATA INTO 30 PERCENT TEST AND 70 PERCENT TRAINING DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [None]:
tree = DecisionTreeClassifier(max_depth = 10)

In [None]:
tree_model = tree.fit(X_train, y_train)

In [None]:
y_pred = tree_model.predict(X_test)
print('Decision Tree Model')

print('Accuracy Score: {}\n\nConfusion Matrix:\n {}'
      .format(accuracy_score(y_test,y_pred), confusion_matrix(y_test,y_pred)))

In [None]:
# APPLYING RANDOM FORREST
rf_model = RandomForestClassifier(min_samples_leaf = 6, min_samples_split=6,
                                  n_estimators = 100)

# fitting of the model
estimator= rf_model.fit(X_train, y_train)
#Prediction of the Model
predict_rf = rf_model.predict(X_test)

In [None]:
RF_matrix = confusion_matrix(y_test, predict_rf)

In [None]:
RF_matrix = confusion_matrix(y_test, predict_rf)
ax = plt.plot()
sns.heatmap(RF_matrix,annot=True, fmt="d", cbar=False, cmap="Pastel2")

In [None]:
rf_model.feature_importances_

In [None]:
for name, importance in zip(X.columns, rf_model.feature_importances_):
    print(name, "=", importance)

In [None]:
#MODELLING WITH EXTREME GRADIENT BOOST
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

In [None]:
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

steps = 20  # The number of training iterations

In [None]:
model = xgb.train(param, D_train, steps)

In [None]:
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, best_preds)))

In [None]:
# first neural network with keras 
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Import `Sequential` from `keras.models`
from keras.models import Sequential

# Import `Dense` from `keras.layers`
from keras.layers import Dense

# Initialize the constructor
model = Sequential()

# Add an input layer 
model.add(Dense(12, activation='relu', input_shape=(25,)))

# Add one hidden layer 
model.add(Dense(8, activation='relu'))

# Add an output layer 
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
                   
model.fit(X_train, y_train,epochs=5, batch_size=1, verbose=1)

In [None]:
y_pred = model.predict(X_test)

In [None]:
score = model.evaluate(X_test, y_test,verbose=1)

print(score)

The neural nets can be trained, and effective losses can be calculated.
Overall the Random Forest algorithm provides the best fit for better decision making on the parameters given.


The idea here is to understand the questions and train the model accordingly, whereas doing Exploratory data analysis give us a bright idea on the type and how the data is measured.
Also, domain experience can play an important role in the analysis therefore, looking at the past and present terms we should understand the behavior of the model then take some meaningful decision on it.
Finally, we need to be transparent on what is more preferable to our objective whether the precision or the recall is important to us, as there could be tradeoff between the two in most of the cases.
