# Task: To predict hotel cancellations using a Keras-based neural network.

## Original hotel booking demand datasets by authors Nuno Antonio, Ana de Almeida, and Luis Nunes available at:

### https://www.sciencedirect.com/science/article/pii/S2352340918315191

## The same neural network is run, but this time using TF 2.0 standards.

In [None]:
import tensorflow as tf
print(tf.__version__)

## Import libraries

In [None]:
import os
import csv
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier

## Import training dataset H1 and sort by year and week number.

In [None]:
train_df = pd.read_csv('H1.csv')
a=train_df.head()
b=train_df
b
b.sort_values(['ArrivalDateYear','ArrivalDateWeekNumber'], ascending=True)

## Dependent variable (y). Cancellation by customer = 1, no cancellation by customer = 0.

In [None]:
IsCanceled = train_df['IsCanceled']
y = IsCanceled

## Features (or independent variables) hypothesised to influence hotel cancellations.

In [None]:
leadtime = train_df['LeadTime'] #1
staysweekendnights = train_df['StaysInWeekendNights'] #2
staysweeknights = train_df['StaysInWeekNights'] #3
adults = train_df['Adults'] #4
children = train_df['Children'] #5
babies = train_df['Babies'] #6
isrepeatedguest = train_df['IsRepeatedGuest'] #11
previouscancellations = train_df['PreviousCancellations'] #12
previousbookingsnotcanceled = train_df['PreviousBookingsNotCanceled'] #13
bookingchanges = train_df['BookingChanges'] #16
agent = train_df['Agent'] #18
company = train_df['Company'] #19
dayswaitinglist = train_df['DaysInWaitingList'] #20
adr = train_df['ADR'] #22
rcps = train_df['RequiredCarParkingSpaces'] #23
totalsqr = train_df['TotalOfSpecialRequests'] #24

## Categorical variables - variables that do not have an interval scale, e.g. 1-100.

### cat.codes is being used to define these categorical variables, as assigning a number to each variable without specifying that variable as a category will lead to Python treating each variable as interval.

In [None]:
mealcat=train_df.Meal.astype("category").cat.codes
mealcat=pd.Series(mealcat)
countrycat=train_df.Country.astype("category").cat.codes
countrycat=pd.Series(countrycat)
marketsegmentcat=train_df.MarketSegment.astype("category").cat.codes
marketsegmentcat=pd.Series(marketsegmentcat)
distributionchannelcat=train_df.DistributionChannel.astype("category").cat.codes
distributionchannelcat=pd.Series(distributionchannelcat)
reservedroomtypecat=train_df.ReservedRoomType.astype("category").cat.codes
reservedroomtypecat=pd.Series(reservedroomtypecat)
assignedroomtypecat=train_df.AssignedRoomType.astype("category").cat.codes
assignedroomtypecat=pd.Series(assignedroomtypecat)
deposittypecat=train_df.DepositType.astype("category").cat.codes
deposittypecat=pd.Series(deposittypecat)
customertypecat=train_df.CustomerType.astype("category").cat.codes
customertypecat=pd.Series(customertypecat)
reservationstatuscat=train_df.ReservationStatus.astype("category").cat.codes
reservationstatuscat=pd.Series(reservationstatuscat)

## minmax_scale is used to scale the relevant independent variable (in this case, lead time) to a value between 0 and 1.

### If the variables in the neural network do not have a common scale, then it will increase the likelihood of incorrect interpretations by the neural network.

In [None]:
from sklearn.preprocessing import minmax_scale
leadtime = minmax_scale(leadtime)
leadtime

## All independent variables are stacked together using numpy.

In [None]:
x1 = np.column_stack((leadtime,deposittypecat,countrycat))
x1 = sm.add_constant(x1, prepend=True)

## A train-test split is used to partition the data into training (used to train the model), and validation (check predictions) against the actual cancellation incidences and update the model accordingly.

### The actual test data (or unseen data) is the H2 dataset - you will use this soon to test the model predictions against the actual cancellations for H2.

In [None]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y, random_state=0)

## In TF v1.0, Sequential, Dense, and KerasRegressor were being imported separately. In TF 2.0, models and layers are being imported from keras, and the above no longer need to be manually imported.

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers

### 'sigmoid' is used as the activation function for the output node given that this is a classification problem.

In [None]:
model = models.Sequential()
model.add(layers.Dense(8, activation='relu', input_shape=(4,)))
model.add(layers.Dense(1, activation='sigmoid'))

## The adam optimizer is used to train the model, and the binary_crossentropy is used as the loss function.

### Loss = degree of error between the predicted and actual values.

### 500 epochs (or forward and backward passes) are used to train the model.

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

history=model.fit(x1_train,
                  y1_train,
                  epochs=500,
                  batch_size=512,
                  validation_data=(x1_test, y1_test))

## Here is a plot of the train and validation loss.

In [None]:
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

## This is a plot of the train vs. validation accuracy.

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

## Here are the predictions from the model.

In [None]:
pred=model.predict(x1_test)

## AUC (or area under the curve) is used to assess the classifier accuracy.

### An AUC of 0.5 means that the model performs no better than random guessing. A value above 0.5 means that the model has predictive power.

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

falsepos, truepos, thresholds = roc_curve(y1_test, pred)

auc = roc_auc_score(y1_test, pred)
print('AUC: %.3f' % auc)

fpr, tpr, thresholds = roc_curve(y1_test, pred)
plt.plot([0, 1], [0, 1], linestyle='-')
plt.plot(falsepos, truepos, marker='.')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## H2 - TEST SET WITH UNSEEN DATA

### Now, data from H2.csv (or the second hotel in the study) is being imported and processed. As you will see in the exercise below, your task is now to train the built model on the new data, generate new classifications and compare these to the existing records in H2.

In [None]:
h2data = pd.read_csv('H2.csv')
a=h2data.head()
a

In [None]:
t_leadtime = h2data['LeadTime'] #1
t_staysweekendnights = h2data['StaysInWeekendNights'] #2
t_staysweeknights = h2data['StaysInWeekNights'] #3
t_adults = h2data['Adults'] #4
t_children = h2data['Children'] #5
t_babies = h2data['Babies'] #6
t_isrepeatedguest = h2data['IsRepeatedGuest'] #11
t_previouscancellations = h2data['PreviousCancellations'] #12
t_previousbookingsnotcanceled = h2data['PreviousBookingsNotCanceled'] #13
t_bookingchanges = h2data['BookingChanges'] #16
t_agent = h2data['Agent'] #18
t_company = h2data['Company'] #19
t_dayswaitinglist = h2data['DaysInWaitingList'] #20
t_adr = h2data['ADR'] #22
t_rcps = h2data['RequiredCarParkingSpaces'] #23
t_totalsqr = h2data['TotalOfSpecialRequests'] #24

In [None]:
t_mealcat=h2data.Meal.astype("category").cat.codes
t_mealcat=pd.Series(t_mealcat)
t_countrycat=h2data.Country.astype("category").cat.codes
t_countrycat=pd.Series(t_countrycat)
t_marketsegmentcat=h2data.MarketSegment.astype("category").cat.codes
t_marketsegmentcat=pd.Series(t_marketsegmentcat)
t_distributionchannelcat=h2data.DistributionChannel.astype("category").cat.codes
t_distributionchannelcat=pd.Series(t_distributionchannelcat)
t_reservedroomtypecat=h2data.ReservedRoomType.astype("category").cat.codes
t_reservedroomtypecat=pd.Series(t_reservedroomtypecat)
t_assignedroomtypecat=h2data.AssignedRoomType.astype("category").cat.codes
t_assignedroomtypecat=pd.Series(t_assignedroomtypecat)
t_deposittypecat=h2data.DepositType.astype("category").cat.codes
t_deposittypecat=pd.Series(t_deposittypecat)
t_customertypecat=h2data.CustomerType.astype("category").cat.codes
t_customertypecat=pd.Series(t_customertypecat)
t_reservationstatuscat=h2data.ReservationStatus.astype("category").cat.codes
t_reservationstatuscat=pd.Series(t_reservationstatuscat)

In [None]:
t_leadtime=np.array(t_leadtime)

In [None]:
lttest = minmax_scale(t_leadtime)
lttest

In [None]:
t1 = np.column_stack((lttest,t_deposittypecat,t_countrycat))
t1 = sm.add_constant(t1, prepend=True)

In [None]:
IsCanceled = h2data['IsCanceled']
b = IsCanceled
b=b.values

# EXERCISE

## Using the model that has been built, generate new predictions for H2 and generate an AUC curve plotting the true vs. false positive rate.

### Hint: You must use the model.predict command on the new data you are trying to predict, and then generate the AUC curve in the same manner as before by comparing the predictions to the actual data.