# Task: To predict hotel cancellations using a Keras-based neural network.

## Original hotel booking demand datasets by authors Nuno Antonio, Ana de Almeida, and Luis Nunes available at:

### https://www.sciencedirect.com/science/article/pii/S2352340918315191

# What is Feature Selection?

## Feature Selection is a process of selecting the features (or independent variables) that are hypothesised to have the greatest influence on hotel cancellations.

## Import Libraries

In [1]:
import os
import csv
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler
import os;

## Import training dataset H1 and sort by year and week number.

In [2]:
train_df = pd.read_csv('H1.csv')
a=train_df.head()
b=train_df
b
b.sort_values(['ArrivalDateYear','ArrivalDateWeekNumber'], ascending=True)

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
73,1,152,2015,July,27,2,4,10,2,0,...,No Deposit,243,,0,Contract,67.58,0,0,Canceled,2015-04-03
272,0,45,2015,July,27,2,2,3,2,0,...,No Deposit,8,,0,Contract,79.50,0,0,Check-Out,2015-07-07
303,0,1,2015,July,27,2,0,1,2,0,...,No Deposit,240,,0,Transient,147.00,1,0,Check-Out,2015-07-03
339,0,130,2015,July,27,4,2,6,2,0,...,No Deposit,196,,0,Transient,96.95,1,1,Check-Out,2015-07-12
430,0,37,2015,July,27,1,1,4,1,0,...,No Deposit,241,,0,Transient,97.29,0,1,Check-Out,2015-07-06
529,0,98,2015,July,27,4,2,1,2,0,...,No Deposit,8,,0,Transient,71.55,1,0,Check-Out,2015-07-07
756,0,150,2015,July,27,2,2,4,2,0,...,No Deposit,156,,0,Contract,55.68,0,0,Check-Out,2015-07-08
874,0,157,2015,July,27,4,4,6,2,0,...,No Deposit,156,,0,Contract,55.68,0,0,Check-Out,2015-07-14
1182,0,1,2015,July,27,1,0,1,1,0,...,No Deposit,,270,0,Transient,134.00,0,0,Check-Out,2015-07-02
1279,1,26,2015,July,27,4,2,5,2,2,...,No Deposit,240,,0,Transient,163.00,0,0,Canceled,2015-06-09


## Dependent variable (y). Cancellation by customer = 1, no cancellation by customer = 0.

In [3]:
IsCanceled = train_df['IsCanceled']
y = IsCanceled

## Features (or independent variables) hypothesised to influence hotel cancellations.

In [4]:
leadtime = train_df['LeadTime'] #1
staysweekendnights = train_df['StaysInWeekendNights'] #2
staysweeknights = train_df['StaysInWeekNights'] #3
adults = train_df['Adults'] #4
children = train_df['Children'] #5
babies = train_df['Babies'] #6
isrepeatedguest = train_df['IsRepeatedGuest'] #11
previouscancellations = train_df['PreviousCancellations'] #12
previousbookingsnotcanceled = train_df['PreviousBookingsNotCanceled'] #13
bookingchanges = train_df['BookingChanges'] #16
agent = train_df['Agent'] #18
company = train_df['Company'] #19
dayswaitinglist = train_df['DaysInWaitingList'] #20
adr = train_df['ADR'] #22
rcps = train_df['RequiredCarParkingSpaces'] #23
totalsqr = train_df['TotalOfSpecialRequests'] #24

## Categorical variables - variables that do not have an interval scale, e.g. 1-100.

### cat.codes is being used to define these categorical variables, as assigning a number to each variable without specifying that variable as a category will lead to Python treating each variable as interval.

In [5]:
mealcat=train_df.Meal.astype("category").cat.codes
mealcat=pd.Series(mealcat)
countrycat=train_df.Country.astype("category").cat.codes
countrycat=pd.Series(countrycat)
marketsegmentcat=train_df.MarketSegment.astype("category").cat.codes
marketsegmentcat=pd.Series(marketsegmentcat)
distributionchannelcat=train_df.DistributionChannel.astype("category").cat.codes
distributionchannelcat=pd.Series(distributionchannelcat)
reservedroomtypecat=train_df.ReservedRoomType.astype("category").cat.codes
reservedroomtypecat=pd.Series(reservedroomtypecat)
assignedroomtypecat=train_df.AssignedRoomType.astype("category").cat.codes
assignedroomtypecat=pd.Series(assignedroomtypecat)
deposittypecat=train_df.DepositType.astype("category").cat.codes
deposittypecat=pd.Series(deposittypecat)
customertypecat=train_df.CustomerType.astype("category").cat.codes
customertypecat=pd.Series(customertypecat)
reservationstatuscat=train_df.ReservationStatus.astype("category").cat.codes
reservationstatuscat=pd.Series(reservationstatuscat)

## All independent variables are stacked together using numpy.

In [6]:
x = np.column_stack((leadtime,staysweekendnights,staysweeknights,adults,children,babies,mealcat,countrycat,marketsegmentcat,distributionchannelcat,isrepeatedguest,previouscancellations,previousbookingsnotcanceled,reservedroomtypecat,assignedroomtypecat,bookingchanges,deposittypecat,dayswaitinglist,customertypecat,adr,rcps,totalsqr,reservationstatuscat))
x = sm.add_constant(x, prepend=True)

## The ExtraTreesClassifier is used to assess the importance of each feature in predicting hotel cancellations. The higher the value, the more important the variable is hypothesised to be.

In [7]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x, y)
print(model.feature_importances_)

[0.00000000e+00 1.89050158e-02 3.07011698e-03 3.79718729e-03
 2.23887371e-03 2.18611909e-03 3.69910647e-04 3.23892795e-03
 2.64885000e-02 1.68447950e-02 8.51588916e-03 8.31000207e-03
 3.95862221e-03 1.36135665e-03 8.55717447e-03 5.55767908e-03
 5.70356133e-03 3.52220830e-02 2.92386198e-04 1.49214184e-02
 5.46337713e-03 3.05836817e-02 6.49765945e-03 7.87915663e-01]




## The variable number (left-hand column) is listed with the score in ascending order.

In [8]:
ext=pd.DataFrame(model.feature_importances_,columns=["extratrees"])
ext
ext.sort_values(['extratrees'], ascending=True)

Unnamed: 0,extratrees
0,0.0
18,0.000292
6,0.00037
13,0.001361
5,0.002186
4,0.002239
2,0.00307
7,0.003239
3,0.003797
12,0.003959
