In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


In [2]:
data = pd.read_csv('hotel_bookings.csv')  #reading the data

In [3]:
# removing irrelevant attributes.

# a list of irrelevant attributes
irrelevant_attributes = ['arrival_date_year','arrival_date_month', 'arrival_date_week_number','arrival_date_day_of_month',
                        'children','babies', 'country', 'market_segment', 'previous_cancellations',
                        'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company','days_in_waiting_list',
                        'required_car_parking_spaces','reservation_status_date']

for attribute in irrelevant_attributes:
    data.drop(attribute, axis = 1, inplace = True)

In [4]:
# The following code encodes categorical attributes in to binary. It uses a Panda function known as get_dummies()
# and drops the which converts data into binary. The attributes are dropped after conversion is made.


# encoding reservation status category and drop it after.
status = np.array(data.reservation_status)
Reservation_status = []
for value in status:
    if value == 'Canceled':
        Reservation_status.append(0)
    elif value == 'Check-Out':
        Reservation_status.append(1)
    elif value == 'No-Show':
        Reservation_status.append(2)
data['Reservation_status'] = Reservation_status
data.drop('reservation_status', axis = 1, inplace = True)

# encoding meal category
data = pd.concat([data, pd.get_dummies(data['meal'], prefix = 'meal_type')], axis = 1)
data.drop('meal', axis = 1, inplace = True)

# encoding distribution channel
data = pd.concat([data, pd.get_dummies(data['distribution_channel'], prefix = 'distr_channel')], axis = 1)
data.drop('distribution_channel', axis = 1, inplace = True)

# encoding deposit_type
data = pd.concat([data, pd.get_dummies(data['deposit_type'], prefix = 'deposit')], axis = 1)
data.drop('deposit_type', axis = 1, inplace = True)

# enocding customer_type
data = pd.concat([data, pd.get_dummies(data['customer_type'], prefix = 'cust_type')], axis = 1)
data.drop('customer_type', axis = 1, inplace = True)


In [5]:
data['total_of_special_requests'].unique()


array([0, 1, 3, 2, 4, 5])

In [6]:
# The following code combines two attributes -reserved_room_type and assigned_room_type together. These attributes
# are strongly related because when a customer reserves a prefered room, they may be able to get the room or not. 
# Therefore to combine these two attributes and still have their significance in the data, the code compares every
# value in the lists element wise. if the values are similar, 1 is appended in the list(prefered_room) meaning 
# that the customer got the room they requested and 0 otherwise.
# The new attribute(prefered_room) with binary values is merged with the dataframe
reserved_room = np.array(data.reserved_room_type)
assigned_room = np.array(data.assigned_room_type)
combined_room = list(zip(reserved_room, assigned_room))
prefered_room = []
for item in combined_room:
    if item[0] == item[1]:
        prefered_room.append(1)
    else:
        prefered_room.append(0)
        
# merge the attribute to the dataframe and drop attributes - reserved_room_type and assigned_room_type       
data['prefered_room'] = prefered_room 
data.drop('reserved_room_type', axis = 1, inplace = True)
data.drop('assigned_room_type', axis = 1, inplace = True)


In [7]:
data.head(7)

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,is_repeated_guest,adr,total_of_special_requests,Reservation_status,...,distr_channel_TA/TO,distr_channel_Undefined,deposit_No Deposit,deposit_Non Refund,deposit_Refundable,cust_type_Contract,cust_type_Group,cust_type_Transient,cust_type_Transient-Party,prefered_room
0,Resort Hotel,0,342,0,0,2,0,0.0,0,1,...,0,0,1,0,0,0,0,1,0,1
1,Resort Hotel,0,737,0,0,2,0,0.0,0,1,...,0,0,1,0,0,0,0,1,0,1
2,Resort Hotel,0,7,0,1,1,0,75.0,0,1,...,0,0,1,0,0,0,0,1,0,0
3,Resort Hotel,0,13,0,1,1,0,75.0,0,1,...,0,0,1,0,0,0,0,1,0,1
4,Resort Hotel,0,14,0,2,2,0,98.0,1,1,...,1,0,1,0,0,0,0,1,0,1
5,Resort Hotel,0,14,0,2,2,0,98.0,1,1,...,1,0,1,0,0,0,0,1,0,1
6,Resort Hotel,0,0,0,2,2,0,107.0,0,1,...,0,0,1,0,0,0,0,1,0,1


In [8]:
# Extracting Resort Hotel data
resort_data = data[data["hotel"] == "Resort Hotel"]
resort_data.columns

Index(['hotel', 'is_canceled', 'lead_time', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'is_repeated_guest', 'adr',
       'total_of_special_requests', 'Reservation_status', 'meal_type_BB',
       'meal_type_FB', 'meal_type_HB', 'meal_type_SC', 'meal_type_Undefined',
       'distr_channel_Corporate', 'distr_channel_Direct', 'distr_channel_GDS',
       'distr_channel_TA/TO', 'distr_channel_Undefined', 'deposit_No Deposit',
       'deposit_Non Refund', 'deposit_Refundable', 'cust_type_Contract',
       'cust_type_Group', 'cust_type_Transient', 'cust_type_Transient-Party',
       'prefered_room'],
      dtype='object')

In [9]:
# predict the average daily rate and anaylse what factors affect it

# # standardizing the data (Resort data) 
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
scaler = StandardScaler()
scaler2 = MinMaxScaler()
X = resort_data.drop(['hotel', 'adr'], axis = 1)
for attribute in X.columns:
    X[attribute] = scaler.fit_transform(np.array(X[attribute]).reshape(-1,1))

# target variable. average daily rate    
Y = resort_data.adr

# split the data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

# Using the linear classifier to predict the average daily rate
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures 

regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# predicting the average daily rate
y_pred = regressor.predict(X_test)

# These coefficients could help us know which variables are highly correlated with 
# the average daily rate
coefficient_df = pd.DataFrame(regressor.coef_, X.columns, columns = ['coefficients'])
print(coefficient_df)

print(r2_score(Y_test, y_pred))



                           coefficients
is_canceled                7.046421e-01
lead_time                 -5.237547e-01
stays_in_weekend_nights   -1.133074e+00
stays_in_week_nights       3.259250e+00
adults                     7.208646e+00
is_repeated_guest         -3.393437e+00
total_of_special_requests  8.534869e+00
Reservation_status        -4.680830e+00
meal_type_BB               5.161743e+13
meal_type_FB               1.617795e+13
meal_type_HB               4.769452e+13
meal_type_SC               5.509936e+12
meal_type_Undefined        2.003735e+13
distr_channel_Corporate   -4.053446e+12
distr_channel_Direct      -5.881521e+12
distr_channel_GDS         -6.536225e+13
distr_channel_TA/TO       -6.633273e+12
distr_channel_Undefined   -7.397690e+10
deposit_No Deposit        -2.169595e+14
deposit_Non Refund        -2.089052e+14
deposit_Refundable        -6.126438e+13
cust_type_Contract         4.638488e+14
cust_type_Group            1.890671e+14
cust_type_Transient        9.704085e+14


In [10]:

# predicting the reservation status of customers who booked resort hotels

# Standardizing target data
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
scaler = StandardScaler()
X2 = resort_data.drop(['hotel','Reservation_status'], axis = 1)
for attribute in X2.columns:
    X2[attribute] = scaler.fit_transform(np.array(X2[attribute]).reshape(-1,1))
    
# target variable. reservation status
Y2 = resort_data['Reservation_status']


# Using KNN classifier to predict the reservation status

# split the data
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size = 0.25, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X2_train, Y2_train)
y2_pred = knn_classifier.predict(X2_test)





In [11]:
# Evaluating the model.Print the accuracy and confusion matrix
from sklearn import metrics
print(metrics.accuracy_score(Y2_test, y2_pred))
print(metrics.confusion_matrix(Y2_test, y2_pred))


0.9912131802296555
[[2649   20    2]
 [   1 7265    0]
 [  62    3   13]]


In [12]:
# Using support vector machine (svm) to predict the reservation status
from sklearn.svm import SVC
svc_classifier = SVC(kernel = 'linear')
svc_classifier.fit(X2_train, Y2_train)
y3_predSVC = svc_classifier.predict(X2_test)

print(metrics.accuracy_score(Y2_test, y3_predSVC))
print(metrics.confusion_matrix(Y2_test, y3_predSVC))

0.9922116824762856
[[2671    0    0]
 [   0 7266    0]
 [  78    0    0]]


In [13]:
# using random forest classification to predict the reservation status
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X2_train, Y2_train)
y_predrf = rf_classifier.predict(X2_test)
print(metrics.accuracy_score(Y2_test, y_predrf))
print(metrics.confusion_matrix(Y2_test, y_predrf))

0.9935097353969047
[[2667    0    4]
 [   0 7266    0]
 [  61    0   17]]




In [14]:
# comparing the perfomance of each classifier using cross validation
from sklearn.model_selection import cross_val_score
classifiers = [knn_classifier, svc_classifier, rf_classifier] # a list of classfiers
for i in classifiers:
    print(cross_val_score(i, X2, Y2, cv = 10).mean())


0.9820764540373125
0.990388842174253
0.9922865387006246


In [15]:
# Analysing the likelihood for customers to cancel their booking for resort hotels

# target variable canceled = 1, not canceled = 0
y = resort_data.is_canceled

# Scaling dependent variables
scaler2 = StandardScaler()
x = resort_data.drop(['is_canceled','hotel'], axis = 1)
for attribute in x.columns:
    x[attribute] = scaler2.fit_transform(np.array(x[attribute]).reshape(-1,1))
    


# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

# use logistic regression to predict binomial outcomes

from sklearn.linear_model import LogisticRegression #importing Logistic regression
log_regressor = LogisticRegression()

log_regressor.fit(x_train,y_train) #fitting the model to the data

y_pred = log_regressor.predict(x_test) # predicting x test data
print("confusion matrix and accuracy for logistic regression")
print(metrics.confusion_matrix(y_test, y_pred)) #confusion matrix

print(metrics.accuracy_score(y_test, y_pred))


# using random forest to predict the likelihood for customers to cancel
# their booking
random_forest_regressor = RandomForestClassifier()
random_forest_regressor.fit(x_train, y_train) 
rf_ypred = random_forest_regressor.predict(x_test)





confusion matrix and accuracy for logistic regression
[[7212    0]
 [  74 2729]]
0.9926110833749376




In [16]:
print("confusion matrix and accuracy for random forest classifier")
print(metrics.confusion_matrix(y_test, rf_ypred)) #confusion matrix
print()
print(metrics.accuracy_score(y_test, rf_ypred)) #accuracy


confusion matrix and accuracy for random forest classifier
[[7212    0]
 [   9 2794]]

0.999101347978033


In [17]:
# using cross validation to evluate the logistic regression and 
# random classification model
from sklearn.model_selection import cross_val_score
print(cross_val_score(log_regressor, x, y, cv= 10).mean())
print(cross_val_score(random_forest_regressor, x, y, cv=10).mean())



0.9892409135242722
0.9988266975130641


In [18]:
# Analysis on the likelihood for a hotel to have a repeated customer

# target variable variable repeated guest = 1, not a repeated guest = 0
y_repeated_customer = resort_data.is_repeated_guest

# Scaling dependent variables
scaler3 = StandardScaler()
x_repeated_customer = resort_data.drop(['hotel', 'is_repeated_guest'], axis = 1)
for attribute in x_repeated_customer.columns:
    x_repeated_customer[attribute] = scaler3.fit_transform(np.array(x_repeated_customer[attribute]).reshape(-1,1))
    


# split the data 
X_training, X_testing, y_training, y_testing = train_test_split(x_repeated_customer,
                                                             y_repeated_customer, test_size = 0.25)

# use logistic regression to predict wheather a customer will be a repeated guest

from sklearn.linear_model import LogisticRegression #importing Logistic regression
log_regressor2 = LogisticRegression()

log_regressor2.fit(X_training,y_training) #fitting the model to the data

y_prediction = log_regressor.predict(X_testing) # predicting x test data

                                                            



In [19]:
print("confusion matrix and accuracy for logistic regression")
print(metrics.confusion_matrix(y_testing, y_prediction)) #confusion matrix
print()
print(metrics.accuracy_score(y_testing, y_prediction))


confusion matrix and accuracy for logistic regression
[[6883 2673]
 [ 434   25]]

0.6897653519720419


In [20]:
# using random forest to predict repeated customer 
random_forest_regressor2 = RandomForestClassifier()
random_forest_regressor2.fit(X_training, y_training) 
rf_yprediction = random_forest_regressor.predict(X_testing)
print("confusion matrix and accuracy for random forest classifier")
print(metrics.confusion_matrix(y_testing, rf_yprediction)) #confusion matrix
print()
print(metrics.accuracy_score(y_testing, rf_yprediction))



confusion matrix and accuracy for random forest classifier
[[6815 2741]
 [ 433   26]]

0.6830753869196206


In [21]:
# using cross validation to evluate the logistic regression and 
# random classification model
print(cross_val_score(log_regressor2, x, y, cv= 10).mean())
print(cross_val_score(random_forest_regressor2, x_repeated_customer, y_repeated_customer, cv=10).mean())



0.9892409135242722
0.9425855089046047
