In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


In [2]:
data = pd.read_csv('hotel_bookings.csv')

In [3]:
# removing irrelevant attributes.

# a list of irrelevant attributes
irrelevant_attributes = ['arrival_date_year','arrival_date_month', 'arrival_date_week_number','arrival_date_day_of_month',
                        'children','babies', 'country', 'market_segment', 'previous_cancellations',
                        'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company','days_in_waiting_list',
                        'required_car_parking_spaces','reservation_status_date']

for attribute in irrelevant_attributes:
    data.drop(attribute, axis = 1, inplace = True)

In [4]:
# The following code encodes categorical attributes in to binary. It uses a Panda function known as get_dummies()
# and drops the which converts data into binary. The attributes are dropped after conversion is made.


# encoding reservation status category and drop it after.
data = pd.concat([data, pd.get_dummies(data['reservation_status'], prefix = 'reservation')], axis = 1)
data.drop('reservation_status', axis = 1, inplace = True)

# encoding meal category
data = pd.concat([data, pd.get_dummies(data['meal'], prefix = 'meal_type')], axis = 1)
data.drop('meal', axis = 1, inplace = True)

# encoding distribution channel
data = pd.concat([data, pd.get_dummies(data['distribution_channel'], prefix = 'distr_channel')], axis = 1)
data.drop('distribution_channel', axis = 1, inplace = True)

# encoding deposit_type
data = pd.concat([data, pd.get_dummies(data['deposit_type'], prefix = 'deposit')], axis = 1)
data.drop('deposit_type', axis = 1, inplace = True)

# enocding customer_type
data = pd.concat([data, pd.get_dummies(data['customer_type'], prefix = 'cust_type')], axis = 1)
data.drop('customer_type', axis = 1, inplace = True)


In [5]:
# The following code combines two attributes -reserved_room_type and assigned_room_type together. These attributes
# are strongly related because when a customer reserves a prefered room, they may be able to get the room or not. 
# Therefore to combine these two attributes and still have their significance in the data, the code compares every
# value in the lists element wise. if the values are similar, 1 is appended in the list(prefered_room) meaning 
# that the customer got the room they requested and 0 otherwise.
# The new attribute(prefered_room) with binary values is merged with the dataframe
reserved_room = np.array(data.reserved_room_type)
assigned_room = np.array(data.assigned_room_type)
combined_room = list(zip(reserved_room, assigned_room))
prefered_room = []
for item in combined_room:
    if item[0] == item[1]:
        prefered_room.append(1)
    else:
        prefered_room.append(0)
        
# merge the attribute to the dataframe and drop attributes - reserved_room_type and assigned_room_type       
data['prefered_room'] = prefered_room 
data.drop('reserved_room_type', axis = 1, inplace = True)
data.drop('assigned_room_type', axis = 1, inplace = True)


In [6]:
data.head(7)

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,is_repeated_guest,adr,total_of_special_requests,reservation_Canceled,...,distr_channel_TA/TO,distr_channel_Undefined,deposit_No Deposit,deposit_Non Refund,deposit_Refundable,cust_type_Contract,cust_type_Group,cust_type_Transient,cust_type_Transient-Party,prefered_room
0,Resort Hotel,0,342,0,0,2,0,0.0,0,0,...,0,0,1,0,0,0,0,1,0,1
1,Resort Hotel,0,737,0,0,2,0,0.0,0,0,...,0,0,1,0,0,0,0,1,0,1
2,Resort Hotel,0,7,0,1,1,0,75.0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,Resort Hotel,0,13,0,1,1,0,75.0,0,0,...,0,0,1,0,0,0,0,1,0,1
4,Resort Hotel,0,14,0,2,2,0,98.0,1,0,...,1,0,1,0,0,0,0,1,0,1
5,Resort Hotel,0,14,0,2,2,0,98.0,1,0,...,1,0,1,0,0,0,0,1,0,1
6,Resort Hotel,0,0,0,2,2,0,107.0,0,0,...,0,0,1,0,0,0,0,1,0,1
