In [1]:
import pandas as pd

hotels = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv')

# is_canceled: Value indicating if the booking was canceled (1) or not (0)

hotels['children'] = hotels['children'].fillna(0).astype(int)
hotels.loc[hotels['meal'] == 'Undefined', 'meal'] = 'SC'

# conditional column if the reserved room was assigned or other one was assigned
hotels['correctly_assigned'] = 0
hotels.loc[hotels['reserved_room_type'] == hotels['assigned_room_type'], 'correctly_assigned'] = 1

# agent: there is null agent, impute using zero and convert to int (remember to treat it as a categorical variable)
hotels['agent'] = hotels['agent'].fillna(0).astype(int)

# company: there is null company, impute using zero and convert to int (remember to treat it as a categorical variable)
hotels['company'] = hotels['company'].fillna(0).astype(int)

# There are 2 reservations with 8 required parking spaces which are removed from the dataset
hotels = hotels[hotels['required_car_parking_spaces']!=8]

# Drop these 2 columns as they dont provide any data
hotels.drop(['arrival_date_week_number', 'arrival_date_day_of_month', 'reservation_status', 'reservation_status_date'], axis=1, inplace = True)

hotels

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,correctly_assigned
0,Resort Hotel,0,342,2015,July,0,0,2,0,0,...,3,No Deposit,0,0,0,Transient,0.00,0,0,1
1,Resort Hotel,0,737,2015,July,0,0,2,0,0,...,4,No Deposit,0,0,0,Transient,0.00,0,0,1
2,Resort Hotel,0,7,2015,July,0,1,1,0,0,...,0,No Deposit,0,0,0,Transient,75.00,0,0,0
3,Resort Hotel,0,13,2015,July,0,1,1,0,0,...,0,No Deposit,304,0,0,Transient,75.00,0,0,1
4,Resort Hotel,0,14,2015,July,0,2,2,0,0,...,0,No Deposit,240,0,0,Transient,98.00,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,2,5,2,0,0,...,0,No Deposit,394,0,0,Transient,96.14,0,0,1
119386,City Hotel,0,102,2017,August,2,5,3,0,0,...,0,No Deposit,9,0,0,Transient,225.43,0,2,1
119387,City Hotel,0,34,2017,August,2,5,2,0,0,...,0,No Deposit,9,0,0,Transient,157.71,0,4,1
119388,City Hotel,0,109,2017,August,2,5,2,0,0,...,0,No Deposit,89,0,0,Transient,104.40,0,0,1


In [2]:
hotels.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                            int32
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                               int32
company                             int32
days_in_waiting_list              

In [3]:
onehot_vars = pd.concat([pd.get_dummies(hotels.hotel),
                         pd.get_dummies(hotels.arrival_date_year, prefix = 'year_'),
                         pd.get_dummies(hotels.arrival_date_month),
                         pd.get_dummies(hotels.meal),
                         pd.get_dummies(hotels.country),
                         pd.get_dummies(hotels.market_segment),
                         pd.get_dummies(hotels.distribution_channel),
                         pd.get_dummies(hotels.reserved_room_type),
                         pd.get_dummies(hotels.assigned_room_type),
                         pd.get_dummies(hotels.deposit_type),
                         pd.get_dummies(hotels.customer_type)], 
                         axis=1)
onehot_vars

Unnamed: 0,City Hotel,Resort Hotel,year__2015,year__2016,year__2017,April,August,December,February,January,...,K,L,P,No Deposit,Non Refund,Refundable,Contract,Group,Transient,Transient-Party
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,1,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
119386,1,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
119387,1,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
119388,1,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [4]:
data = pd.concat([hotels, onehot_vars], axis = 1)

data.drop(['hotel', 'arrival_date_year', 'arrival_date_month', 
           'meal', 'country', 'market_segment', 'distribution_channel',
           'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type'], axis = 1, inplace = True)

In [5]:
data.dtypes

is_canceled                int64
lead_time                  int64
stays_in_weekend_nights    int64
stays_in_week_nights       int64
adults                     int64
                           ...  
Refundable                 uint8
Contract                   uint8
Group                      uint8
Transient                  uint8
Transient-Party            uint8
Length: 258, dtype: object

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = data.drop('is_canceled', axis = 1)
y = data['is_canceled']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

cla = DecisionTreeClassifier()

cla.fit(X_train, y_train) 

# Predict y data with classifier: 
y_test_pred = cla.predict(X_test)

# Print results: 
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred)) 

[[13271  1653]
 [ 1586  7368]]
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     14924
           1       0.82      0.82      0.82      8954

    accuracy                           0.86     23878
   macro avg       0.86      0.86      0.86     23878
weighted avg       0.86      0.86      0.86     23878



In [7]:
X

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,K,L,P,No Deposit,Non Refund,Refundable,Contract,Group,Transient,Transient-Party
0,342,0,0,2,0,0,0,0,0,3,...,0,0,0,1,0,0,0,0,1,0
1,737,0,0,2,0,0,0,0,0,4,...,0,0,0,1,0,0,0,0,1,0
2,7,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,13,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,14,0,2,2,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,23,2,5,2,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
119386,102,2,5,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
119387,34,2,5,2,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
119388,109,2,5,2,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [8]:
# Predict y data with classifier: 
y_train_pred = cla.predict(X_train)

# Print results: 
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred)) 

[[60158    82]
 [  274 34996]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     60240
           1       1.00      0.99      0.99     35270

    accuracy                           1.00     95510
   macro avg       1.00      1.00      1.00     95510
weighted avg       1.00      1.00      1.00     95510

