<a href="https://colab.research.google.com/github/stevansehn/python-hotel-reservation-classification/blob/main/hotel_reservation_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeClassifier

import kagglehub

path = kagglehub.dataset_download("ahsan81/hotel-reservations-classification-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/ahsan81/hotel-reservations-classification-dataset/versions/1


In [2]:
df = pd.read_csv(path + "/Hotel Reservations.csv")
df.head()

df.info()

df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
count,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0
mean,1.844962,0.105279,0.810724,2.2043,0.030986,85.232557,2017.820427,7.423653,15.596995,0.025637,0.023349,0.153411,103.423539,0.619655
std,0.518715,0.402648,0.870644,1.410905,0.173281,85.930817,0.383836,3.069894,8.740447,0.158053,0.368331,1.754171,35.089424,0.786236
min,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,2018.0,5.0,8.0,0.0,0.0,0.0,80.3,0.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,2018.0,8.0,16.0,0.0,0.0,0.0,99.45,0.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,2018.0,10.0,23.0,0.0,0.0,0.0,120.0,1.0
max,4.0,10.0,7.0,17.0,1.0,443.0,2018.0,12.0,31.0,1.0,13.0,58.0,540.0,5.0


In [3]:
df.isnull().sum()

Unnamed: 0,0
Booking_ID,0
no_of_adults,0
no_of_children,0
no_of_weekend_nights,0
no_of_week_nights,0
type_of_meal_plan,0
required_car_parking_space,0
room_type_reserved,0
lead_time,0
arrival_year,0


In [4]:
# 1) Faça a remoção da coluna 'Booking_ID':

df.drop('Booking_ID', axis=1, inplace = True)

In [5]:
# 2) Obtenha o nome das colunas int64 e float64 e faça a normalização com StandardScaler:

scale_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(scale_columns)

['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests']


In [6]:
# 3) Aplique a transformação das colunas categóricas com get_dummies()

df = pd.get_dummies(df, columns=['market_segment_type', 'room_type_reserved', 'type_of_meal_plan'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 31 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   required_car_parking_space            36275 non-null  int64  
 5   lead_time                             36275 non-null  int64  
 6   arrival_year                          36275 non-null  int64  
 7   arrival_month                         36275 non-null  int64  
 8   arrival_date                          36275 non-null  int64  
 9   repeated_guest                        36275 non-null  int64  
 10  no_of_previous_cancellations          36275 non-null  int64  
 11  no_of_previous_

In [7]:
# 4) Preparação para amostragem. Efetuar a separação entre os atributos previsores (X) e a classe (y)

X = df.drop('booking_status', axis=1)
y = df['booking_status']
X.shape, y.shape

((36275, 30), (36275,))

In [8]:
print(y.unique())
print(y.name)

['Not_Canceled' 'Canceled']
booking_status


In [9]:
# Codificar valores categóricos de y ['Not_Canceled' 'Canceled']

le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[1 1 0 ... 1 0 1]


In [10]:
# 5) Aplicar a amostragem Holdout com 80% treino e 20% teste

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(29020, 30) (29020,)
(7255, 30) (7255,)


In [11]:
# 6) Aplicar o balanceamento com a abordagem SMOTE

sm = SMOTE()
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)
X_train_resampled.shape, y_train_resampled.shape

((39102, 30), (39102,))

In [12]:
# 7) Faça a amostragem por validação cruzada utilizando a abordagem KFold
# com 5 partições e uso da métrica acurácia

DT = DecisionTreeClassifier()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(DT, X_train_resampled, y_train_resampled, cv=kf, scoring=['accuracy'])
scores

{'fit_time': array([0.51529813, 0.61832643, 0.44436741, 0.45888686, 0.551018  ]),
 'score_time': array([0.0102706 , 0.02222705, 0.00958347, 0.02218008, 0.05445814]),
 'test_accuracy': array([0.89144611, 0.89144611, 0.892711  , 0.88580563, 0.88734015])}