# Dataset: Hotel Reservations


In [2]:
import numpy as np
import pandas as pd


In [3]:
df = pd.read_csv('./Hotel.csv')

Conjunto de dados para coleta de informações de reservas de hóspedes de hotéis
Sobre as colunas

* ID: identificador único de cada reserva
* n_adults: Número de adultos
* n_children: Número de crianças
* weekend_nights: Número de noites de fim de semana (sábado ou domingo) em que o hóspede se hospedou ou reservou hospedagem no hotel
* week_nights: Número de noites de semana (segunda a sexta) em que o hóspede se hospedou ou reservou hospedagem no hotel
* meal_plan: Tipo de plano de refeições reservado pelo cliente
* car_parking_space: O cliente precisa de uma vaga de estacionamento? (0 - Não, 1 - Sim)
* room_type: Tipo de quarto reservado pelo cliente. Os valores são criptografados (codificados) pela INN Hotels.
* lead_time: Número de dias entre a data da reserva e a data de chegada
* year: Ano da data de chegada
* month: Mês da data de chegada
* date: Dia do mês
* market_segment: Designação do segmento de mercado.

* repeated_guest: O cliente é um hóspede frequente? (0 - Não, 1 - Sim)
* previous_cancellations: Número de reservas anteriores que foram canceladas pelo cliente antes da reserva atual
* previous_bookings_not_canceled: Número de reservas anteriores não canceladas pelo cliente antes da reserva atual
* avg_room_price: Preço médio por dia da reserva; os preços dos quartos são dinâmicos. (em euros)
* special_requests: Número total de pedidos especiais feitos pelo cliente (por exemplo, andar alto, vista do quarto, etc.)
* status: Indicador que mostra se a reserva foi cancelada ou não.

In [4]:
df.head(10)

Unnamed: 0,ID,n_adults,n_children,weekend_nights,week_nights,meal_plan,car_parking_space,room_type,lead_time,year,month,date,market_segment,repeated_guest,previous_cancellations,previous_bookings_not_canceled,avg_room_price,special_requests,status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled
5,INN00006,2,0,0,2,Meal Plan 2,0,Room_Type 1,346,2018,9,13,Online,0,0,0,115.0,1,Canceled
6,INN00007,2,0,1,3,Meal Plan 1,0,Room_Type 1,34,2017,10,15,Online,0,0,0,107.55,1,Not_Canceled
7,INN00008,2,0,1,3,Meal Plan 1,0,Room_Type 4,83,2018,12,26,Online,0,0,0,105.61,1,Not_Canceled
8,INN00009,3,0,0,4,Meal Plan 1,0,Room_Type 1,121,2018,7,6,Offline,0,0,0,96.9,1,Not_Canceled
9,INN00010,2,0,0,5,Meal Plan 1,0,Room_Type 4,44,2018,10,18,Online,0,0,0,133.44,3,Not_Canceled


## Análises Básicas

In [5]:
# total de registros
len(df) #número de linhas (registros)
df.shape #número de linhas e colunas

(36275, 19)

In [6]:
# linhas com dados faltantes
linhas_com_nulos = df[df.isnull().any(axis=1)]
len(linhas_com_nulos) #não retornou nenhum registro

0

In [7]:
# linhas duplicadas
quantidade_duplicadas = df.duplicated().sum()
quantidade_duplicadas #possui 36024 linhas duplicadas

np.int64(0)

## Estatísticas descritivas

In [8]:
#lista os campos da tabela
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              36275 non-null  object 
 1   n_adults                        36275 non-null  int64  
 2   n_children                      36275 non-null  int64  
 3   weekend_nights                  36275 non-null  int64  
 4   week_nights                     36275 non-null  int64  
 5   meal_plan                       36275 non-null  object 
 6   car_parking_space               36275 non-null  int64  
 7   room_type                       36275 non-null  object 
 8   lead_time                       36275 non-null  int64  
 9   year                            36275 non-null  int64  
 10  month                           36275 non-null  int64  
 11  date                            36275 non-null  int64  
 12  market_segment                  

In [9]:
df.describe().round() #estatística dos campos numéricos

Unnamed: 0,n_adults,n_children,weekend_nights,week_nights,car_parking_space,lead_time,year,month,date,repeated_guest,previous_cancellations,previous_bookings_not_canceled,avg_room_price,special_requests
count,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0
mean,2.0,0.0,1.0,2.0,0.0,85.0,2018.0,7.0,16.0,0.0,0.0,0.0,103.0,1.0
std,1.0,0.0,1.0,1.0,0.0,86.0,0.0,3.0,9.0,0.0,0.0,2.0,35.0,1.0
min,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,2018.0,5.0,8.0,0.0,0.0,0.0,80.0,0.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,2018.0,8.0,16.0,0.0,0.0,0.0,99.0,0.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,2018.0,10.0,23.0,0.0,0.0,0.0,120.0,1.0
max,4.0,10.0,7.0,17.0,1.0,443.0,2018.0,12.0,31.0,1.0,13.0,58.0,540.0,5.0
