In [1]:
import pandas as pd

In [2]:
# reading the bookings.csv dataset with the ";" separator
# saving the read dataset in bookings and display the first 7 lines
bookings = pd.read_csv('2_bookings.csv', sep=';')
bookings.head(7)

Unnamed: 0,Hotel,Is Canceled,Lead Time,arrival full date,Arrival Date Year,Arrival Date Month,Arrival Date Week Number,Arrival Date Day of Month,Stays in Weekend nights,Stays in week nights,...,Adults,Children,Babies,Meal,Country,Reserved Room Type,Assigned room type,customer type,Reservation Status,Reservation status_date
0,Resort Hotel,0,342,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015-07-01,2015,July,27,1,0,1,...,1,0.0,0,BB,GBR,A,C,Transient,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015-07-01,2015,July,27,1,0,1,...,1,0.0,0,BB,GBR,A,A,Transient,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015-07-01,2015,July,27,1,0,2,...,2,0.0,0,BB,GBR,A,A,Transient,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015-07-01,2015,July,27,1,0,2,...,2,0.0,0,BB,GBR,A,A,Transient,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015-07-01,2015,July,27,1,0,2,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-03


In [3]:
# checking how many rows and columns there are in the dataset.
rows, cols = bookings.shape
print(f'{rows = }, {cols = }')

rows = 119390, cols = 21


In [4]:
# looking at the types of columns
bookings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Hotel                      119390 non-null  object 
 1   Is Canceled                119390 non-null  int64  
 2   Lead Time                  119390 non-null  int64  
 3   arrival full date          119390 non-null  object 
 4   Arrival Date Year          119390 non-null  int64  
 5   Arrival Date Month         119390 non-null  object 
 6   Arrival Date Week Number   119390 non-null  int64  
 7   Arrival Date Day of Month  119390 non-null  int64  
 8   Stays in Weekend nights    119390 non-null  int64  
 9   Stays in week nights       119390 non-null  int64  
 10  stays total nights         119390 non-null  int64  
 11  Adults                     119390 non-null  int64  
 12  Children                   119386 non-null  float64
 13  Babies                     11

In [5]:
# converting the column names to lower case and replacing spaces with underscores
bookings.columns = bookings.columns.str.lower().str.replace(' ', '_')
bookings.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_full_date,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,adults,children,babies,meal,country,reserved_room_type,assigned_room_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015-07-01,2015,July,27,1,0,1,...,1,0.0,0,BB,GBR,A,C,Transient,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015-07-01,2015,July,27,1,0,1,...,1,0.0,0,BB,GBR,A,A,Transient,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015-07-01,2015,July,27,1,0,2,...,2,0.0,0,BB,GBR,A,A,Transient,Check-Out,2015-07-03


In [6]:
# users from which countries made the most successful bookings
(bookings
    .loc[bookings.is_canceled == 0]
    .country.value_counts()
    .head(5))

country
PRT    21071
GBR     9676
FRA     8481
ESP     6391
DEU     6069
Name: count, dtype: int64

In [7]:
# how many nights (stays_total_nights) are City Hotels booked on average
(bookings
    .groupby('hotel')
    .stays_total_nights.mean()
    .round(2))

hotel
City Hotel      2.98
Resort Hotel    4.32
Name: stays_total_nights, dtype: float64

In [8]:
# in how many bookings is the room type assigned to the client (assigned_room_type) different from the one originally booked (reserved_room_type)
len(bookings.loc[bookings.assigned_room_type != bookings.reserved_room_type])

14917

In [9]:
# analyzing the dates of planned arrival (arrival_date_year) and finding the most popular months
(bookings
     .groupby('arrival_date_year')
     .arrival_date_month.agg(pd.Series.mode))

arrival_date_year
2015    September
2016      October
2017          May
Name: arrival_date_month, dtype: object

In [10]:
# what month (arrival_date_month) were City Hotel reservations canceled most often in 2015? 2016? 2017?
(bookings[bookings.hotel == 'City Hotel']
    .groupby(['arrival_date_year', 'arrival_date_month'])
    .is_canceled.sum()
    .groupby('arrival_date_year')
    .nlargest(1))

arrival_date_year  arrival_date_year  arrival_date_month
2015               2015               September             1543
2016               2016               October               1947
2017               2017               May                   2217
Name: is_canceled, dtype: int64

In [11]:
# highest average value of customer categories
bookings[['adults', 'children', 'babies']].mean()

adults      1.856403
children    0.103890
babies      0.007949
dtype: float64

In [12]:
# for which type of hotels did the average value of total_kids turn out to be the highest?
bookings['total_kids'] = bookings['children'] + bookings ['babies']
(bookings
    .groupby('hotel')
    .total_kids.mean()
    .round(2))

hotel
City Hotel      0.10
Resort Hotel    0.14
Name: total_kids, dtype: float64

In [13]:
# True if greater than or equal to 1, False if less
bookings['has_kids'] = bookings['total_kids'] >= 1

# group by the desired column, for each group take the 'is_canceled' column
# use value_counts with the parameter normalize=True to show the percentage
rates = bookings.groupby('has_kids')['is_canceled'].value_counts(normalize=True)

# round and display the result
round(rates * 100, 2)

has_kids  is_canceled
False     0              62.78
          1              37.22
True      0              65.08
          1              34.92
Name: proportion, dtype: float64