In [1]:
import numpy as np
import pandas as pd

In [3]:
hotels = pd.read_csv("hotel_booking_data.csv")

In [4]:
# How many rows are in the data set

len(hotels)

119390

In [6]:
#Finding Missing data in a data set

hotels.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [7]:
# Dropping a column

hotels = hotels.drop('company', axis=1)

In [13]:
# Top 5 most common country codes

hotels['country'].value_counts()[:5]

PRT    48590
GBR    12129
FRA    10415
ESP     8568
DEU     7287
Name: country, dtype: int64

In [19]:
#Person who paid highest ADR

hotels[['name','adr']].sort_values('adr',ascending=False).iloc[0]

name    Daniel Walter
adr            5400.0
Name: 48515, dtype: object

In [21]:
# mean of total adr

round(hotels['adr'].mean(),2)

101.83

In [22]:
hotels['total_nights'] = hotels['stays_in_week_nights'] + hotels['stays_in_weekend_nights']

round(hotels['total_nights'].mean(),2)

3.43

In [24]:
# average total cost

hotels['total_cost'] = hotels['total_nights'] * hotels['adr']

round(hotels['total_cost'].mean(),2)

357.85

In [26]:
hotels[['name','email']][(hotels['total_of_special_requests'] == 5)]

Unnamed: 0,name,email
7860,Amanda Harper,Amanda.H66@yahoo.com
11125,Laura Sanders,Sanders_Laura@hotmail.com
14596,Tommy Ortiz,Tommy_O@hotmail.com
14921,Gilbert Miller,Miller.Gilbert@aol.com
14922,Timothy Torres,TTorres@protonmail.com
24630,Jennifer Weaver,Jennifer_W@aol.com
27288,Crystal Horton,Crystal.H@mail.com
27477,Brittney Burke,Burke_Brittney16@att.com
29906,Cynthia Cabrera,Cabrera.Cynthia@xfinity.com
29949,Sarah Floyd,Sarah_F@gmail.com


In [31]:
# Percentage of repeated guests
#Repeat guest are labeled as 1

round(len(hotels[(hotels['is_repeated_guest'] == 1)]) / len(hotels) * 100,2)

3.19

In [38]:
# Top 5 most common last names

hotels['name'].apply(lambda name: name.split()[-1]).value_counts()[:5]

Smith       2503
Johnson     1990
Williams    1618
Jones       1434
Brown       1423
Name: name, dtype: int64

In [45]:
# top 5 most children
hotels['total_kids'] = hotels['babies'] + hotels['children']

hotels[['name','adults','total_kids']].sort_values('total_kids', ascending=False)[:5]

Unnamed: 0,name,adults,total_kids
328,Jamie Ramirez,2,10.0
46619,Nicholas Parker,2,10.0
78656,Marc Robinson,1,9.0
19718,Mr. Jeffrey Cross,2,3.0
107837,Albert French,2,3.0


In [49]:
# Top 3 phone area codes

hotels['phone-number'].apply(lambda num: num.split("-")[0]).value_counts()[:5]

799    168
185    167
541    166
739    163
763    163
Name: phone-number, dtype: int64

In [50]:
# Total counts for each day of the week a person arrived

def convert(month, day, year):
    return f"{month}-{day}-{year}"

In [52]:
hotels['date'] = np.vectorize(convert)(hotels['arrival_date_month'],hotels['arrival_date_day_of_month'],hotels['arrival_date_year'])

In [53]:
hotels['date']

0            July-1-2015
1            July-1-2015
2            July-1-2015
3            July-1-2015
4            July-1-2015
               ...      
119385    August-30-2017
119386    August-31-2017
119387    August-31-2017
119388    August-31-2017
119389    August-29-2017
Name: date, Length: 119390, dtype: object

In [55]:
hotels['date'] = pd.to_datetime(hotels['date'])

In [62]:
hotels['date'].dt.day_name().value_counts()

Friday       19631
Thursday     19254
Monday       18171
Saturday     18055
Wednesday    16139
Sunday       14141
Tuesday      13999
Name: date, dtype: int64