# Data Cleaning & Exploration - Hotel Bookings

# Import Libraries

In [134]:
import pandas as pd
import numpy as np

# Load Dataset

In [135]:
hb = pd.read_csv('/Users/raheem-gsu/Documents/Github Portfolio Projects/Hotel-Booking-Analysis/hotel_bookings.csv')
hb

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


# Initial Data Inspection

In [136]:
hb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

- `company` is an ID number. Rename it to company_id and convert it to an object. Remove the decimal point.
- `agent` is an ID number. Rename it to agent_id and convert it to an object. Remove the decimal point.
- Convert `reservation_status_date` to datetime64.
- Combine "arrival" fields into a new datetime64 field called `arrival_date`.


In [137]:
# Viewing NULL values
hb.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [138]:
# Viewing NULL `children` rows.
hb[hb['children'].isnull()]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
40600,City Hotel,1,2,2015,August,32,3,1,0,2,...,No Deposit,,,0,Transient-Party,12.0,0,1,Canceled,2015-08-01
40667,City Hotel,1,1,2015,August,32,5,0,2,2,...,No Deposit,14.0,,0,Transient-Party,12.0,0,1,Canceled,2015-08-04
40679,City Hotel,1,1,2015,August,32,5,0,2,3,...,No Deposit,,,0,Transient-Party,18.0,0,2,Canceled,2015-08-04
41160,City Hotel,1,8,2015,August,33,13,2,5,2,...,No Deposit,9.0,,0,Transient-Party,76.5,0,1,Canceled,2015-08-09


- `children` column has 4 NULL values - set them to zero then, convert to int64. 

# Data Cleaning

## Fixing company

In [139]:
# Convert `company` to string
hb['company'] = hb['company'].astype(str)
hb['company']

0         nan
1         nan
2         nan
3         nan
4         nan
         ... 
119385    nan
119386    nan
119387    nan
119388    nan
119389    nan
Name: company, Length: 119390, dtype: object

In [140]:
# Convert to object NULL values - None
hb.loc[hb['company'] == 'nan', 'company'] = None
hb['company']

0         None
1         None
2         None
3         None
4         None
          ... 
119385    None
119386    None
119387    None
119388    None
119389    None
Name: company, Length: 119390, dtype: object

In [141]:
# Remove decimal point
hb['company'] = hb['company'].str.split('.').str[0]
hb['company'].value_counts()

company
40     927
223    784
67     267
45     250
153    215
      ... 
104      1
531      1
160      1
413      1
386      1
Name: count, Length: 352, dtype: int64

In [142]:
# Rename `company` to `company_id`
hb.rename(columns={'company' : 'company_id'}, inplace=True)

## Fixing agent 

In [143]:
# Convert `agent` to string
hb['agent'] = hb['agent'].astype(str)
hb['agent']

0           nan
1           nan
2           nan
3         304.0
4         240.0
          ...  
119385    394.0
119386      9.0
119387      9.0
119388     89.0
119389      9.0
Name: agent, Length: 119390, dtype: object

In [144]:
# Convert to object NULL values - None
hb.loc[hb['agent'] == 'nan', 'agent'] = None
hb['agent']

0          None
1          None
2          None
3         304.0
4         240.0
          ...  
119385    394.0
119386      9.0
119387      9.0
119388     89.0
119389      9.0
Name: agent, Length: 119390, dtype: object

In [145]:
# Remove decimal point
hb['agent'] = hb['agent'].str.split('.').str[0]
hb['agent'].value_counts()

agent
9      31961
240    13922
1       7191
14      3640
7       3539
       ...  
289        1
432        1
265        1
93         1
304        1
Name: count, Length: 333, dtype: int64

In [146]:
# Rename `agent` to `agent_id`
hb.rename(columns={'agent' : 'agent_id'}, inplace=True)

## Converting reservation_status_date to datetime64

In [147]:
hb['reservation_status_date'] = hb['reservation_status_date'].astype('datetime64[ns]')
hb['reservation_status_date']

0        2015-07-01
1        2015-07-01
2        2015-07-02
3        2015-07-02
4        2015-07-03
            ...    
119385   2017-09-06
119386   2017-09-07
119387   2017-09-07
119388   2017-09-07
119389   2017-09-07
Name: reservation_status_date, Length: 119390, dtype: datetime64[ns]

In [148]:
# View arrival fields to combine
hb[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']]

Unnamed: 0,arrival_date_year,arrival_date_month,arrival_date_day_of_month
0,2015,July,1
1,2015,July,1
2,2015,July,1
3,2015,July,1
4,2015,July,1
...,...,...,...
119385,2017,August,30
119386,2017,August,31
119387,2017,August,31
119388,2017,August,31


In [149]:
# Convert arrival month to numeric
from datetime import datetime

hb['arrival_date_month'] = hb['arrival_date_month'].apply(lambda x: datetime.strptime(x, '%B').month)
hb['arrival_date_month']

0         7
1         7
2         7
3         7
4         7
         ..
119385    8
119386    8
119387    8
119388    8
119389    8
Name: arrival_date_month, Length: 119390, dtype: int64

In [150]:
# Combine `arrival_date_year`, `arrival_date_month`, and `arrival_date_day_of_month` into a new field datetime64 field called `arrival_date`.
hb['arrival_date'] = pd.to_datetime(hb['arrival_date_year'].astype(str) + '-' + hb['arrival_date_month'].astype(str) + '-' + hb['arrival_date_day_of_month'].astype(str))
hb['arrival_date']

0        2015-07-01
1        2015-07-01
2        2015-07-01
3        2015-07-01
4        2015-07-01
            ...    
119385   2017-08-30
119386   2017-08-31
119387   2017-08-31
119388   2017-08-31
119389   2017-08-29
Name: arrival_date, Length: 119390, dtype: datetime64[ns]

In [151]:
# Drop all arrival date related columns except arrival_date
hb.drop(columns=['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month', 'arrival_date_week_number'], inplace=True)
hb

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent_id,company_id,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date
0,Resort Hotel,0,342,0,0,2,0.0,0,BB,PRT,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01
1,Resort Hotel,0,737,0,0,2,0.0,0,BB,PRT,...,,,0,Transient,0.00,0,0,Check-Out,2015-07-01,2015-07-01
2,Resort Hotel,0,7,0,1,1,0.0,0,BB,GBR,...,,,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01
3,Resort Hotel,0,13,0,1,1,0.0,0,BB,GBR,...,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02,2015-07-01
4,Resort Hotel,0,14,0,2,2,0.0,0,BB,GBR,...,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03,2015-07-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2,5,2,0.0,0,BB,BEL,...,394,,0,Transient,96.14,0,0,Check-Out,2017-09-06,2017-08-30
119386,City Hotel,0,102,2,5,3,0.0,0,BB,FRA,...,9,,0,Transient,225.43,0,2,Check-Out,2017-09-07,2017-08-31
119387,City Hotel,0,34,2,5,2,0.0,0,BB,DEU,...,9,,0,Transient,157.71,0,4,Check-Out,2017-09-07,2017-08-31
119388,City Hotel,0,109,2,5,2,0.0,0,BB,GBR,...,89,,0,Transient,104.40,0,0,Check-Out,2017-09-07,2017-08-31


# Handling Missing Values

- `company`, `agent`, and `country` are categorical identifiers by nature that contain many NULL values. We will keep them because they account for large portion of the dataset and contain other information valuable for analysis.

## Fixing children column

In [152]:
# Convert NULL values to 0 and convert to int.
hb.loc[pd.isna(hb['children']), 'children'] = 0
hb['children'] = hb['children'].astype(int)
hb['children'].value_counts()

children
0     110800
1       4861
2       3652
3         76
10         1
Name: count, dtype: int64

# Descriptive Statistics

In [153]:
hb.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
is_canceled,119390.0,0.370416,0.0,0.0,0.0,1.0,1.0,0.482918
lead_time,119390.0,104.011416,0.0,18.0,69.0,160.0,737.0,106.863097
stays_in_weekend_nights,119390.0,0.927599,0.0,0.0,1.0,2.0,19.0,0.998613
stays_in_week_nights,119390.0,2.500302,0.0,1.0,2.0,3.0,50.0,1.908286
adults,119390.0,1.856403,0.0,2.0,2.0,2.0,55.0,0.579261
children,119390.0,0.103886,0.0,0.0,0.0,0.0,10.0,0.398555
babies,119390.0,0.007949,0.0,0.0,0.0,0.0,10.0,0.097436
is_repeated_guest,119390.0,0.031912,0.0,0.0,0.0,0.0,1.0,0.175767
previous_cancellations,119390.0,0.087118,0.0,0.0,0.0,0.0,26.0,0.844336
previous_bookings_not_canceled,119390.0,0.137097,0.0,0.0,0.0,0.0,72.0,1.497437


# Exploratory Data Analysis (EDA)

## What is the year range of bookings?

- Arrival Date: 2015 - 2017


- Reservation Status Date: 2014 - 2017




In [154]:
# View `hotel` values
hb['hotel'].value_counts()

hotel
City Hotel      79330
Resort Hotel    40060
Name: count, dtype: int64

In [155]:
# View `market_segment` values
hb['market_segment'].value_counts()

market_segment
Online TA        56477
Offline TA/TO    24219
Groups           19811
Direct           12606
Corporate         5295
Complementary      743
Aviation           237
Undefined            2
Name: count, dtype: int64

In [156]:
# View `distribution_channel` values
hb['distribution_channel'].value_counts()

distribution_channel
TA/TO        97870
Direct       14645
Corporate     6677
GDS            193
Undefined        5
Name: count, dtype: int64

In [157]:
# View `deposit_type` values
hb['deposit_type'].value_counts()

deposit_type
No Deposit    104641
Non Refund     14587
Refundable       162
Name: count, dtype: int64

In [158]:
# View `customer_type` values
hb['customer_type'].value_counts()

customer_type
Transient          89613
Transient-Party    25124
Contract            4076
Group                577
Name: count, dtype: int64

In [159]:
# View `reservation_status` values
hb['reservation_status'].value_counts()

reservation_status
Check-Out    75166
Canceled     43017
No-Show       1207
Name: count, dtype: int64

# Export Cleaned Dataset

In [160]:
hb.to_csv('hotel_bookings (cleaned).csv', index=False)