In [None]:

import numpy as np 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt 

import datetime
import calendar
import matplotlib.dates as mdates

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#### In this notebook, I am showing my final prep work on the Hotel Bookings data.  I have arrived to this point after starting some EDA and noticing values in the data that don't show what I would expect them to show.  There were a few iterations and this is where I landed.  Since this is a data set that I imagine has gone through some sort of ETL process, or something someone had put together after a SQL query, I do my best to replace data I have questions about with something I feel is more logical.  Of course, if this were a work setting, I would be able to meet with the person that this came from to look at some of the code going into this.

In [None]:
#Read data from locally stored CSV.  The CSV file is from Kaggle.
df = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')

In [None]:
#There are too many columns to display in a regular view, so .T will be used to transpose.
df.head().T

### Data Dictionary:

#### Hotel: Hotel (H1 = Resort Hotel or H2 = City Hotel)

#### is_canceled: Value indicating if the booking was canceled (1) or not (0)

#### lead_time: Number of days that elapsed between the entering date of the booking into the PMS and the arrival date

#### arrival_date_year: Year of arrival date

#### arrival_date_month: Month of arrival date

#### arrival_date_week_number: Week number of year for arrival date

#### arrival_date_day_of_month: Day of arrival date

#### stays_in_weekend_nights: Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel

#### stays_in_week_nights: Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel

#### adults: Number of adults

#### children: Number of children

#### babies: Number of babies

#### meal: Type of meal booked. Categories are presented in standard hospitality meal packages: Undefined/SC – no meal package; BB – Bed & Breakfast; HB – Half board (breakfast and one other meal – usually dinner); FB – Full board (breakfast, lunch and dinner)

#### country: Country of origin. Categories are represented in the ISO 3155–3:2013 format

#### market_segment: Market segment designation. In categories, the term “TA” means “Travel Agents” and “TO” means “Tour Operators”

#### distribution_channel: Booking distribution channel. The term “TA” means “Travel Agents” and “TO” means “Tour Operators”

#### is_repeated_guest: Value indicating if the booking name was from a repeated guest (1) or not (0)

#### previous_cancellations: Number of previous bookings that were cancelled by the customer prior to the current booking

#### previous_bookings_not_canceled: Number of previous bookings not cancelled by the customer prior to the current booking

#### reserved_room_type: Code of room type reserved. Code is presented instead of designation for anonymity reasons.

#### assigned_room_type: Code for the type of room assigned to the booking. Sometimes the assigned room type differs from the reserved room type due to hotel operation reasons (e.g. overbooking) or by customer request. Code is presented instead of designation for anonymity reasons.

#### booking_changes: Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation

#### deposit_type: Indication on if the customer made a deposit to guarantee the booking. This variable can assume three categories: No Deposit – no deposit was made; Non Refund – a deposit was made in the value of the total stay cost; Refundable – a deposit was made with a value under the total cost of stay.

#### agent: ID of the travel agency that made the booking

#### company: ID of the company/entity that made the booking or responsible for paying the booking. ID is presented instead of designation for anonymity reasons

#### days_in_waiting_list: Number of days the booking was in the waiting list before it was confirmed to the customer

#### customer_type: Type of booking, assuming one of four categories: Contract - when the booking has an allotment or other type of contract associated to it; Group – when the booking is associated to a group; Transient – when the booking is not part of a group or contract, and is not associated to other transient booking; Transient-party – when the booking is transient, but is associated to at least other transient booking

#### adr: Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights

#### required_car_parking_spaces: Number of car parking spaces required by the customer

#### total_of_special_requests: Number of special requests made by the customer (e.g. twin bed or high floor)

#### reservation_status: Reservation last status, assuming one of three categories: Canceled – booking was canceled by the customer; Check-Out – customer has checked in but already departed; No-Show – customer did not check-in and did inform the hotel of the reason why

#### Date at which the last status was set. This variable can be used in conjunction with the ReservationStatus to understand when was the booking canceled or when did the customer checked-out of the hotel

## Data Dictionary Thoughts:
### Possible data collection recommendation: These do not appear to be associated to a particular customer/account number.  This additional data could be useful in indentifying possible cancellation patterns.

In [None]:
df.describe()

## What stands out from the describe?
### There is a max lead time of 737 days!  Do hotels even allow that?  Is that a good data point?
### Someone stayed 19 weekend nights.  Did this person/family stay for 10-11 weeks?  Looking at stays_in_week_nights, this seems plausible as a 50 weekday stretch would be 10 weeks.
### 55 adults in a single room? 10 children?  10 babies?
### \\$5,400 for an average daily rate?
### 8 Parking spots?  I guess that can work with 55 adults.
### There are guests that show 0 total nights stay?
### Someone averaged -\\$6.38/night?

# Some feature engineering:
## Create an arrival date:

In [None]:
#Create a month numeric column to be used to create a DateTime
d = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6,\
     'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
df['month'] = df.arrival_date_month.map(d)
df.head(3)

In [None]:
#Rename arrival_date_year/month columns to be able to concatenate them all for a DateTime column
df.rename(columns={'arrival_date_year': 'year', 'arrival_date_day_of_month': 'day'}, inplace=True)

#Create datetime value
df['arrival_date'] = pd.to_datetime(df[['year', 'month' , 'day']])

## Create an arrival day of week.  This will be used to replace rows that have 0's in both stays in weekend or weekday nights:

In [None]:
#Values assigned will be 0-6, with Monday = 0 and Sunday = 6.  
#With this logic, the code in the next cell can be used to create a weekend/weekday column.
df['arrival_day_of_week'] = [x.dayofweek for x in df.arrival_date]

## Create a boolean column indicating whether the arrival date was a weekday or weekend:

In [None]:
#Fridays are 5's and Saturdays are 6's.  Where 5 or 6, put a 1 to indicate the arrival
#date was on a weekend and put a 0 to indicate it wasn't a weekend.
df['arrival_day_is_weekend'] = np.where(df['arrival_day_of_week'] >= 5, 1, 0)

## Create a column where previous bookings canceled is divided by the sum of cancels and non-cancels, or total visits:

In [None]:
df['cancel_rate'] = df['previous_cancellations'] / \
    (df['previous_cancellations'] + df['previous_bookings_not_canceled'])

df['cancel_rate'].fillna(0, inplace=True)

df[df.cancel_rate > 0].head(3)

## Create a column for total number of guests:

In [None]:
#Replacing 0's with 1's in the adult column on the assumption 
#that an adult must check in and use the room.
df['adults'].replace({0:1}, inplace=True) 

In [None]:
df['total_guests'] = df['adults'] + df['children'] + df['babies']
df[df['total_guests'] > 1].head()

## Create a column for total number of days, which is the sum of stays_in_week_nights and stays_in_weekend_nights:

In [None]:
df['total_nights_stay'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']

In [None]:
#Working on rows where total_nights_stay <= 0.  This appears to be a data quality issue.
#If a guest stayed and checked out, we would assume they stayed at least one night.
df[['total_nights_stay', 'stays_in_week_nights', 'stays_in_weekend_nights']]\
    [(df['total_nights_stay'] <= 0) & (df['reservation_status'] == 'Check-Out')]\
    .groupby('total_nights_stay').count()

In [None]:
#We know that there are rows where total_nights_stay == 0.  Are these all Canceled
#or No-Show?  Let's take a look to be safe.  If we're wanting to change these all to 1,
#it would be beneficial to make sure 0's show up in all reservation_status types.
df[['total_nights_stay', 'reservation_status']][df['total_nights_stay'] <= 0]\
    .groupby('reservation_status').count()

In [None]:
#There aren't a ton of rows where total_nights_stay is 0 for Canceled or No-Show.
#Considering that we might change those rows to a 1, let's make sure that we do see plenty
#of rows with 1+ in those groups.  The thought is that the data was put together in a way
#that when someone cancels or no-shows, they automatically show as 0 nights stay.
df[['total_nights_stay', 'reservation_status', 'adr']][df['reservation_status'] != 'Check-Out']\
    .groupby(['reservation_status', 'total_nights_stay']).count()

## We do not see this.  We see that cancel and no-show rows can have > 0 values in the nights stay column.  For this reason, we will change all rows with 0, to at least 1.  If we were able to meet with the person that put this data together, we would most likely have a more accurate solution, but we'll do what we can on this one.

In [None]:
#Find the rows where total_nights_stay = 0, and the arrival day is not a weekend.
#Place a 1 in the stays_in_week_nights column.
df.loc[((df['total_nights_stay'] == 0) & (df['arrival_day_is_weekend'] == 0)\
          ), 'stays_in_week_nights' ] = 1

In [None]:
#Find the rows where total_nights_stay = 0, and the arrival day is a weekend.
#Place a 1 in the stays_in_weekend_nights column
df.loc[((df['total_nights_stay'] == 0) & (df['arrival_day_is_weekend'] == 1)\
          ), 'stays_in_weekend_nights' ] = 1

In [None]:
#We create the total_nights_stay column based on the sum of the week night and
#and weekend night columns.  
#Redo total_nights_stay column to reflect our changes:
df['total_nights_stay'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']

## Create a total \$ Amount column:

In [None]:
df['total_dollar'] = df['total_nights_stay'] * df['adr']

## Create a column summing previous cancellations and non-cancellations:

In [None]:
df['total_previous_bookings'] = df['previous_cancellations'] + df['previous_bookings_not_canceled']

# Some validation:
## With this new 'cancel_rate' column, my thought is that someone that is not a repeat guest (0), should have only a 0.0 in the cancel_rate column.

In [None]:
df[['is_repeated_guest', 'cancel_rate']].groupby('is_repeated_guest').agg(['min', 'max'])

## That's not the case.  Looking below, it looks like there are just over 5,500 guests that have previous visits that are labeled as 0, or not a repeat guest:

In [None]:
#If a guest has previously cancelled, they will considered a repeat guest.
#Showing 3 rows where guests have previous cancellations and 
#non_cancellations, but are not listed as a repeat guest (0).

res_non_repeat_w_cancel_rate_count = df['is_repeated_guest'][(df.is_repeated_guest == 0) \
                                        & (df.cancel_rate > 0)].count()

print('There appear to be {0} rows where we see previous cancellations and\
visits, but the guest shows as a new guest.'.format(res_non_repeat_w_cancel_rate_count))

df[['previous_cancellations','previous_bookings_not_canceled', 'is_repeated_guest', \
       'cancel_rate']][(df.is_repeated_guest == 0) & (df.cancel_rate > 0)].head(3)

In [None]:
#Double checking, here are examples where the guest is not a repeat guest, but show
#both previous cancellations and previous stays.
df[['previous_cancellations','previous_bookings_not_canceled', 'is_repeated_guest', 'cancel_rate']]\
    [(df.is_repeated_guest == 0) & (df.cancel_rate > 0) & (df.cancel_rate < 1)].tail(3)

In [None]:
#Aggregating guests listed as non repeat guests that have both previous cancellations
#and stays.  Grouping into reservation status to make sure that they do not all fit
#into a single value.
df[['is_repeated_guest', 'reservation_status', 'previous_cancellations', 'previous_bookings_not_canceled']]\
    [(df['is_repeated_guest'] == 0)\
    & ((df['previous_cancellations'] > 0) & (df['previous_bookings_not_canceled'] > 0))]\
    .groupby(['is_repeated_guest', 'reservation_status']).count()

## I'm headed down a rabbit hole here.  There are customers that have checked out but are not repeat customers.  I will change any customer with a cancel_rate > 0 to a 1 in the is_repeated_guest column.  Any customer with 0 total bookings will be changed to 0 in the is_repeated_guest column as well.  This might fix a lot.

In [None]:
print('There are {} rows where the customer is listed as a repeated guest, \
but they have 0 previous bookings. \nHere are the first 5 rows:'\
    .format(df[['previous_cancellations','previous_bookings_not_canceled', \
    'is_repeated_guest', 'cancel_rate']][(df.is_repeated_guest == 1) \
    & (df.previous_cancellations == 0) & (df.previous_bookings_not_canceled == 0)].shape[0]))

df[['previous_cancellations','previous_bookings_not_canceled', 'is_repeated_guest', 'cancel_rate']]\
    [(df.is_repeated_guest == 1) & (df.previous_cancellations == 0)\
    & (df.previous_bookings_not_canceled == 0)].head()

In [None]:
#Checking a few rows where the customer is listed as a previous guest but 
#has no previous reservation.
df[(df.is_repeated_guest == 1) & (df.previous_cancellations == 0) \
      & (df.previous_bookings_not_canceled == 0)].head(10).T

## Similarly, there are rows where the customer has no previous cancel or non-cancel event, but they are labeled as a repeat customer.  I pulled all columns for the first 5 observations to check things like reservation_status (maybe these were all canceled due to an input error).  Some of these have a reservation_status of Check-Out, indicating the party checked in, stayed the night, and then departed.  Another observation from this: someone checked out, but stayed 0 nights...?  The two observations with 0 nights also have 0 adr.

In [None]:
#The columns for number of nights stayed have been adjusted to reflect at least
#1 night for either weekend or weeknight check-in, so this should return nothing.
print('There are {} rows where the total nights stayed are 0 or less.'\
      .format(df['total_nights_stay'][df.total_nights_stay <=0].count()))

df[['is_canceled', 'reservation_status', 'total_nights_stay','adr']]\
    [df['total_nights_stay'] <= 0].groupby(['is_canceled', 'reservation_status',\
    'total_nights_stay'], as_index=False).agg(['min', 'max', 'count'])

## There are a lot of people staying with 0 ADR.  The initial thought is that these customers are using reward points or some sort of voucher for their stay.

In [None]:
print('There are {} rows where the ADR is <= 0.'\
    .format(df['adr'][df['adr'] <= 0].count()))

df[['is_canceled', 'reservation_status', 'total_nights_stay','adr']]\
    [df['adr'] <= 0].groupby(['is_canceled', 'reservation_status','total_nights_stay']\
    , as_index=False).agg(['min', 'max', 'count'])

## Changing values based on logic:

In [None]:
#Where the customer is listed as a repeated guest, but has no previous bookings
#they will be changed to 0.
df['is_repeated_guest'] = np.where(df['total_previous_bookings'] == 0\
                            , 0, df['is_repeated_guest'])

In [None]:
#Where the customer is listed as a first time guest, but has previous bookings
#they will be changed to 1.
df['is_repeated_guest'] = np.where(df['total_previous_bookings'] > 0\
                            , 1, df['is_repeated_guest'])

In [None]:
#Checking guests listed as first time guests to make sure they have only 0
#in the cancel_rate column.
df[['is_repeated_guest', 'cancel_rate']][df['is_repeated_guest'] == 0]\
    .groupby('is_repeated_guest').agg(['min','max'])

In [None]:
#Previous cancellations should be all 0's for customers with a 0 in cancel_rate.  
#Checking the sum to confirm.
df[['previous_cancellations']][df['cancel_rate'] == 0].sum()

In [None]:
#Change min ADR to 0.  $0 a night is feasible based on a rewards program.
#I don't like the thought of a customer receving money for their stay.  
#I think that's a data issue.
df['adr'] = df['adr'].clip(lower=0.0)

# Looking at Null's:

In [None]:
#Taking a look at null counts and percentages of columns that are null
data = {'null_count': df.isnull().sum(), 'null_percent': (df.isnull().sum()/df.shape[0]) * 100}
df_nulls = pd.DataFrame(data=data)
df_nulls[df_nulls['null_count'] > 0].sort_values(by='null_count', ascending=False)

## Company is almost all Null values.  This will most likely be dropped.
## Children has only 3 rows with Nulls.  Let's replace those with 0 and the recreate the total guests column so those rows are filled with the totals of adults and babies.

In [None]:
df['children'].fillna(0, inplace=True)
df['total_guests'] = df['adults'] + df['children'] + df['babies']
df[df['total_guests'] > 1].head()

In [None]:
#How many unique values do we see in agent?
df.agent.nunique()

In [None]:
#What about country?
df.country.nunique()

## At this point, we will drop the 'company' column.
## We can see that agent is a float type, so we will change null's to 0.0.  Country is a 3 letter string, so we will replace null's with 'Unknown'.

In [None]:
df['agent'].fillna(0.0, inplace=True)
df['country'].fillna('Unknown', inplace=True)

df.drop(columns=['company'], inplace=True)

In [None]:
#This should no show nothing as we have dropped a column with lots of nulls,
#and we have replaced nulls in other columns.
data = {'null_count': df.isnull().sum(), 'null_percent': (df.isnull().sum()/df.shape[0]) * 100}
df_nulls = pd.DataFrame(data=data)
df_nulls[df_nulls['null_count'] > 0].sort_values(by='null_count', ascending=False)

In [None]:
df.head().T

# At this point, we have completed enough cleaning of the data that we will be able to move forward with some EDA and visualization in the next notebook.  Keep in mind that what we've done here doesn't mean that this data is at its absolute best state.  We are saying that we know we have made changes and we feel comfortable moving forward.  If we do see some issues later, we can always iterate and make changes in this stage.